• File: test_encode.py
  • Full Path: /home/masbinta/public_html/admin/installer/css/sass/sym404/root/usr/local/lib64/python3.6/site-packages/sklearn/utils/tests/test_encode.py
  • File size: 7.16 KB
  • MIME-type: text/x-python
  • Charset: utf-8
import pickle

import numpy as np
import pytest
from numpy.testing import assert_array_equal

from sklearn.utils._encode import _unique
from sklearn.utils._encode import _encode
from sklearn.utils._encode import _check_unknown


@pytest.mark.parametrize(
        "values, expected",
        [(np.array([2, 1, 3, 1, 3], dtype='int64'),
          np.array([1, 2, 3], dtype='int64')),
         (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
          np.array(['a', 'b', 'c'], dtype=object)),
         (np.array(['b', 'a', 'c', 'a', 'c']),
          np.array(['a', 'b', 'c']))],
        ids=['int64', 'object', 'str'])
def test_encode_util(values, expected):
    uniques = _unique(values)
    assert_array_equal(uniques, expected)
    encoded = _encode(values, uniques=uniques)
    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))


def test_encode_with_check_unknown():
    # test for the check_unknown parameter of _encode()
    uniques = np.array([1, 2, 3])
    values = np.array([1, 2, 3, 4])

    # Default is True, raise error
    with pytest.raises(ValueError,
                       match='y contains previously unseen labels'):
        _encode(values, uniques=uniques, check_unknown=True)

    # dont raise error if False
    _encode(values, uniques=uniques, check_unknown=False)

    # parameter is ignored for object dtype
    uniques = np.array(['a', 'b', 'c'], dtype=object)
    values = np.array(['a', 'b', 'c', 'd'], dtype=object)
    with pytest.raises(ValueError,
                       match='y contains previously unseen labels'):
        _encode(values, uniques=uniques, check_unknown=False)


def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
    diff = _check_unknown(values, uniques)
    assert_array_equal(diff, expected_diff)

    diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
    assert_array_equal(diff, expected_diff)
    assert_array_equal(valid_mask, expected_mask)


@pytest.mark.parametrize("values, uniques, expected_diff, expected_mask", [
  (np.array([1, 2, 3, 4]),
   np.array([1, 2, 3]),
   [4],
   [True, True, True, False]),
  (np.array([2, 1, 4, 5]),
   np.array([2, 5, 1]),
   [4],
   [True, True, False, True]),
  (np.array([2, 1, np.nan]),
   np.array([2, 5, 1]),
   [np.nan],
   [True, True, False]),
  (np.array([2, 1, 4, np.nan]),
   np.array([2, 5, 1, np.nan]),
   [4],
   [True, True, False, True]),
  (np.array([2, 1, 4, np.nan]),
   np.array([2, 5, 1]),
   [4, np.nan],
   [True, True, False, False]),
  (np.array([2, 1, 4, 5]),
   np.array([2, 5, 1, np.nan]),
   [4],
   [True, True, False, True]),
  (np.array(['a', 'b', 'c', 'd'], dtype=object),
   np.array(['a', 'b', 'c'], dtype=object),
   np.array(['d'], dtype=object),
   [True, True, True, False]),
  (np.array(['d', 'c', 'a', 'b'], dtype=object),
   np.array(['a', 'c', 'b'], dtype=object),
   np.array(['d'], dtype=object),
   [False, True, True, True]),
  (np.array(['a', 'b', 'c', 'd']),
   np.array(['a', 'b', 'c']),
   np.array(['d']),
   [True, True, True, False]),
  (np.array(['d', 'c', 'a', 'b']),
   np.array(['a', 'c', 'b']),
   np.array(['d']),
   [False, True, True, True]),
])
def test_check_unknown(values, uniques, expected_diff, expected_mask):
    _assert_check_unknown(values, uniques, expected_diff, expected_mask)


@pytest.mark.parametrize("missing_value", [None, np.nan, float('nan')])
@pytest.mark.parametrize('pickle_uniques', [True, False])
def test_check_unknown_missing_values(missing_value, pickle_uniques):
    # check for check_unknown with missing values with object dtypes
    values = np.array(['d', 'c', 'a', 'b', missing_value], dtype=object)
    uniques = np.array(['c', 'a', 'b', missing_value], dtype=object)
    if pickle_uniques:
        uniques = pickle.loads(pickle.dumps(uniques))

    expected_diff = ['d']
    expected_mask = [False, True, True, True, True]
    _assert_check_unknown(values, uniques, expected_diff, expected_mask)

    values = np.array(['d', 'c', 'a', 'b', missing_value], dtype=object)
    uniques = np.array(['c', 'a', 'b'], dtype=object)
    if pickle_uniques:
        uniques = pickle.loads(pickle.dumps(uniques))

    expected_diff = ['d', missing_value]

    expected_mask = [False, True, True, True, False]
    _assert_check_unknown(values, uniques, expected_diff, expected_mask)

    values = np.array(['a', missing_value], dtype=object)
    uniques = np.array(['a', 'b', 'z'], dtype=object)
    if pickle_uniques:
        uniques = pickle.loads(pickle.dumps(uniques))

    expected_diff = [missing_value]
    expected_mask = [True, False]
    _assert_check_unknown(values, uniques, expected_diff, expected_mask)


@pytest.mark.parametrize('missing_value', [np.nan, None, float('nan')])
@pytest.mark.parametrize('pickle_uniques', [True, False])
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
    # check for _unique and _encode with missing values with object dtypes
    values = np.array(['a', 'c', 'c', missing_value, 'b'], dtype=object)
    expected_uniques = np.array(['a', 'b', 'c', missing_value], dtype=object)

    uniques = _unique(values)

    if missing_value is None:
        assert_array_equal(uniques, expected_uniques)
    else:  # missing_value == np.nan
        assert_array_equal(uniques[:-1], expected_uniques[:-1])
        assert np.isnan(uniques[-1])

    if pickle_uniques:
        uniques = pickle.loads(pickle.dumps(uniques))

    encoded = _encode(values, uniques=uniques)
    assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))


def test_unique_util_missing_values_numeric():
    # Check missing values in numerical values
    values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
    expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
    expected_inverse = np.array([1, 0, 3, 2, 1, 3])

    uniques = _unique(values)
    assert_array_equal(uniques, expected_uniques)

    uniques, inverse = _unique(values, return_inverse=True)
    assert_array_equal(uniques, expected_uniques)
    assert_array_equal(inverse, expected_inverse)

    encoded = _encode(values, uniques=uniques)
    assert_array_equal(encoded, expected_inverse)


def test_unique_util_with_all_missing_values():
    # test for all types of missing values for object dtype
    values = np.array([np.nan, 'a', 'c', 'c', None, float('nan'),
                       None], dtype=object)

    uniques = _unique(values)
    assert_array_equal(uniques[:-1], ['a', 'c', None])
    # last value is nan
    assert np.isnan(uniques[-1])

    expected_inverse = [3, 0, 1, 1, 2, 3, 2]
    _, inverse = _unique(values, return_inverse=True)
    assert_array_equal(inverse, expected_inverse)


def test_check_unknown_with_both_missing_values():
    # test for both types of missing values for object dtype
    values = np.array([np.nan, 'a', 'c', 'c', None, np.nan,
                       None], dtype=object)

    diff = _check_unknown(values,
                          known_values=np.array(['a', 'c'], dtype=object))
    assert diff[0] is None
    assert np.isnan(diff[1])

    diff, valid_mask = _check_unknown(
        values, known_values=np.array(['a', 'c'], dtype=object),
        return_mask=True)

    assert diff[0] is None
    assert np.isnan(diff[1])
    assert_array_equal(valid_mask,
                       [False, True, True, True, False, False, False])