Module idict.function.data

Functions to be used directly within an idict workflow

Expand source code
#  Copyright (c) 2021. Davi Pereira dos Santos
#  This file is part of the i-dict project.
#  Please respect the license - more about this in the section (*) below.
#
#  i-dict is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  i-dict is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with i-dict.  If not, see <http://www.gnu.org/licenses/>.
#
#  (*) Removing authorship by any means, e.g. by distribution of derived
#  works or verbatim, obfuscated, compiled or rewritten versions of any
#  part of this work is illegal and it is unethical regarding the effort and
#  time spent here.
#

"""
Functions to be used directly within an idict workflow
"""

# TODO: break down all sklearn and numpy used inside binarize,
#  so e.g. the fit-wrapper can be used for OHE; and binarize can be a composition.

from idict.macro import isnumber


def nomcols(input="X", output="nomcols", **kwargs):
    """
    >>> import numpy as np
    >>> X = np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]])
    >>> nomcols(X=X)
    {'nomcols': [1], '_history': Ellipsis}
    """
    X = kwargs[input]
    idxs = []
    for i, x in enumerate(X.iloc[0] if hasattr(X, "iloc") else X[0]):
        if not isnumber(x):
            idxs.append(i)
    return {output: idxs, "_history": ...}


def binarize(input="X", idxsin="nomcols", output="Xbin", **kwargs):
    """
    >>> import numpy as np
    >>> from pandas import DataFrame as DF
    >>> X = DF(np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]]))
    >>> X
         0  1    2
    0    0  a  1.6
    1  3.2  b    2
    2    8  c    3
    >>> binarize(X=X, nomcols=[1])["Xbin"]
         0    2  1_a  1_b  1_c
    0    0  1.6    1    0    0
    1  3.2    2    0    1    0
    2    8    3    0    0    1
    """
    X = kwargs[input]
    cols = kwargs[idxsin]
    if X.__class__.__name__ in ["DataFrame", "Series"]:
        import pandas

        clabels = X.columns[cols]
        Xout = pandas.get_dummies(X, prefix=clabels, columns=clabels)
    else:
        import numpy
        from sklearn.preprocessing import OneHotEncoder

        encoder = OneHotEncoder()
        nom = encoder.fit_transform(X.iloc[:, cols] if hasattr(X, "iloc") else X[:, cols]).toarray()
        num = numpy.delete(X, cols, axis=1).astype(float)
        Xout = numpy.column_stack((nom, num))
    return {output: Xout, "_history": ...}


def df2list(input="df", output="list", **kwargs):
    """
    >>> from idict import idict
    >>> d = idict.fromtoy(output_format="df")
    >>> d >>= df2list
    >>> d.list
    [['attr1', 'attr2', 'class'], [5.1, 6.4, 0.0], [1.1, 2.5, 1.0], [6.1, 3.6, 0.0], [1.1, 3.5, 1.0], [3.1, 2.5, 0.0], [4.7, 4.9, 1.0], [9.1, 3.5, 0.0], [8.3, 2.9, 1.0], [9.1, 7.2, 0.0], [2.5, 4.5, 1.0], [7.1, 6.6, 0.0], [0.1, 4.3, 1.0], [2.1, 0.1, 0.0], [0.1, 4.0, 1.0], [5.1, 4.5, 0.0], [31.1, 4.7, 1.0], [1.1, 3.2, 0.0], [2.2, 8.5, 1.0], [3.1, 2.5, 0.0], [1.1, 8.5, 1.0]]
    """
    M = kwargs[input]
    lst = [list(M.columns)] + M.to_numpy().tolist()
    return {output: lst, "_history": ...}


nomcols.metadata = {
    "id": "idict----------------------------nomcols",
    "name": "nomcols",
    "description": "List column indices of nominal attributes.",
    "parameters": ...,
    "code": ...,
}
binarize.metadata = {
    "id": "sk-1.0.1--pd-1.3.4--np-1.21.4---binarize",
    "name": "binarize",
    "description": "Binarize nominal attributes so they can be handled as numeric.",
    "parameters": ...,
    "code": ...,
}
df2list.metadata = {
    "id": "idict---pandas-1.3.4--np-1.21.4--df2list",
    "name": "df2list",
    "description": "Convert DataFrame to nested lists.",
    "parameters": ...,
    "code": ...,
}

Functions

def binarize(input='X', idxsin='nomcols', output='Xbin', **kwargs)
>>> import numpy as np
>>> from pandas import DataFrame as DF
>>> X = DF(np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]]))
>>> X
     0  1    2
0    0  a  1.6
1  3.2  b    2
2    8  c    3
>>> binarize(X=X, nomcols=[1])["Xbin"]
     0    2  1_a  1_b  1_c
0    0  1.6    1    0    0
1  3.2    2    0    1    0
2    8    3    0    0    1
Expand source code
def binarize(input="X", idxsin="nomcols", output="Xbin", **kwargs):
    """
    >>> import numpy as np
    >>> from pandas import DataFrame as DF
    >>> X = DF(np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]]))
    >>> X
         0  1    2
    0    0  a  1.6
    1  3.2  b    2
    2    8  c    3
    >>> binarize(X=X, nomcols=[1])["Xbin"]
         0    2  1_a  1_b  1_c
    0    0  1.6    1    0    0
    1  3.2    2    0    1    0
    2    8    3    0    0    1
    """
    X = kwargs[input]
    cols = kwargs[idxsin]
    if X.__class__.__name__ in ["DataFrame", "Series"]:
        import pandas

        clabels = X.columns[cols]
        Xout = pandas.get_dummies(X, prefix=clabels, columns=clabels)
    else:
        import numpy
        from sklearn.preprocessing import OneHotEncoder

        encoder = OneHotEncoder()
        nom = encoder.fit_transform(X.iloc[:, cols] if hasattr(X, "iloc") else X[:, cols]).toarray()
        num = numpy.delete(X, cols, axis=1).astype(float)
        Xout = numpy.column_stack((nom, num))
    return {output: Xout, "_history": ...}
def df2list(input='df', output='list', **kwargs)
>>> from idict import idict
>>> d = idict.fromtoy(output_format="df")
>>> d >>= df2list
>>> d.list
[['attr1', 'attr2', 'class'], [5.1, 6.4, 0.0], [1.1, 2.5, 1.0], [6.1, 3.6, 0.0], [1.1, 3.5, 1.0], [3.1, 2.5, 0.0], [4.7, 4.9, 1.0], [9.1, 3.5, 0.0], [8.3, 2.9, 1.0], [9.1, 7.2, 0.0], [2.5, 4.5, 1.0], [7.1, 6.6, 0.0], [0.1, 4.3, 1.0], [2.1, 0.1, 0.0], [0.1, 4.0, 1.0], [5.1, 4.5, 0.0], [31.1, 4.7, 1.0], [1.1, 3.2, 0.0], [2.2, 8.5, 1.0], [3.1, 2.5, 0.0], [1.1, 8.5, 1.0]]
Expand source code
def df2list(input="df", output="list", **kwargs):
    """
    >>> from idict import idict
    >>> d = idict.fromtoy(output_format="df")
    >>> d >>= df2list
    >>> d.list
    [['attr1', 'attr2', 'class'], [5.1, 6.4, 0.0], [1.1, 2.5, 1.0], [6.1, 3.6, 0.0], [1.1, 3.5, 1.0], [3.1, 2.5, 0.0], [4.7, 4.9, 1.0], [9.1, 3.5, 0.0], [8.3, 2.9, 1.0], [9.1, 7.2, 0.0], [2.5, 4.5, 1.0], [7.1, 6.6, 0.0], [0.1, 4.3, 1.0], [2.1, 0.1, 0.0], [0.1, 4.0, 1.0], [5.1, 4.5, 0.0], [31.1, 4.7, 1.0], [1.1, 3.2, 0.0], [2.2, 8.5, 1.0], [3.1, 2.5, 0.0], [1.1, 8.5, 1.0]]
    """
    M = kwargs[input]
    lst = [list(M.columns)] + M.to_numpy().tolist()
    return {output: lst, "_history": ...}
def nomcols(input='X', output='nomcols', **kwargs)
>>> import numpy as np
>>> X = np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]])
>>> nomcols(X=X)
{'nomcols': [1], '_history': Ellipsis}
Expand source code
def nomcols(input="X", output="nomcols", **kwargs):
    """
    >>> import numpy as np
    >>> X = np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]])
    >>> nomcols(X=X)
    {'nomcols': [1], '_history': Ellipsis}
    """
    X = kwargs[input]
    idxs = []
    for i, x in enumerate(X.iloc[0] if hasattr(X, "iloc") else X[0]):
        if not isnumber(x):
            idxs.append(i)
    return {output: idxs, "_history": ...}