Module hdict.dataset.dataset
Expand source code
# Copyright (c) 2021. Davi Pereira dos Santos
# This file is part of the hdict project.
# Please respect the license - more about this in the section (*) below.
#
# hdict is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# hdict is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with hdict. If not, see <http://www.gnu.org/licenses/>.
#
# (*) Removing authorship by any means, e.g. by distribution of derived
# works or verbatim, obfuscated, compiled or rewritten versions of any
# part of this work is illegal and it is unethical regarding the effort and
# time spent here.
#
# library arff2pandas simply disappeared...
import re
def isplit(source, sep=None, regex=False): # pragma: no cover
"""
https://stackoverflow.com/a/9773142/9681577
generator version of str.split()
:param source:
source string (unicode or bytes)
:param sep:
separator to split on.
:param regex:
if True, will treat sep as regular expression.
:returns:
generator yielding elements of string.
"""
if sep is None: # pragma: no cover
# mimic default python behavior
source = source.strip()
sep = "\\s+"
if isinstance(source, bytes):
sep = sep.encode("ascii")
regex = True
if regex:
# version using re.finditer()
if not hasattr(sep, "finditer"):
sep = re.compile(sep)
start = 0
for m in sep.finditer(source):
idx = m.start()
assert idx >= start
yield source[start:idx]
start = m.end()
yield source[start:]
else:
# version using str.find(), less overhead than re.finditer()
sepsize = len(sep)
start = 0
while True:
idx = source.find(sep, start)
if idx == -1:
yield source[start:]
return
yield source[start:idx]
start = idx + sepsize
def liac2pandas(arff):
import pandas as pd
attrs = arff["attributes"]
attrs_t = []
for attr in attrs:
if isinstance(attr[1], list):
attrs_t.append("%s@{%s}" % (attr[0], ",".join(attr[1])))
else:
attrs_t.append("%s@%s" % (attr[0], attr[1]))
df = pd.DataFrame(data=arff["data"], columns=attrs_t)
return df
def load(fp):
import arff as liacarff
data = liacarff.load(fp)
return liac2pandas(data)
def loads(s):
import arff as liacarff
data = liacarff.loads(s)
return liac2pandas(data)
def df2liac(df, relation="data", description=""): # pragma: no cover
attrs = []
for col in df.columns:
attr = col.split("@")
if attr[1].count("{") > 0 and attr[1].count("}") > 0:
vals = attr[1].replace("{", "").replace("}", "").split(",")
attrs.append((attr[0], vals))
else:
attrs.append((attr[0], attr[1]))
data = list(df.values)
result = {"attributes": attrs, "data": data, "description": description, "relation": relation}
return result
# def dump(df, fp): # todo: : save arff/CSV from hdict
# import arff as liacarff
#
# arff = df2liac(df)
# liacarff.dump(arff, fp)
#
#
# def dumps(df): # todo: : output arff/CSV from hdict
# import arff as liacarff
#
# arff = df2liac(df)
# return liacarff.dumps(arff)
def df2Xy(df, target=None):
r"""
>>> from hdict import hdict
>>> from hdict.dataset.dataset import df2Xy
>>> from testfixtures import TempDirectory
>>> arff = "@RELATION mini\n@ATTRIBUTE attr1 REAL\n@ATTRIBUTE attr2 REAL\n@ATTRIBUTE class {0,1}\n@DATA\n5.1,3.5,0\n3.1,4.5,1"
>>> with TempDirectory() as tmp: # doctest:+ELLIPSIS
... tmp.write("mini.arff", arff.encode())
... d = hdict.fromfile(tmp.path + "/mini.arff")
'/tmp/.../mini.arff'
>>> df2Xy(d.df)
{'X': attr1 attr2
0 5.1 3.5
1 3.1 4.5, 'y': array([0, 1])}
>>> df2Xy(d.df, target="attr2")
{'X': attr1 class
0 5.1 0
1 3.1 1, 'y': 0 3.5
1 4.5
Name: attr2, dtype: float64}
"""
if target is None:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X = df.drop(df.columns[[-1]], axis=1)
y = le.fit_transform(df[df.columns[-1]])
else:
y = df[target]
X = df.drop(target, axis=1)
return {"X": X, "y": y}
def nom2bin(X, nomcols):
"""
>>> import numpy as np
>>> from pandas import DataFrame as DF
>>> X = DF(np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]]))
>>> X
0 1 2
0 0 a 1.6
1 3.2 b 2
2 8 c 3
>>> nom2bin(X, nomcols=[1])
0 2 1_a 1_b 1_c
0 0 1.6 1 0 0
1 3.2 2 0 1 0
2 8 3 0 0 1
"""
if X.__class__.__name__ in ["DataFrame", "Series"]:
import pandas
clabels = X.columns[nomcols]
return pandas.get_dummies(X, prefix=clabels, columns=clabels, dtype=int)
else:
import numpy
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
nom = encoder.fit_transform(X.iloc[:, nomcols] if hasattr(X, "iloc") else X[:, nomcols]).toarray()
num = numpy.delete(X, nomcols, axis=1).astype(float)
return numpy.column_stack((nom, num))
#
#
# def df2arff(input="df", output="arff", **kwargs):
# """
# >>> from idict import let, idict
# >>> d = idict.fromminiarff()
# >>> d >>= let(df2arff, output="a")
# >>> d.show(colored=False)
# {
# "a": "→(input output df)",
# "_history": "idict---------arff2pandas-1.0.1--df2arff",
# "df": "«{'attr1@REAL': {0: 5.1, 1: 3.1}, 'attr2@REAL': {0: 3.5, 1: 4.5}, 'class@{0,1}': {0: '0', 1: '1'}}»",
# "_id": "Ojq9k7ZSbVjwLGlZ7uuqIyhoJPo.p36mAmav2Wul",
# "_ids": {
# "a": "wVPgESiPwobfFLLm591BKw6i2Zn.p36mAmav2Wul",
# "_history": "D2NpAYrhyJW-.nOhh91ttSjrbGw.nZ8qxps9giws",
# "df": "q3_b71eb05c4be05eba7b6ae5a9245d5dd70b81b (content: 6X_dc8ccea3b2e46f1c78967fae98b692701dc99)"
# }
# }
# >>> d.a
# '@RELATION data\\n\\n@ATTRIBUTE attr1 REAL\\n@ATTRIBUTE attr2 REAL\\n@ATTRIBUTE class {0, 1}\\n\\n@DATA\\n5.1,3.5,0\\n3.1,4.5,1\\n'
# """
# return {output: dumps(kwargs[input]), "_history": ...}
#
#
# def openml(Xout="X", yout="y", name="iris", version=1):
# """
# #>>> from idict import Ø
# #>>> (Ø >> openml).show(colored=False)
# #{
# #"X": "→(Xout yout name version)",
# #"y": "→(Xout yout name version)",
# #"_history": "idict--------------sklearn-1.0.1--openml",
# #"_id": "idict--------------sklearn-1.0.1--openml",
# #"_ids": {
# #"X": "KkSoAvgPmGq52PvPBqGFCEp9cSUCfb7eht9VpUdr",
# #"y": "HOE9E-HnFYen6JNRC9cHgAamJlOWGYTPIVKvQu8W",
# #"_history": "Efw0ebrPTxTiuJ.ZpVt-MvB62ja.kmrOYzrn-ljf"
# #}
# #}
# #>>> (Ø >> openml).X.head()
# #sepallength sepalwidth petallength petalwidth
# #0 5.1 3.5 1.4 0.2
# #1 4.9 3.0 1.4 0.2
# #2 4.7 3.2 1.3 0.2
# #3 4.6 3.1 1.5 0.2
# #4 5.0 3.6 1.4 0.2
# #>>> (Ø >> openml).y.head()
# #0 Iris-setosa
# #1 Iris-setosa
# #2 Iris-setosa
# #3 Iris-setosa
# #4 Iris-setosa
# #Name: class, dtype: category
# #Categories (3, object): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
# """
# from sklearn.datasets import fetch_openml
#
# X, y = fetch_openml(name=name, version=version, as_frame=True, return_X_y=True)
# return {Xout: X, yout: y, "_history": ...}
#
#
Functions
def df2Xy(df, target=None)
-
>>> from hdict import hdict >>> from hdict.dataset.dataset import df2Xy >>> from testfixtures import TempDirectory >>> arff = "@RELATION mini\n@ATTRIBUTE attr1 REAL\n@ATTRIBUTE attr2 REAL\n@ATTRIBUTE class {0,1}\n@DATA\n5.1,3.5,0\n3.1,4.5,1" >>> with TempDirectory() as tmp: # doctest:+ELLIPSIS ... tmp.write("mini.arff", arff.encode()) ... d = hdict.fromfile(tmp.path + "/mini.arff") '/tmp/.../mini.arff' >>> df2Xy(d.df) {'X': attr1 attr2 0 5.1 3.5 1 3.1 4.5, 'y': array([0, 1])} >>> df2Xy(d.df, target="attr2") {'X': attr1 class 0 5.1 0 1 3.1 1, 'y': 0 3.5 1 4.5 Name: attr2, dtype: float64}
Expand source code
def df2Xy(df, target=None): r""" >>> from hdict import hdict >>> from hdict.dataset.dataset import df2Xy >>> from testfixtures import TempDirectory >>> arff = "@RELATION mini\n@ATTRIBUTE attr1 REAL\n@ATTRIBUTE attr2 REAL\n@ATTRIBUTE class {0,1}\n@DATA\n5.1,3.5,0\n3.1,4.5,1" >>> with TempDirectory() as tmp: # doctest:+ELLIPSIS ... tmp.write("mini.arff", arff.encode()) ... d = hdict.fromfile(tmp.path + "/mini.arff") '/tmp/.../mini.arff' >>> df2Xy(d.df) {'X': attr1 attr2 0 5.1 3.5 1 3.1 4.5, 'y': array([0, 1])} >>> df2Xy(d.df, target="attr2") {'X': attr1 class 0 5.1 0 1 3.1 1, 'y': 0 3.5 1 4.5 Name: attr2, dtype: float64} """ if target is None: from sklearn.preprocessing import LabelEncoder le = LabelEncoder() X = df.drop(df.columns[[-1]], axis=1) y = le.fit_transform(df[df.columns[-1]]) else: y = df[target] X = df.drop(target, axis=1) return {"X": X, "y": y}
def df2liac(df, relation='data', description='')
-
Expand source code
def df2liac(df, relation="data", description=""): # pragma: no cover attrs = [] for col in df.columns: attr = col.split("@") if attr[1].count("{") > 0 and attr[1].count("}") > 0: vals = attr[1].replace("{", "").replace("}", "").split(",") attrs.append((attr[0], vals)) else: attrs.append((attr[0], attr[1])) data = list(df.values) result = {"attributes": attrs, "data": data, "description": description, "relation": relation} return result
def isplit(source, sep=None, regex=False)
-
https://stackoverflow.com/a/9773142/9681577
generator version of str.split()
:param source: source string (unicode or bytes)
:param sep: separator to split on.
:param regex: if True, will treat sep as regular expression.
:returns: generator yielding elements of string.
Expand source code
def isplit(source, sep=None, regex=False): # pragma: no cover """ https://stackoverflow.com/a/9773142/9681577 generator version of str.split() :param source: source string (unicode or bytes) :param sep: separator to split on. :param regex: if True, will treat sep as regular expression. :returns: generator yielding elements of string. """ if sep is None: # pragma: no cover # mimic default python behavior source = source.strip() sep = "\\s+" if isinstance(source, bytes): sep = sep.encode("ascii") regex = True if regex: # version using re.finditer() if not hasattr(sep, "finditer"): sep = re.compile(sep) start = 0 for m in sep.finditer(source): idx = m.start() assert idx >= start yield source[start:idx] start = m.end() yield source[start:] else: # version using str.find(), less overhead than re.finditer() sepsize = len(sep) start = 0 while True: idx = source.find(sep, start) if idx == -1: yield source[start:] return yield source[start:idx] start = idx + sepsize
def liac2pandas(arff)
-
Expand source code
def liac2pandas(arff): import pandas as pd attrs = arff["attributes"] attrs_t = [] for attr in attrs: if isinstance(attr[1], list): attrs_t.append("%s@{%s}" % (attr[0], ",".join(attr[1]))) else: attrs_t.append("%s@%s" % (attr[0], attr[1])) df = pd.DataFrame(data=arff["data"], columns=attrs_t) return df
def load(fp)
-
Expand source code
def load(fp): import arff as liacarff data = liacarff.load(fp) return liac2pandas(data)
def loads(s)
-
Expand source code
def loads(s): import arff as liacarff data = liacarff.loads(s) return liac2pandas(data)
def nom2bin(X, nomcols)
-
>>> import numpy as np >>> from pandas import DataFrame as DF >>> X = DF(np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]])) >>> X 0 1 2 0 0 a 1.6 1 3.2 b 2 2 8 c 3 >>> nom2bin(X, nomcols=[1]) 0 2 1_a 1_b 1_c 0 0 1.6 1 0 0 1 3.2 2 0 1 0 2 8 3 0 0 1
Expand source code
def nom2bin(X, nomcols): """ >>> import numpy as np >>> from pandas import DataFrame as DF >>> X = DF(np.array([[0, "a", 1.6], [3.2, "b", 2], [8, "c", 3]])) >>> X 0 1 2 0 0 a 1.6 1 3.2 b 2 2 8 c 3 >>> nom2bin(X, nomcols=[1]) 0 2 1_a 1_b 1_c 0 0 1.6 1 0 0 1 3.2 2 0 1 0 2 8 3 0 0 1 """ if X.__class__.__name__ in ["DataFrame", "Series"]: import pandas clabels = X.columns[nomcols] return pandas.get_dummies(X, prefix=clabels, columns=clabels, dtype=int) else: import numpy from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() nom = encoder.fit_transform(X.iloc[:, nomcols] if hasattr(X, "iloc") else X[:, nomcols]).toarray() num = numpy.delete(X, nomcols, axis=1).astype(float) return numpy.column_stack((nom, num))