Module hdict.dataset.pandas_handling
Expand source code
# Copyright (c) 2023. Davi Pereira dos Santos
# This file is part of the hdict project.
# Please respect the license - more about this in the section (*) below.
#
# hdict is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# hdict is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with hdict. If not, see <http://www.gnu.org/licenses/>.
#
# (*) Removing authorship by any means, e.g. by distribution of derived
# works or verbatim, obfuscated, compiled or rewritten versions of any
# part of this work is illegal and it is unethical regarding the effort and
# time spent here.
#
def explode_df(df):
"""
>>> from pandas import DataFrame
>>> from hdict import hdict, cache
>>> df = DataFrame({"x": [1,2,3], "y": [5,6,7]}, index=["a", "b", "c"])
>>> d = hdict(df_=df)
>>> d.show(colored=False)
{
df_: "‹{'x': {'a': 1, 'b': 2, 'c': 3}, 'y': {'a': 5, 'b': 6, 'c': 7}}›",
df: {
index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›",
x: "‹{'a': 1, 'b': 2, 'c': 3}›",
y: "‹{'a': 5, 'b': 6, 'c': 7}›",
_id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m,
_ids: {
index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r,
x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt,
y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94
}
},
_id: symsSXy-oMJrIpAisV7aHbn.fZ9yfjfrExxIq6HS,
_ids: {
df: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m,
df_: e.VIUfoRxV4B6aoIZeJ8AlDwmq7IZNPDNPFI72WZ
}
}
>>> d.df_
x y
a 1 5
b 2 6
c 3 7
>>> c = {}
>>> d >>= cache(c)
>>> d.show(colored=False)
{
df_: "‹{'x': {'a': 1, 'b': 2, 'c': 3}, 'y': {'a': 5, 'b': 6, 'c': 7}}›",
df: {
index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›",
x: "‹{'a': 1, 'b': 2, 'c': 3}›",
y: "‹{'a': 5, 'b': 6, 'c': 7}›",
_id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m,
_ids: {
index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r,
x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt,
y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94
}
},
_id: symsSXy-oMJrIpAisV7aHbn.fZ9yfjfrExxIq6HS,
_ids: {
df: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m,
df_: e.VIUfoRxV4B6aoIZeJ8AlDwmq7IZNPDNPFI72WZ
}
}
>>> d.df.show(colored=False)
{
index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›",
x: "‹{'a': 1, 'b': 2, 'c': 3}›",
y: "‹{'a': 5, 'b': 6, 'c': 7}›",
_id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m,
_ids: {
index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r,
x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt,
y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94
}
}
"""
from hdict.data.frozenhdict import frozenhdict
dic = {"index": df.index.to_series()}
for col in df:
dic[str(col)] = df[col]
d = frozenhdict(dic)
return d
def file2df(filename, hide_types=True, return_name=True, transpose=False, index=False):
from hdict.dataset.dataset import load
if filename.endswith(".arff"):
relation = None
with open(filename) as f:
for line in f:
if line[:9].upper() == "@RELATION":
relation = line[9:-1]
break
with open(filename) as f:
df = load(f)
if index or transpose:
indexname = df.columns[0]
df.set_index(indexname, inplace=True)
df.index.name = indexname
if transpose:
df = df.T
if index:
df.index.rename(indexname, inplace=True)
if hide_types:
df.rename(columns={k: k.split("@")[0] for k in df.columns}, inplace=True)
if return_name:
return df, relation or filename
else:
return df
elif filename.endswith(".csv"):
from pandas import read_csv
df = read_csv(filename)
if index or transpose:
indexname = df.columns[0]
df.set_index(indexname, inplace=True)
df.index.name = indexname
if transpose:
df = df.T
if index:
df.index.rename(indexname, inplace=True)
if return_name:
return df, filename
else:
return df
else: # pragma: no cover
raise Exception(f"Unknown extension {filename.split('.')[-1]}.")
Functions
def explode_df(df)
-
>>> from pandas import DataFrame >>> from hdict import hdict, cache >>> df = DataFrame({"x": [1,2,3], "y": [5,6,7]}, index=["a", "b", "c"]) >>> d = hdict(df_=df) >>> d.show(colored=False) { df_: "‹{'x': {'a': 1, 'b': 2, 'c': 3}, 'y': {'a': 5, 'b': 6, 'c': 7}}›", df: { index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›", x: "‹{'a': 1, 'b': 2, 'c': 3}›", y: "‹{'a': 5, 'b': 6, 'c': 7}›", _id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, _ids: { index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r, x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt, y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94 } }, _id: symsSXy-oMJrIpAisV7aHbn.fZ9yfjfrExxIq6HS, _ids: { df: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, df_: e.VIUfoRxV4B6aoIZeJ8AlDwmq7IZNPDNPFI72WZ } } >>> d.df_ x y a 1 5 b 2 6 c 3 7 >>> c = {} >>> d >>= cache(c) >>> d.show(colored=False) { df_: "‹{'x': {'a': 1, 'b': 2, 'c': 3}, 'y': {'a': 5, 'b': 6, 'c': 7}}›", df: { index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›", x: "‹{'a': 1, 'b': 2, 'c': 3}›", y: "‹{'a': 5, 'b': 6, 'c': 7}›", _id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, _ids: { index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r, x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt, y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94 } }, _id: symsSXy-oMJrIpAisV7aHbn.fZ9yfjfrExxIq6HS, _ids: { df: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, df_: e.VIUfoRxV4B6aoIZeJ8AlDwmq7IZNPDNPFI72WZ } } >>> d.df.show(colored=False) { index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›", x: "‹{'a': 1, 'b': 2, 'c': 3}›", y: "‹{'a': 5, 'b': 6, 'c': 7}›", _id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, _ids: { index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r, x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt, y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94 } }
Expand source code
def explode_df(df): """ >>> from pandas import DataFrame >>> from hdict import hdict, cache >>> df = DataFrame({"x": [1,2,3], "y": [5,6,7]}, index=["a", "b", "c"]) >>> d = hdict(df_=df) >>> d.show(colored=False) { df_: "‹{'x': {'a': 1, 'b': 2, 'c': 3}, 'y': {'a': 5, 'b': 6, 'c': 7}}›", df: { index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›", x: "‹{'a': 1, 'b': 2, 'c': 3}›", y: "‹{'a': 5, 'b': 6, 'c': 7}›", _id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, _ids: { index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r, x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt, y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94 } }, _id: symsSXy-oMJrIpAisV7aHbn.fZ9yfjfrExxIq6HS, _ids: { df: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, df_: e.VIUfoRxV4B6aoIZeJ8AlDwmq7IZNPDNPFI72WZ } } >>> d.df_ x y a 1 5 b 2 6 c 3 7 >>> c = {} >>> d >>= cache(c) >>> d.show(colored=False) { df_: "‹{'x': {'a': 1, 'b': 2, 'c': 3}, 'y': {'a': 5, 'b': 6, 'c': 7}}›", df: { index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›", x: "‹{'a': 1, 'b': 2, 'c': 3}›", y: "‹{'a': 5, 'b': 6, 'c': 7}›", _id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, _ids: { index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r, x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt, y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94 } }, _id: symsSXy-oMJrIpAisV7aHbn.fZ9yfjfrExxIq6HS, _ids: { df: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, df_: e.VIUfoRxV4B6aoIZeJ8AlDwmq7IZNPDNPFI72WZ } } >>> d.df.show(colored=False) { index: "‹{'a': 'a', 'b': 'b', 'c': 'c'}›", x: "‹{'a': 1, 'b': 2, 'c': 3}›", y: "‹{'a': 5, 'b': 6, 'c': 7}›", _id: efFl2-CCjnStQSpp8QmwkK-7ANKh8bNctOap9X9m, _ids: { index: HdOKL6NHC9ApFmGygZ54A9f265R6TxhGjBRwdO9r, x: n7fkFYWuvJ.MTp7asUu8489mQMwoZiL.5.or1EDt, y: N9Of5idOlZFH25hFb9IcjOwJrxt2.tXddWKuYq94 } } """ from hdict.data.frozenhdict import frozenhdict dic = {"index": df.index.to_series()} for col in df: dic[str(col)] = df[col] d = frozenhdict(dic) return d
def file2df(filename, hide_types=True, return_name=True, transpose=False, index=False)
-
Expand source code
def file2df(filename, hide_types=True, return_name=True, transpose=False, index=False): from hdict.dataset.dataset import load if filename.endswith(".arff"): relation = None with open(filename) as f: for line in f: if line[:9].upper() == "@RELATION": relation = line[9:-1] break with open(filename) as f: df = load(f) if index or transpose: indexname = df.columns[0] df.set_index(indexname, inplace=True) df.index.name = indexname if transpose: df = df.T if index: df.index.rename(indexname, inplace=True) if hide_types: df.rename(columns={k: k.split("@")[0] for k in df.columns}, inplace=True) if return_name: return df, relation or filename else: return df elif filename.endswith(".csv"): from pandas import read_csv df = read_csv(filename) if index or transpose: indexname = df.columns[0] df.set_index(indexname, inplace=True) df.index.name = indexname if transpose: df = df.T if index: df.index.rename(indexname, inplace=True) if return_name: return df, filename else: return df else: # pragma: no cover raise Exception(f"Unknown extension {filename.split('.')[-1]}.")