Module `idict.data.serialization`

Expand source code

#  Copyright (c) 2021. Davi Pereira dos Santos
#  This file is part of the i-dict project.
#  Please respect the license - more about this in the section (*) below.
#
#  i-dict is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  i-dict is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with i-dict.  If not, see <http://www.gnu.org/licenses/>.
#
#  (*) Removing authorship by any means, e.g. by distribution of derived
#  works or verbatim, obfuscated, compiled or rewritten versions of any
#  part of this work is illegal and it is unethical regarding the effort and
#  time spent here.
#
import json
from importlib import import_module

from orjson import dumps, OPT_SORT_KEYS


def import_dependence(dep):
    try:
        return import_module(dep)
    except ImportError as e:
        raise Exception(f"Missing {dep} library. Need a complete install\n" "pip install -U idict[full]")


def custom_orjson_encoder(obj):
    # E.g., pandas dataframes.
    typ = str(type(obj))
    if typ == "<class 'pandas.core.frame.DataFrame'>":
        return obj.to_numpy()
    if typ == "<class 'pandas.core.series.Series'>":
        return obj.to_numpy()
    # if hasattr(obj, 'to_json'):
    #     # REMINDER: default_handler=str is to avoid infinite recursion, e.g., on iris.arff
    #     txt = obj.to_json(force_ascii=False, default_handler=str)
    #     return {"_type_orjson": str(type(obj)), "_obj.to_json()": txt}

    # Numpy objects generic type and ndarray, keeping dtype.
    if typ == "<class 'numpy.ndarray'>":
        print(typ)
        try:
            return serialize_numpy(obj)
        except Exception as e:
            print(e)
            exit()

    # try:
    #     import numpy
    #     if isinstance(obj, numpy.generic):
    #         return {"_type_orjson": str(obj.dtype), "_numpy.asscalar(obj)": numpy.asscalar(obj)}
    #     if isinstance(obj, numpy.ndarray):
    #         return {"_type_orjson": str(obj.dtype), "_numpy.ndarray.tolist()": obj.tolist()}
    # except ImportError as e:
    #     pass

    if isinstance(obj, bytes):
        return obj.decode()  # nem qq byte vira string!
    raise TypeError


def json_object_hook_decoder(dic):
    if "_type_orjson" in dic:
        if "_obj.to_json()" in dic:
            if dic["_type_orjson"] == "<class 'pandas.core.frame.DataFrame'>":
                m = import_dependence("pandas")
                return m.read_json(dic["_obj.to_json()"])  # , default_handler=str)
            if dic["_type_orjson"] == "<class 'pandas.core.series.Series'>":
                m = import_dependence("pandas")
                # default_handler=callable
                return m.read_json(dic["_obj.to_json()"], typ=dic["_type_orjson"])
            else:  # pragma: no cover
                raise Exception(f"Cannot desserialize object of type '{dic['_type_orjson']}'")
        if (c := "_numpy.asscalar(obj)") in dic or (c := "_numpy.ndarray.tolist()") in dic:
            m = import_dependence("numpy")
            dtype = "str" if len(dic["_type_orjson"]) > 10 else dic["_type_orjson"]
            return m.array(dic[c], dtype=dtype)
    return dic


def serialize_json(obj):
    # r"""
    # >>> import numpy as np
    # >>> import math
    # >>> a = np.array([[1/3, 5/4], [1.3**6, "text"]])
    # >>> a
    # array([['0.3333333333333333', '1.25'],
    #        ['4.826809000000001', 'text']], dtype='<U32')
    # >>> b = np.array([[1/3,5/4], [1.3**6, 4]], dtype = np.int64)
    # >>> b
    # array([[0, 1],
    #        [4, 4]])
    # >>> c = np.array([[1/3,5/4], [1.3**6, 4]], dtype = np.int8)
    # >>> c
    # array([[0, 1],
    #        [4, 4]], dtype=int8)
    # >>> serialize_json([math.inf, a, b, c])
    # b'[null,{"_numpy.ndarray.tolist()":[["0.3333333333333333","1.25"],["4.826809000000001","text"]],"_type_orjson":"<U32"},{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int64"},{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int8"}]'
    # >>> import pandas as pd
    # >>> df = pd.DataFrame(
    # ...     [[1/3, 5/4], [1.3**54, "text"]],
    # ...     index=["row 1", "row 2"],
    # ...     columns=["col 1", "col 2"],
    # ... )
    # >>> df
    #               col 1 col 2
    # row 1  3.333333e-01  1.25
    # row 2  1.422136e+06  text
    # >>> serialize_json(df)
    # b'{"_obj.to_json()":"{\\"col 1\\":{\\"row 1\\":0.3333333333,\\"row 2\\":1422135.6537506874},\\"col 2\\":{\\"row 1\\":1.25,\\"row 2\\":\\"text\\"}}","_type_orjson":"<class \'pandas.core.frame.DataFrame\'>"}'
    # >>> s = pd.Series(
    # ...     [1/3, 5/4, (1.3)**54, "text"],
    # ...     index=["row 1", "row 2", "row 3", "row 4"],
    # ... )
    # >>> s
    # row 1          0.333333
    # row 2              1.25
    # row 3    1422135.653751
    # row 4              text
    # dtype: object
    # >>> serialize_json(s)
    # b'{"_obj.to_json()":"{\\"row 1\\":0.3333333333,\\"row 2\\":1.25,\\"row 3\\":1422135.6537506874,\\"row 4\\":\\"text\\"}","_type_orjson":"<class \'pandas.core.series.Series\'>"}'
    # """
    return dumps(obj, default=custom_orjson_encoder, option=OPT_SORT_KEYS)


def deserialize_json(blob):
    r"""
    >>> deserialize_json(b'null')
    >>> deserialize_json(b'{"_numpy.ndarray.tolist()":[["0.3333333333333333","1.25"],["4.826809000000001","text"]],"_type_orjson":"<U32"}')
    array([['0.3333333333333333', '1.25'],
           ['4.826809000000001', 'text']], dtype='<U32')
    >>> deserialize_json(b'{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int64"}')
    array([[0, 1],
           [4, 4]])
    >>> deserialize_json(b'{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int8"}')
    array([[0, 1],
           [4, 4]], dtype=int8)
    >>> deserialize_json(b'{"_obj.to_json()":"{\\"col 1\\":{\\"row 1\\":0.3333333333,\\"row 2\\":1422135.6537506874},\\"col 2\\":{\\"row 1\\":1.25,\\"row 2\\":\\"text\\"}}","_type_orjson":"<class \'pandas.core.frame.DataFrame\'>"}')
                  col 1 col 2
    row 1  3.333333e-01  1.25
    row 2  1.422136e+06  text
    >>> deserialize_json(b'{"_obj.to_json()":"{\\"row 1\\":0.3333333333,\\"row 2\\":1.25,\\"row 3\\":1422135.6537506874,\\"row 4\\":\\"text\\"}","_type_orjson":"<class \'pandas.core.series.Series\'>"}')
    row 1          0.333333
    row 2              1.25
    row 3    1422135.653751
    row 4              text
    dtype: object
    """
    return json.loads(blob, object_hook=json_object_hook_decoder)


def serialize_numpy(obj):
    # r"""
    # >>> import numpy as np
    # >>> m = np.array([1,2,3,4])
    # >>> m
    # array([1, 2, 3, 4])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([1, 2, 3, 4])
    # >>> m = np.array([[1,2],[3,4]])
    # >>> m
    # array([[1, 2],
    #        [3, 4]])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([[1, 2],
    #        [3, 4]])
    # >>> m = np.array([1,2.7,3,4])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([1. , 2.7, 3. , 4. ])
    # >>> m = np.array([[1,2],[3,4/3]])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([[1.        , 2.        ],
    #        [3.        , 1.33333333]])
    # >>> m = np.array([1,2,3,"txt"])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array(['1', '2', '3', 'txt'], dtype='<U21')
    # >>> m = np.array([[1,"txt"],[3,4]])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([['1', 'txt'],
    #        ['3', '4']], dtype='<U21')
    # """
    import numpy

    if isinstance(obj, numpy.ndarray):
        dims = len(obj.shape)
        dtype = str(obj.dtype)
        headerlen = 1 + len(dtype)
        header = f"{headerlen}§{dims}§{dtype}§".encode() + integers2bytes(obj.shape)
        return header, obj.data
    raise Exception(f"Cannot handle this type {type(obj)}, check its shape or dtype")


# passar memoryview direto pro compressor, talvez cada serializador tenha que comprimir por si
# faz msm mais sentido o header fora da parte comprimida
# tentar fazer com que json resolva tudo, só q delegando numpy e pandas pra mim. testar depois veocidade
#   vantagem: numpys nested inside dict/lists/sets
#   desvantagem?: converter de bytes p/ str


def deserialize_numpy(blob):
    import numpy

    view = memoryview(blob)
    prefix, dtype, hw = view[:30].split(b"_")
    dims = int(chr(prefix[2]))
    dtype = dtype.decode().rstrip()
    h, w = bytes2integers(hw)
    dump = blob[30:]
    m = numpy.frombuffer(dump, dtype=dtype)
    if dims == 2:
        m = numpy.reshape(m, newshape=(h, w))
    return m


def integers2bytes(lst, n=4) -> bytes:
    """Each int becomes N bytes. max=4294967294 for 4 bytes"""
    return b"".join(d.to_bytes(n, byteorder="little") for d in lst)


def bytes2integers(bytes_content: bytes, n=4):
    """Each 4 bytes become an int."""
    return [int.from_bytes(bytes_content[i : i + n], "little") for i in range(0, len(bytes_content), n)]

Functions

def bytes2integers(bytes_content: bytes, n=4)

Each 4 bytes become an int.

Expand source code

def bytes2integers(bytes_content: bytes, n=4):
    """Each 4 bytes become an int."""
    return [int.from_bytes(bytes_content[i : i + n], "little") for i in range(0, len(bytes_content), n)]

def custom_orjson_encoder(obj)

Expand source code

def custom_orjson_encoder(obj):
    # E.g., pandas dataframes.
    typ = str(type(obj))
    if typ == "<class 'pandas.core.frame.DataFrame'>":
        return obj.to_numpy()
    if typ == "<class 'pandas.core.series.Series'>":
        return obj.to_numpy()
    # if hasattr(obj, 'to_json'):
    #     # REMINDER: default_handler=str is to avoid infinite recursion, e.g., on iris.arff
    #     txt = obj.to_json(force_ascii=False, default_handler=str)
    #     return {"_type_orjson": str(type(obj)), "_obj.to_json()": txt}

    # Numpy objects generic type and ndarray, keeping dtype.
    if typ == "<class 'numpy.ndarray'>":
        print(typ)
        try:
            return serialize_numpy(obj)
        except Exception as e:
            print(e)
            exit()

    # try:
    #     import numpy
    #     if isinstance(obj, numpy.generic):
    #         return {"_type_orjson": str(obj.dtype), "_numpy.asscalar(obj)": numpy.asscalar(obj)}
    #     if isinstance(obj, numpy.ndarray):
    #         return {"_type_orjson": str(obj.dtype), "_numpy.ndarray.tolist()": obj.tolist()}
    # except ImportError as e:
    #     pass

    if isinstance(obj, bytes):
        return obj.decode()  # nem qq byte vira string!
    raise TypeError

def deserialize_json(blob)

>>> deserialize_json(b'null')
>>> deserialize_json(b'{"_numpy.ndarray.tolist()":[["0.3333333333333333","1.25"],["4.826809000000001","text"]],"_type_orjson":"<U32"}')
array([['0.3333333333333333', '1.25'],
       ['4.826809000000001', 'text']], dtype='<U32')
>>> deserialize_json(b'{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int64"}')
array([[0, 1],
       [4, 4]])
>>> deserialize_json(b'{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int8"}')
array([[0, 1],
       [4, 4]], dtype=int8)
>>> deserialize_json(b'{"_obj.to_json()":"{\\"col 1\\":{\\"row 1\\":0.3333333333,\\"row 2\\":1422135.6537506874},\\"col 2\\":{\\"row 1\\":1.25,\\"row 2\\":\\"text\\"}}","_type_orjson":"<class \'pandas.core.frame.DataFrame\'>"}')
              col 1 col 2
row 1  3.333333e-01  1.25
row 2  1.422136e+06  text
>>> deserialize_json(b'{"_obj.to_json()":"{\\"row 1\\":0.3333333333,\\"row 2\\":1.25,\\"row 3\\":1422135.6537506874,\\"row 4\\":\\"text\\"}","_type_orjson":"<class \'pandas.core.series.Series\'>"}')
row 1          0.333333
row 2              1.25
row 3    1422135.653751
row 4              text
dtype: object

Expand source code

def deserialize_json(blob):
    r"""
    >>> deserialize_json(b'null')
    >>> deserialize_json(b'{"_numpy.ndarray.tolist()":[["0.3333333333333333","1.25"],["4.826809000000001","text"]],"_type_orjson":"<U32"}')
    array([['0.3333333333333333', '1.25'],
           ['4.826809000000001', 'text']], dtype='<U32')
    >>> deserialize_json(b'{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int64"}')
    array([[0, 1],
           [4, 4]])
    >>> deserialize_json(b'{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int8"}')
    array([[0, 1],
           [4, 4]], dtype=int8)
    >>> deserialize_json(b'{"_obj.to_json()":"{\\"col 1\\":{\\"row 1\\":0.3333333333,\\"row 2\\":1422135.6537506874},\\"col 2\\":{\\"row 1\\":1.25,\\"row 2\\":\\"text\\"}}","_type_orjson":"<class \'pandas.core.frame.DataFrame\'>"}')
                  col 1 col 2
    row 1  3.333333e-01  1.25
    row 2  1.422136e+06  text
    >>> deserialize_json(b'{"_obj.to_json()":"{\\"row 1\\":0.3333333333,\\"row 2\\":1.25,\\"row 3\\":1422135.6537506874,\\"row 4\\":\\"text\\"}","_type_orjson":"<class \'pandas.core.series.Series\'>"}')
    row 1          0.333333
    row 2              1.25
    row 3    1422135.653751
    row 4              text
    dtype: object
    """
    return json.loads(blob, object_hook=json_object_hook_decoder)

def deserialize_numpy(blob)

Expand source code

def deserialize_numpy(blob):
    import numpy

    view = memoryview(blob)
    prefix, dtype, hw = view[:30].split(b"_")
    dims = int(chr(prefix[2]))
    dtype = dtype.decode().rstrip()
    h, w = bytes2integers(hw)
    dump = blob[30:]
    m = numpy.frombuffer(dump, dtype=dtype)
    if dims == 2:
        m = numpy.reshape(m, newshape=(h, w))
    return m

def import_dependence(dep)

Expand source code

def import_dependence(dep):
    try:
        return import_module(dep)
    except ImportError as e:
        raise Exception(f"Missing {dep} library. Need a complete install\n" "pip install -U idict[full]")

def integers2bytes(lst, n=4) ‑> bytes

Each int becomes N bytes. max=4294967294 for 4 bytes

Expand source code

def integers2bytes(lst, n=4) -> bytes:
    """Each int becomes N bytes. max=4294967294 for 4 bytes"""
    return b"".join(d.to_bytes(n, byteorder="little") for d in lst)

def json_object_hook_decoder(dic)

Expand source code

def json_object_hook_decoder(dic):
    if "_type_orjson" in dic:
        if "_obj.to_json()" in dic:
            if dic["_type_orjson"] == "<class 'pandas.core.frame.DataFrame'>":
                m = import_dependence("pandas")
                return m.read_json(dic["_obj.to_json()"])  # , default_handler=str)
            if dic["_type_orjson"] == "<class 'pandas.core.series.Series'>":
                m = import_dependence("pandas")
                # default_handler=callable
                return m.read_json(dic["_obj.to_json()"], typ=dic["_type_orjson"])
            else:  # pragma: no cover
                raise Exception(f"Cannot desserialize object of type '{dic['_type_orjson']}'")
        if (c := "_numpy.asscalar(obj)") in dic or (c := "_numpy.ndarray.tolist()") in dic:
            m = import_dependence("numpy")
            dtype = "str" if len(dic["_type_orjson"]) > 10 else dic["_type_orjson"]
            return m.array(dic[c], dtype=dtype)
    return dic

def serialize_json(obj)

Expand source code

def serialize_json(obj):
    # r"""
    # >>> import numpy as np
    # >>> import math
    # >>> a = np.array([[1/3, 5/4], [1.3**6, "text"]])
    # >>> a
    # array([['0.3333333333333333', '1.25'],
    #        ['4.826809000000001', 'text']], dtype='<U32')
    # >>> b = np.array([[1/3,5/4], [1.3**6, 4]], dtype = np.int64)
    # >>> b
    # array([[0, 1],
    #        [4, 4]])
    # >>> c = np.array([[1/3,5/4], [1.3**6, 4]], dtype = np.int8)
    # >>> c
    # array([[0, 1],
    #        [4, 4]], dtype=int8)
    # >>> serialize_json([math.inf, a, b, c])
    # b'[null,{"_numpy.ndarray.tolist()":[["0.3333333333333333","1.25"],["4.826809000000001","text"]],"_type_orjson":"<U32"},{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int64"},{"_numpy.ndarray.tolist()":[[0,1],[4,4]],"_type_orjson":"int8"}]'
    # >>> import pandas as pd
    # >>> df = pd.DataFrame(
    # ...     [[1/3, 5/4], [1.3**54, "text"]],
    # ...     index=["row 1", "row 2"],
    # ...     columns=["col 1", "col 2"],
    # ... )
    # >>> df
    #               col 1 col 2
    # row 1  3.333333e-01  1.25
    # row 2  1.422136e+06  text
    # >>> serialize_json(df)
    # b'{"_obj.to_json()":"{\\"col 1\\":{\\"row 1\\":0.3333333333,\\"row 2\\":1422135.6537506874},\\"col 2\\":{\\"row 1\\":1.25,\\"row 2\\":\\"text\\"}}","_type_orjson":"<class \'pandas.core.frame.DataFrame\'>"}'
    # >>> s = pd.Series(
    # ...     [1/3, 5/4, (1.3)**54, "text"],
    # ...     index=["row 1", "row 2", "row 3", "row 4"],
    # ... )
    # >>> s
    # row 1          0.333333
    # row 2              1.25
    # row 3    1422135.653751
    # row 4              text
    # dtype: object
    # >>> serialize_json(s)
    # b'{"_obj.to_json()":"{\\"row 1\\":0.3333333333,\\"row 2\\":1.25,\\"row 3\\":1422135.6537506874,\\"row 4\\":\\"text\\"}","_type_orjson":"<class \'pandas.core.series.Series\'>"}'
    # """
    return dumps(obj, default=custom_orjson_encoder, option=OPT_SORT_KEYS)

def serialize_numpy(obj)

Expand source code

def serialize_numpy(obj):
    # r"""
    # >>> import numpy as np
    # >>> m = np.array([1,2,3,4])
    # >>> m
    # array([1, 2, 3, 4])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([1, 2, 3, 4])
    # >>> m = np.array([[1,2],[3,4]])
    # >>> m
    # array([[1, 2],
    #        [3, 4]])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([[1, 2],
    #        [3, 4]])
    # >>> m = np.array([1,2.7,3,4])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([1. , 2.7, 3. , 4. ])
    # >>> m = np.array([[1,2],[3,4/3]])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([[1.        , 2.        ],
    #        [3.        , 1.33333333]])
    # >>> m = np.array([1,2,3,"txt"])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array(['1', '2', '3', 'txt'], dtype='<U21')
    # >>> m = np.array([[1,"txt"],[3,4]])
    # >>> deserialize_numpy(serialize_numpy(m))
    # array([['1', 'txt'],
    #        ['3', '4']], dtype='<U21')
    # """
    import numpy

    if isinstance(obj, numpy.ndarray):
        dims = len(obj.shape)
        dtype = str(obj.dtype)
        headerlen = 1 + len(dtype)
        header = f"{headerlen}§{dims}§{dtype}§".encode() + integers2bytes(obj.shape)
        return header, obj.data
    raise Exception(f"Cannot handle this type {type(obj)}, check its shape or dtype")