Module idict.core.identification
Expand source code
# Copyright (c) 2021. Davi Pereira dos Santos
# This file is part of the idict project.
# Please respect the license - more about this in the section (*) below.
#
# idict is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# idict is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with idict. If not, see <http://www.gnu.org/licenses/>.
#
# (*) Removing authorship by any means, e.g. by distribution of derived
# works or verbatim, obfuscated, compiled or rewritten versions of any
# part of this work is illegal and unethical regarding the effort and
# time spent here.
import dis
import pickle
from inspect import signature
from garoupa import Hosh, UT40_4, Identity
from ldict.exception import NoInputException
from orjson import dumps
from idict.data.compression import pack, NondeterminismException
dill_warned = False
def f2bin(f, approach):
# Add signature.
fields_and_params = signature(f).parameters.values()
fields_and_params = {v.name: None if v.default is v.empty else v.default for v in fields_and_params}
if not fields_and_params:
raise NoInputException(f"Missing function input parameters.")
if "_" in fields_and_params:
return None
if approach == "clean":
# Remove line numbers.
groups = [l for l in dis.Bytecode(f).dis().split("\n\n") if l]
clean_lines = []
for group in groups:
lines = [segment for segment in group.split(" ") if segment][1:]
clean_lines.append(lines)
return dumps(clean_lines) + pickle.dumps(fields_and_params, protocol=5)
if approach == "direct":
c = f.__code__
code_bin = c.co_code + str(c.co_consts).encode()
# TODO (minor): replace pickle for a deterministic dill if possible?
# it could allow a broader range of default values (numpy, models)
return code_bin + pickle.dumps(fields_and_params, protocol=5)
if approach == "dill":
# TODO (minor): one advantage of dill here is to be able to hash a custom callable, instead of only functions.
# However, preferably, custom callables are expected to provided custom ids.
global dill_warned
if not dill_warned:
dill_warned = True
print("WARNING: using 'dill' to hash functions is not determinist")
import dill
return dill.dumps(f)
def fhosh(f, version, approach="clean"):
"""
Create hosh with etype="ordered" using bytecode of "f" as binary content for blake3.
For some insight on the algorithm choice inside GaROUPa, see, e.g.:
https://news.ycombinator.com/item?id=22021984
Usage:
>>> print(fhosh(lambda x: {"z": x**2}, UT40_4))
p2MGclmVa-FRxu5kFQ65RNjiK42otvusPZ9LGCi4
>>> print(fhosh(lambda x, name=[1, 2, Ellipsis, ..., 10]: {"z": x**2}, UT40_4))
3NPAab2SC5lsIz5ekeIQMeQU9EKRW1dYvpUsywyr
Parameters
----------
f
version
Returns
-------
"""
if hasattr(f, "hosh"):
return f.hosh
if (bin := f2bin(f, approach)) is None:
f.hosh = Identity(version=version)
else:
f.hosh = Hosh(bin, "ordered", version=version)
return f.hosh
def blobs_hashes_hoshes(data, identity, ids, version):
"""
>>> from idict import idict
>>> idict(x=1, y=2, z=3, _ids={"y": "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"}).show(colored=False)
{
"x": 1,
"y": 2,
"z": 3,
"_id": "Sv8G-WU9SZL90Tus885EWBBf3koyyyyyyyyyyyyy",
"_ids": {
"x": "fH_5142f0a4338a1da2ca3159e2d1011981ac890 (content: l8_09c7059156c4ed2aea46243e9d4b36c01f272)",
"y": "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy",
"z": "Nx_e12377018e5ab54023f91f7c6b7aea6676b60 (content: S5_331b7e710abd1443cd82d6b5cdafb9f04d5ab)"
}
}
"""
from idict.core.frozenidentifieddict import FrozenIdentifiedDict
from idict.core.idict_ import Idict
blobs = {}
hashes = {}
hoshes = {}
for k, v in data.items():
if k in ids:
hoshes[k] = identity * ids[k]
else:
if isinstance(v, (Idict, FrozenIdentifiedDict)):
hashes[k] = v.hosh
else:
try:
blobs[k] = pack(v)
vhosh = identity.h * blobs[k]
except NondeterminismException:
vhosh = fhosh(v, version)
hashes[k] = vhosh
try:
hoshes[k] = hashes[k] ** k.encode()
except KeyError as e: # pragma: no cover
raise Exception(
f"{str(e)} is not allowed in field name: {k}. It is only accepted as the first character to indicate a metafield."
)
return dict(blobs=blobs, hashes=hashes, hoshes=hoshes)
Functions
def blobs_hashes_hoshes(data, identity, ids, version)
-
>>> from idict import idict >>> idict(x=1, y=2, z=3, _ids={"y": "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"}).show(colored=False) { "x": 1, "y": 2, "z": 3, "_id": "Sv8G-WU9SZL90Tus885EWBBf3koyyyyyyyyyyyyy", "_ids": { "x": "fH_5142f0a4338a1da2ca3159e2d1011981ac890 (content: l8_09c7059156c4ed2aea46243e9d4b36c01f272)", "y": "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "z": "Nx_e12377018e5ab54023f91f7c6b7aea6676b60 (content: S5_331b7e710abd1443cd82d6b5cdafb9f04d5ab)" } }
Expand source code
def blobs_hashes_hoshes(data, identity, ids, version): """ >>> from idict import idict >>> idict(x=1, y=2, z=3, _ids={"y": "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"}).show(colored=False) { "x": 1, "y": 2, "z": 3, "_id": "Sv8G-WU9SZL90Tus885EWBBf3koyyyyyyyyyyyyy", "_ids": { "x": "fH_5142f0a4338a1da2ca3159e2d1011981ac890 (content: l8_09c7059156c4ed2aea46243e9d4b36c01f272)", "y": "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "z": "Nx_e12377018e5ab54023f91f7c6b7aea6676b60 (content: S5_331b7e710abd1443cd82d6b5cdafb9f04d5ab)" } } """ from idict.core.frozenidentifieddict import FrozenIdentifiedDict from idict.core.idict_ import Idict blobs = {} hashes = {} hoshes = {} for k, v in data.items(): if k in ids: hoshes[k] = identity * ids[k] else: if isinstance(v, (Idict, FrozenIdentifiedDict)): hashes[k] = v.hosh else: try: blobs[k] = pack(v) vhosh = identity.h * blobs[k] except NondeterminismException: vhosh = fhosh(v, version) hashes[k] = vhosh try: hoshes[k] = hashes[k] ** k.encode() except KeyError as e: # pragma: no cover raise Exception( f"{str(e)} is not allowed in field name: {k}. It is only accepted as the first character to indicate a metafield." ) return dict(blobs=blobs, hashes=hashes, hoshes=hoshes)
def f2bin(f, approach)
-
Expand source code
def f2bin(f, approach): # Add signature. fields_and_params = signature(f).parameters.values() fields_and_params = {v.name: None if v.default is v.empty else v.default for v in fields_and_params} if not fields_and_params: raise NoInputException(f"Missing function input parameters.") if "_" in fields_and_params: return None if approach == "clean": # Remove line numbers. groups = [l for l in dis.Bytecode(f).dis().split("\n\n") if l] clean_lines = [] for group in groups: lines = [segment for segment in group.split(" ") if segment][1:] clean_lines.append(lines) return dumps(clean_lines) + pickle.dumps(fields_and_params, protocol=5) if approach == "direct": c = f.__code__ code_bin = c.co_code + str(c.co_consts).encode() # TODO (minor): replace pickle for a deterministic dill if possible? # it could allow a broader range of default values (numpy, models) return code_bin + pickle.dumps(fields_and_params, protocol=5) if approach == "dill": # TODO (minor): one advantage of dill here is to be able to hash a custom callable, instead of only functions. # However, preferably, custom callables are expected to provided custom ids. global dill_warned if not dill_warned: dill_warned = True print("WARNING: using 'dill' to hash functions is not determinist") import dill return dill.dumps(f)
def fhosh(f, version, approach='clean')
-
Create hosh with etype="ordered" using bytecode of "f" as binary content for blake3.
For some insight on the algorithm choice inside GaROUPa, see, e.g.: https://news.ycombinator.com/item?id=22021984
Usage:
>>> print(fhosh(lambda x: {"z": x**2}, UT40_4)) p2MGclmVa-FRxu5kFQ65RNjiK42otvusPZ9LGCi4
>>> print(fhosh(lambda x, name=[1, 2, Ellipsis, ..., 10]: {"z": x**2}, UT40_4)) 3NPAab2SC5lsIz5ekeIQMeQU9EKRW1dYvpUsywyr
Parameters
f
version
Returns
Expand source code
def fhosh(f, version, approach="clean"): """ Create hosh with etype="ordered" using bytecode of "f" as binary content for blake3. For some insight on the algorithm choice inside GaROUPa, see, e.g.: https://news.ycombinator.com/item?id=22021984 Usage: >>> print(fhosh(lambda x: {"z": x**2}, UT40_4)) p2MGclmVa-FRxu5kFQ65RNjiK42otvusPZ9LGCi4 >>> print(fhosh(lambda x, name=[1, 2, Ellipsis, ..., 10]: {"z": x**2}, UT40_4)) 3NPAab2SC5lsIz5ekeIQMeQU9EKRW1dYvpUsywyr Parameters ---------- f version Returns ------- """ if hasattr(f, "hosh"): return f.hosh if (bin := f2bin(f, approach)) is None: f.hosh = Identity(version=version) else: f.hosh = Hosh(bin, "ordered", version=version) return f.hosh