Source code for revscoring.datasources.meta.hashing
"""
These meta-datasources operate on :class:`revscoring.Datasource`'s that returns
a list of strings (i.e. "tokens") and produces a list of ngram/skipgram
sequences.
.. autoclass:: revscoring.datasources.meta.hashing.hash
"""
import json
import mmh3
from ..datasource import Datasource
[docs]class hash(Datasource):
"""
Converts a sequence of items into a sequence of portable hashes (`int`)
based on the result of applying `str()`. E.g. `str(["foo"]) = '["foo"]'`
:Parameters:
items_datasource : :class:`revscoring.Datasource`
A datasource that generates a list of items to be hashed
n : `int`
The number of potential hashes that can be produced
name : `str`
A name for the datasource.
"""
def __init__(self, items_datasource, n=2 ** 20, name=None):
name = self._format_name(name, [items_datasource, n])
super().__init__(name, self.process,
depends_on=[items_datasource])
self.n = n
def process(self, items):
return [mmh3_item(item, self.n) for item in items]
def mmh3_item(item, n):
return (2**32 + mmh3.hash(json.dumps(item))) % n