Source code for revscoring.datasources.meta.gramming

"""
These meta-datasources operate on :class:`revscoring.Datasource`'s that returns
a list of strings (i.e. "tokens") and produces a list of ngram/skipgram
sequences.

.. autoclass:: revscoring.datasources.meta.gramming.gram

"""
from ..datasource import Datasource


[docs]class gram(Datasource): """ Converts a sequence of items into ngrams. :Parameters: items_datasource : :class:`revscoring.Datasource` A datasource that generates a list of some item grams : `list` ( `tuple` ( `int` ) ) A list of ngram and/or skipgram sequences to produce name : `str` A name for the datasource. """ def __init__(self, items_datasource, grams=[(0,)], name=None): name = self._format_name(name, [items_datasource, grams]) super().__init__(name, self.process, depends_on=[items_datasource]) self.grams = grams def process(self, tokens): return list(gram_tokens(tokens, grams=self.grams))
def gram_tokens(items, grams=[(0,)]): for i in range(len(items)): for gram in grams: if gram == (0,): yield (items[i], ) elif len(items) > i + max(gram): yield tuple(items[i + offset] for offset in gram)