Source code for revscoring.datasources.meta.selectors

"""
These meta-datasources operate on :class:`revscoring.Datasource`'s that return
a flat `dict` of key-value pairs (aka a "table") and filter ("select") keys
and/or weight values.

.. autoclass:: revscoring.datasources.meta.selectors.tfidf

.. autoclass:: revscoring.datasources.meta.selectors.filter_keys

"""

from collections import defaultdict
from math import log

from ..datasource import Datasource


[docs]class tfidf(Datasource):
    """
    Selects a subset of a frequency table based on term utility and applies
    TF-iDF weighting.

    :Parameters:
        table_datasource : :class:`revscoring.Datasource`
            A datasource that generates a dict of term frequency counts
        max_terms : `int`
            The maximum number of terms that will be selected.  The terms
            with the highest proportional representation in a label class
            are selected.
        weight : `bool`
            Should TF-iDF weighting be applied to output counts?
        boolean : `bool`
            Normalize counts to 0 (not in document) and 1 (in document).  Note
            that negative frequencies will be converted to -1.
        name : `str`
            A name for the datasource.
    """

    def __init__(self, table_datasource, max_terms=None, weight=True,
                 boolean=False, name=None):
        name = self._format_name(
            name, [table_datasource, max_terms, weight, boolean])
        super().__init__(name, self.process,
                         depends_on=[table_datasource])
        self.max_terms = int(max_terms) if max_terms is not None else None
        self.weight = weight
        self.boolean = boolean

    def fit(self, value_labels):
        # Count up document frequencies and label frequencies
        self.document_freq = defaultdict(lambda: 0)
        self.document_n = 0
        label_freq = defaultdict(lambda: defaultdict(lambda: 0))
        label_n = defaultdict(lambda: 0)
        for values, label in value_labels:
            table = values[0]
            self.document_n += 1
            label_n[label] += 1
            for term, freq in table.items():
                if self.boolean:
                    freq = 1 if freq > 0 else -1
                self.document_freq[term] += freq
                label_freq[label][term] += freq

        # Select terms
        if self.max_terms is not None:
            self.document_freq = \
                self._select_terms(label_freq, label_n)

    def _select_terms(self, label_freq, label_n):
        term_utilities = []
        for label, table in label_freq.items():
            for term, label_freq in table.items():
                utility = term_utility(
                    label_freq, label_n[label],
                    self.document_freq[term], self.document_n)
                term_utilities.append((abs(utility), term, label))

        term_utilities.sort(reverse=True)
        new_document_freq = {}
        while len(new_document_freq) < self.max_terms and \
                len(term_utilities) > 0:
            _, term, _ = term_utilities.pop(0)
            new_document_freq[term] = self.document_freq[term]

        return new_document_freq

    def keys(self):
        return self.document_freq.keys()

    def process(self, table):
        new_table = {}
        for term, freq in table.items():
            if self.boolean:
                freq = 1 if freq > 0 else -1
            if term in self.document_freq:
                if self.weight:
                    new_table[term] = \
                        freq * log(self.document_n /
                                   max(self.document_freq[term], 1))
                else:
                    new_table[term] = freq

        return new_table


def term_utility(label_freq, label_n, document_freq, document_n):
    within_label_prop = label_freq / label_n
    extra_label_prop = (document_freq - label_freq) / (document_n - label_n)
    return within_label_prop / max(extra_label_prop, 0.001)


[docs]class filter_keys(Datasource):
    """
    Selects a subset of features (key/values) based a set of keys.

    :Parameters:
        table_datasource : :class:`revscoring.Datasource`
            A datasource that generates a table including only the specified
            keys
        keys : `iterable` ( `hashable` )
            The keys to select from the table
        name : `str`
            A name for the datasource.
    """

    def __init__(self, table_datasource, keys, name=None):
        name = self._format_name(
            name, [table_datasource, keys])
        super().__init__(name, self.process,
                         depends_on=[table_datasource])
        self.keys = set(keys)

    def process(self, table):
        new_table = {}
        for key in self.keys:
            if key in table:
                new_table[key] = table[key]

        return new_table
Source code for revscoring.datasources.meta.selectors

Revscoring

Navigation

Related Topics