Source code for revscoring.datasources.meta.selectors
"""
These meta-datasources operate on :class:`revscoring.Datasource`'s that return
a flat `dict` of key-value pairs (aka a "table") and filter ("select") keys
and/or weight values.
.. autoclass:: revscoring.datasources.meta.selectors.tfidf
.. autoclass:: revscoring.datasources.meta.selectors.filter_keys
"""
from collections import defaultdict
from math import log
from ..datasource import Datasource
[docs]class tfidf(Datasource):
"""
Selects a subset of a frequency table based on term utility and applies
TF-iDF weighting.
:Parameters:
table_datasource : :class:`revscoring.Datasource`
A datasource that generates a dict of term frequency counts
max_terms : `int`
The maximum number of terms that will be selected. The terms
with the highest proportional representation in a label class
are selected.
weight : `bool`
Should TF-iDF weighting be applied to output counts?
boolean : `bool`
Normalize counts to 0 (not in document) and 1 (in document). Note
that negative frequencies will be converted to -1.
name : `str`
A name for the datasource.
"""
def __init__(self, table_datasource, max_terms=None, weight=True,
boolean=False, name=None):
name = self._format_name(
name, [table_datasource, max_terms, weight, boolean])
super().__init__(name, self.process,
depends_on=[table_datasource])
self.max_terms = int(max_terms) if max_terms is not None else None
self.weight = weight
self.boolean = boolean
def fit(self, value_labels):
# Count up document frequencies and label frequencies
self.document_freq = defaultdict(lambda: 0)
self.document_n = 0
label_freq = defaultdict(lambda: defaultdict(lambda: 0))
label_n = defaultdict(lambda: 0)
for values, label in value_labels:
table = values[0]
self.document_n += 1
label_n[label] += 1
for term, freq in table.items():
if self.boolean:
freq = 1 if freq > 0 else -1
self.document_freq[term] += freq
label_freq[label][term] += freq
# Select terms
if self.max_terms is not None:
self.document_freq = \
self._select_terms(label_freq, label_n)
def _select_terms(self, label_freq, label_n):
term_utilities = []
for label, table in label_freq.items():
for term, label_freq in table.items():
utility = term_utility(
label_freq, label_n[label],
self.document_freq[term], self.document_n)
term_utilities.append((abs(utility), term, label))
term_utilities.sort(reverse=True)
new_document_freq = {}
while len(new_document_freq) < self.max_terms and \
len(term_utilities) > 0:
_, term, _ = term_utilities.pop(0)
new_document_freq[term] = self.document_freq[term]
return new_document_freq
def keys(self):
return self.document_freq.keys()
def process(self, table):
new_table = {}
for term, freq in table.items():
if self.boolean:
freq = 1 if freq > 0 else -1
if term in self.document_freq:
if self.weight:
new_table[term] = \
freq * log(self.document_n /
max(self.document_freq[term], 1))
else:
new_table[term] = freq
return new_table
def term_utility(label_freq, label_n, document_freq, document_n):
within_label_prop = label_freq / label_n
extra_label_prop = (document_freq - label_freq) / (document_n - label_n)
return within_label_prop / max(extra_label_prop, 0.001)
[docs]class filter_keys(Datasource):
"""
Selects a subset of features (key/values) based a set of keys.
:Parameters:
table_datasource : :class:`revscoring.Datasource`
A datasource that generates a table including only the specified
keys
keys : `iterable` ( `hashable` )
The keys to select from the table
name : `str`
A name for the datasource.
"""
def __init__(self, table_datasource, keys, name=None):
name = self._format_name(
name, [table_datasource, keys])
super().__init__(name, self.process,
depends_on=[table_datasource])
self.keys = set(keys)
def process(self, table):
new_table = {}
for key in self.keys:
if key in table:
new_table[key] = table[key]
return new_table