Source code for revscoring.languages.features.stopwords.features
from ....datasources.meta import dicts, filters
from ....dependencies import DependentSet
from ....features.meta import aggregators
[docs]class Revision(DependentSet):
def __init__(self, name, revision_datasources):
super().__init__(name)
self.datasources = revision_datasources
self.stopwords = aggregators.len(self.datasources.stopwords)
"`int` : A count of the number of stopwords in the content"
self.non_stopwords = \
aggregators.len(self.datasources.non_stopwords)
"`int` : A count of the number of non-stopwords in the content"
if hasattr(self.datasources, 'parent'):
self.parent = Revision(name + ".parent", self.datasources.parent)
"""
:class:`~revscoring.languages.features.stopwords.Revision` : The
parent revision
"""
if hasattr(self.datasources, 'diff'):
self.diff = Diff(name + '.diff', self.datasources.diff)
"""
:class:`~revscoring.languages.features.stopwords.Diff` : The
parent revision
"""
[docs]class Diff(DependentSet):
def __init__(self, name, diff_datasources):
super().__init__(name)
self.datasources = diff_datasources
# Simple counts (based on wikitext.edit.diff)
self.stopwords_added = \
aggregators.len(self.datasources.stopwords_added)
"`int` : A count of stopwords added"
self.stopwords_removed = \
aggregators.len(self.datasources.stopwords_removed)
"`int` : A count of stopwords removed"
self.non_stopwords_added = \
aggregators.len(self.datasources.non_stopwords_added)
"`int` : A count of non-stopwords added"
self.non_stopwords_removed = \
aggregators.len(self.datasources.non_stopwords_removed)
"`int` : A count of non-stopwords removed"
# Word frequency deltas
stopword_delta_values = dicts.values(self.datasources.stopword_delta)
self.stopword_delta_sum = aggregators.sum(
stopword_delta_values,
name=name + ".stopword_delta_sum",
returns=int
)
"`int` : The sum of word frequency deltas for stopwords"
self.stopword_delta_increase = aggregators.sum(
filters.positive(stopword_delta_values),
name=name + ".stopword_delta_increase",
returns=int
)
"`int` : The sum of word frequency delta increases for stopwords"
self.stopword_delta_decrease = aggregators.sum(
filters.negative(stopword_delta_values),
name=name + ".stopword_delta_decrease",
returns=int
)
"`int` : The sum of word frequency delta decreases for stopwords"
non_stopword_delta_values = \
dicts.values(self.datasources.non_stopword_delta)
self.non_stopword_delta_sum = aggregators.sum(
non_stopword_delta_values,
name=name + ".non_stopword_delta_sum",
returns=int
)
"`int` : The sum of word frequency deltas for non-stopwords"
self.non_stopword_delta_increase = aggregators.sum(
filters.positive(non_stopword_delta_values),
name=name + ".non_stopword_delta_increase",
returns=int
)
"`int` : The sum of word frequency delta increases for non-stopwords"
self.non_stopword_delta_decrease = aggregators.sum(
filters.negative(non_stopword_delta_values),
name=name + ".non_stopword_delta_decrease",
returns=int
)
"`int` : The sum of word frequency delta decreases for non-stopwords"
# Proportional word frequency deltas
stopword_prop_delta_values = \
dicts.values(self.datasources.stopword_prop_delta)
self.stopword_prop_delta_sum = aggregators.sum(
stopword_prop_delta_values,
name=name + ".stopword_prop_delta_sum"
)
"`float` : The sum of proportional word frequency deltas for stopwords"
self.stopword_prop_delta_increase = aggregators.sum(
filters.positive(stopword_prop_delta_values),
name=name + ".stopword_prop_delta_increase"
)
"""
`float` : The sum of proportional word frequency delta increases for
stopwords
"""
self.stopword_prop_delta_decrease = aggregators.sum(
filters.negative(stopword_prop_delta_values),
name=name + ".stopword_prop_delta_decrease"
)
"""
`float` : The sum of proportional word frequency delta decreases for
stopwords
"""
non_stopword_prop_delta_values = \
dicts.values(self.datasources.non_stopword_prop_delta)
self.non_stopword_prop_delta_sum = aggregators.sum(
non_stopword_prop_delta_values,
name=name + ".non_stopword_prop_delta_sum"
)
"""
`float` : The sum of proportional word frequency deltas for
non-stopwords
"""
self.non_stopword_prop_delta_increase = aggregators.sum(
filters.positive(non_stopword_prop_delta_values),
name=name + ".non_stopword_prop_delta_increase"
)
"""
`float` : The sum of proportional word frequency delta increases for
non-stopwords
"""
self.non_stopword_prop_delta_decrease = aggregators.sum(
filters.negative(non_stopword_prop_delta_values),
name=name + ".non_stopword_prop_delta_decrease"
)
"""
`float` : The sum of proportional word frequency delta decreases for
non-stopwords
"""