Source code for revscoring.features.wikitext.features.parsed

import re

from textstat.textstat import textstat

from ...feature import Feature
from ...meta import aggregators


class Revision:

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.content_chars = aggregators.len(
            self.datasources.content,
            name=self._name + ".content_chars"
        )
        """
        `int` : The number of characters of viewable content (no markup or
        templates
        """

        self.flesh_kincaid = Feature(
            self._name + ".flesh_kincaid",
            textstat.flesch_reading_ease,
            depends_on=[self.datasources.content],
            returns=float
        )
        """
        `float` : returns the Flesch reading ease score.
        (https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests)
        """

        self.headings = aggregators.len(
            self.datasources.headings,
            name=self._name + ".headings"
        )
        "`int` : The number of headings"

        self.external_links = aggregators.len(
            self.datasources.external_links,
            name=self._name + ".external_links"
        )
        "`int` : The number of external links"

        self.wikilinks = aggregators.len(
            self.datasources.wikilinks,
            name=self._name + ".wikilinks"
        )
        "`int` : The number of wikilinks (internal to other pages in the wiki)"

        self.tags = aggregators.len(
            self.datasources.tags,
            name=self._name + ".tags"
        )
        "`int` : The number of HTML tags"

        self.ref_tags = aggregators.len(
            self.datasources.tag_names_matching(r"ref"),
            name=self._name + ".ref_tags"
        )
        "`int` : The number of <ref> tags"

        self.templates = aggregators.len(
            self.datasources.templates,
            name=self._name + ".templates"
        )
        "`int` : The number of templates"

[docs] def heading_titles_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Feature` that that generates a count of header titles that match a regular expression. """ if not hasattr(regex, "pattern"): regex = re.compile(regex, re.I) if name is None: name = "{0}({1})".format(self._name + ".heading_titles_matching", regex.pattern) return aggregators.len( self.datasources.heading_titles_matching(regex), name=name )
[docs] def headings_by_level(self, level, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a count of all headers of a level. """ if name is None: name = "{0}({1})".format(self._name + ".headings_by_level", level) return aggregators.len( self.datasources.headings_by_level(level), name=name )
[docs] def tag_names_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a count of tag names that match a regular expression. """ if not hasattr(regex, "pattern"): regex = re.compile(regex, re.I) if name is None: name = "{0}({1})" \ .format(self._name + ".tag_names_matching", regex.pattern) return aggregators.len( self.datasources.tag_names_matching(regex), name=name )
[docs] def template_names_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Feature` that generates a count of template names that match a regular expression. """ if not hasattr(regex, "pattern"): regex = re.compile(regex, re.I) if name is None: name = "{0}({1})" \ .format(self._name + ".template_names_matching", regex.pattern) return aggregators.len( self.datasources.template_names_matching(regex), name=name )