Source code for revscoring.datasources.meta.extractors
"""
These meta-datasources operate on :class:`revscoring.Datasource`'s that
return `str`'s or `list` ( `str` ) and extract information from them.
.. autoclass:: revscoring.datasources.meta.extractors.regex
"""
import re
from ..datasource import Datasource
[docs]class regex(Datasource):
"""
Generates a list of strings that match any of a set of privided `regexes`
:Parameters:
regexes : `list` ( `str` )
A list of regexes to find in the text
text_datasource : :class:`revscoring.Datasource`
A datasource that returns a `str` or a `list` of `str`
regex_flags : `int`
A set of regex flags to use in matching
wrapping : ( `str`, `str` )
Wrap all regexes with these values. This is useful for languages
that *have* word boundaries.
name : `str`
A name for the new datasource
"""
def __init__(self, regexes, text_datasource, regex_flags=re.I,
wrapping=(r'\b', r'\b'), exclusions=None, name=None):
wrapping = wrapping or ("", "")
group_pattern = r"(" + wrapping[0] + r")" + \
r"(" + r"|".join(regexes) + r")" + \
r"(" + wrapping[1] + r")"
self.group_re = re.compile(group_pattern, flags=regex_flags)
if exclusions is not None:
exclusion_pattern = r"(" + wrapping[0] + r")" + \
r"(" + r"|".join(exclusions) + r")" + \
r"(" + wrapping[1] + r")"
self.exclude_re = re.compile(exclusion_pattern, flags=regex_flags)
else:
self.exclude_re = None
name = self._format_name(name, [regexes, text_datasource])
super().__init__(name, self.process, depends_on=[text_datasource])
def process(self, text_or_texts):
if text_or_texts is None:
return []
elif isinstance(text_or_texts, str):
text = text_or_texts
return [match.group(2)
for match in self.group_re.finditer(text)
if not hasattr(self, 'exclude_re') or
self.exclude_re is None or
not self.exclude_re.match(match.group(2))]
else:
texts = text_or_texts
return [match.group(2)
for text in texts
for match in self.group_re.finditer(text)
if not hasattr(self, 'exclude_re') or
self.exclude_re is None or
not self.exclude_re.match(match.group(2))]