Source code for revscoring.features.wikibase.features.diff

import re

import mwbase

from revscoring.dependencies import DependentSet

from ...feature import Feature
from ...meta import aggregators, bools


[docs]class Diff(DependentSet): def __init__(self, name, datasources): super().__init__(name) self.datasources = datasources # Sitelinks self.sitelinks_added = \ aggregators.len(self.datasources.sitelinks_added) "`int` : The number of sitelinks added" self.sitelinks_removed = \ aggregators.len(self.datasources.sitelinks_removed) "`int` : The number of sitelinks removed" self.sitelinks_changed = \ aggregators.len(self.datasources.sitelinks_changed) "`int` : The number of sitelinks changed" # Labels self.labels_added = aggregators.len(self.datasources.labels_added) "`int` : The number of labels added" self.labels_removed = aggregators.len(self.datasources.labels_removed) "`int` : The number of labels removed" self.labels_changed = aggregators.len(self.datasources.labels_changed) "`int` : The number of labels changed" # Aliases self.aliases_added = aggregators.len(self.datasources.aliases_added) "`int` : The number of aliases added" self.aliases_removed = \ aggregators.len(self.datasources.aliases_removed) "`int` : The number of aliases removed" self.aliases_changed = \ aggregators.len(self.datasources.aliases_changed) "`int` : The number of aliases changed" # Descriptions self.descriptions_added = \ aggregators.len(self.datasources.descriptions_added) "`int` : The number of descriptions added" self.descriptions_removed = \ aggregators.len(self.datasources.descriptions_removed) "`int` : The number of descriptions removed" self.descriptions_changed = \ aggregators.len(self.datasources.descriptions_changed) "`int` : The number of descriptions changed" # Properties self.properties_added = \ aggregators.len(self.datasources.properties_added) "`int` : The number of properties added" self.properties_removed = \ aggregators.len(self.datasources.properties_removed) "`int` : The number of properties removed" self.properties_changed = \ aggregators.len(self.datasources.properties_changed) "`int` : The number of properties changed" # Claims self.statements_added = \ aggregators.len(self.datasources.statements_added) "`int` : The number of statements/claims added" self.claims_added = \ aggregators.len(self.datasources.claims_added) # Backwards compatible "`int` : The number of statements/claims added" self.statements_removed = \ aggregators.len(self.datasources.statements_removed) "`int` : The number of statements/claims removed" self.claims_removed = \ aggregators.len(self.datasources.claims_removed) # Backwards compatible "`int` : The number of statements/claims removed" self.statements_changed = \ aggregators.len(self.datasources.statements_changed) "`int` : The number of statements/claims changed" self.claims_changed = \ aggregators.len(self.datasources.claims_changed) # Backwards compatible "`int` : The number of statements/claims changed" # Sources self.sources_added = aggregators.len(self.datasources.sources_added) "`int` : The number of sources added" self.sources_removed = \ aggregators.len(self.datasources.sources_removed) "`int` : The number of sources removed" # Qualifiers self.qualifiers_added = \ aggregators.len(self.datasources.qualifiers_added) "`int` : The number of qualifiers added" self.qualifiers_removed = \ aggregators.len(self.datasources.qualifiers_removed) "`int` : The number of qualifiers removed" # Badges self.badges_added = aggregators.len(self.datasources.badges_added) "`int` : The number of badges added" self.badges_removed = aggregators.len(self.datasources.badges_removed) "`int` : The number of badges removed" self.badges_changed = aggregators.len(self.datasources.badges_changed) "`int` : The number of badges changed" # AF/38 self.proportion_of_qid_added = Feature( name + ".proportion_of_qid_added", _process_proportion_of_qid_added, returns=float, depends_on=[self.datasources.parent_entity, self.datasources.revision_entity] ) "`int` : The proportion of Q# added." # AF/38 self.proportion_of_language_added = Feature( name + ".proportion_of_language_added", _process_proportion_of_language_added, returns=float, depends_on=[self.datasources.parent_entity, self.datasources.revision_entity] ) "`int` : The proportion of language added." self.proportion_of_links_added = Feature( name + ".proportion_of_links_added", _process_proportion_of_links_added, returns=float, depends_on=[self.datasources.parent_entity, self.datasources.revision_entity] ) "`int` : The proportion of links added." self.identifiers_changed = Feature( name + ".identifiers_changed", _process_identifiers_changed, returns=int, depends_on=[self.datasources.claims_changed] ) "`int` : The number of identifiers that were changed"
[docs] def property_changed(self, property, name=None): """ Returns a :class:`revscoring.Feature` that represents whether a property was changed. :Parameters: property : `str` The property name name : `str` A name to associate with the feature. If not set, the feature's name will be 'property_changed(<property>)' """ if name is None: name = self._name + ".property_changed({0})" \ .format(repr(property)) return bools.item_in_set(property, self.datasources.properties_changed, name=name)
def _process_proportion_of_qid_added(parent_entity, revision_entity): parent_entity_doc = parent_entity if parent_entity is not None else {} re_qid = re.compile(r'Q\d{1,8}') revision_entity_qids = len(re.findall( re_qid, mwbase.json_dumps(revision_entity))) parent_entity_qids = len(re.findall( re_qid, mwbase.json_dumps(parent_entity_doc))) return float(revision_entity_qids - parent_entity_qids) / \ float(revision_entity_qids + 1) # AF/8 LANGUAGE_REGEXES = (r"(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|" r"[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[eə]rba" r"(ijani?|ycan(ca)?|yjan)|нглийский)|b(ahasa( (indonesia|" r"jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|" r"elarusian?|okmål|osanski|ra[sz]il(ian?)?|ritish( " r"kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?" r"|zech|roat([eo]|ian?)|atal[aà]n?|рпски|antonese)|[cč]" r"(esky|e[sš]tina)|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]" r"nika|ng(els|le(ski|za)|lisc?h)|spa(g?[nñ]h?i?ol|nisc?h)" r"|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[cç]" r"(ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|" r"uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|" r"ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|" r"ndonesian?|ngl[eê]se?|ngilizce|tali(ano?|en(isch)?))|" r"ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|" r"sova)|urd[iî])|l(at(in[ao]?|vi(an?|e[sš]u))|ietuvi[uų]" r"|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|" r"sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol" r"(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|" r"orsk( bokm[aå]l)?|ynorsk)|o(landese|dia)|p(ashto|" r"ersi?an?|ol(n?isc?h|ski)|or?tugu?[eê]se?(( d[eo])? " r"brasil(eiro)?| ?\(brasil\))?|unjabi)|r(om[aâi]ni?[aă]n?" r"|um(ano|änisch)|ussi([ao]n?|sch))|s(anskrit|erbian|" r"imple english|inha?la|lov(ak(ian?)?|enš?[cč]ina|" r"en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|" r"rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|" r"hai(land)?|i[eế]ng vi[eệ]t|[uü]rk([cç]e|isc?h|iş|ey))|" r"u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(англиис|" r"[kк]алмыкс|[kк]азахс|немец|[pр]усс|[yу]збекс|" r"татарс)кий( язык)??|עברית|[kкқ](аза[кқ]ша|ыргызча|" r"ирилл)|українськ(а|ою)|б(еларуская|" r"ългарски( език)?)|ελλ[ηι]" r"νικ(ά|α)|ქართული|हिन्दी|ไทย|[mм]онгол(иа)?|([cс]рп|" r"[mм]акедон)ски|العربية|日本語|한국(말|어)|‌हिनद़ि|" r"বাংলা|ਪੰਜਾਬੀ|मराठी|ಕನ್ನಡ|اُردُو|தமிழ்|తెలుగు|ગુજરાતી|" r"فارسی|پارسی|മലയാളം|پښتو|မြန်မာဘာသာ|中文(简体|繁體)?|" r"中文((简体?|繁體))|简体|繁體)") LANGUAGE_RE = re.compile(LANGUAGE_REGEXES) def _process_proportion_of_language_added(parent_entity, revision_entity): parent_entity_doc = parent_entity if parent_entity is not None else {} revision_entity_res = len(re.findall(LANGUAGE_RE, mwbase.json_dumps(revision_entity))) parent_entity_res = len(re.findall(LANGUAGE_RE, mwbase.json_dumps(parent_entity_doc))) return float(revision_entity_res - parent_entity_res) / \ float(revision_entity_res + 1) def _process_proportion_of_links_added(parent_entity, revision_entity): parent_entity_doc = parent_entity if parent_entity is not None else {} re_qid = re.compile(r'https?\://|wwww\.') revision_entity_res = len(re.findall(re_qid, mwbase.json_dumps(revision_entity))) parent_entity_res = len(re.findall(re_qid, mwbase.json_dumps(parent_entity_doc))) return float(revision_entity_res - parent_entity_res) / \ float(revision_entity_res + 1) def _process_identifiers_changed(changed_claims): counter = 0 for old, new in changed_claims: if isinstance(old.claim.datavalue, mwbase.String): counter += 1 return counter