User:Erutuon/scripts/normalize.py

From Wiktionary, the free dictionary
Jump to navigation Jump to search
#! /usr/bin/env python3
# coding: utf-8

from datetime import timedelta
from edit_log import DiffEditLog
from pywikibot import Site, Page, Timestamp
from pywikibot.pagegenerators import RecentChangesPageGenerator
import regex as re
from sys import argv
from traceback import print_exc

POSHeaders = """Abbreviation
Acronym
Adjectival noun
Adjective
Adnominal
Adverb
Affix
Article
Circumfix
Classifier
Combining form
Conjugation
Conjunction
Contraction
Counter
Declension
Definitions
Determiner
Diacritical mark
Gerund
Hanja
Hanzi
Idiom
Infix
Initialism
Interfix
Interjection
Kanji
Letter
Ligature
Logogram
Noun
Number
Numeral
Ordinal number
Participle
Particle
Phrase
Postposition
Predicative
Prefix
Preposition
Prepositional phrase
Pronoun
Pronunciation
Proper noun
Proverb
Punctuation mark
Relative
Romanization
Root
Stem
Suffix
Syllable
Symbol
Verb
Verbal noun""";

POS_header_and_template = (r"((?:"
    + POSHeaders.replace("\n", "|")
    + r")\s*=+)[ \t]*(?:\n[ \t]*)+(\{\{[^\n]+)\n+(?=#)")

def normalize(text):
    text = text.replace("\t", " ")
    text = re.sub(r"^([;:#*]+)(?=[^;:#*\s])", r"\1 ", text, flags = re.MULTILINE)
    text = re.sub(r"(?:\n *)*\n==", "\n\n==", text)
    text = re.sub(r"^((?!=)[^\n]*(?:\n(?!=)[^\n]*?)*?)\n+==", r"\1\n==", text)
    text = re.sub(r"==\n\n+(?!=)", "==\n", text)
    # Single newline between PoS header and headword template;
    # two newlines after headword template.
    text = re.sub(POS_header_and_template, r"\1\n\2\n\n", text)
    text = re.sub(r"^\n*----+\n*|\n+----+\s*$", "", text)
    text = re.sub(r"\n+----+\s+", "\n\n----\n\n", text)
    text = re.sub(r"\n +| +\n", "\n", text)
    text = re.sub(r"(^|\n)(=+) *([^\n]+?) *\2(\n|$)", r"\1\2\3\2\4", text);
    return text

def run(site, pages, summary):
    log = DiffEditLog(site, "WT:NORM")
    try:
        for page in site.preloadpages(pages):
            if not page.isRedirectPage():
                old_text = page.text
                new_text = normalize(old_text)
                if old_text != new_text:
                    try:
                        page.text = new_text
                        page.save(summary = "[[WT:NORM|normalize]]")
                        log.add(page.title(), (old_text, new_text))
                    except Exception as e:
                        print(f"Could not save edit on [[{page.title()}]]: {e}")
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(f"Error while processing pages")
        print_exc()
    log.save(summary = summary)

def remove_duplicates(iterable):
    already_seen = set()
    for item in iterable:
        if item not in already_seen:
            yield item
            already_seen.add(item)

total = int(argv[1]) if len(argv) >= 2 and argv[1].isdigit() else None
if total is None:
    raise TypeError("Expected total number of pages as first argument")

pages = remove_duplicates(RecentChangesPageGenerator(
	total = total,
	tag = "WT:NORM",
	start = Timestamp.now() - timedelta(days = 2),
	end = Timestamp.now() - timedelta(days = 1),
	reverse = True,
	topOnly = True,
	showRedirects = False))

run(site = Site(code = "en", fam = "wiktionary"), pages = pages, summary = "save bot log")