User:CrowleyBot/task/1

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Technical details[edit]

  • Maintain a list about what is legit after what.
    • "Synonyms" is usually son of POS. It can also be L3.
    • "Alternative forms" is usually L3. It can be son of POS.
  • If something will be L3 after the process, it should be checked.
  • [1] is caused by [2]. It is partially reverted.
  • The error report of the second batch. Pages in the error report is not touched.

Source code[edit]

from prelude import  *
from botaccount import *

"normal-enwikt.py"

en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
en.login(UN, PWD)

#pn = list(map(lambda p: p.name, epgl))
#n = len(pn)
defaultsummary = ''

etxtl = [en.Pages[x].text() for x in pn]
esecl = [list() for i in range(n)]
etxtn = [''] * n
todo, todo1, todo2, todo3 = [], [], [], []
summary = [''] * n
log = []


@fct.total_ordering
class node:
    def __init__(s, kyu=0, title='', a=0, b=0, c=0, z='', t='', dummy = 0):
        if dummy:
            s.kyu = kyu
            s.title = dummy
            return
        s.kyu, s.title, s.oldkyu = kyu, title, kyu
        s.a, s.b, s.c, s.z, s.t = a, b, c, z, t
        s.l, s.r, s.f, s.s, s.tp = None, None, None, [], -1
    
    def __bool__(s):
        return isinstance(s.title, str)
    
    def __eq__(x, y):
        return x.kyu == y.kyu
    
    def __lt__(x, y):
        return x.kyu < y.kyu
    
    def __str__(s):
        return "%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu)
    
    def __repr__(s):
        return "%s%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu, s.t)
    
    def printtree(s, i=0):
        print('  ' * i + str(s))
        for ss in s.s:
            ss.printtree(i + 1)

def process1():
    # *? for non-greedy
    # On en.wikt, User:Erutuon has ensured the sameness of '=', no redundant spaces and no '=' in in section titles, as well as no L1
    rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
    def _f1(m):
        return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m.end(), m[4])
    etxt = etxtl[i]
    esecs = [node(0, '', 0, 0, 0)] + list(map(_f1, rx1.finditer(etxt))) + [node(0, '', len(etxt), len(etxt), len(etxt))]
    for j in range(len(esecs) - 1):
        esecs[j].c = esecs[j + 1].a
        esecs[j].t = etxt[esecs[j].b:esecs[j].c]
    del esecs[-1]
    esecl[i] = esecs

# "Root" have different meaning in namespace Reconstruction
# "Proverbs" and "Citations" can be 9
typelst = [
    [], [], [],
    ['Etymology', 'Reconstruction'], [],
    ['Pronunciation', ], [],
    ['Adjectival noun', 'Adjective', 'Adverb', 'Affix', 'Article', 'Classifier', 'Clitic', 'Combining form', 'Conjunction', 'Contraction', 'Counter', 'Cuneiform sign', 'Definitions', 'Determiner', 'Demonstrative', 'Dependent noun', 'Final', 'Han character', 'Hanja', 'Hanzi', 'Hán tự', 'Ideophone', 'Idiom', 'Infix', 'Interfix', 'Interjection', 'Kanji', 'Letter', 'Noun', 'Number', 'Numeral', 'Participle', 'Particle', 'Phrase', 'Postposition', 'Predicative', 'Prefix', 'Preposition', 'Prepositional phrase', 'Preverb', 'Pronoun', 'Proper noun', 'Proverb', 'Relative', 'Romanization', 'Root', 'Sign values', 'Suffix', 'Syllable', 'Symbol', 'Verb', 'Verbal noun', ],
    ['Conjugation', 'Declension', 'Derived characters', 'Forms', 'Inflection', 'Readings', 'Related characters', ],
    ['Affixed forms', 'Antonyms', 'Compounds', 'Coordinate terms', 'Derived terms', 'Descendants', 'Gallery', 'Holonyms', 'Hypernyms', 'Hyponyms', 'Idioms', 'Meronyms', 'Mutation', 'Paronyms', 'Quotations', 'Related terms', 'Synonyms', 'Translations', 'Trivia', 'Troponyms', 'Usage notes', ],
    ['Alternative forms', 'Alternative reconstructions', 'Alternative scripts', 'Dialectal variants', 'Notes', 'Reconstruction notes', 'Statistics', ],
    ['Further reading', 'See also', 'References', ],
    [],
    ['Anagrams', 'Glyph origin', ], 
]
typedct = defaultdict(bool)
typedct[''] = False
for i, l in enumerate(typelst):
    for x in l:
        typedct[x] = i

def processt():
    for nd in esecl[i]:
        if nd.kyu > 2:
            st.add(nd.title)

def process3():
    esecs = esecl[i]
    def gentype(x):
        if x.oldkyu in [0, 2]:
            x.tp = x.kyu
            return True
        if x.oldkyu in [1, 7]:
            return False
        if 'Etymology ' in x.title or 'Reconstruction ' in x.title:
            x.tp = 4
            return True
        if 'Pronunciation ' in x.title:
            x.tp = 6
            return True
        key = re.sub(r' \d+', '', x.title)
        x.tp = typedct[key]
        if x.tp == False:
            return False
        return True
    
    def link(x, y):
        x.s.append(y)
        y.f = x
        if x.kyu == 0:
            y.kyu = 2
        else:
            y.kyu = x.kyu + 1
    
    def canlink(x, y):
        linkd = {(0, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (3, 6), (4, 5), (4, 6), (4, 7), (4, 9), (4, 10), (4, 11), (4, 12), (4, 14), (5, 14), (6, 3), (6, 7), (6, 9), (6, 10), (6, 11), (6, 12), (6, 14), (7, 8), (7, 9), (7, 10), (7, 11)}
        return (x.tp, y.tp) in linkd
    
    def trylink(x, y, h):
        while x and x.kyu >= h:
            if canlink(x, y):
                link(x, y)
                return True
            x = x.f
        return False
    
    esecs[0].tp = 0
    for ((p, x), (q, y)) in its.pairwise(enumerate(esecs)):
        if not gentype(y):
            print(("gentype", i, p, q, str(x), str(y)))
            fail.append(i)
            fail1.append(("gentype", i, p, q, str(x), str(y)))
            return False
        if (x.tp == 4 and y.tp in [5, 6, 7, 10]) or (x.tp == 6 and y.tp in [7, 10]):
            # Etymology n and Pronunciation n should have a son
            if trylink(x, y, 0):
                continue
        elif x.oldkyu > y.oldkyu:
            z = x.f
            while z.kyu >= y.kyu:
                z = z.f
            if trylink(z, y, 0):
                continue
            if trylink(x, y, z.kyu + 1):
                continue
        elif x.oldkyu == y.oldkyu:
            if trylink(x.f, y, 0):
                continue
            if trylink(x, y, x.kyu):
                continue
        else:
            if trylink(x, y, 0):
                continue
        print(("resolve", i, p, q, str(x), str(y)))
        fail.append(i)
        fail1.append(("resolve", i, p, q, str(x), str(y)))
        return False
    
    return True

def process4():
    tmp = []
    for nd in esecl[i][1:]:
        if nd.kyu != nd.oldkyu:
            tmp.append('L%d -> L%d: %s' % (nd.oldkyu, nd.kyu, nd.title))
            log.append((i, nd.kyu, nd.title, nd.f.title))
            if nd.oldkyu - nd.f.oldkyu >= 2:
                summary[i] = 'Fix L%d after L%d. ' % (nd.oldkyu, nd.f.oldkyu)
    summary[i] += ', '.join(tmp)
    etxtn[i] = ''.join(map(lambda nd: str(nd) + nd.z + nd.t, esecl[i]))

def process5():
    tryedit(en.Pages[pn[i]], etxtn[i], summary[i], fail=fail)