User:Robert Ullmann/code/xhan

From Wiktionary, the free dictionary
Jump to navigation Jump to search



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/code/xhan


"""
This bot checks Han character entries in en-wikt.xml and writes report pages for each row.

No command line arguments.

Generates (replaces) User:Robert Ullmann/Han/(hexcode) for each row

Generates a problems summary (to-do list)
"""

import wikipedia
import xmlreader
import sys
import re
import pickle
from getwikitext import getwikitext
import xmldate

def safe(s):
    return pickle.dumps(s)[1:-5]

def main():

    probonly = False
    recheck = False
    for arg in sys.argv[1:]:
        if arg.startswith('-probonly'):
            probonly = True
            print "only updating problems"
        elif arg.startswith('-recheck'):
            recheck = True
            print "rechecking entries from current DB"
        else: print "unknown command line argument %s" % arg

    # report dictionary
    enwikt = { 0:"blank" }

    # problems, index is character code number, value is char + text of problem
    problems = { }

    # header levels
    Lang = {'Translingual':'Han char', 'Cantonese':'yue', 'Japanese':'ja', 'Korean':'ko', 'Mandarin':'cmn',
        'Min Nan':'nan', 'Hakka':'hak', 'Gan':'gan', 'Jinyu':'cjy', 'Min Bei':'mnp', 'Min Dong':'cdo',
        'Min Zhong':'czo', 'Wu':'wuu', 'Xiang':'hsn', 'Vietnamese':'vi', 'Chinese':'zh',
        'Old Chinese':'och', 'Middle Chinese':'???', 'Zhuang':'za', 'Old Korean':'oko' }

    L3 = set(['Hanzi', 'Kanji', 'Hanja', 'Han character', 'Pronunciation', 'Proper noun', 'Pronoun',
        'Noun', 'Verb', 'Adjective', 'Number', 'Counter', 'Particle', 'Prefix', 'Suffix', 'Affix', 'Adverb',
        'Etymology', 'Etymology 1', 'Etymology 2','Etymology 3','Etymology 4',
        'Related terms', 'Derived terms', 'Usage notes', 'External links', 'See also',
        'Alternative spellings', 'Alternative forms', 'Preposition', 'Adnominal',
        'References', 'Interjection', 'Measure word', 'Conjunction' ])

    L4 = set(['Compounds', 'References', 'Readings', 'Derived terms', 'Related terms', 'Antonyms',
        'Usage notes', 'Synonyms', 'See also', 'Descendants' ])

    # template list, these are the templates that are used in one specific language and section
    # and should always appear in that section. used to build Tdict and Require

    Tlist = [   ('Han char', 'Translingual', 'Han character'),
                ('Han ref', 'Translingual', 'Han character'),
                ('cmn-hanzi', 'Mandarin', 'Hanzi'),
                ('nan-hanzi', 'Min Nan', 'Hanzi'),
                ('yue-hanzi', 'Cantonese', 'Hanzi'),
                ('ja-kanji', 'Japanese', 'Kanji'),
                ('ko-hanja', 'Korean', 'Hanja'),
                ('vi-hantu', 'Vietnamese', 'Han character') ]

    # dictionary of templates, built from above
    Tdict = {}
    for t, l, s in Tlist:
        Tdict[t] = (l, s)

    # checklist requirements. for each of the first in the tuple, the second must exist, or the entry
    # is in error
    Require = [ ('entry', 'Translingual'),
                ('Translingual', 'Translingual Han character section'),
                ('Mandarin', 'Mandarin Hanzi section'),
                ('Min Nan', 'Min Nan Hanzi section'),
                ('Cantonese', 'Cantonese Hanzi section'),
                ('Japanese', 'Japanese Kanji section'),
                ('Korean', 'Korean Hanja section'),
                ('Vietnamese', 'Vietnamese Han character section') ]
    # add templates to requirements
    for t, l, s in Tlist:
        Require.append( (l + ' ' + s + ' section', t + ' template in ' + l + ' ' + s + ' section') )

    # regex precomp
    rehanchar = re.compile(r'\{\{Han char.*?\}\}')
    reradno = re.compile(r'\|rn=(\d+)[|}]')
    rerad = re.compile(r'\|rad=(.)[|}]')
    reas = re.compile(r'\|as=(\d\d)[|}]')
    rehanref = re.compile(r'\{\{Han ref.*?\}\}')
    reuh = re.compile(r'\|uh=(\w+)[|}]')
    reud = re.compile(r'\|ud=(\d+)[|}]')
    # header, will treat L1 as a special case
    reheader = re.compile(r'(={2,6})\s*(.+?)={2,6}(.*)')
    retemplate = re.compile(r'\{\{([-a-zA-Z ]+)[\}\|]')
    # templater allows for '* ' before Han ref ... 
    retemplater = re.compile(r'\*? ?\{\{(Han ref)[\}\|]')

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()
    wikipedia.setAction('Han character report')

    # get XML dump
    dump = xmlreader.XmlDump("en-wikt.xml")

    print "reading XML dump from %s" % xmldate.enXMLdate

    entries = 0
    hanchars = 0
    kprobs = 0

    for entry in dump.parse():
        text = entry.text
        title = entry.title

        entries += 1
        if entries % 5000 == 0: print "%d entries, %d characters" % (entries, hanchars)

        # figure out if it is a Han character entry:

        ishanchar = False
        if len(title) == 1:
            a = ord(title[0:1])
            #print "one character entry, code is %x" % a
            if a >= 0x3400 and a < 0xA000: ishanchar = True
            if a > 0x4BD5 and a < 0x4E00: ishanchar = False  # I Ching characters

        # Extension B, in UTF-16 (although XMLreader/Python Lib don't say so):
        if len(title) == 2:
            a = ord(title[0:1])
            b = ord(title[1:2])
            if a >= 0xd800 and a < 0xdc00:
                a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000
            if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

        if not ishanchar: continue
        hanchars += 1

        # do this twice if needed, first with XML, then with current entry if recheck

        rc = True
        reread = recheck

        while rc:
            rc = False

            #u = unichr(i)
            #ucchar = u.encode("UTF-8")
            ucs = '%X' % a
            #title = "&#%d;" % a
            han = '[[' + title + ']]'

            #inititalize

            Checklist = set(['entry'])

            simple = ''
            defn = ''
            ex = ''
            wlinkfound = False
            deffound = 0
            inlevel2 = 0
            currlang = ''
            current3 = ''
            langfound = 0
            l3found = 0
            deffound = 0
            extra = 0
            detail = True
            MR = ''
            Yale = ''
            
            # first find Han char and Han ref templates, check a few things

            mo = rehanchar.search(text)
            if mo:
                hanct = mo.group(0)
                mo = reradno.search(hanct)
                if mo: radno = int(mo.group(1))
                else: ex += 'Radical number missing<br/>'
                mo = rerad.search(hanct)
                if mo: rad = mo.group(1)
                else: ex += 'Radical missing<br/>'
                mo = reas.search(hanct)
                if mo: ast = int(mo.group(1))
                else: ex += 'Additional strokes parameter missing or incorrect<br/>'
            # else: ex += 'Han char template missing<br/>'

            mo = rehanref.search(text)
            if mo:
                hanref = mo.group(0)
                mo = reud.search(hanref)
                if mo:
                    ud = int(mo.group(1), 10)
                    if ud != a: ex += 'Unicode decimal value incorrect<br/>'
                mo = reuh.search(hanref)
                if mo:
                    uh = int(mo.group(1), 16)
                    if uh != a: ex += 'Unicode hex value incorrect<br/>'
                else: ex += 'Unicode hex value missing<br/>'
            # else: ex += 'Han ref template missing<br/>'

            # now parse text line-by-line ...

            for line in text.splitlines():
                # print "line len is %d" % len(line)
                if line[0:1] == '#':
                    deffound = 1
                    if line.find('[[') > 0: wlinkfound = True
                    lang = line.partition('{{defn|')[2]
                    if lang <> '':
                        lang = lang.split('|')[0]
                        lang = lang.split('}')[0]
                        if lang in Lang: defn += ', ' + Lang[lang]
                        else: defn += ', ' + lang
 		    elif simple == '': simple = line[1:140]

                # look for indicators of un-revised format
                if detail:
                    #if line.find('total strokes index') > 0: ex += "NanshuBot header not formatted<br/>"
                    if line.find('Penkyamp') > 0: ex += "Chinese hanzi not formatted<br/>"
                    if line.find('McCune-Reischauer') > 0: ex += "Korean not formatted<br/>"
                    if line.find('Morohashi') > 0: ex += "References not formatted<br/>"

                if line[0:1] == '=' and line[1:2] != '=':
                    ex += "Level one header<br/>"
                    continue

                mo = reheader.match(line)
                if mo:
                    header = mo.group(2).strip()
                    level = len(mo.group(1))
                    if mo.group(3): ex += "Stuff after %s header<br/>" % header
                else: level = 0

                # check headers by level

                if level == 4:
                    if header not in L4 and header not in L3:
                        if detail: ex += "L4 header: %s<br/>" % header
                    # multiple etymologies:
                    if header not in L4 and header in L3:
                        l3found = 1
                        current3 = header
                        Checklist.add(currlang + ' ' + current3 + ' section')

                if level == 3:
                    if header in L3:
                        l3found = 1
                        current3 = header
                        Checklist.add(currlang + ' ' + current3 + ' section')
                    else:
                        if detail: ex += "L3 header: %s<br/>" % header
                        current3 = ''

                # if level is two, close L2 section
                if level == 2:
                   current3 = ''
                   if inlevel2 == 1:
                       if detail: ex += "Missing ---- to end %s section<br/>" % currlang

                # check, pick up new language
                if level == 2:
                    inlevel2 = 1
                    if header in Lang: newlang = header
                    else: newlang = ''
                    if newlang <> '':
                        if newlang <> 'Translingual': langfound = 1
                        if currlang <> '':
                            # check current lang for order
                            if newlang == 'Translingual': ex += '%s before Translingual<br/>' % currlang
                            elif currlang <> 'Translingual':
                                if currlang == newlang: ex += 'two sections for %s<br/>' % currlang
                                if currlang > newlang: ex += '%s out of order<br/>' % currlang
                    else:
                        ex += "L2 header: %s<br/>" % header
                        detail = False
                    # in order, or not, current is language if valid
                    currlang = newlang
                    l3found = 0
                    current3 = ''
                    deffound = 0
                    Checklist.add(currlang)

                # templates
                mo = retemplate.match(line)
                if not mo: mo = retemplater.match(line)    # "* {{Han ref..." case
                if mo:
                    t = mo.group(1).strip()
                    if t in Tdict:
                        l, s = Tdict[t]
                        if currlang != l: ex += "Template %s not in %s section<br/>" % (t, l)
                        elif current3 != s: ex += "Template %s not in %s section<br/>" % (t, s)
                        # (if error, harmless to add to checklist)
                        Checklist.add(t + ' template in ' + l + ' ' + s + ' section')

                # don't require Korean Hanja section on kwukyel notes, should refer to that in Han defn:
                if line.startswith('#') and 'kwukyel' in line:
                    Checklist.add("Korean Hanja section")
                    Checklist.add("ko-hanja template in Korean Hanja section")

                # random things, cruft:
                if detail:
                    if inlevel2 and line[0:5] == "* '''":
                        ex += "Cruft: <nowiki>%s
" % line[2:] if "Template:substub" in line: ex += "substub template
"
               # Korean, new format:
               if line.find('ko-hanja') > 0:
                   if line.find('|mr=') > 0: MR = re.sub(r'.*\|mr=(.*?)[|}].*', r'\1', line)
                   if line.find('|y=') > 0: Yale = re.sub(r'.*\|y=(.*?)[|}].*', r'\1', line)
               # line across, exit level 2
               if line[0:4] == '----':
                   if inlevel2 == 0:
                       if extra == 0: ex += 'Extraneous ----
' detail = False inlevel2 = 0 if not l3found and detail: ex += "No L3 header in %s section
" % currlang if not deffound and detail: ex += "No definition line for %s
" % currlang elif inlevel2 == 0: # only other text allowed is templates or blank lines if len(line) > 1: if line[0:2] <> '{{': if extra == 0: if detail: ex += "Extraneous text not in L2 section
" extra = 1
               # enough already!
               if detail and len(ex) > 200:
                   detail = False
                   ex += '...more...
'
               # end for line
           # end of entry
           if detail:
               # close last section, should be in level 2, exit
               if inlevel2 == 0: ex += 'Extraneous ---- at end
' else: if l3found == 0: ex += "No L3 header in %s section
" % currlang if deffound == 0: ex += "No definition line for %s
" % currlang
           # even if no detail, report bad Korean Yale
           if Yale:
               yf = Yale
               if MR.find(u'y\u014f') >= 0 and Yale.find('ey') >= 0:
                   yf = re.sub('ey', 'ye', yf)
                   yf = re.sub('yye', 'yey', yf)
               elif MR.find(u'he') >= 0 and Yale.find('ye') >= 0: yf = re.sub('ye', 'ey', yf)
               if MR.find(u'ya') >= 0 and Yale.find('ay') >= 0: yf = re.sub('ay', 'ya', yf)
               elif MR.find(u'ae') >= 0 and Yale.find('ya') >= 0: yf = re.sub('ya', 'ay', yf)
               if MR.find(u"ch'e") >= 0 and Yale.find('chye') >= 0: yf = re.sub('chye', 'chey', yf)
               if MR.find(u'ke') >= 0 and Yale.find('kye') >= 0: yf = re.sub('kye', 'key', yf)
               if MR.find(u'se') >= 0 and Yale.find('sye') >= 0: yf = re.sub('sye', 'sey', yf)
               if MR.find(u're') >= 0 and Yale.find('lye') >= 0: yf = re.sub('lye', 'ley', yf)
               if MR.find(u'ne') >= 0 and Yale.find('nye') >= 0: yf = re.sub('nye', 'ney', yf)
               if MR.find(u'pe') >= 0 and Yale.find('pye') >= 0: yf = re.sub('pye', 'pey', yf)
               if MR == 'e' and Yale == 'ye': yf = 'ey'
               if yf <> Yale:
                   ex += "Korean Yale %s should be %s
" % (Yale, yf) kprobs += 1
           # run checklist (regardless of detail for now)
           for r, i in Require:
               if r in Checklist and i not in Checklist: ex += i + ' missing
'
           # if there was a problem, reread from current DB?
           if ex and reread:
               page = wikipedia.Page(site, title)
               print "Re-reading character %X" % a
               try:
                   # text = page.get()
                   text = getwikitext(site, page)
                   rc = True
                   reread = False
                   continue # go back to top once more
               except wikipedia.NoPage:
                   print "can't read current page?"
                   pass
               except wikipedia.IsRedirectPage:
                   print "redirect page?"
                   pass
           # add to problems
           if ex: problems[a] = han + ' ' + re.sub('
', ', ', ex)[0:-2]
           # more details, not reported in problem punchlist
           if detail:
               if simple and not wlinkfound: ex += "No wikilink in any definition found
" if langfound == 0: ex += "No language section found
"
           # fixups
           if defn[0:1] == ',': defn = defn[2:]
           # store report line
           enwikt[a] = '|-\n| ' + ucs + ' || ' + han + ' || ' + simple + ' || ' + defn + ' || ' + ex + '\n'
           print "Character %X %s" % (a, safe(ex))


   print "%d Korean Yale problems" % kprobs
   print "%d total problems" % len(problems)
   print "%d entries, %d characters, writing reports" % (entries, hanchars)
   # write report pages
   report = '\nProblems as of ' + xmldate.enXMLdate
   report += ', keep in mind while fixing entries that the check, rather than the entry, may be wrong.\n\n'
   for c in sorted(problems):
       report += '* %X ' % c + problems[c] + '\n'
   report += '\n%d problems\n\n' % len(problems)
   # report page 
   try:
       reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Han/Problems')
       oldreport = reportpage.get()
   except wikipedia.NoPage:
       print "No present report for %s" % reportpage.aslink()
   # file the report
   if report.strip(' \n') != oldreport.strip(' \n'): reportpage.put(report)
   if probonly: return # we are done
   for si in range(0x3400, 0x2B000, 256):
       validentry = False
       # save some time
       if si > 0xA000 and si < 0x20000: continue
       # blank and re-intialize
       report = '
Summary of checks on Han character entries from UCS hex ' + "%X"%si + ' to ' + "%X"%(si+255)
       report += ', run on ' + xmldate.enXMLdate + ' XML dump of the en.wikt.


This is one row (sometimes called a block) of the Unified Han characters; see the + "%X"%si + ' Unihan database for this row.

Notes:

  • The simple meaning shown is just the first # definition line in the entry, regardless of language.
  • Exceptions may not be errors, rather things that did not "pass" rather simple checks; some less used level 4 headers, etc. may show up.
  • Some exceptions may mask others, for example if the horizontal rule ending a section is reported missing, missing POS headers or definitions in that section will not be reported.
  • A major error (bad L2 header) will cause details to be suppressed, also if there are simply too many exceptions.
  • Cruft refers to the format, not the content!

This page is generated by 'bot code, and is completely over-written on each run, so it isn't very useful to edit it.


\n' for i in range(si, si+256): if i in enwikt: line = enwikt[i] validentry = True report += line # else: line = '|-\n| ' + '%X'%i + ' || ' + "&#%d;"%i + ' || || || (entry not found)\n' # last rows of Han, Ext A, Ext B if i == 0x9FA5: break if i == 0x4DB5: break if i == 0x2A6D6: break report += '|}\n' if not validentry: continue # report page try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Han/%X' % si) oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink() oldreport = if report.strip(' \n') == oldreport.strip(' \n'): print "No change to report for %s" % reportpage.aslink() continue wikipedia.showDiff(oldreport, report) # file the report reportpage.put(report) if __name__ == "__main__": try: main() finally: wikipedia.stopme()</nowiki>
UCS Simple meaning {{rfdef}} languages Exceptions