User:Robert Ullmann/code/xhan
Jump to navigation
Jump to search
#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Robert Ullmann/code/xhan """ This bot checks Han character entries in en-wikt.xml and writes report pages for each row. No command line arguments. Generates (replaces) User:Robert Ullmann/Han/(hexcode) for each row Generates a problems summary (to-do list) """ import wikipedia import xmlreader import sys import re import pickle from getwikitext import getwikitext import xmldate def safe(s): return pickle.dumps(s)[1:-5] def main(): probonly = False recheck = False for arg in sys.argv[1:]: if arg.startswith('-probonly'): probonly = True print "only updating problems" elif arg.startswith('-recheck'): recheck = True print "rechecking entries from current DB" else: print "unknown command line argument %s" % arg # report dictionary enwikt = { 0:"blank" } # problems, index is character code number, value is char + text of problem problems = { } # header levels Lang = {'Translingual':'Han char', 'Cantonese':'yue', 'Japanese':'ja', 'Korean':'ko', 'Mandarin':'cmn', 'Min Nan':'nan', 'Hakka':'hak', 'Gan':'gan', 'Jinyu':'cjy', 'Min Bei':'mnp', 'Min Dong':'cdo', 'Min Zhong':'czo', 'Wu':'wuu', 'Xiang':'hsn', 'Vietnamese':'vi', 'Chinese':'zh', 'Old Chinese':'och', 'Middle Chinese':'???', 'Zhuang':'za', 'Old Korean':'oko' } L3 = set(['Hanzi', 'Kanji', 'Hanja', 'Han character', 'Pronunciation', 'Proper noun', 'Pronoun', 'Noun', 'Verb', 'Adjective', 'Number', 'Counter', 'Particle', 'Prefix', 'Suffix', 'Affix', 'Adverb', 'Etymology', 'Etymology 1', 'Etymology 2','Etymology 3','Etymology 4', 'Related terms', 'Derived terms', 'Usage notes', 'External links', 'See also', 'Alternative spellings', 'Alternative forms', 'Preposition', 'Adnominal', 'References', 'Interjection', 'Measure word', 'Conjunction' ]) L4 = set(['Compounds', 'References', 'Readings', 'Derived terms', 'Related terms', 'Antonyms', 'Usage notes', 'Synonyms', 'See also', 'Descendants' ]) # template list, these are the templates that are used in one specific language and section # and should always appear in that section. used to build Tdict and Require Tlist = [ ('Han char', 'Translingual', 'Han character'), ('Han ref', 'Translingual', 'Han character'), ('cmn-hanzi', 'Mandarin', 'Hanzi'), ('nan-hanzi', 'Min Nan', 'Hanzi'), ('yue-hanzi', 'Cantonese', 'Hanzi'), ('ja-kanji', 'Japanese', 'Kanji'), ('ko-hanja', 'Korean', 'Hanja'), ('vi-hantu', 'Vietnamese', 'Han character') ] # dictionary of templates, built from above Tdict = {} for t, l, s in Tlist: Tdict[t] = (l, s) # checklist requirements. for each of the first in the tuple, the second must exist, or the entry # is in error Require = [ ('entry', 'Translingual'), ('Translingual', 'Translingual Han character section'), ('Mandarin', 'Mandarin Hanzi section'), ('Min Nan', 'Min Nan Hanzi section'), ('Cantonese', 'Cantonese Hanzi section'), ('Japanese', 'Japanese Kanji section'), ('Korean', 'Korean Hanja section'), ('Vietnamese', 'Vietnamese Han character section') ] # add templates to requirements for t, l, s in Tlist: Require.append( (l + ' ' + s + ' section', t + ' template in ' + l + ' ' + s + ' section') ) # regex precomp rehanchar = re.compile(r'\{\{Han char.*?\}\}') reradno = re.compile(r'\|rn=(\d+)[|}]') rerad = re.compile(r'\|rad=(.)[|}]') reas = re.compile(r'\|as=(\d\d)[|}]') rehanref = re.compile(r'\{\{Han ref.*?\}\}') reuh = re.compile(r'\|uh=(\w+)[|}]') reud = re.compile(r'\|ud=(\d+)[|}]') # header, will treat L1 as a special case reheader = re.compile(r'(={2,6})\s*(.+?)={2,6}(.*)') retemplate = re.compile(r'\{\{([-a-zA-Z ]+)[\}\|]') # templater allows for '* ' before Han ref ... retemplater = re.compile(r'\*? ?\{\{(Han ref)[\}\|]') # make sure we are logged in site = wikipedia.getSite() site.forceLogin() wikipedia.setAction('Han character report') # get XML dump dump = xmlreader.XmlDump("en-wikt.xml") print "reading XML dump from %s" % xmldate.enXMLdate entries = 0 hanchars = 0 kprobs = 0 for entry in dump.parse(): text = entry.text title = entry.title entries += 1 if entries % 5000 == 0: print "%d entries, %d characters" % (entries, hanchars) # figure out if it is a Han character entry: ishanchar = False if len(title) == 1: a = ord(title[0:1]) #print "one character entry, code is %x" % a if a >= 0x3400 and a < 0xA000: ishanchar = True if a > 0x4BD5 and a < 0x4E00: ishanchar = False # I Ching characters # Extension B, in UTF-16 (although XMLreader/Python Lib don't say so): if len(title) == 2: a = ord(title[0:1]) b = ord(title[1:2]) if a >= 0xd800 and a < 0xdc00: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000 if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True if not ishanchar: continue hanchars += 1 # do this twice if needed, first with XML, then with current entry if recheck rc = True reread = recheck while rc: rc = False #u = unichr(i) #ucchar = u.encode("UTF-8") ucs = '%X' % a #title = "&#%d;" % a han = '[[' + title + ']]' #inititalize Checklist = set(['entry']) simple = '' defn = '' ex = '' wlinkfound = False deffound = 0 inlevel2 = 0 currlang = '' current3 = '' langfound = 0 l3found = 0 deffound = 0 extra = 0 detail = True MR = '' Yale = '' # first find Han char and Han ref templates, check a few things mo = rehanchar.search(text) if mo: hanct = mo.group(0) mo = reradno.search(hanct) if mo: radno = int(mo.group(1)) else: ex += 'Radical number missing<br/>' mo = rerad.search(hanct) if mo: rad = mo.group(1) else: ex += 'Radical missing<br/>' mo = reas.search(hanct) if mo: ast = int(mo.group(1)) else: ex += 'Additional strokes parameter missing or incorrect<br/>' # else: ex += 'Han char template missing<br/>' mo = rehanref.search(text) if mo: hanref = mo.group(0) mo = reud.search(hanref) if mo: ud = int(mo.group(1), 10) if ud != a: ex += 'Unicode decimal value incorrect<br/>' mo = reuh.search(hanref) if mo: uh = int(mo.group(1), 16) if uh != a: ex += 'Unicode hex value incorrect<br/>' else: ex += 'Unicode hex value missing<br/>' # else: ex += 'Han ref template missing<br/>' # now parse text line-by-line ... for line in text.splitlines(): # print "line len is %d" % len(line) if line[0:1] == '#': deffound = 1 if line.find('[[') > 0: wlinkfound = True lang = line.partition('{{defn|')[2] if lang <> '': lang = lang.split('|')[0] lang = lang.split('}')[0] if lang in Lang: defn += ', ' + Lang[lang] else: defn += ', ' + lang elif simple == '': simple = line[1:140] # look for indicators of un-revised format if detail: #if line.find('total strokes index') > 0: ex += "NanshuBot header not formatted<br/>" if line.find('Penkyamp') > 0: ex += "Chinese hanzi not formatted<br/>" if line.find('McCune-Reischauer') > 0: ex += "Korean not formatted<br/>" if line.find('Morohashi') > 0: ex += "References not formatted<br/>" if line[0:1] == '=' and line[1:2] != '=': ex += "Level one header<br/>" continue mo = reheader.match(line) if mo: header = mo.group(2).strip() level = len(mo.group(1)) if mo.group(3): ex += "Stuff after %s header<br/>" % header else: level = 0 # check headers by level if level == 4: if header not in L4 and header not in L3: if detail: ex += "L4 header: %s<br/>" % header # multiple etymologies: if header not in L4 and header in L3: l3found = 1 current3 = header Checklist.add(currlang + ' ' + current3 + ' section') if level == 3: if header in L3: l3found = 1 current3 = header Checklist.add(currlang + ' ' + current3 + ' section') else: if detail: ex += "L3 header: %s<br/>" % header current3 = '' # if level is two, close L2 section if level == 2: current3 = '' if inlevel2 == 1: if detail: ex += "Missing ---- to end %s section<br/>" % currlang # check, pick up new language if level == 2: inlevel2 = 1 if header in Lang: newlang = header else: newlang = '' if newlang <> '': if newlang <> 'Translingual': langfound = 1 if currlang <> '': # check current lang for order if newlang == 'Translingual': ex += '%s before Translingual<br/>' % currlang elif currlang <> 'Translingual': if currlang == newlang: ex += 'two sections for %s<br/>' % currlang if currlang > newlang: ex += '%s out of order<br/>' % currlang else: ex += "L2 header: %s<br/>" % header detail = False # in order, or not, current is language if valid currlang = newlang l3found = 0 current3 = '' deffound = 0 Checklist.add(currlang) # templates mo = retemplate.match(line) if not mo: mo = retemplater.match(line) # "* {{Han ref..." case if mo: t = mo.group(1).strip() if t in Tdict: l, s = Tdict[t] if currlang != l: ex += "Template %s not in %s section<br/>" % (t, l) elif current3 != s: ex += "Template %s not in %s section<br/>" % (t, s) # (if error, harmless to add to checklist) Checklist.add(t + ' template in ' + l + ' ' + s + ' section') # don't require Korean Hanja section on kwukyel notes, should refer to that in Han defn: if line.startswith('#') and 'kwukyel' in line: Checklist.add("Korean Hanja section") Checklist.add("ko-hanja template in Korean Hanja section") # random things, cruft: if detail: if inlevel2 and line[0:5] == "* '''": ex += "Cruft: <nowiki>%s
" % line[2:] if "Template:substub" in line: ex += "substub template
"
# Korean, new format: if line.find('ko-hanja') > 0: if line.find('|mr=') > 0: MR = re.sub(r'.*\|mr=(.*?)[|}].*', r'\1', line) if line.find('|y=') > 0: Yale = re.sub(r'.*\|y=(.*?)[|}].*', r'\1', line)
# line across, exit level 2 if line[0:4] == '----': if inlevel2 == 0: if extra == 0: ex += 'Extraneous ----
' detail = False inlevel2 = 0 if not l3found and detail: ex += "No L3 header in %s section
" % currlang if not deffound and detail: ex += "No definition line for %s
" % currlang elif inlevel2 == 0: # only other text allowed is templates or blank lines if len(line) > 1: if line[0:2] <> '{{': if extra == 0: if detail: ex += "Extraneous text not in L2 section
" extra = 1
# enough already! if detail and len(ex) > 200: detail = False ex += '...more...
'
# end for line
# end of entry if detail: # close last section, should be in level 2, exit if inlevel2 == 0: ex += 'Extraneous ---- at end
' else: if l3found == 0: ex += "No L3 header in %s section
" % currlang if deffound == 0: ex += "No definition line for %s
" % currlang
# even if no detail, report bad Korean Yale if Yale: yf = Yale
if MR.find(u'y\u014f') >= 0 and Yale.find('ey') >= 0: yf = re.sub('ey', 'ye', yf) yf = re.sub('yye', 'yey', yf) elif MR.find(u'he') >= 0 and Yale.find('ye') >= 0: yf = re.sub('ye', 'ey', yf)
if MR.find(u'ya') >= 0 and Yale.find('ay') >= 0: yf = re.sub('ay', 'ya', yf) elif MR.find(u'ae') >= 0 and Yale.find('ya') >= 0: yf = re.sub('ya', 'ay', yf)
if MR.find(u"ch'e") >= 0 and Yale.find('chye') >= 0: yf = re.sub('chye', 'chey', yf) if MR.find(u'ke') >= 0 and Yale.find('kye') >= 0: yf = re.sub('kye', 'key', yf) if MR.find(u'se') >= 0 and Yale.find('sye') >= 0: yf = re.sub('sye', 'sey', yf) if MR.find(u're') >= 0 and Yale.find('lye') >= 0: yf = re.sub('lye', 'ley', yf) if MR.find(u'ne') >= 0 and Yale.find('nye') >= 0: yf = re.sub('nye', 'ney', yf) if MR.find(u'pe') >= 0 and Yale.find('pye') >= 0: yf = re.sub('pye', 'pey', yf) if MR == 'e' and Yale == 'ye': yf = 'ey'
if yf <> Yale: ex += "Korean Yale %s should be %s
" % (Yale, yf) kprobs += 1
# run checklist (regardless of detail for now) for r, i in Require: if r in Checklist and i not in Checklist: ex += i + ' missing
'
# if there was a problem, reread from current DB?
if ex and reread: page = wikipedia.Page(site, title) print "Re-reading character %X" % a try: # text = page.get() text = getwikitext(site, page) rc = True reread = False continue # go back to top once more except wikipedia.NoPage: print "can't read current page?" pass except wikipedia.IsRedirectPage: print "redirect page?" pass
# add to problems if ex: problems[a] = han + ' ' + re.sub('
', ', ', ex)[0:-2]
# more details, not reported in problem punchlist if detail: if simple and not wlinkfound: ex += "No wikilink in any definition found
" if langfound == 0: ex += "No language section found
"
# fixups if defn[0:1] == ',': defn = defn[2:]
# store report line enwikt[a] = '|-\n| ' + ucs + ' || ' + han + ' || ' + simple + ' || ' + defn + ' || ' + ex + '\n'
print "Character %X %s" % (a, safe(ex))
print "%d Korean Yale problems" % kprobs print "%d total problems" % len(problems) print "%d entries, %d characters, writing reports" % (entries, hanchars)
# write report pages
report = '\nProblems as of ' + xmldate.enXMLdate report += ', keep in mind while fixing entries that the check, rather than the entry, may be wrong.\n\n' for c in sorted(problems): report += '* %X ' % c + problems[c] + '\n' report += '\n%d problems\n\n' % len(problems)
# report page try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Han/Problems') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink()
# file the report if report.strip(' \n') != oldreport.strip(' \n'): reportpage.put(report)
if probonly: return # we are done
for si in range(0x3400, 0x2B000, 256): validentry = False
# save some time if si > 0xA000 and si < 0x20000: continue
# blank and re-intialize report = '
- Summary of checks on Han character entries from UCS hex ' + "%X"%si + ' to ' + "%X"%(si+255)
report += ', run on ' + xmldate.enXMLdate + ' XML dump of the en.wikt.
This is one row (sometimes called a block) of the Unified Han characters; see the + "%X"%si + ' Unihan database for this row.
Notes:
- The simple meaning shown is just the first # definition line in the entry, regardless of language.
- Exceptions may not be errors, rather things that did not "pass" rather simple checks; some less used level 4 headers, etc. may show up.
- Some exceptions may mask others, for example if the horizontal rule ending a section is reported missing, missing POS headers or definitions in that section will not be reported.
- A major error (bad L2 header) will cause details to be suppressed, also if there are simply too many exceptions.
- Cruft refers to the format, not the content!
This page is generated by 'bot code, and is completely over-written on each run, so it isn't very useful to edit it.
UCS | Simple meaning | {{rfdef}} languages
|
Exceptions |