User:Robert Ullmann/code/level2
Jump to navigation
Jump to search
#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Robert Ullmann/code/level2 """ This code looks for valid and invalid L2 headers (languages) in the en.wikt No command line arguments. writes reports """ import wikipedia import xmlreader import sys import re import pickle import xmldate def safe(s): ss = pickle.dumps(s) l = len(ss) return ss[1:l-5] def main(): # make sure we are logged in site = wikipedia.getSite() site.forceLogin() wikipedia.setAction('writing report') # get XML dump dump = xmlreader.XmlDump("en-wikt.xml") entries = 0 words = 0 L2headers = 0 # valid headers have templates with codes Codes2 = {} Codes3 = {} CodesW = {} # all headers have occurance counts Occurs = {} # invalid headers have examples, but we collect for all Examples = {} # things that look like codes, but aren't; including ISO 639-2 B codes (of which one is missing?): Stops = [ 'alb', 'arm', 'baq', 'bur', 'chi', 'cze', 'dut', 'fre', 'geo', 'ger', 'gre', 'ice', 'mac', 'may', 'mao', 'per', 'rum', 'scc', 'scr', 'slo', 'tib', 'wel', 'zh-tc', 'zh-sc', 'gko', 'rfc', 'rfd', 'rfv', 'top', 'mid', 'pos-n', 'pie' ] # and fix DAVilla silliness: Codes2['Chinese'] = 'zh' recmatch = re.compile(r'[a-z-]+$') regood = re.compile(r'(' + re.escape('{{{l|[}}}{{{l|[}}}') + \ r'|)([^\{\}<]+)(' + re.escape('{{{l|]]}}}') + r'|)<noinclude') Reds = { } redirect = re.compile(r'#.*\[\[Template:(.*)\]\]') for entry in dump.parse(): text = entry.text title = entry.title entries += 1 if entries % 10000 == 0: print "%d entries, %d words, %d L2 headers" % (entries, words, L2headers) # look for code templates, just ignore any that are badly formatted if title.startswith('Template:'): code = title[9:] if code in Stops: continue if not recmatch.match(code): continue if len(code) > 10: continue if text[:1] == '#': # record redirects, more breakage July 2010: mo = redirect.match(text) if mo: Reds[code] = mo.group(1) continue # gratuitiously broken July 2010, can no longer positively ID language templates # if 'Language templates' not in text: continue mo = regood.match(text) if not mo: # can't report bad templates, as we can no longer tell which are lang temps # print "bad code template %s: %s" % (safe(code), safe(text)) continue lang = mo.group(2) print "code %s: %s" % (safe(code), safe(lang)) if len(code) == 2: Codes2[lang] = code elif len(code) == 3: Codes3[lang] = code else: CodesW[lang] = code continue # now skip non main-name-space if title.find(':') >= 0: continue else: words += 1 # if entries > 5000: break # parse text ... for line in text.splitlines(): # comments on the (presumed) end of lines if line.find('<!--') >= 0: line = line.split('<!--')[0] if line[0:2] != '==': continue if line[2:3] == '=': continue L2headers += 1 header = line.strip()[2:-2].strip(' []') # template mess, might as well keep (from L3 code) if header[0:2] == '{{': header = re.sub(r'(.*?)\|.*?\}(.*)', r'\1|...}\2', header) if header not in Occurs: Occurs[header] = 0 Occurs[header] += 1 # always collect examples if header not in Examples: Examples[header] = '[[' + title + ']]' continue if len(Examples[header]) < 210 or header == 'Slovenian': Examples[header] += ' [[' + title + ']]' # end of for line # end of for entry print "%d entries, %d words, %d L2 headers" % (entries, words, L2headers) # fix up redirects, brokenness from about July 2010: for header in Codes2: code = Codes2[header] for red in Reds: if Reds[red] == code: print "found redirect from %s to %s" % (red, code) if len(red) == 3: Codes3[header] = red else: CodesW[header] = red # does this case occur?: for header in Codes3: code = Codes3[header] for red in Reds: if Reds[red] == code: print "found redirect from %s to %s" % (red, code) CodesW[header] = red # yes, that was sloppy. But what can I do? nlangs = 0 # report valid headers report = '\nas of ' + xmldate.enXMLdate + '\n' report += """ May include bogus codes/languages as ability to distinguish language templates by wikitext was broken June/July 2010: category is now buried in doc page elsewhere in the dump. """ #Codes['Ancient Greek'] = 'grc' #report += '(Ancient Greek set to grc for this run)\n' # fixes 8.7.10: Codes3['Seneca'] = 'see' Codes3['Old English'] = 'ang' Codes3['!Kung'] = 'knw' if 'Simplified Chinese' in CodesW: del CodesW['Simplified Chinese'] if 'Traditional Chinese' in CodesW: del CodesW['Traditional Chinese'] report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | ISO 639-1\n| | ISO 639-3\n| | Wiki code\n| |Occurs\n| |Language\n| |Category\n' for header in sorted(Occurs): if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue report += "|-\n| " if header in Codes2: report += "'''" + Codes2[header] + "''' ||" else: report += " ||" if header in Codes3: report += "'''" + Codes3[header] + "''' ||" else: report += " ||" if header in CodesW: report += "'''" + CodesW[header] + "''' ||" else: report += " ||" report += str(Occurs[header]) + '||' + header + ' || [[:Category:' + header + ' language]]\n' # del Occurs[header] nlangs += 1 report += "|}\n" wikipedia.setAction('writing report') # write the report page try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/valid') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink() # file the report reportpage.put(report) print "valid languages: %d" % nlangs # now remove valid, to report all the rest (keys() allows us to delete) for header in Occurs.keys(): if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue del Occurs[header] # report invalid headers report = '\nas of ' + xmldate.enXMLdate + '\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Language\n| |Occurs\n| |Examples\n' for header in sorted(Occurs): report += "|-\n| '''<nowiki>" + header + " ||" + str(Occurs[header]) + '||' + Examples[header] + '\n'
report += "|}\n" wikipedia.setAction('writing report')
# write the report page
try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/invalid') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink()
# file the report reportpage.put(report)
if __name__ == "__main__":
try: main() finally: wikipedia.stopme()</nowiki>