User:Robert Ullmann/code/level2

Definition from Wiktionary, the free dictionary
Jump to: navigation, search



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/code/level2


"""
This code looks for valid and invalid L2 headers (languages) in the en.wikt

No command line arguments.

writes reports
"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate

def safe(s):

    ss = pickle.dumps(s)
    l = len(ss)
    return ss[1:l-5]

def main():

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()
    wikipedia.setAction('writing report')

    # get XML dump
    dump = xmlreader.XmlDump("en-wikt.xml")

    entries = 0
    words = 0
    L2headers = 0

    # valid headers have templates with codes
    Codes2 = {}
    Codes3 = {}
    CodesW = {}
    # all headers have occurance counts
    Occurs = {}
    # invalid headers have examples, but we collect for all
    Examples = {}

    # things that look like codes, but aren't; including ISO 639-2 B codes (of which one is missing?):

    Stops = [ 'alb', 'arm', 'baq', 'bur', 'chi', 'cze', 'dut', 'fre', 'geo', 'ger',
              'gre', 'ice', 'mac', 'may', 'mao', 'per', 'rum', 'scc', 'scr', 'slo',
              'tib', 'wel',
              'zh-tc', 'zh-sc', 'gko',
              'rfc', 'rfd', 'rfv', 'top', 'mid', 'pos-n', 'pie' ]

    # and fix DAVilla silliness:
    Codes2['Chinese'] = 'zh'

    recmatch = re.compile(r'[a-z-]+$')
    regood = re.compile(r'(' + re.escape('{{{l|[}}}{{{l|[}}}') + \
                        r'|)([^\{\}<]+)(' + re.escape('{{{l|]]}}}') + r'|)<noinclude')

    Reds = { }
    redirect = re.compile(r'#.*\[\[Template:(.*)\]\]')

    for entry in dump.parse():
        text = entry.text
        title = entry.title

        entries += 1
        if entries % 10000 == 0:
            print "%d entries, %d words, %d L2 headers" % (entries, words, L2headers)

        # look for code templates, just ignore any that are badly formatted

        if title.startswith('Template:'):
           code = title[9:]
           if code in Stops: continue

           if not recmatch.match(code): continue
           if len(code) > 10: continue

           if text[:1] == '#':
               # record redirects, more breakage July 2010:
               mo = redirect.match(text)
               if mo: Reds[code] = mo.group(1)
               continue

           # gratuitiously broken July 2010, can no longer positively ID language templates
           # if 'Language templates' not in text: continue

           mo = regood.match(text)
           if not mo:
               # can't report bad templates, as we can no longer tell which are lang temps
               # print "bad code template %s: %s" % (safe(code), safe(text))
               continue
           lang = mo.group(2)

           print "code %s: %s" % (safe(code), safe(lang))
           if len(code) == 2: Codes2[lang] = code
           elif len(code) == 3: Codes3[lang] = code
           else: CodesW[lang] = code
           continue

        # now skip non main-name-space

        if title.find(':') >= 0:
            continue
        else:
            words += 1

            # if entries > 5000: break

            # parse text ...

            for line in text.splitlines():

                # comments on the (presumed) end of lines
                if line.find('<!--') >= 0: line = line.split('<!--')[0]

                if line[0:2] != '==': continue
                if line[2:3] == '=': continue

                L2headers += 1

                header = line.strip()[2:-2].strip(' []')

                # template mess, might as well keep (from L3 code)
                if header[0:2] == '{{': header = re.sub(r'(.*?)\|.*?\}(.*)', r'\1|...}\2', header)

                if header not in Occurs: Occurs[header] = 0
                Occurs[header] += 1

                # always collect examples
                if header not in Examples:
                    Examples[header] = '[[' + title + ']]'
                    continue

                if len(Examples[header]) < 210 or header == 'Slovenian':
                    Examples[header] += ' [[' + title + ']]'

                # end of for line

            # end of for entry

    print "%d entries, %d words, %d L2 headers" % (entries, words, L2headers)

    # fix up redirects, brokenness from about July 2010:
    for header in Codes2:
        code = Codes2[header]
        for red in Reds:
            if Reds[red] == code:
                print "found redirect from %s to %s" % (red, code)
                if len(red) == 3: Codes3[header] = red
                else: CodesW[header] = red
    # does this case occur?:
    for header in Codes3:
        code = Codes3[header]
        for red in Reds:
            if Reds[red] == code:
                print "found redirect from %s to %s" % (red, code)
                CodesW[header] = red
    # yes, that was sloppy. But what can I do?

    nlangs = 0

    # report valid headers

    report = '\nas of ' + xmldate.enXMLdate + '\n'

    report += """
May include bogus codes/languages as ability to distinguish language templates by wikitext
was broken June/July 2010: category is now buried in doc page elsewhere in the dump.
"""

    #Codes['Ancient Greek'] = 'grc'
    #report += '(Ancient Greek set to grc for this run)\n'

    # fixes 8.7.10:
    Codes3['Seneca'] = 'see'
    Codes3['Old English'] = 'ang'
    Codes3['!Kung'] = 'knw'
    if 'Simplified Chinese' in CodesW: del CodesW['Simplified Chinese']
    if 'Traditional Chinese' in CodesW: del CodesW['Traditional Chinese']

    report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
    report += '|-\n| | ISO 639-1\n| | ISO 639-3\n| | Wiki code\n| |Occurs\n| |Language\n| |Category\n'

    for header in sorted(Occurs):
        if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue
        report += "|-\n| "
        if header in Codes2: report += "'''" + Codes2[header] + "''' ||"
        else: report += " ||" 
        if header in Codes3: report += "'''" + Codes3[header] + "''' ||"
        else: report += " ||" 
        if header in CodesW: report += "'''" + CodesW[header] + "''' ||"
        else: report += " ||" 
        report += str(Occurs[header]) + '||' + header + ' || [[:Category:' + header + ' language]]\n'
        # del Occurs[header]
        nlangs += 1
    report += "|}\n"
    wikipedia.setAction('writing report')

    # write the report page

    try:
        reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/valid')
        oldreport = reportpage.get()
    except wikipedia.NoPage:
        print "No present report for %s" % reportpage.aslink()

    # file the report
    reportpage.put(report)

    print "valid languages: %d" % nlangs

    # now remove valid, to report all the rest (keys() allows us to delete)
    for header in Occurs.keys():
        if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue
        del Occurs[header]

    # report invalid headers

    report = '\nas of ' + xmldate.enXMLdate + '\n'
    report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
    report += '|-\n| | Language\n| |Occurs\n| |Examples\n'

    for header in sorted(Occurs):
        report += "|-\n| '''<nowiki>" + header + " ||" + str(Occurs[header]) + '||' + Examples[header] + '\n'
   report += "|}\n"
   wikipedia.setAction('writing report')
   # write the report page
   try:
       reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/invalid')
       oldreport = reportpage.get()
   except wikipedia.NoPage:
       print "No present report for %s" % reportpage.aslink()
   # file the report
   reportpage.put(report)


if __name__ == "__main__":

   try:
       main()
   finally:
       wikipedia.stopme()</nowiki>