User:Robert Ullmann/Pronunciation exceptions/code

From Wiktionary, the free dictionary
Jump to navigation Jump to search



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/Pronunciation_exceptions/code


"""
This bot looks for and executes replacements, customized for each run

This version looks unmatched wikisyntax and parens

No command line arguments.

"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate
import socket
from mwapi import getwikitext

def safe(s):
    return pickle.dumps(s)[1:-5]

# work cache, record time last looked at entry
# each record is key: lc:word, pickled with safe(), value is integer time()

import shelve
cache = shelve.open("pronex")
from time import time

# we want to identify trouble cases, line by line
# they are applied after checking all the AF regex fixes

Flags = set([ '<tt>', "{|", "//", "[[w:", 
              "{{enPR|/", "[[Rhymes:", "[[rhymes:", "hymes:--", "hymes|-",
              "''US''", "''(US)''", "''UK''", "''(UK)''",
              "[[RP]]", "[[WEAE]]",
              "* [[", "* ''", "* (" ])

reenpr = re.compile(r'\{\{enPR\|(.*?)}}')
reipa = re.compile(r'\{\{IPA\|(.*?)}}')
resampa = re.compile(r'\{\{SAMPA\|(.*?)}}')
rerfp = re.compile(r'\{\{rfp\|(.*?)}}')
rederef = re.compile(r'<ref.*?/ref>') # not quite correct, but will do for now
redecom = re.compile(r'<!--.*?-->')
rehttp = re.compile(r'\[http:.*?\]')
# match "stackable" format characters at start of lines, so we can have one space exactly
restack = re.compile(r"^([:#\*]+)\s*")
# match entire line is "blank" IPA, SAMPA, etc:
reblank = re.compile(r"^\* ?\[\[(IPA|SAMPA|AHD)\]\]:? *//$")

# exact copies of AF regex it will fix (manually copied)

AFcount = 0
Prex = {}

def preset():

    # Pronunciate
    # like Regex, but applied line by line only in pronunciation sections
    # use ^ and $ as needed with re.M for prescreen
    Prex['template enPR/IPA/SAMPA'] = \
                    (re.compile(r'^\*? ?([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
                     r'* {{enPR|\1}}, {{IPA|/\2/}}, {{SAMPA|/\3/}}')
    Prex['template enPR/IPA/SAMPA (RP, UK, US)'] = \
                    (re.compile(r"^\*? ?\(''(RP|UK|US)''\):? *"
                     r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
                     r'* {{a|\1}} {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}')
    Prex['template enPR/IPA/SAMPA with {a}'] = \
                    (re.compile(r"^\*? ?(\{\{a\|[^\}]+\}\}):? *"
                     r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /<tt>([^\|\}/]+)</tt>/$', re.M),
                     r'* \1 {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}')

    Prex['+rhymes template'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:English:-(?P<s>.+?)\|-(?P=s)\]\]"),
                                r'{{rhymes|\1}}')
    # w/O "Rhymes:":
    Prex['+rhymes template w/Rhymes: in link'] = \
           (re.compile("^([\*:]+) *\[\[[Rr]hymes:English:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
                                r'\1 {{rhymes|\2}}')
    Prex['+rhymes template (Finnish)'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Finnish:-(?P<s>.+?)\|-(?P=s)\]\]"),
                                r'{{rhymes|\1|lang=fi}}')
    Prex['+rhymes template w/Rhymes: in link (Finnish)'] = \
           (re.compile("^([\*:]+) *\[\[[Rr]hymes:Finnish:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
                                r'\1 {{rhymes|\2|lang=fi}}')
    Prex['+rhymes template w/Rhymes: in link (French)'] = \
           (re.compile("^([\*:]+) *\[\[[Rr]hymes:French:-(?P<s>.+?)\|Rhymes: -(?P=s)\]\]", re.M),
                                r'\1 {{rhymes|\2|lang=fr}}')
    Prex['+rhymes template (Icelandic)'] = \
           (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Icelandic:-(?P<s>.+?)\|-(?P=s)\]\]"),
                                r'{{rhymes|\1|lang=is}}')
    Prex['template -Rhymes +rhymes'] = (re.compile(r'\{\{Rhymes([\|\}])'), r'{{rhymes\1')
    # multiple rhymes (assume language matches! ;-)
    Prex['add additional rhyme to template'] = \
           (re.compile(r'(\{\{rhymes\|[^\}]+)\}\} *(,|or|) *\[\[[Rr]hymes:[A-Za-z -]+:-(?P<s>.+?)\| ?-(?P=s)\]\]'),
                                r'\1|\3}}')

    Prex["rm /'s from enPR template"] = (re.compile(r'\{\{enPR\|/([^ /\[\]\{\}]+?)/\}\}'), r'{{enPR|\1}}')

    # RP, UK, and US in a wide variety of cases
    Prex['(RP) to {{a|RP}}'] = (re.compile(r"^\*? ?[\(\[\{']+RP[\]\)\}:']+", re.M), r'* {{a|RP}}')
    Prex['(UK) to {{a|UK}}'] = (re.compile(r"^\*? ?[\(\[\{']+UK[\]\)\}:']+", re.M), r'* {{a|UK}}')
    Prex['(US) to {{a|US}}'] = (re.compile(r"^\*? ?[\(\[\{']+US[\]\)\}:']+", re.M), r'* {{a|US}}')
    Prex['(italbrac RP) to {{a|RP}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*RP\]*\}\}:?", re.M), r'* {{a|RP}}')
    Prex['(italbrac UK) to {{a|UK}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*UK\]*\}\}:?", re.M), r'* {{a|UK}}')
    Prex['(italbrac US) to {{a|US}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*US\]*\}\}:?", re.M), r'* {{a|US}}')
    Prex['IPA: [[WEAE]] to {{a|WEAE}} IPA:'] = \
               (re.compile(r"^\*? ?IPA: [\(\[\{']+WEAE[\]\)\}:']+", re.M), r'* {{a|WEAE}} IPA:')
    Prex['(GenAm) to {{a|GenAm}}'] = (re.compile(r"^\*? ?\[\[w:G[^\|]+\|GenAm\]\]", re.M), r'* {{a|GenAM}}')
    Prex['(Canada) to {{a|Canada}}'] = (re.compile(r"^\*? ?[\(\[\{']+Canada[\]\)\}:']+", re.M), r'* {{a|Canada}}')
    Prex['(Australia) to {{a|Australia}}'] = \
               (re.compile(r"^\*? ?[\(\[\{']+Australia[\]\)\}:']+", re.M), r'* {{a|Australia}}')
    Prex['(Aus) to {{a|Aus}}'] = (re.compile(r"^\*? ?[\(\[\{']+Aus[\]\)\}:']+", re.M), r'* {{a|Aus}}')
    Prex['(GenAm|US) to {{a|GenAm}}'] = \
           (re.compile('^' + re.escape("* (''[[General American|US]]'')"), re.M),
            r'* {{a|GenAm}}')
    Prex['(RecPr|UK) to {{a|RP}}'] = \
           (re.compile('^' + re.escape("* (''[[Received Pronunciation|UK]]'')"), re.M),
            r'* {{a|RP}}')

    # untemplated SAMPA and IPA, several combinations, also for "AHD", allow an {{a}} template in front
    Prex['template IPA'] = \
         (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
             r"\[*(w:IPA\||)IPA\]*:? *([/\[][^\{\|\}/\]]+?[/\]])$", re.M),
             r'* \1{{IPA|\3}}')
    Prex['template IPA -IPAchar'] = \
         (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
             r"\[*(w:IPA\||)IPA\]*:? *\{\{IPAchar\|([/\[][^\{\|\}/\]]+?[/\]])\}\}$", re.M),
             r'* \1{{IPA|\3}}')
    Prex['template SAMPA'] = \
            (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
             r"\[*(w:SAMPA\||)SAMPA\]*:? *([/\[])(<tt>|)([^\|\}/]+?)(</tt>|)([/\]])$", re.M),
             r'* \1{{SAMPA|\3\5\7}}')
    Prex['template enPR (was AHD)'] = \
            (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)\[*(w:AHD\||)AHD\]*:? *([^ \{\|\}/]+?)$", re.M),
             r'* \1{{enPR|\3}}')
    Prex['template X-SAMPA'] = \
            (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)"
             r"\[*(w:X-SAMPA\||)X-SAMPA\]*:? *([/\[])(<tt>|)([^\{\|\}/]+?)(</tt>|)([/\]])$", re.M),
             r'* \1{{X-SAMPA|\3\5\7}}')

    Prex['or/comma to multiple parameters in IPA template'] = \
            (re.compile(r"\{\{IPA\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{IPA|\1|\3}}')
    Prex['or/comma to multiple parameters in enPR template'] = \
            (re.compile(r"\{\{enPR\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{enPR|\1|\3}}')
    Prex['or/comma to multiple parameters in SAMPA template'] = \
            (re.compile(r"\{\{SAMPA\|([^\}]+/)(, ?| or | ''or'' )(/[^\}]+)\}\}"), r'{{SAMPA|\1|\3}}')

    # accent templates, try to cover the A-cai/Min Nan cases and others, up to 4

    Prex['+accent template 1'] = (re.compile(r"^\* \(''"
            r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r"''\):?", re.M), r'* {{a|\2}}')
    Prex['+accent template 2'] = (re.compile(r"^\* \(''"
            r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r"''\):?", re.M), r'* {{a|\2|\4}}')
    Prex['+accent template 3'] = (re.compile(r"^\* \(''"
            r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r"''\):?", re.M), r'* {{a|\2|\4|\6}}')
    Prex['+accent template 4'] = (re.compile(r"^\* \(''"
            r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"
            r"''\):?", re.M), r'* {{a|\2|\4|\6|\8}}')

    # hyphenation ...
    Prex['+hyphenation template'] = (re.compile(r"'*Hyphenation:?'*:? *([^ \{\}]+)$", re.M), r'{{hyphenation|\1}}')
    Prex['middot to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u00B7' + '(.+?\}\})'),
            r'\1|\2')
    Prex['hyphpt to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u2027' + '(.+?\}\})'),
            r'\1|\2')
    Prex['bullet to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u2022' + '(.+?\}\})'),
            r'\1|\2')
    Prex['middot (HTML) to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)·(.+?\}\})'),
            r'\1|\2')

    # sorting enPR, IPA, (X-)SAMPA:
    Prex['enPR before SAMPA'] = (re.compile(r'\{\{(X-|)SAMPA\|([^\}]*)\}\}, \{\{enPR\|([^\}]*)\}\}'),
                                            r'{{enPR|\3}}, {{\1SAMPA|\2}}')
    Prex['IPA before SAMPA'] = (re.compile(r'\{\{(X-|)SAMPA\|([^\}]*)\}\}, \{\{IPA\|([^\}]*)\}\}'),
                                            r'{{IPA|\3}}, {{\1SAMPA|\2}}')
    Prex['enPR before IPA'] = (re.compile(r'\{\{IPA\|([^\}]*)\}\}, \{\{enPR\|([^\}]*)\}\}'),
                                            r'{{enPR|\2}}, {{IPA|\1}}')

def trouble(s):
    global AFcount

    s2 = s = restack.sub(r'\1 ', s)
    # skip AF fix(es), do what it will do: 

    for rx in Prex:
        s2 = Prex[rx][0].sub(Prex[rx][1], s2)
    if s2 != s:
        if AFcount < 500:
            print "AF will fix:"
            print "   %s" % safe(s)
            print "to %s" % safe(s2)
        AFcount += 1
        return False  # as AF will do something

    # IPA, SAMPA, enPR are in AF.StarTemp:
    if s.startswith(('{{IPA|', '{{SAMPA|', '{{enPR|')):
        AFcount += 1
        return False

    # "blank" non-templates, in general Regex in AF:
    if reblank.match(s):
        AFcount += 1
        return False

    if "Manuel de Codage" in s: return False

    # remove rfp and contents, ref tags, comments, http links
    s = rerfp.sub(' ', s)
    if s == '{{rfap}}': return False
    s = rederef.sub(' ', s)
    s = redecom.sub(' ', s)
    if s.startswith('<!--') or s.endswith('-->'): return False
    s = rehttp.sub(' ', s)

    # non-templates (skip "[Aa]udio-IPA" for now)
    if "IPA" in s and "{{IPA" not in s and "udio-IPA" not in s: return "IPA not template"
    if "enPR" in s and "{{enPR" not in s: return "enPR not template"
    if "SAMPA" in s and "{{SAMPA" not in s and "{{X-SAMPA" not in s: return "SAMPA not template"
    if "AHD" in s: return "AHD found"

    # check sequence
    e = s.find("{{enPR|")
    i = s.find("{{IPA|")
    m = s.find("{{SAMPA|")
    if e > 0 and i > 0 and i < e: return "IPA before enPR"
    if i > 0 and m > 0 and m < i: return "SAMPA before IPA"
    if e > 0 and m > 0 and m < e: return "SAMPA before enPR"

    # a must be at start, and only follow wikisyntax
    a = s.find("{{a|")
    if a > 0 and s[0:a].strip(':* '): return "{a} template not at beginning"

    # now check templates
    for c in reenpr.findall(s):
        if not c.strip(): return 'empty enPR template'
        for p in c.split('|'):
            p = p.strip()
            if " or " in p: return '"or" should be multiple template parameters'
            if " ''or'' " in p: return '"or" should be multiple template parameters'
            # next is fixed by AF at present
            # if p.startswith('/') and p.endswith('/'): return "slashes in enPR template"

    for c in reipa.findall(s):
        if not c.strip(): return 'empty IPA template'
        for p in c.split('|'):
            p = p.strip()
            if p.startswith('lang='): continue
            if " or " in p: return '"or" should be multiple template parameters'
            if " ''or'' " in p: return '"or" should be multiple template parameters'
            if p.startswith('/'):
                if not p.endswith('/'): return "mismatched /'s in IPA template"
            elif p.startswith('['):
                if not p.endswith(']'): return "mismatched [ ]'s in IPA template"
            else: return "no / / or [ ]'s in IPA template"

    for c in resampa.findall(s):
        if not c.strip(): return 'empty SAMPA template'
        for p in c.split('|'):
            p = p.strip()
            if p.startswith('lang='): continue
            if p[:1].isdigit() and p[1:2] == '=': p = p[2:] 
            if " or " in p: return '"or" should be multiple template parameters'
            if " ''or'' " in p: return '"or" should be multiple template parameters'
            if p.startswith('/'):
                if not p.endswith('/'): return "mismatched /'s in SAMPA template"
            elif p.startswith('['):
                if not p.endswith(']'): return "mismatched [ ]'s in SAMPA template"
            else: return "no / / or [ ]'s in SAMPA template"

    # some simple cases that are just flagged
    for flag in Flags:
        if flag in s: return "flag <tt><no" + "wiki>" + flag + "</no" + "wiki></tt>"

    # couple of other randoms
    if s.endswith('/'): return "line ends with /"
    # if s.startswith('[['): return "line starts with <no" + "wiki>[[</no" + "wiki>"
    # if s.startswith('('): return "line starts with ("
    # if s.startswith('{{'): return "line starts with <no" + "wiki>{{</no" + "wiki>"
    # if s.startswith("''") and not s.endswith("''"): return "line starts with <no" + "wiki>''</no" + "wiki>"
    # next rule is fixed by AF in most cases (one of these at start), fix this rule sometime
    # if ('{{enPR|' in s or '{{IPA|' in s or '{{SAMPA|' in s) and not s.startswith('*'):
    #   return 'line does not start with *'

    return False

# (sporked from Tbot/script, no need to keep up to date):

# table of scripts, each is lowest character code point, highest code + 1, ISO script

Scs = [
         (0x0080, 0x0250, 'Latin'),
         (0x0250, 0x02B0, 'IPA'),
         (0x0370, 0x0400, 'Greek'),
         (0x0400, 0x0530, 'Cyrillic'),
         (0x0530, 0x0590, 'Armenian'),
         (0x0590, 0x0600, 'Hebrew'),
         (0x0600, 0x0700, 'Arabic'),
         (0x0700, 0x0750, 'Syriac'),
         (0x0750, 0x0780, 'Arabic Ext'),
         (0x0900, 0x0980, 'Devanagari'),
         (0x0980, 0x0A00, 'Bengali'),
         (0x0C00, 0x0C80, 'Telugu'),
         (0x0D00, 0x0D80, 'Malayalam'),
         (0x1A00, 0x1100, 'Georgian'),

         (0x1E00, 0x1F00, 'Latin Ext'),
         (0x1F00, 0x2000, 'Greek Ext'),

         (0x3040, 0x30A0, 'Hiragana'),
         (0x30A0, 0x3100, 'Katakana'),
         (0x3400, 0xA000, 'Han'),     # Han Ext A and Unified
         (0xAC00, 0xD800, 'Hangeul'),

         (0x20000, 0x2A6D7, 'Han Ext B') ]  # Han Ext B

def tkey(word):

    # generate a TOC key for a given word

    # simple case first, also handles ''
    if word[:1] <= 'z': return word[:1]

    a = ord(word[0:1])
    if a >= 0xd800 and a < 0xdc00:
        if len(word) < 2: return word # ouch!
        b = ord(word[1:2])
        # "UTF-16" crap:
        a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000

    sc = ''
    for low, high, scode in Scs:
        if a >= low and a < high:
            sc = scode
            break

    if not sc:
        print "no match for script for char code %x" % a
        return word[:1]

    return sc

def main():
    global AFcount

    socket.setdefaulttimeout(40)

    # list of entry names to ignore
    Stops = set()

    reports = { }
    preset()

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()

    # read Stops
    page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions/stops")
    text = page.get()
    for s in re.findall(r'\* \[\[(.*?)\]\]', text): Stops.add(s)

    print 'found %d stops' % len(Stops)

    # get XML dump
    dump = xmlreader.XmlDump("en-wikt.xml")

    entries = 0
    probs = 0
    fixed = 0
    reps = 0
    replimit = 1000
    cis = 0
    lasttab = 0

    rem = """
remainder, one per link, not checked against current, one reason for exception
----
"""


    # testing
    test = False
    tmod = 20
    if test:
        replimit /= tmod
        print "in test mode"

    for entry in dump.parse():
        text = entry.text
        title = entry.title
        if title.find(':') >= 0: continue
        # if title.find('/') >= 0: continue
        if not title: continue # ?


        entries += 1
        if entries % 10000 == 0: print "%d entries, %d problems" % (entries, probs)

        # if test and title[0:1] != 'c': continue
        if test and entries % tmod != 0: continue

        # skim a lot of the db for now
        # if entries % tmod != 0: continue

        if title in Stops: continue

        # screen entries:
        tag = False

        inPron = False
        for line in text.splitlines():
            if '=Pronunciation=' in line:
                inPron = True
                continue
            if line.startswith('='):
                inPron = False
            if not inPron: continue
            a = trouble(line)
            if a:
                if line.startswith('{|') and entries < 300000: lasttab = entries
                tag = True
                break

        # now see if it is something that should be reported:

        if tag:

            ckey = safe(title) # must be string for bsd dbm
            if ckey in cache:
               last = cache[ckey]
               if last > time() - (70 * 24 * 3600):
                  print "%s in 70 day cache, not checked" % safe(title)
                  continue

            probs += 1

            # ... pick up current version from en.wikt

            if reps < replimit:

                print '%s is possible problem, getting current entry' % safe(title)

                try:
                    page = wikipedia.Page(site, title)
                    # text = page.get()
                    text = getwikitext(page)
                except wikipedia.NoPage:
                    print "Can't get %s from en.wikt!" % safe(page.aslink())
                    text = ''
                except wikipedia.IsRedirectPage:
                    print safe(title), 'is now a redirect page'
                    text = '(redirect page)'
                    # will be treated as fixed and added to cache
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except Exception, e:
                    print "unknown exception, maybe timeout"
                    continue # do this again next time

            else:
                print '%s is possible problem' % safe(title)
                rem += '* [[' + title + "]] ''" + a + "''\n"

            if not text: continue

            # check each line for trouble

            act = ''
            inPron = False
            for line in text.splitlines():
                if '=Pronunciation=' in line:
                    inPron = True
                    continue
                if line.startswith('='):
                    inPron = False
                if not inPron: continue
                a = trouble(line)
                if a and a not in act: act += ', ' + a

            # if fixed, add to cache so we don't keep re-checking

            if not act:
                print "%s has been fixed" % safe(title)
                cache[ckey] = time() # entry has been fixed for now
                cis += 1
                if cis % 20 == 0: cache.sync()
                continue

        else: continue

        # don't write any change to entry, report:

        if act:
            act = "     ''" + act.strip(', ') + "''"
            if reps < replimit:
                xp = wikipedia.Page(site, title)
                url = xp.urlname()
                repline = \
"* [[%s]] [http://en.wiktionary.org/w/index.php?title=%s&action=edit&section=SECTXX (edit)] %s" % (title, url, act)
            # go isolate the lines
            s = 0
            se = 0
            ts = ''
            inPron = False
            for line in text.splitlines():
               if line.startswith('='): s += 1
               if '=Pronunciation=' in line:
                    inPron = True
                    continue
               if line.startswith('='):
                    inPron = False
               if not inPron: continue
               if trouble(line):
                   if not se: se = s
                   ts += ', ' + trouble(line)
                   repline += '\n*: <tt><no' + 'wiki>' + line + '</no' + 'wiki></tt>'
                   print reps, safe(title), safe(line)

            if reps < replimit:
                repline = repline.replace('SECTXX', "%d"%se)
                reports[title] = repline
            reps += 1

        if test and reps > replimit: break

        continue
        # no corrections here!

    print "%d entries, %d problems" % (entries, probs)
    cache.close()
    print "last table at entry %d" % lasttab

    if not test:
        page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions")
    else:
        page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions/test")
    try:
        oldrep = page.get()
    except wikipedia.NoPage:
        pass

    ss = ', '.join(sorted(Stops))
    fs = ''
    for flag in sorted(Flags): fs += ", <tt><no" + "wiki>" + flag + "</no" + "wiki></tt>"
    fs = fs.lstrip(", ")

    report = """

'''occurances of pronunciation section exceptions'''
----
* from XML dump as of %s, checked against live wiki {{subst:CURRENTDAY}} {{subst:CURRENTMONTHNAME}} {{subst:CURRENTYEAR}}
* see talk page for rules in effect
* checks may not be perfect at this point
* entries are not listed if [[User:AutoFormat|AutoFormat]] would fix something, though perhaps not entirely
* total AF will fix: %d
* some entries are listed as "stops" and not shown
* stops in effect: %s
*: from [[User:Robert Ullmann/Pronunciation exceptions/stops]]
* specific strings flagged: %s
* "blank" IPA, SAMPA, etc (i.e. "* [[SAMPA]]: //") are not reported
* %d total problems, limit of %d shown, remainder listed in [[User:Robert Ullmann/Pronunciation exceptions/remains]]

Please do section edit and remove completed entries, the automation will then recheck them. If you do most of a section but not quite all, feel free to just blank the section, any leftovers will get picked up again.

----

<div class="floatright" style="margin-top:1.5em">__TOC__</div><div class=plainlinks>

""" % (xmldate.enXMLdate, AFcount, ss, fs, reps, replimit)

    if test:
        report += "'''this is a test run, you want to look at [[User:Robert Ullmann/Pronunciation exceptions]]'''\n"

    prev = ''
    s = 0
    i = 1
    for t in sorted(reports):
        if tkey(t) != prev:
            report += '\n==' + tkey(t) + '==\n\n'
            prev = tkey(t)
            s = 0
            i = 1
        s += 1
        if s > 9:
            i += 1
            report += '\n==' + tkey(t) + ' (%d)==\n\n' % i
            s = 0
        report += reports[t] + '\n'

    wikipedia.setAction("regenerate, add more")
    page.put(report)

    if not test:
        wikipedia.setAction("updating remainder")
        page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions/remains")
        page.put(rem)

    # done

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()