User:Robert Ullmann/Mismatched wikisyntax/code

#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/Mismatched_wikisyntax/code


"""
This bot looks for and executes replacements, customized for each run

This version looks unmatched wikisyntax and parens

No command line arguments.

"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate
import socket
from mwapi import getwikitext

def safe(s):
    return pickle.dumps(s)[1:-5]

# work cache, record time last looked at entry
# each record is key: lc:word, pickled with safe(), value is integer time()

import shelve
cache = shelve.open("mismatch")
from time import time

# we want to match [] {} (), mrd returns remainder

def mrd(s):

    s = s.strip(' .,;abcdefghijklmnopqrpstvwxyz')

    # look for an open:
    while s:

        # if we find a close, return it
        if s[0] in (')', ']', '}'): return s

        if s[0] == '(':
             s = mrd(s[1:])
             if s[0:1] == ')': return mrd(s[1:])
             else: return '(' + s

        if s[0] == '[':
             s = mrd(s[1:])
             if s[0:1] == ']': return mrd(s[1:])
             else: return '[' + s

        if s[0] == '{':
             s = mrd(s[1:])
             if s[0:1] == '}': return mrd(s[1:])
             else: return '{' + s

        s = s[1:]

    # okay, nil remaining
    return s

# multiline templates
Tstops = set()
retstop = re.compile(r'[\#\*\s:\|]*\{\{([-a-zA-Z1-9 ]+)')
# some lines that start with parameter names, not | ... real crock this ...
Pars = ( '1500s=', '1600s=', '1700s=', '1800s=', '1900s=', '2000s=', 'passage=')

resampa = re.compile(r'\{\{(X-|)SAMPA.*?\}\}')
reslash = re.compile(r'/[^\{\}/]*\{[^\{\}/]*/')
rebrack = re.compile(r'\][^\{\}\]]*\{[^\{\}\]]*\]')
rercal = re.compile(r'\{\{R:(CAL|LSJ)\|.*?\}\}')
redecom = re.compile(r'<!--.*?-->')

def mismatch(s):

    # remove comments first
    s = redecom.sub('', s)

    # temp: ignore esbot debris (should all be fixed now)
    # if 'esbot:catline' in s: return False

    # remove <ref> tags and handle content separately, recurse
    if "<ref>" in s:
        a = s.split("<ref>", 1)
        return mismatch(a[0]) or mismatch(a[1])
    elif "<ref" in s:
        a = s.split("<ref", 1)
        if ">" not in a[1]: return True
        b = a[1].split(">", 1)
        if b[0].endswith('/'): return mismatch(a[0] + b[1])
        return mismatch(a[0]) or mismatch(b[1])
    if "</ref>" in s:
        a = s.split("</ref>", 1)
        return mismatch(a[0]) or mismatch(a[1])

    # tables, end of templates
    if s.startswith( ('{|', '|}', '}}') ): return False
    if s.lstrip() == '}}': return False

    # multiline templates
    mo = retstop.match(s)
    if mo and mo.group(1).strip() in Tstops and '}}' not in s: return False

    # just ignore crap:
    if '{{sh-' in s: return False

    # and end of same
    if s.startswith('|') and s.endswith('}}'): return False
    if s.startswith(Pars) and s.endswith('}}'): return False

    # remove SAMPA template, uses { in some cases
    s = resampa.sub(' ', s)
    # and anything that looks like it, for one {, either / or bracket
    s = reslash.sub(' ', s)
    s = rebrack.sub(' ', s)

    # R:CAL template used in Hebrew entries often contains ), ignore content
    # also R:LSJ
    s = rercal.sub(' ', s)

    if mrd(s): return True
    return False

# (sporked from Tbot/script, no need to keep up to date):

# table of scripts, each is lowest character code point, highest code + 1, ISO script

Scs = [
         (0x0080, 0x0250, 'Latin'),
         (0x0250, 0x02B0, 'IPA'),
         (0x0370, 0x0400, 'Greek'),
         (0x0400, 0x0530, 'Cyrillic'),
         (0x0530, 0x0590, 'Armenian'),
         (0x0590, 0x0600, 'Hebrew'),
         (0x0600, 0x0700, 'Arabic'),
         (0x0700, 0x0750, 'Syriac'),
         (0x0750, 0x0780, 'Arabic Ext'),
         (0x0900, 0x0980, 'Devanagari'),
         (0x0980, 0x0A00, 'Bengali'),
         (0x0C00, 0x0C80, 'Telugu'),
         (0x0D00, 0x0D80, 'Malayalam'),
         (0x1A00, 0x1100, 'Georgian'),

         (0x1E00, 0x1F00, 'Latin Ext'),
         (0x1F00, 0x2000, 'Greek Ext'),

         (0x3040, 0x30A0, 'Hiragana'),
         (0x30A0, 0x3100, 'Katakana'),
         (0x3400, 0xA000, 'Han'),     # Han Ext A and Unified
         (0xAC00, 0xD800, 'Hangeul'),

         (0x20000, 0x2A6D7, 'Han Ext B') ]  # Han Ext B

def tkey(word):

    # generate a TOC key for a given word

    # simple case first, also handles ''
    if word[:1] <= 'z': return word[:1]

    a = ord(word[0:1])
    if a >= 0xd800 and a < 0xdc00:
        if len(word) < 2: return word # ouch!
        b = ord(word[1:2])
        # "UTF-16" crap:
        a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000

    sc = ''
    for low, high, scode in Scs:
        if a >= low and a < high:
            sc = scode
            break

    if not sc:
        print "no match for script for char code %x" % a
        return word[:1]

    return sc

def main():

    socket.setdefaulttimeout(240)

    # list of entry names to ignore
    Stops = set()

    reports = { }

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()

    # read Stops and Tstops
    page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/stops")
    text = getwikitext(page)
    for s in re.findall(r'\[\[(.*?)\]\]', text): Stops.add(s)

    page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/multiline")
    text = getwikitext(page)
    for s in re.findall(r'\{\{temp\|(.*?)\}\}', text): Tstops.add(s)

    print 'found %d stops, %d multilines' % (len(Stops), len(Tstops))

    # get XML dump
    dump = xmlreader.XmlDump("en-wikt.xml")

    entries = 0
    probs = 0
    fixed = 0
    reps = 0
    replimit = 1000
    cis = 0

    # testing
    test = False
    tmod = 20
    if test:
        replimit /= tmod
        print "in test mode"

    for entry in dump.parse():
        text = entry.text
        title = entry.title
        if title.find(':') >= 0: continue
        # if title.find('/') >= 0: continue
        if not title: continue # ?


        entries += 1
        if entries % 10000 == 0: print "%d entries, %d problems" % (entries, probs)

        # if test and title[0:1] != 'c': continue
        if test and entries % tmod != 0: continue

        if title in Stops: continue

        # screen entries:
        tag = False

        for line in text.splitlines():
            if mismatch(line):
                tag = True
                break

        # now see if it is something that should be reported:

        if tag:

            ckey = safe(title) # must be string for bsd dbm
            if ckey in cache:
               last = cache[ckey]
               if last > time() - (70 * 24 * 3600):
                  print "%s in 70 day cache, not checked" % safe(title)
                  continue

            probs += 1

            # ... pick up current version from en.wikt

            if reps < replimit:

                print '%s is possible problem, getting current entry' % safe(title)

                try:
                    page = wikipedia.Page(site, title)
                    # text = page.get()
                    text = getwikitext(page)
                except wikipedia.NoPage:
                    print "Can't get %s from en.wikt!" % safe(page.aslink())
                    text = ''
                except wikipedia.IsRedirectPage:
                    print safe(title), 'is now a redirect page'
                    text = '(redirect page)'
                    # will be treated as fixed and added to cache
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except Exception, e:
                    print "unknown exception, maybe timeout"
                    continue # do this again next time


            else: print '%s is possible problem' % safe(title)

            if not text: continue

            # check each line for mismatches

            act = ''
            for line in text.splitlines():
                if mismatch(line):
                    act = 'mismatched syntax'
                    break

            # if fixed, add to cache so we don't keep re-checking

            if not act:
                print "%s has been fixed" % safe(title)
                cache[ckey] = time() # entry has been fixed for now
                cis += 1
                if cis % 20 == 0: cache.sync()
                continue

        else: continue

        # don't write any change to entry, report:

        if act:
            if reps < replimit:
                xp = wikipedia.Page(site, title)
                url = xp.urlname()
                repline = \
"* [[%s]] [http://en.wiktionary.org/w/index.php?title=%s&action=edit&section=SECTXX (edit)]" % (title, url)
            # go isolate the lines
            s = 0
            se = 0
            for line in text.splitlines():
               if line.startswith('='): s += 1
               if mismatch(line):
                   if not se: se = s
                   repline += '\n*: <tt><nowiki>' + line + ''
                  print reps, safe(title), safe(line)

           if reps < replimit:
               repline = repline.replace('SECTXX', "%d"%se)
               reports[title] = repline
           reps += 1

       if test and reps > replimit: break

       continue
       # no corrections here!

   print "%d entries, %d problems" % (entries, probs)
   cache.close()

   if not test:
       page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax")
   else:
       page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/test")
   try:
       oldrep = page.get()
   except wikipedia.NoPage:
       pass

   ss = ', '.join(sorted(Stops))
   ts = ', '.join(sorted(Tstops))

   report = """

occurances of mismatched wikisyntax

from XML dump as of %s, checked against live wiki 9 July 2010
checks that (), [], {} match, correctly nested
matches may not be perfect at this point
of course, some of these are not errors
some entries are listed as "stops" and not shown (smiley ;-)
stops in effect: %s
from User:Robert Ullmann/Mismatched wikisyntax/stops
multiline templates ignored at present: %s
from User:Robert Ullmann/Mismatched wikisyntax/multiline
contents of SAMPA template ignored, as SAMPA uses {, also tries to avoid others by looking for /...{.../, but may miss something or produce spurious errors as a result
also X-SAMPA, and brackets as well as slashes
%d total problems, limit of %d shown

Please do section edit and remove completed entries, the automation will then recheck them. If you do most of a section but not quite all, feel free to just blank the section, any leftovers will get picked up again.

""" % (xmldate.enXMLdate, ss, ts, reps, replimit)

   if test:
       report += " this is a test run, you want to look at User:Robert Ullmann/Mismatched wikisyntax\n"

   prev = 
   s = 0
   i = 1
   for t in sorted(reports):
       if tkey(t) != prev:
           report += '\n==' + tkey(t) + '==\n\n'
           prev = tkey(t)
           s = 0
           i = 1
       s += 1
       if s > 9:
           i += 1
           report += '\n==' + tkey(t) + ' (%d)==\n\n' % i
           s = 0
       report += reports[t] + '\n'

   wikipedia.setAction("regenerate, add more")
   page.put(report)

   # done

if __name__ == "__main__":

   try:
       main()
   finally:
       wikipedia.stopme()</nowiki>

User:Robert Ullmann/Mismatched wikisyntax/code

Navigation menu

Search