User:Robert Ullmann/Mismatched wikisyntax/code
Jump to navigation
Jump to search
#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Robert Ullmann/Mismatched_wikisyntax/code """ This bot looks for and executes replacements, customized for each run This version looks unmatched wikisyntax and parens No command line arguments. """ import wikipedia import xmlreader import sys import re import pickle import xmldate import socket from mwapi import getwikitext def safe(s): return pickle.dumps(s)[1:-5] # work cache, record time last looked at entry # each record is key: lc:word, pickled with safe(), value is integer time() import shelve cache = shelve.open("mismatch") from time import time # we want to match [] {} (), mrd returns remainder def mrd(s): s = s.strip(' .,;abcdefghijklmnopqrpstvwxyz') # look for an open: while s: # if we find a close, return it if s[0] in (')', ']', '}'): return s if s[0] == '(': s = mrd(s[1:]) if s[0:1] == ')': return mrd(s[1:]) else: return '(' + s if s[0] == '[': s = mrd(s[1:]) if s[0:1] == ']': return mrd(s[1:]) else: return '[' + s if s[0] == '{': s = mrd(s[1:]) if s[0:1] == '}': return mrd(s[1:]) else: return '{' + s s = s[1:] # okay, nil remaining return s # multiline templates Tstops = set() retstop = re.compile(r'[\#\*\s:\|]*\{\{([-a-zA-Z1-9 ]+)') # some lines that start with parameter names, not | ... real crock this ... Pars = ( '1500s=', '1600s=', '1700s=', '1800s=', '1900s=', '2000s=', 'passage=') resampa = re.compile(r'\{\{(X-|)SAMPA.*?\}\}') reslash = re.compile(r'/[^\{\}/]*\{[^\{\}/]*/') rebrack = re.compile(r'\][^\{\}\]]*\{[^\{\}\]]*\]') rercal = re.compile(r'\{\{R:(CAL|LSJ)\|.*?\}\}') redecom = re.compile(r'<!--.*?-->') def mismatch(s): # remove comments first s = redecom.sub('', s) # temp: ignore esbot debris (should all be fixed now) # if 'esbot:catline' in s: return False # remove <ref> tags and handle content separately, recurse if "<ref>" in s: a = s.split("<ref>", 1) return mismatch(a[0]) or mismatch(a[1]) elif "<ref" in s: a = s.split("<ref", 1) if ">" not in a[1]: return True b = a[1].split(">", 1) if b[0].endswith('/'): return mismatch(a[0] + b[1]) return mismatch(a[0]) or mismatch(b[1]) if "</ref>" in s: a = s.split("</ref>", 1) return mismatch(a[0]) or mismatch(a[1]) # tables, end of templates if s.startswith( ('{|', '|}', '}}') ): return False if s.lstrip() == '}}': return False # multiline templates mo = retstop.match(s) if mo and mo.group(1).strip() in Tstops and '}}' not in s: return False # just ignore crap: if '{{sh-' in s: return False # and end of same if s.startswith('|') and s.endswith('}}'): return False if s.startswith(Pars) and s.endswith('}}'): return False # remove SAMPA template, uses { in some cases s = resampa.sub(' ', s) # and anything that looks like it, for one {, either / or bracket s = reslash.sub(' ', s) s = rebrack.sub(' ', s) # R:CAL template used in Hebrew entries often contains ), ignore content # also R:LSJ s = rercal.sub(' ', s) if mrd(s): return True return False # (sporked from Tbot/script, no need to keep up to date): # table of scripts, each is lowest character code point, highest code + 1, ISO script Scs = [ (0x0080, 0x0250, 'Latin'), (0x0250, 0x02B0, 'IPA'), (0x0370, 0x0400, 'Greek'), (0x0400, 0x0530, 'Cyrillic'), (0x0530, 0x0590, 'Armenian'), (0x0590, 0x0600, 'Hebrew'), (0x0600, 0x0700, 'Arabic'), (0x0700, 0x0750, 'Syriac'), (0x0750, 0x0780, 'Arabic Ext'), (0x0900, 0x0980, 'Devanagari'), (0x0980, 0x0A00, 'Bengali'), (0x0C00, 0x0C80, 'Telugu'), (0x0D00, 0x0D80, 'Malayalam'), (0x1A00, 0x1100, 'Georgian'), (0x1E00, 0x1F00, 'Latin Ext'), (0x1F00, 0x2000, 'Greek Ext'), (0x3040, 0x30A0, 'Hiragana'), (0x30A0, 0x3100, 'Katakana'), (0x3400, 0xA000, 'Han'), # Han Ext A and Unified (0xAC00, 0xD800, 'Hangeul'), (0x20000, 0x2A6D7, 'Han Ext B') ] # Han Ext B def tkey(word): # generate a TOC key for a given word # simple case first, also handles '' if word[:1] <= 'z': return word[:1] a = ord(word[0:1]) if a >= 0xd800 and a < 0xdc00: if len(word) < 2: return word # ouch! b = ord(word[1:2]) # "UTF-16" crap: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000 sc = '' for low, high, scode in Scs: if a >= low and a < high: sc = scode break if not sc: print "no match for script for char code %x" % a return word[:1] return sc def main(): socket.setdefaulttimeout(240) # list of entry names to ignore Stops = set() reports = { } # make sure we are logged in site = wikipedia.getSite() site.forceLogin() # read Stops and Tstops page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/stops") text = getwikitext(page) for s in re.findall(r'\[\[(.*?)\]\]', text): Stops.add(s) page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/multiline") text = getwikitext(page) for s in re.findall(r'\{\{temp\|(.*?)\}\}', text): Tstops.add(s) print 'found %d stops, %d multilines' % (len(Stops), len(Tstops)) # get XML dump dump = xmlreader.XmlDump("en-wikt.xml") entries = 0 probs = 0 fixed = 0 reps = 0 replimit = 1000 cis = 0 # testing test = False tmod = 20 if test: replimit /= tmod print "in test mode" for entry in dump.parse(): text = entry.text title = entry.title if title.find(':') >= 0: continue # if title.find('/') >= 0: continue if not title: continue # ? entries += 1 if entries % 10000 == 0: print "%d entries, %d problems" % (entries, probs) # if test and title[0:1] != 'c': continue if test and entries % tmod != 0: continue if title in Stops: continue # screen entries: tag = False for line in text.splitlines(): if mismatch(line): tag = True break # now see if it is something that should be reported: if tag: ckey = safe(title) # must be string for bsd dbm if ckey in cache: last = cache[ckey] if last > time() - (70 * 24 * 3600): print "%s in 70 day cache, not checked" % safe(title) continue probs += 1 # ... pick up current version from en.wikt if reps < replimit: print '%s is possible problem, getting current entry' % safe(title) try: page = wikipedia.Page(site, title) # text = page.get() text = getwikitext(page) except wikipedia.NoPage: print "Can't get %s from en.wikt!" % safe(page.aslink()) text = '' except wikipedia.IsRedirectPage: print safe(title), 'is now a redirect page' text = '(redirect page)' # will be treated as fixed and added to cache except KeyboardInterrupt: raise KeyboardInterrupt except Exception, e: print "unknown exception, maybe timeout" continue # do this again next time else: print '%s is possible problem' % safe(title) if not text: continue # check each line for mismatches act = '' for line in text.splitlines(): if mismatch(line): act = 'mismatched syntax' break # if fixed, add to cache so we don't keep re-checking if not act: print "%s has been fixed" % safe(title) cache[ckey] = time() # entry has been fixed for now cis += 1 if cis % 20 == 0: cache.sync() continue else: continue # don't write any change to entry, report: if act: if reps < replimit: xp = wikipedia.Page(site, title) url = xp.urlname() repline = \ "* [[%s]] [http://en.wiktionary.org/w/index.php?title=%s&action=edit§ion=SECTXX (edit)]" % (title, url) # go isolate the lines s = 0 se = 0 for line in text.splitlines(): if line.startswith('='): s += 1 if mismatch(line): if not se: se = s repline += '\n*: <tt><nowiki>' + line + '' print reps, safe(title), safe(line)
if reps < replimit: repline = repline.replace('SECTXX', "%d"%se) reports[title] = repline reps += 1
if test and reps > replimit: break
continue # no corrections here!
print "%d entries, %d problems" % (entries, probs) cache.close()
if not test: page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax") else: page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/test") try: oldrep = page.get() except wikipedia.NoPage: pass
ss = ', '.join(sorted(Stops)) ts = ', '.join(sorted(Tstops))
report = """
occurances of mismatched wikisyntax
- from XML dump as of %s, checked against live wiki 9 July 2010
- checks that (), [], {} match, correctly nested
- matches may not be perfect at this point
- of course, some of these are not errors
- some entries are listed as "stops" and not shown (smiley ;-)
- stops in effect: %s
- multiline templates ignored at present: %s
- contents of SAMPA template ignored, as SAMPA uses {, also tries to avoid others by looking for /...{.../, but may miss something or produce spurious errors as a result
- also X-SAMPA, and brackets as well as slashes
- %d total problems, limit of %d shown
Please do section edit and remove completed entries, the automation will then recheck them. If you do most of a section but not quite all, feel free to just blank the section, any leftovers will get picked up again.
""" % (xmldate.enXMLdate, ss, ts, reps, replimit)
if test: report += " this is a test run, you want to look at User:Robert Ullmann/Mismatched wikisyntax\n"
prev = s = 0 i = 1 for t in sorted(reports): if tkey(t) != prev: report += '\n==' + tkey(t) + '==\n\n' prev = tkey(t) s = 0 i = 1 s += 1 if s > 9: i += 1 report += '\n==' + tkey(t) + ' (%d)==\n\n' % i s = 0 report += reports[t] + '\n'
wikipedia.setAction("regenerate, add more") page.put(report)
# done
if __name__ == "__main__":
try: main() finally: wikipedia.stopme()</nowiki>