User:Interwicket/code/iwiktrc

Definition from Wiktionary, the free dictionary
Jump to: navigation, search
#!/usr/bin/python
# -*- coding: utf-8  -*-
 
"""
This bot updates iwiki links between wiktionaries
 
22.1.9: try reading RC from various wikts and adding to en.wikt (just for fun)
 
24.1.9: try hunting down iwikis for new en.wikt entries
 
26.1.9: try adding reciprocals; can then use this in full run?
 
"""
 
import wikipedia
import xmlreader
import sys
import socket
import re
import pickle
import pagegenerators
import time
from random import randrange
from mwapi import getwikitext, getedit
from reciprocal import addrci, replink, plock
# borrow global:
from config import usernames
 
def safe(s):
    return pickle.dumps(s)[1:-5]
 
# Iwiki cache:
 
# not used quite yet:
"""
import shelve
Iwikis = None
 
def iwopen(home):
    global Iwikis
 
    Iwikis = shelve.open(home + "-iwiki-cache")
 
cis = 0
def iwadd(title, iws, upd = True):
    global cis
 
    if safe(title) in Iwikis and not upd: return
    if not iws or not len(iws): return
 
    # with plock: print "iwikis cache %s: %s" % (safe(title), safe(u' '.join(iws)))
    Iwikis[safe(title)] = iws
 
    cis += 1
    if cis % 100: Iwikis.sync()
 
    return
"""
 
Lcode = { }
Exists = set()
Active = set()
site = { }
naps = { }
 
def now(): return int(time.clock())
 
# return title, language code of FL wikt for recent changes in the other wikts
 
def recent(home = 'en'):
 
    # set up list of wikt codes to look at
 
    qtime = { }
    maxnap = 350 * 60 # almost 6 hours
    for lc in Exists:
         # if lc == home: continue
         site[lc] = wikipedia.getSite(lc, "wiktionary")
         qtime[lc] = now()
         naps[lc] = 60 * randrange(20, 71) # scatter 20 to 70 minutes
         if lc == home: naps[lc] = 300 # five min for home wikt
 
    # entries seen already (just let this grow?)
    seen = set()
    ny = 0
 
    rcex = re.compile(r'title="(.+?)"')
 
    while True:
 
        # sleep until next one
        nextq = now() + 1000000
        nextlc = ''
        for lc in qtime:
            if qtime[lc] < nextq:
                nextq = qtime[lc]
                nextlc = lc
        st = nextq - now()
        if st > 90:
            with plock: print "(%d, sleeping %d minutes, %s next)" % (now(), (st+29)/60, nextlc)
        if st > 0:
            time.sleep(st)
        if st < -120:
            with plock: print "(%d minutes behind)" % (-(st-29)/60)
        lc = nextlc
 
        # read recentchanges, new entries, namespace 0, from site:
 
        if True: # [indent]
 
            with plock: print "(%d, reading from %s.wikt)" % (now(), lc)
 
            # set parameters
 
            # one hour ago back to one day ago
            rcend = '&rcend=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 86400))
            rcstart = '&rcstart=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 3600))
 
            if lc == home:
                rcshow = "&rcshow=patrolled|!bot" # avoid junk, large numbers of bot forms
                sysop = True # need patrol right on login used
            else:
                rcshow = ''
                sysop = False
 
            rclimit = "&rclimit=%d" % min(1 + ny/10, 200)
 
            # with plock: print "(options " + rcend + rcshow + rclimit + ")"
 
            try:
                rct = site[lc].getUrl("/w/api.php?
 
action=query&list=recentchanges&format=xml&rcprop=title" +
                     "&rctype=new&rcnamespace=0"+rcend+rcstart+rcshow+rclimit, sysop = sysop)
            except wikipedia.NoPage:
                with plock: print "can't get recentchanges from %s.wikt" % lc
                # rct = ''
                # time.sleep(30)
                qtime[lc] = now() + 700  # do other things for a bit
                continue
            except KeyError:
                # local bogosity
                with plock: print "keyerror"
                sleep(20)
                continue
 
            if '<recentchanges />' in rct:
                # no changes in recent history
                pass
            elif '</recentchanges>' not in rct:
                with plock: print "some bad return from recentchanges, end tag not found"
                with plock: print safe(rct)
                # rct = ''
                time.sleep(30)
                qtime[lc] = now() + 300  # do other things for a bit
                continue
 
            found = False
            for title in rcex.findall(rct):
                if ':' in title: continue # other stray stuff in NS:0
 
                if lc + ':' + title not in seen:
                    seen.add(lc + ':' + title)
                    yield title, lc
                    ny += 1
                    found = True
 
            if found:
                naps[lc] /= 2
                # naps[lc] = max(naps[lc], 30) # thirty seconds
                Active.add(lc)
            else:
                mn = naps[lc]/300 # one-fifth, in minutes
                naps[lc] += 60 * randrange(5, 11 + mn) # five-ten minutes or longer if we don't find 
 
anything
                naps[lc] = min(naps[lc], maxnap)
                if naps[lc] > maxnap/2: Active.discard(lc)
 
            qtime[lc] = now() + naps[lc]
            if naps[lc] > 90:
                with plock: print "(naptime for %s is %d minutes)" % (lc, (naps[lc]+29)/60)
            else:
                with plock: print "(naptime for %s is %d seconds)" % (lc, naps[lc])
 
# wiki-hunt ... see if a word is in other wikts, return list ...
# challenge here is not to take a huge amount of time, but get as many as possible
# also used by iwiktll, be careful with changes!
# note that lc argument can be one code, or a list
 
re2head = re.compile(r'^==([^=]*)==$', re.M)
 
def hunt(word, text, lc, lcs = '', home = 'en'):
 
    with plock: print "    ... hunting iwikis"
    totry = set()
    done = set()
    fps = set()
    links = { }
    redirs = { }
 
    reiw = re.compile(r'\[\[([a-z-]{2,11}):' + re.escape(word) + r'\]\]')
 
    # for lc in Active: totry.add(lc) magic occurs:
    if lc == home:
        # try hunting 10 most active wikts (11 because home will usually be in this list)
        totry = set( sorted(Active, key=lambda c: naps[c])[:11] )
    else:
        # if we found an FL title, start with that
        totry.add(lc)
 
    # other codes known to caller
    for lc in lcs: totry.add(lc)
 
    # language header(s) in entry are good candidates (of course!)
    # [code specific to English wikt ...]
 
    for lang in re2head.findall(text):
        if lang in Lcode: totry.add(Lcode[lang])
 
    # simple scan for existing iwikis
 
    for lc in reiw.findall(text):
        lc = str(lc)
        if lc in site:
            totry.add(lc)
 
    # not home:
    totry.discard(home)
    done.add(home)
 
    exceptions = False
 
    while totry:
        lc = totry.pop()
 
        try:
            fpage = wikipedia.Page(site[lc], word)
            text = getwikitext(fpage)
            with plock: print "        found in", lc
        except wikipedia.NoPage:
            with plock: print "        not in", lc
            done.add(lc)
            continue
        except wikipedia.IsRedirectPage:
            redirs[lc] = fpage
            with plock: print "        found in", lc, "(redirect)"
        except Exception, e:
            exceptions = True
            with plock: print "exception testing existence of word", str(e)
            done.add(lc)
            continue
 
        done.add(lc)
        links[lc] = fpage
 
        # add to list to add reciprocal link, or complete set, don't (can't :-) update redirects
        if lc not in redirs: fps.add(fpage)
 
        # look for iwikis in the page, add to to-be-tried if not already done
 
        for lc in reiw.findall(text):
            lc = str(lc) # not in unicode
            if lc not in site: continue # (!) else who knows what junk ...
            if lc not in done and lc not in totry:
                with plock: print "            found further iwiki", lc
                totry.add(lc)
 
    # all done, now add reciprocals
    # don't remove anything if there were exceptions because hunt may be incomplete
    # if no exceptions, hunt is complete for these entries (there may be others not seen,
    # but then they aren't linked, as we've looked at all links ...), so remove any
    # links not found:
 
    for fpage in fps:
        addrci(fpage, site[home], links=links, redirs=redirs, remove=not exceptions)
 
    # return list of all links and redirects, and flag if complete
    return links, redirs, not exceptions
 
 
def main():
 
    socket.setdefaulttimeout(40)
 
    home = 'en'
    xml = True
 
    # testing rc:
    xml = False
 
    """ just keep argv code for now
    for arg in sys.argv[1:]:
        if arg.startswith('-start:'):
            start = arg[7:]
            with plock: print "starting at %s" % start
        elif arg.startswith('-stop:'):
            stop = arg[6:]
            with plock: print "stopping at %s" % stop
        elif arg.startswith('-new'):
            newonly = True
            with plock: print "new entries only"
        elif arg.startswith('-sort'):
            sort = True
            with plock: print "do edits for sort"
        elif arg.startswith('-xml'):
            xml = True
            with plock: print "read XML file"
        elif arg.startswith('-update'):
            update = True
            with plock: print "update cache from XML (XML is current!)"
        else: with plock: print "unknown command line argument %s" % arg
    """
 
    mysite = wikipedia.getSite(home, 'wiktionary')
    # make sure we are logged in
    mysite.forceLogin()
    meta = wikipedia.getSite(code = "meta", fam = "meta")
 
    # get active wikt list
    # minus crap. Tokipona? what are they thinking? Klingon? ;-)
    Lstops = ['tokipona', 'tlh']
 
    page = wikipedia.Page(meta, "List of Wiktionaries/Table")
    existtab = page.get()
 
    """ entry looks like:
| [[w:Vietnamese language|Vietnamese]]
| [[w:Vietnamese language|Tiếng Việt]]
| [http://vi.wiktionary.org/wiki/ vi]
"""
 
    # reextab = re.compile(r'^\[\[:([a-z-]+):')
    # reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org')
    reextab = re.compile(r'^\| \[\[w:.*\|(.*)\]\]\n'
                         r'^\| .*\n'
                         r'^\| \[http://([a-z-]+)\.wiktionary\.org', re.M)
    for mo in reextab.finditer(existtab):
        if mo.group(2) in Lstops: continue
        Exists.add(mo.group(2))
        Lcode[mo.group(1)] = mo.group(2)
        # see if we have a login in user config, else pretend we do
        # has to be done before any call, or login status gets confused!
        if mo.group(2) not in usernames['wiktionary']:
            usernames['wiktionary'][mo.group(2)] = "Interwicket"
 
    with plock: print "found %d active wikts" % len(Exists)
    if len(Exists) < 150: return
 
    # naps ... ;-)
    naptime = 0
    maxnap = 70
 
    # Iwikis cache [not updated for now]
    # iwopen(home)
 
    # build table of existing entries from xml
    # note we assume since we are doing RC new entries that the iwiki will be new,
    # what we want here is just an index to entries, so we don't have to do lots of en.wikt lookups
 
    enwikt = set()
    if xml:
      # get XML dump
      dump = xmlreader.XmlDump("../hancheck/en-wikt.xml")
 
      ti = 0
      entries = 0
      reds = 0
      iws = { } # in memory cache
 
      for entry in dump.parse():
        text = entry.text
        title = entry.title
        if ':' in title: continue
        # if title < start or (stop and title > stop): continue
        if text.startswith('#'): continue
        entries += 1
        if entries % 20000 == 0:
            with plock: print "prescan %d entries" % entries
        enwikt.add(title)
 
        # test:
        # if entries > 100000: break
 
        continue
 
      with plock: print "total  %d entries" % entries
 
    # now look for iwikis needed
 
    entries = 0
    probs = 0
    fixed = 0
 
    news = 0
    cbase = now() - 86400
    rate = 0.0
 
    for title, lc in recent():
 
        if ':' in title: continue # redundant, but eh?
 
        # canon title
        page = wikipedia.Page(mysite, title)
        title = page.title()
 
        # temp:
        # if lc == 'en' and title.startswith('Da'): continue
 
        if title.lower() == 'main page': continue
 
        news += 1
        rate = news*3600.0/(now()-cbase)
        if news % 100 == 0:
            with plock: print "(observed creation rate %.4f/hour)" % rate
 
        with plock: print "%s:%s" % (safe(lc), safe(title))
 
        # if looking at home wikt is enabled above, just add things (;-)
        """
        if lc == home:
             with plock: print "    ... added to en.wikt"
             enwikt.add(title)
             continue
        """
        if lc == home: tag = True
 
        # if we are using xml? else just always look at entry
        if lc != home and xml and title not in enwikt:
             with plock: print "    ... %s not in en.wikt" % safe(title)
             continue
 
        # [look at cache, but unlikely, as this is new]
 
        tag = True
 
        # now see if it is something that should be tagged/replaced:
 
        if tag:
 
            probs += 1
            naptime += 1
 
            # ... pick up current version from en.wikt
 
            # with plock: print '%s is possible update, getting current entry' % safe(title)
 
            try:
                # text = page.get()
                text = getwikitext(page)
                oldtext = text
            except wikipedia.NoPage:
                with plock: print "    ... %s not in en.wikt" % safe(page.title())
                text = ''
            except wikipedia.IsRedirectPage:
                with plock: print "    ... redirect page"
                text = ''
            except KeyError:
                # annoying local error, from crappy framework code
                with plock: print "KeyError"
                time.sleep(200)
                continue
 
            if not text: continue
 
            if lc != home and '[[' + lc + ':' + title + ']]' in text:
                 with plock: print "    ... iwiki %s already in %s" % (safe(lc), safe(title))
                 continue
 
            # go hunt down some iwikis, add reciprocals when needed
            links, redirs, complete = hunt(title, text, lc)
 
            # then update this one (also queued to other thread):
            replink(page = page, links = links, redirs = redirs, remove = complete)
 
        # limit number of fixes for testing
        # if fixed > 7: break
 
        # pace [not used in the same way, reconsider]
        if naptime > maxnap: naptime = maxnap
        """
        if naptime > 4:
            with plock: print "sleeping %d seconds" % naptime
        time.sleep(naptime)
        """
        continue
 
    # [notreached]
    # with plock: print "%d entries, %d possible, %d updated" % (entries, probs, fixed)
 
    # done
 
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()