User:Interwicket/code/iwikirc

From Wiktionary, the free dictionary
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot updates iwiki links between wiktionaries

22.1.9: try reading RC from various wikts and adding to en.wikt (just for fun)

24.1.9: try hunting down iwikis for new en.wikt entries

26.1.9: try adding reciprocals; can then use this in full run?

"""

import wikipedia
import xmlreader
import sys
import socket
import re
import pickle
import pagegenerators
import time
from random import randrange
from mwapi import getwikitext, getedit
from reciprocal import addrci
# borrow global:
from config import usernames

def safe(s):
    return pickle.dumps(s)[1:-5]

# Iwiki cache:

# not used quite yet:
"""
import shelve
Iwikis = None

def iwopen(home):
    global Iwikis

    Iwikis = shelve.open(home + "-iwiki-cache")

cis = 0
def iwadd(title, iws, upd = True):
    global cis

    if safe(title) in Iwikis and not upd: return
    if not iws or not len(iws): return

    # print "iwikis cache %s: %s" % (safe(title), safe(u' '.join(iws)))
    Iwikis[safe(title)] = iws

    cis += 1
    if cis % 100: Iwikis.sync()

    return
"""

Lcode = { }
Exists = set()
Active = set()
site = { }
naps = { }

def now(): return int(time.clock())

# return title, language code of FL wikt for recent changes in the other wikts

def recent(home = 'en'):

    # set up list of wikt codes to look at

    qtime = { }
    maxnap = 350 * 60 # almost 6 hours
    for lc in Exists:
         # if lc == home: continue
         site[lc] = wikipedia.getSite(lc, "wiktionary")
         qtime[lc] = now()
         naps[lc] = 60 * randrange(20, 71) # scatter 20 to 70 minutes
         if lc == home: naps[lc] = 300 # five min for home wikt

    # entries seen already (just let this grow?)
    seen = set()
    ny = 0

    rcex = re.compile(r'title="(.+?)"')

    while True:

        # sleep until next one
        nextq = now() + 1000000
        nextlc = ''
        for lc in qtime:
            if qtime[lc] < nextq:
                nextq = qtime[lc]
                nextlc = lc
        st = nextq - now()
        if st > 90:
            print "(%d, sleeping %d minutes, %s next)" % (now(), (st+29)/60, nextlc)
        if st > 0:
            time.sleep(st)
        lc = nextlc

        # read recentchanges, new entries, namespace 0, from site:

        if True: # [indent]

            print "(%d, reading from %s.wikt)" % (now(), lc)

            # set parameters

            # one hour ago back to one day ago
            rcend = '&rcend=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 86400))
            rcstart = '&rcstart=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 3600))

            if lc == home:
                rcshow = "&rcshow=patrolled|!bot" # avoid junk, large numbers of bot forms
                sysop = True # need patrol right on login used
            else:
                rcshow = ''
                sysop = False

            rclimit = "&rclimit=%d" % min(1 + ny/20, 200)

            # print "(options " + rcend + rcshow + rclimit + ")"

            try:
                rct = site[lc].getUrl("/w/api.php?

action=query&list=recentchanges&format=xml&rcprop=title" +
                     "&rctype=new&rcnamespace=0"+rcend+rcstart+rcshow+rclimit, sysop = sysop)
            except wikipedia.NoPage:
                print "can't get recentchanges from %s.wikt" % lc
                # rct = ''
                # time.sleep(30)
                qtime[lc] = now() + 700  # do other things for a bit
                continue
            if '<recentchanges />' in rct:
                # no changes in recent history
                pass
            elif '</recentchanges>' not in rct:
                print "some bad return from recentchanges, end tag not found"
                print safe(rct)
                # rct = ''
                # time.sleep(30)
                qtime[lc] = now() + 300  # do other things for a bit
                continue

            found = False
            for title in rcex.findall(rct):
                if ':' in title: continue # other stray stuff in NS:0

                if lc + ':' + title not in seen:
                    seen.add(lc + ':' + title)
                    yield title, lc
                    ny += 1
                    found = True

            if found:
                naps[lc] /= 2
                # naps[lc] = max(naps[lc], 30) # thirty seconds
                Active.add(lc)
            else:
                mn = naps[lc]/300 # one-fifth, in minutes
                naps[lc] += 60 * randrange(5, 11 + mn) # five-ten minutes or longer if we don't find 

anything
                naps[lc] = min(naps[lc], maxnap)
                if naps[lc] > maxnap/2: Active.discard(lc)

            qtime[lc] = now() + naps[lc]
            if naps[lc] > 90:
                print "(naptime for %s is %d minutes)" % (lc, (naps[lc]+29)/60)
            else:
                print "(naptime for %s is %d seconds)" % (lc, naps[lc])

# wiki-hunt ... see if a word is in other wikts, return list ...
# challenge here is not to take a huge amount of time, but get as many as possible

re2head = re.compile(r'^==([^=]*)==$', re.M)

def hunt(word, text, lc, home = 'en'):

    iwikis = [ ]
    print "    ... hunting iwikis"
    totry = set()
    done = set()
    present = set()
    fps = set()
    links = { }
    redirs = { }

    reiw = re.compile(r'\[\[([a-z-]{2,11}):' + re.escape(word) + '\]\]')

    # for lc in Active: totry.add(lc) magic occurs:
    totry = set( sorted(Active, key=lambda c: naps[c])[:11] )

    # if we found an FL title, start with that
    if lc != home: totry.add(lc)

    # language header(s) in entry are good candidates (of course!)

    for lang in re2head.findall(text):
        if lang in Lcode: totry.add(Lcode[lang])

    # simple scan for existing iwikis

    for lc in reiw.findall(text):
        if lc in site:
            totry.add(lc)
            present.add(lc)

    # not home:
    totry.discard(home)
    done.add(home)

    while totry:
        lc = totry.pop()

        try:
            fpage = wikipedia.Page(site[lc], word)
            text = getwikitext(fpage)
        except wikipedia.NoPage:
            print "        not in", lc
            done.add(lc)
            continue
        except wikipedia.IsRedirectPage:
            redirs[lc] = fpage
        except Exception, e:
            print "exception testing existence of word", str(e)
            done.add(lc)
            continue

        print "        found in", lc
        if lc not in present: iwikis.append(lc)
        done.add(lc)
        links[lc] = fpage

        # add to list to add reciprocal link, or complete set
        fps.add(fpage)

        # look for iwikis in the page, add to to-be-tried if not already done

        for lc in reiw.findall(text):
            if lc not in site: continue # (!) else who knows what junk ...
            if lc not in done: totry.add(lc)
            if lc not in done and lc not in totry:
                print "            found further iwiki", lc

    # all done, now add reciprocals, don't remove anything because hunt may be incomplete
    # [we could remove ones we looked for and didn't find, but that is even more cases]

    for fpage in fps:
        addrci(fpage, site[home], links=links, redirs=redirs, remove=False)

    return sorted(iwikis)
           

def main():

    socket.setdefaulttimeout(40)

    home = 'en'
    xml = True

    # testing rc:
    xml = False

    """ just keep argv code for now
    for arg in sys.argv[1:]:
        if arg.startswith('-start:'):
            start = arg[7:]
            print "starting at %s" % start
        elif arg.startswith('-stop:'):
            stop = arg[6:]
            print "stopping at %s" % stop
        elif arg.startswith('-new'):
            newonly = True
            print "new entries only"
        elif arg.startswith('-sort'):
            sort = True
            print "do edits for sort"
        elif arg.startswith('-xml'):
            xml = True
            print "read XML file"
        elif arg.startswith('-update'):
            update = True
            print "update cache from XML (XML is current!)"
        else: print "unknown command line argument %s" % arg
    """

    mysite = wikipedia.getSite(home, 'wiktionary')
    # make sure we are logged in
    mysite.forceLogin()
    meta = wikipedia.getSite(code = "meta", fam = "meta")

    # get active wikt list
    # minus crap. Tokipona? what are they thinking? Klingon? ;-)
    Lstops = ['tokipona', 'tlh']

    page = wikipedia.Page(meta, "List of Wiktionaries/Table")
    existtab = page.get()

    """ entry looks like:
| [[w:Vietnamese language|Vietnamese]]
| [[w:Vietnamese language|Tiếng Việt]]
| [http://vi.wiktionary.org/wiki/ vi]
"""

    # reextab = re.compile(r'^\[\[:([a-z-]+):')
    # reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org')
    reextab = re.compile(r'^\| \[\[w:.*\|(.*)\]\]\n'
                         r'^\| .*\n'
                         r'^\| \[http://([a-z-]+)\.wiktionary\.org', re.M)
    for mo in reextab.finditer(existtab):
        if mo.group(2) in Lstops: continue
        Exists.add(mo.group(2))
        Lcode[mo.group(1)] = mo.group(2)
        # see if we have a login in user config, else pretend we do
        # has to be done before any call, or login status gets confused!
        if mo.group(2) not in usernames['wiktionary']:
            usernames['wiktionary'][mo.group(2)] = "Interwicket"
 
    print "found %d active wikts" % len(Exists)
    if len(Exists) < 150: return

    # naps ... ;-)
    naptime = 0
    maxnap = 70

    # Iwikis cache [not updated for now]
    # iwopen(home)

    # build table of existing entries from xml
    # note we assume since we are doing RC new entries that the iwiki will be new,
    # what we want here is just an index to entries, so we don't have to do lots of en.wikt lookups

    enwikt = set()
    if xml:
      # get XML dump
      dump = xmlreader.XmlDump("../hancheck/en-wikt.xml")

      ti = 0
      entries = 0
      reds = 0
      iws = { } # in memory cache

      for entry in dump.parse():
        text = entry.text
        title = entry.title
        if ':' in title: continue
        # if title < start or (stop and title > stop): continue
        if text.startswith('#'): continue
        entries += 1
        if entries % 20000 == 0: print "prescan %d entries" % entries
        enwikt.add(title)

        # test:
        # if entries > 100000: break

        continue

      print "total  %d entries" % entries

    # now look for iwikis needed

    entries = 0
    probs = 0
    fixed = 0

    news = 0
    cbase = now() - 86400
    rate = 0.0

    for title, lc in recent():

        if ':' in title: continue # redundant, but eh?

        # temp:
        # if lc == 'en' and title.startswith('Da'): continue

        if title.lower() == 'main page': continue

        news += 1
        rate = news*3600.0/(now()-cbase)
        if news % 100 == 0: print "(observed creation rate %.4f/hour)" % rate

        print "%s:%s" % (safe(lc), safe(title))

        # if looking at home wikt is enabled above, just add things (;-)
        """
        if lc == home:
             print "    ... added to en.wikt"
             enwikt.add(title)
             continue
        """
        if lc == home: tag = True

        # if we are using xml? else just always look at entry
        if lc != home and xml and title not in enwikt:
             print "    ... %s not in en.wikt" % safe(title)
             continue

        # [look at cache, but unlikely, as this is new]

        tag = True

        # now see if it is something that should be tagged/replaced:

        if tag:

            probs += 1
            naptime += 1

            # ... pick up current version from en.wikt

            # print '%s is possible update, getting current entry' % safe(title)

            try:
                page = wikipedia.Page(mysite, title)
                # text = page.get()
                text = getwikitext(page)
                oldtext = text
            except wikipedia.NoPage:
                print "    ... %s not in en.wikt" % safe(page.title())
                text = ''
            except wikipedia.IsRedirectPage:
                print "    ... redirect page"
                text = ''
            except KeyError:
                # annoying local error, from crappy framework code
                print "KeyError"
                time.sleep(200)
                continue

            if not text: continue

            act = ''

            if lc != home and '[[' + lc + ':' + title + ']]' in text:
                 print "    ... iwiki %s already in %s" % (safe(lc), safe(title))
                 act = 'sort iwikis'
                 # was added manually? so probably wrong ... (;-)
                 iwikis = [ ]
            else:
                # go hunt down some iwikis, add reciprocals when needed

                iwikis = hunt(title, text, lc)
                if iwikis:
                    act = "iwiki +" + ", ".join(iwikis)
                else:
                    print "    ... no iwikis found"

            if not act: continue

            linksites = wikipedia.getLanguageLinks(text)
            for lc in iwikis:
                fpage = wikipedia.Page(site[lc], title)
                linksites[site[lc]] = fpage

            newtext = wikipedia.replaceLanguageLinks(text, linksites, site = mysite)

            newtext = newtext.replace('\r\n', '\n') # wikipedia brain-damage
            if newtext.rstrip(' \n') == text.rstrip(' \n'): continue # didn't change anything
            # wikipedia.showDiff(text, newtext)

            # update cache with links read: [revise, probably don't do anything here]
            # if not act: iwadd(title, oldlinks.keys())

        else: continue

        # some change, write it
        if act:

            fixed += 1
            naptime /= 2

            print "    ... updating %s: %s" % (safe(title), safe(act).strip("'"))

            # try to fix the entry
            try:
                utext = getedit(page)
                # utext = page.get()
                if utext != oldtext:
                    print "page changed during attempted update"
                    continue
                wikipedia.setAction(act)
                page.put(newtext)
                # no cache update [and "links" not set up]
                # iwadd(title, links.keys())
            except wikipedia.EditConflict:
                print "Edit conflict?"
                continue
            except wikipedia.PageNotSaved:
                print "failed to save page"
                # other action?
                continue
            except wikipedia.NoPage:
                print "Can't get %s from en.wikt?" % safe(page.aslink())
                continue
            except wikipedia.IsRedirectPage:
                print "Redirect page now?"
                continue
            except socket.timeout:
                print "socket timeout, maybe not saving page"
                continue
            except socket.error:
                print "socket error, maybe not saving page"
                continue
            except KeyError:
                # annoying local error, from crappy framework code
                print "KeyError"
                time.sleep(200)
                continue

        # limit number of fixes for testing
        # if fixed > 7: break

        # pace [not used in the same way, reconsider]
        if naptime > maxnap: naptime = maxnap
        """
        if naptime > 4:
            print "sleeping %d seconds" % naptime
        time.sleep(naptime)
        """
        continue

    # [notreached]
    # print "%d entries, %d possible, %d updated" % (entries, probs, fixed)

    # done

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()