User:Interwicket/code/iwiktll

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

This is "stabilized" and I don't use it; see User:Interwicket/code/mbwa. If mbwa is given exactly one langlinks file to work with, it will do what this code will do, albeit in a different order. Provided here only because it was here; I may delete this page presently. Robert Ullmann 17:01, 10 February 2009 (UTC)

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot updates iwiki links between wiktionaries

2.2.9: read langlinks.sql.gz dump, compare to union-index, re-evaluate conflicts

Run with options:

-langlinks:(filename)      it is not necessary to specify the "-langlinks.sql.gz" ending
                           overridden by -date if that is used
-home:(code)
-redir                     add links to redirects on this wikt, important to get right,
                           as it will otherwise remove desired links to redirects (not so yet)
-date:(date)               reads file "langlinks/(home)wiktionary-(date)-langlinks.sql.gz"

"""

import wikipedia
import xmlreader
import sys
import socket
import re
import pickle
import pagegenerators
import time
from random import choice
from mwapi import getwikitext, getedit
from reciprocal import addrci
# borrow global:
from config import usernames

def safe(s):
    return pickle.dumps(s)[1:-5]

import shelve

# use hunt routine in iwiktrc, should be able to maintain the same?
from iwiktrc import hunt, Exists, Lcode, site, naps
# things used by hunt(), use same copies!

def now(): return int(time.clock())

# read language links file, sort internally (not in the order we'd like ;-)
# compare to union index, yield titles that do not match, with language codes to hunt

import gzip

def llfile(home = '', filename = '', redirs = False):
    if not filename: return

    # dict of links, to sort out from file
    # entries are pageid, code, link title
    # pageid is mostly useless to us, link title is pagename presumably
    # so dict of sets

    links = { }
    retuple = re.compile(r"\((\d*?),'(.*?)','(.*?)'\)")

    print "reading file", filename

    f = gzip.open(filename, 'rb')

    leftover = ''
    while True:
        content = f.read(4096)
        if not content: break

        content = leftover + content
        # find a break not in UTF-8
        i = content.rfind("');") # at end, must check first
        if i < 0: i = content.rfind("'),") # usual case
        if i < 0:
            leftover = content 
            continue # at end or need to read some more
        leftover = content[i+3:]
        content = content[:i+2]

        content = unicode(content, 'utf-8', 'ignore')

        for tuple in retuple.findall(content):
            # print repr(tuple)

            pid, lc, title = tuple
            if ':' in title: continue
            if not title: continue
            title = title.replace(r"\'", "'") # SQL escape, we've matched ' only before , or )
            if title not in links: links[title] = set()
            links[title].add(lc)

    f.close()

    print "read links for %d titles" % len(links)

    # now we have all the links, compare to union index

    Uix = shelve.open("union-index")
    # Uix = {} # testing w/o union index

    for title in sorted(links):

        if repr(title) in Uix: t, ul, ur = Uix[repr(title)]
        else: ul = ur = ''

        # print repr(title), "LL:", repr(links[title]), "UNION:", repr(ul), "UREDIR:", repr(ur)

        if redirs: ul += ur
    
        # compare links to ul, should match
        # first add home to ll, then it should be identical
        ll = links[title]
        ll.add(home)

        # if not redirs, but some present, is okay (at this point):
        if not redirs and ur:
            for lc in ur: ll.discard(lc)
            # (also no point in trying to read them in hunt ;-)

        if sorted(ll) != sorted(ul):

            print "    in LL, not in UNION:", [x for x in ll if x not in ul]
            print "    in UNION, not in LL:", [x for x in ul if x not in ll]

            lcs = set(ul)
            lcs.discard(home)

            yield title, lcs, ur

        else: print "(%s matches)" % repr(title)

    Uix.close()


def main():

    socket.setdefaulttimeout(40)

    home = 'en'
    langlinks = ''
    addredirs = False
    fdate = ''

    for arg in sys.argv[1:]:
        if arg.startswith('-langlinks:'):
            langlinks = arg[11:]
            if not langlinks.endswith("-langlinks.sql.gz") and '.' not in langlinks:
                langlinks += "-langlinks.sql.gz"
            print "reading langlinks file %s" % langlinks
        if arg.startswith('-date:'):
            fdate = arg[6:]
        elif arg.startswith('-home:'):
            home = arg[6:]
            print "home wikt is %s" % home
        elif arg.startswith('-redir'):
            addredirs = True
            print "add links to redirects"
        else: print "unknown command line argument %s" % arg
    if fdate:
        langlinks = "langlinks/" + home + "wiktionary-" + fdate + "-langlinks.sql.gz"
        print "reading langlinks file %s" % langlinks

    mysite = wikipedia.getSite(home, 'wiktionary')
    # make sure we are logged in
    mysite.forceLogin()
    meta = wikipedia.getSite(code = "meta", fam = "meta")

    # get active wikt list
    # minus crap. Tokipona? what are they thinking? Klingon? ;-)
    Lstops = ['tokipona', 'tlh']

    page = wikipedia.Page(meta, "List of Wiktionaries/Table")
    existtab = page.get()

    """ entry looks like:
| [[w:Vietnamese language|Vietnamese]]
| [[w:Vietnamese language|Tiếng Việt]]
| [http://vi.wiktionary.org/wiki/ vi]
"""

    # reextab = re.compile(r'^\[\[:([a-z-]+):')
    # reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org')
    reextab = re.compile(r'^\| \[\[w:.*\|(.*)\]\]\n'
                         r'^\| .*\n'
                         r'^\| \[http://([a-z-]+)\.wiktionary\.org', re.M)
    for mo in reextab.finditer(existtab):
        if mo.group(2) in Lstops: continue
        Exists.add(mo.group(2))
        Lcode[mo.group(1)] = mo.group(2)
        # see if we have a login in user config, else pretend we do
        # has to be done before any call, or login status gets confused!
        if mo.group(2) not in usernames['wiktionary']:
            usernames['wiktionary'][mo.group(2)] = "Interwicket"
 
    print "found %d active wikts" % len(Exists)
    if len(Exists) < 150: return

    for lc in Exists:
         site[lc] = wikipedia.getSite(lc, "wiktionary")
         naps[lc] = 0 # nil, might be referenced by hunt()

    # naps ... ;-)
    naptime = 0
    maxnap = 70

    # now look for iwikis needed

    entries = 0
    probs = 0
    fixed = 0

    for title, lcs, urs in llfile(home = home, filename = langlinks, redirs = addredirs):

        if ':' in title: continue # redundant, but eh?

        if title.lower() == 'main page': continue

        print "%s:%s" % (home, safe(title))

        # structure of code here is leftover from source (;-)
        tag = True

        # now see if it is something that should be tagged/replaced:

        if tag:

            probs += 1
            naptime += 1

            # ... pick up current version from en.wikt

            # print '%s is possible update, getting current entry' % safe(title)

            try:
                page = wikipedia.Page(mysite, title)
                # text = page.get()
                text = getwikitext(page)
                oldtext = text
            except wikipedia.NoPage:
                print "    ... %s not in %s.wikt" % (safe(page.title()), safe(home))
                text = ''
            except wikipedia.IsRedirectPage:
                print "    ... redirect page"
                text = ''
            except KeyError:
                # annoying local error, from crappy framework code
                print "KeyError"
                time.sleep(200)
                continue

            if not text: continue

            act = ''

            linksites = wikipedia.getLanguageLinks(text)
            ls = [s.lang for s in linksites]

            # list of iwikis in entry should match lcs, if not, we need to update
            if sorted(ls) == sorted(lcs):
                print "    ... is okay"
                continue

            # if not always adding redirs to this wikt, but some present, is ok
            if not addredirs:
                ok = True
                # need to remove something
                for s in ls:
                    if s not in lcs and s not in urs: ok = False
                # need to add something
                for s in lcs:
                    if s not in ls: ok = False
                if ok:
                    print "    ... is okay (may have redirects)"
                    continue

            # go hunt down some iwikis, add reciprocals when needed
            # always include en, pass all other lcs

            iwikis, missing = hunt(title, text, 'en', lcs = lcs, home = home, addredirs = addredirs)
            if iwikis:
                act = "iwiki +" + ", ".join(iwikis)
            else:
                print "    ... no new iwikis found"

            # remove
            rms = [ ]
            for s in ls:
                if s in missing: rms.append(s)
            if home in ls: rms.append(home) # pre-existing self-link (!)
            if rms:
                if act: act += " -"
                else: act = "iwiki -"
                act += ", ".join(sorted(rms))

            if not act: continue

            # add links, [don't remove unwanted redirects yet]
            for lc in iwikis:
                fpage = wikipedia.Page(site[lc], title)
                linksites[site[lc]] = fpage
            for lc in rms:
                del linksites[site[lc]]

            try:
                newtext = wikipedia.replaceLanguageLinks(text, linksites, site = mysite)
            except ValueError:
                # throws this trying to "add to self", just effing continue
                print "    ... replace error in", repr(page.aslink())
                continue

            newtext = newtext.replace('\r\n', '\n') # wikipedia brain-damage
            if newtext.rstrip(' \n') == text.rstrip(' \n'): continue # didn't change anything
            # wikipedia.showDiff(text, newtext)

        else: continue

        # some change, write it
        if act:

            fixed += 1
            naptime /= 2

            print "    ... updating %s: %s" % (safe(title), safe(act).strip("'"))

            # try to fix the entry
            try:
                utext = getedit(page)
                # utext = page.get()
                if utext != oldtext:
                    print "page changed during attempted update"
                    continue
                wikipedia.setAction(act)
                page.put(newtext)
                # no cache update [and "links" not set up]
                # iwadd(title, links.keys())
            except wikipedia.EditConflict:
                print "Edit conflict?"
                continue
            except wikipedia.PageNotSaved:
                print "failed to save page"
                # other action?
                continue
            except wikipedia.NoPage:
                print "Can't get %s from en.wikt?" % safe(page.aslink())
                continue
            except wikipedia.IsRedirectPage:
                print "Redirect page now?"
                continue
            except socket.timeout:
                print "socket timeout, maybe not saving page"
                continue
            except socket.error:
                print "socket error, maybe not saving page"
                continue
            except KeyError:
                # annoying local error, from crappy framework code
                print "KeyError"
                time.sleep(200)
                continue

        # limit number of fixes for testing
        # if fixed > 7: break

        # pace [not used in the same way, reconsider]
        if naptime > maxnap: naptime = maxnap
        """
        if naptime > 4:
            print "sleeping %d seconds" % naptime
        time.sleep(naptime)
        """
        continue

    print "%d entries, %d possible, %d updated" % (entries, probs, fixed)

    # done

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()