User:Interwicket/code/mwapi

From Wiktionary, the free dictionary
Jump to navigation Jump to search



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Interwicket/code/mwapi


"""
functions to use MW API to replace wikipedia.py browser-client functions

getwikitext(page)               -- get the text of the page, like get
getedit(page)                   -- get the text of the page and an edit token
                                   page can then be saved with wikipedia.Page.put
putedit(page, text, comment)    -- save the page, will never create or recreate, edit only!
                                   only if getedit() was used, not framework get

readapi(site, request)          -- read from or post to api, with compression and maxlag built in

optional parameter plock to use as lock around anything printed

this version uses persistent HTTP connections

"""

import wikipedia
import re
import time
from threading import currentThread, Lock
plockd = Lock() # default plock

import urllib, httplib
from StringIO import StringIO
from gzip import GzipFile

# connection pool
# implemented as a queue, so we can share between threads
# no particular limit, effectively limited by number of threads in program

import Queue
pool = Queue.Queue()

# since we aren't using the framework 'throttle', do something better
# this is a "tick-tock" timer, shared on all threads
# clicked down each success, up on each network failure of any type

ticktocklock = Lock()
ticktock = 1.0
def getticktock(): 
    global ticktock
    return ticktock

relagged = re.compile(r'<error.*"maxlag".* (\d+) seconds')

def readapi(site, request, plock = Lock(), sysop = False, mode = "GET", par = None):
    global ticktocklock, ticktock

    url = "http://" + site.hostname() + "/w/api.php?" + request

    done = False
    nap = 5
    maxl = 5
    maxlag = "&maxlag=%d" % maxl

    with ticktocklock:
        ticktock *= 0.95  # is -0.025 if 5 seconds, -1.0 at 20 seconds
        ticktock = max(ticktock, 0.1)
        ticktock = min(ticktock, 20.0)
        if ticktock >= 10.0:
            with plock: print "(mwapi readapi: tick tock is %.1f)" % ticktock
        time.sleep(ticktock)
        ticktock -= 1.0   # undo first increment in loop

    while not done:
        ticktock += 1.0   # done w/o lock, race condition is rare, not a serious problem, ignored!

        # get a connection from pool
        try:
            conn = pool.get_nowait()
        except Queue.Empty:
            conn = None

        try:
            if not conn:
                # with plock: print "(opening connection to %s)" % site.hostname()
                conn = httplib.HTTPConnection(site.hostname())

            # either get or post
            headers = { 'Cookie': site.cookies(sysop = sysop) or '',
                        'Accept-Encoding': 'gzip',
                        'User-Agent': 'Interwicket/1.0' }
            if mode == "POST":
                headers['Content-Type'] = 'application/x-www-form-urlencoded'
            conn.request(mode, url + maxlag, par, headers)
            resp = conn.getresponse()

            text = resp.read()
            if 'gzip' in resp.getheader('Content-Encoding', ''):
                text = GzipFile(fileobj=StringIO(text)).read()
            text = unicode(text, 'UTF-8' , errors = 'ignore')
            done = True
        except Exception, e:
            # work around net problem 5.6.10, ignore first 3
            # this is to deal with the atrocious behavior of Iconnect Kenya
            # which is capable of forcing requests through their proxy and then
            # killing 80+% with blank status ('BadStatusLine'), and connections closed

            repre = repr(e)
            if nap < 15 and ('10060' in repre or '10054' in repre or 'BadStatusLine' in repre \
                                or 'timeout' in repre or 'gaierror' in repre):
                conn.close()
                conn = None
                time.sleep(nap)
                nap = min(nap + nap/2, 300)
                ticktock -= 0.95   # undo most of increment for this failure
                continue # quietly

            with plock:
                print "(%s: exception reading API: %s)" % (currentThread().name, repr(e))
            text = ''
            conn.close()
            conn = None
            time.sleep(nap)
            nap = min(nap + nap/2, 300)
            continue

        if '<api' not in text and 'NdxICC' in text:
            with plock: print "(mwapi readapi: bad reply from box)"
            # (not) silently ignore bad return from Nomadix box
            conn.close()
            conn = None
            time.sleep(5)
            done = False
            continue

        mo = relagged.search(text)
        if mo:
            replag = int(mo.group(1))
            with plock: print "(%s: server lagged %s seconds)" % (currentThread().name, replag)
            # allow more lag the next time
            maxl += max(maxl/4, replag/20)
            maxlag = "&maxlag=%d" % maxl
            # make some progress even when server crocked ...
            if maxl > 600: maxlag = ""
            if maxlag and maxl > 60:
                with plock: print "(mwapi readapi: next with %s)" % maxlag
            # sleep replag if not more than 70
            time.sleep(min(replag, 70))
            done = False
            pool.put(conn) # should still be good
            conn = None
            continue

    # if we still have the connection without failure, return it to pool
    if conn: pool.put(conn)

    return text

def ts(t): return time.strptime(t, '%Y-%m-%dT%H:%M:%SZ')

retok = re.compile(r' edittoken="(.*?)"')
restartime = re.compile(r' starttimestamp="(.*?)"')
retimestamp = re.compile(r' timestamp="(.*?)"')
rerevid = re.compile(r' revid="(.*?)"')

def getwikitext(page, revid = None, plock = plockd):

    site = page._site

    if hasattr(page, '_contents'):
        if revid:
            if hasattr(page, '_revisionid') and revid == page._revisionid: return page._contents
        else: return page._contents
    # else we need to get page

    done = False
    nap = 5
    while not done:
        # if revid, get a specific revision
        if revid: rs = "&rvstartid=%s&rvlimit=1" % revid
        else: rs = ''
        # throw various exceptions to caller
        rawt = readapi(site, "action=query&prop=revisions|info&rvprop=content|ids&format=xml"
                     + rs + "&titles=" + page.urlname(), plock = plock)

        i1 = rawt.find("<rev ")
        if i1 > 0:
            i1a = rawt[i1:].find('>')
            if i1a > 0: i1 += i1a + 1
            else: i1 = -1 # something bad ...
        i2 = rawt.find("</rev")
        if i1 < 0 or i2 < i1:
            # deleted/does not exist? bad title, no API return
            if 'missing=""' in rawt: raise wikipedia.NoPage
            if 'invalid=""' in rawt: raise wikipedia.NoPage
            if '<api />' in rawt: raise wikipedia.NoPage
            # else
            with plock: print "(mwapi: no text found, sleeping %d seconds)" % nap
            # print repr(rawt)
            time.sleep(nap)
            nap = min(nap + nap/2, 300)
        else: done = True

    text = rawt[i1:i2]
    text = wikipedia.unescape(text)

    mo = rerevid.search(rawt)
    if mo:
        # print "mwapi (debug): revision id from getwikitext", mo.group(1)
        revid = mo.group(1)
    else:
        revid = ''

    # for us
    page._revisionid = revid

    # did we get redirect?
    if 'redirect=""' in rawt[:i1]: raise wikipedia.IsRedirectPage # and do not set _contents

    # tell wikipedia put etc that we have the contents (else it does *another* get!)
    page._contents = text

    return text

def getedit(page, sysop = False, plock = plockd):

    site = page._site

    done = False
    nap = 5
    notk = 0
    while not done:
        # throw various exceptions to caller
        rawt = readapi(site, "action=query&prop=info|revisions&intoken=edit&format=xml" +
                     "&titles=" + page.urlname(), sysop = sysop, plock = plock)
    
        # wiki locked; or possibly user blocked? we don't have enough information
        # this is message from locked wiki
        if ">Action 'edit' is not allowed for the current user</info>" in rawt:
            raise wikipedia.UserBlocked

        mo = retok.search(rawt)
        if mo:
            # token is stored in the site (!) silly, I thought it was an *edit* token
            site.putToken(mo.group(1), sysop = sysop)
            done = True
        else:
            notk += 1
            if notk > 20: raise wikipedia.ServerError   # give up eventually!
            with plock:
                print repr(rawt) # probably temporary?
                print "mwapi: no token received trying to edit %s" % repr(page.aslink())
                print "mwapi: sleeping %d seconds" % nap
            time.sleep(nap)
            nap = min(nap + nap/2, 300)

    mo = retimestamp.search(rawt)
    if mo:
        # print "mwapi (debug): timestamp", mo.group(1)
        page._editTime = time.strftime('%Y%m%d%H%M%S', ts(mo.group(1)))
        # and without reformatting, for our putedit:
        page._basetimestamp = mo.group(1)
    else:
        page._editTime = time.strftime('%Y%m%d%H%M%S', time.gmtime())

    mo = restartime.search(rawt)
    if mo:
        # print "mwapi (debug): starttimestamp", mo.group(1)
        page._startTime = time.strftime('%Y%m%d%H%M%S', ts(mo.group(1)))
    else:
        page._startTime = time.strftime('%Y%m%d%H%M%S', time.gmtime())

    mo = rerevid.search(rawt)
    if mo:
        revid = mo.group(1)
    else:
        revid = ''
        if hasattr(page, "_contents"): del page._contents # !

    # print "mwapi (debug): start %s, edit %s, revid %s, token %s" % (page._startTime, page._editTime,
    #      revid, site.getToken())

    return getwikitext(page, revid = revid, plock = plock)

def putedit(page, text, comment = '', sysop = False, plock = plockd):

    site = page._site

    done = False
    nap = 5

    while not done:

        token = site.getToken(sysop = sysop)

        # throw various exceptions to caller

        # parameters in order, token last, to make sure text is complete (!)
        par = urllib.urlencode([ ('text', text.encode("UTF-8")),
                                 ('title', page.title().encode("UTF-8")),
                                 ('summary', comment.encode("UTF-8")),
                                 ('basetimestamp', page._basetimestamp),
                                 ('token', token) ])

        rawt = readapi(site, "action=edit&format=xml&bot=1&minor=1&nocreate=1",
                     mode = "POST", par = par, sysop = sysop, plock = plock)

        if 'result="Success"' in rawt:
            done = True
            break

        # various errors [?]
        if 'code="missingtitle"' in rawt:
            if hasattr(page, "_contents"): del page._contents
            raise wikipedia.NoPage
        if 'code="pagedeleted"' in rawt:
            if hasattr(page, "_contents"): del page._contents
            raise wikipedia.NoPage
        if 'code="protectedpage"' in rawt:
            raise wikipedia.LockedPage

        with plock: print "(mwapi putedit error: %s, page %s)" % (repr(rawt[:300]), repr(page.aslink()))

        time.sleep(nap)
        nap = min(nap + nap/2, 300)
        if nap == 300: break # can't go on forever [?]



if __name__ == "__main__":

    print "mwapi tests"

    site = wikipedia.getSite('en', 'wiktionary')

    print "present page foo"
    page = wikipedia.Page(site, 'foo')

    t = getwikitext(page)
    print repr(t)

    print "missing page"
    page = wikipedia.Page(site, 'foo xxx2')

    try:
        t = getwikitext(page)
        print repr(t)
    except Exception, e:
        print "exception", repr(e)

    """
    print "redirect page"
    page = wikipedia.Page(site, 'html')

    try:
        t = getwikitext(page)
        print repr(t)
    except Exception, e:
        print "exception", repr(e)


    print "recent changes"

    try:
        rct = readapi(site,
                     "action=query&list=recentchanges&format=xml&rcprop=title|user" +
                     "&rctype=new&rcnamespace=0&rclimit=10",
                     sysop = True)
        print repr(rct)
    except Exception, e:
        print "exception", repr(e)
    """

    site = wikipedia.getSite('sw', 'wiktionary')

    print "present page cat on sw.wikt"
    page = wikipedia.Page(site, 'cat')

    t = getwikitext(page)
    print repr(t)

    site = wikipedia.getSite('en', 'wiktionary')

    print "try updating page on en.wikt"

    page = wikipedia.Page(site, 'User:Robert Ullmann/t1')

    text = getedit(page)
    text += "\n\nand some more text"
    putedit(page, text, "add some more")  

    print "anna two ..."

    text = getedit(page)
    text += "\n\nand 2 text"
    putedit(page, text, "add 2")   

    print "edit missing page"
    page = wikipedia.Page(site, 'foo xxx2')

    try:
        t = getedit(page)
        print repr(t)
    except Exception, e:
        print "exception", repr(e)

    page._basetimestamp = '0'

    print "... saving"

    try:
        t = putedit(page, "foo")
        print repr(t)
    except Exception, e:
        print "exception", repr(e)
 
    print "done"