User:Interwicket/code/allpages

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Note: to copy this code, edit this page, and then copy from the edit window! Else you will not get the HTML entities as HTML entities.

#!/usr/bin/python
# -*- coding: utf-8  -*-

# modded RLU for iwikt, use MW API, remove a lot of cruft, add maxlag and some more reliability

import re, codecs, sys
import urllib
import time

import wikipedia

# redirects may be None, True, or False (all different ;-) None is all pages, True is
# just redirects, False is just non-redirects.

reapt = re.compile('title ?="(.*?)"')
relagged = re.compile(r'<error.*"maxlag".* (\d+) seconds')
reapfrom = re.compile(r' apfrom="(.*?)"')

def allpages(site = wikipedia.getSite(), start = '!', namespace = '0', redirects = None):
        while True:
            # encode Non-ASCII characters in hexadecimal format (e.g. %F6)
            start = start.encode(site.encoding())
            start = urllib.quote(start)

            path = "/w/api.php?action=query&list=allpages&apfrom=" + start + \
                "&aplimit=480&format=xml&maxlag=2&namespace=" + namespace

            # redirects may be None, False, or True
            if redirects == None: pass
            elif redirects == True: path += "&apfilterredir=redirects"
            elif redirects == False: path += "&apfilterredir=nonredirects"

            print '(getting pages in %s from %s)' % (site.lang, start)

            # add retry logic, Robert Ullmann 25 Sept 07
            done = False
            nap = 5
            while not done:
                atext = site.getUrl(path)

                mo = relagged.search(atext)
                if mo:
                    print "(server lagged %s seconds)" % mo.group(1)
                    time.sleep(20)
                    continue

                if '</api>' in atext:
                    done = True
                else:
                    print "allpages: incomplete reply, sleeping %d seconds" % nap
                    time.sleep(nap)
                    nap = min(nap + nap/2, 300)

            for title in reapt.findall(atext):
                # " is an HTML entity in this field! Robert Ullmann, 20 January 2008
                # & too! Robert Ullmann, 8 May 2008
                title = title.replace('"', '"')
                title = title.replace('&', '&')
                # others, but not sure we need this at all? Page fixes things.

                # suppress other namespace-like things here, or Page will gen the "wrong" title
                if ':' in title: continue

                yield wikipedia.Page(site, title)

            # find continuation:
            mo = reapfrom.search(atext)
            if mo:
                start = mo.group(1)
                start = start.replace('"', '"')
                start = start.replace('&', '&')
                continue
            else:
                break # we are done, will raise StopIteration

# define a class so we can instantiate the iter method:

class allpagegen:
    def __init__(self, start ='!', namespace = '0', site = wikipedia.getSite(), redirects = None):
        self.start = start
        self.site = site
        self.redir = redirects
        self.namespace = namespace

    def __iter__(self):
        for page in allpages(site = self.site, start = self.start,
                           namespace = self.namespace, redirects = self.redir):
            yield page

# simple unit test:

if __name__ == "__main__":

    print "testing allpages, 1000 redirects from Ka in en.wikt, print every 20:"

    s = wikipedia.getSite('en', 'wiktionary')
    kagen = allpagegen(site = s, redirects = True, start = 'Ka')

    i = 0
    for page in kagen:
        i += 1
        if i%20 == 0: print repr(page.title())
        if i > 1000: break