User:Robert Ullmann/Prologue/code

From Wiktionary, the free dictionary
Jump to navigation Jump to search



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/Prologue/code


"""
Generates prologue (section 0) examples

"""

import wikipedia
import sys
import re
import socket
import urllib
from iwiktmwapi import getwikitext, getedit, putedit, readapi

def srep(s):
    return repr(u''+s)[2:-1]

def lkey(l):
    # language sort key

    n = l.strip('[]')
    if not n: return n

    if n == 'Translingual': return '3' + n
    # at end for now

    if n == 'English': return '1' + n

    # handle names like !Kung and 'Auhelawa: move (one) non-alpha to the end of key

    if not n[0:1].isalpha(): n = n[1:] + n[0:1]

    return '2' + n


relink = re.compile(r'\[\[(.+?)\]\]')
reh2 =   re.compile(r'==([^=]+)==')
rehead = re.compile(r'=+([^=]+)=+')
# match language "tags" on defn lines
retag =  re.compile(r"[(']+([^)]+)[)']+ (.*)")
rexpand = re.compile(r'<expandtemplates.*?>(.*)</expandtemplates>', re.S)

# find a context span in expanded text, at start of definition:
# [ this relies on context working exactly one way ... might be improved]

respan = re.compile(r'<span class="ib-brac"><span class="qualifier-brac">'
               '\(</span></span><span class="ib-content"><span class="qualifier-content">'
               '(.*?)'
               '</span></span><span class="ib-brac"><span class="qualifier-brac">\)</span></span>'
               '(.*)')

# example:
# <span class="ib-brac"><span class="qualifier-brac">(</span></span><span class="ib-content"><span class="qualifier-content">[[nautical]][[Category:Nautical]]</span></span><span class="ib-brac"><span class="qualifier-brac">)</span></span> A strong tackle used to hoist an anchor to the [[cathead]] of a ship.

recomma = re.compile(r'<span class="ib-comma"><span class="qualifier-comma">,</span></span>')

def expand(text, title = ''):

    site = wikipedia.getSite("en", "wiktionary")

    # call expand templates:

    # parameters (and do a post op)
    par = urllib.urlencode([ ('text', text.encode("UTF-8")),
                             ('title', title.encode("UTF-8")) ])

    rawt = readapi(site, "action=expandtemplates&format=xml", mode = "POST", par = par)

    mo = rexpand.search(rawt)
    if not mo:
        print "    can't expand templates?"
        print repr(rawt)
        return ''

    return wikipedia.unescape(mo.group(1))

recat = re.compile(r'\[\[\s*[Cc]ategory\s*:.*?\]\]')
recattag = re.compile(r'<!--XCAT-->\s*<!--XCAT-->', re.S)

def decat(t):
    """
    remove categories from text. not simple as the general parser is complex

    a cat at the end of the line should be removed without removing the line break,
    but line breaks and even blank lines in between cats should be removed
    and blank lines after (or before?) cats should be removed if multiple, but we
    don't handle that case.
    """

    # replace all cats with uniform tag
    # tag is an HTML comment so if it did by chance occur in the wikitext it would be gone anyway
    tot = recat.sub('<!--XCAT-->', t)

    # now replace any spans around whitespace with singlets
    k = 1
    while k: tot, k = recattag.subn('<!--XCAT-->', tot)

    # and remove tags
    tot = tot.replace('<!--XCAT-->', '')

    return tot

def main():

    socket.setdefaulttimeout(70)

    # read list of the pages we should set up as examples

    site = wikipedia.getSite("en", "wiktionary")
    site.forceLogin()

    page = wikipedia.Page(site, "User:Robert Ullmann/Prologue/feedme")
    feed = getwikitext(page)

    # test:
    # feed = '[[bog]] [[cat]] [[prolog]] [[mama]]'
    # feed = '[[' + sys.argv[1] + ']]'

    for title in relink.findall(feed):

        print "%s:" % srep(title)

        try:
            page = wikipedia.Page(site, title)
            text = getwikitext(page)
        except Exception, e:
            print "    exception getting page", repr(e)
            text = ''
            continue

        # now find language sections, POS, defs
        # lang is language, pos is last header (which may very well not be a POS)
        lang = ''
        pos = ''

        # defs is dict of lang to list of (POS, def) tuples
        defs = { }

        for line in text.splitlines():

            mo = reh2.match(line)
            if mo:
                lang = mo.group(1)
                pos = ''
                continue

            if not lang: continue

            mo = rehead.match(line)
            if mo:
                pos = mo.group(1)
                continue

            if line[:2] != '# ': continue

            # skip {defn} and {defn-form}
            if '{{defn' in line: continue

            # def line, add into list
            if lang not in defs: defs[lang] = [ ]
            defs[lang].append( ( pos.lower(), line[2:]) )

            # (that takes care of extracting the basic info)

        # print repr(defs)

        # now we have to reprocess the "Serbo-Croatian" drek:
        # following is an approximation, doing it "correctly" is not possible
        # as the forced merger discards information and the format is not tractable

        if "Serbo-Croatian" in defs:

           if "Croatian" in defs or "Serbian" in defs or "Bosnian" in defs \
                         or "Montenegrin" in defs:
               pass
               # use standard language entries

           else:
               dlist = defs["Serbo-Croatian"]
               for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]:
                   defs[lang] = [ ]

               for pos, defn in dlist:
                   # look for tags
                   mo = retag.match(defn)
                   if mo and ("Croatian" in mo.group(1) or "Serbian" in mo.group(1) \
                         or "Bosnian" in mo.group(1) or "Montenegrin" in mo.group(1)):
                       # add remainder of def to each language tagged:
                       for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]:
                           if lang in mo.group(1): defs[lang].append( (pos, mo.group(2)) )
                   else:
                       # use default on correct script
                       if ord(title[0]) >= 0x0400 and ord(title[0]) < 0x0530:
                           defs["Serbian"].append( (pos, defn) )
                       else:
                           defs["Croatian"].append( (pos, defn) )

               # now drop blanks
               for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]:
                   if not defs[lang]: del defs[lang]

           del defs["Serbo-Croatian"]
           # done with crap

        # consolidate defs ...
        # keeping order is the trick
        # use four lists, generate in parallel

        langs = [ ]
        poss = [ ]
        defns = [ ]
        ctxs = [ ]

        for lang in sorted(defs, key=lkey):
           dlist = defs[lang]
           for pos, defn in dlist:

               if pos == "han character": pos = "Han character" # fix, should be cap

               # do a number of things to clean up defn

               # [remove defdate, ref tags, etc, etc)]

               ctx = ''
               if defn.startswith('{{'):
                   # try finding a context ...
                   exp = expand(defn, title)
                   print "(expand def)"
                   mo = respan.match(exp)
                   if mo:
                       ctx = decat(recomma.sub(',', mo.group(1))).replace(' ', ' ')
                       defn = mo.group(2).lstrip()
                       print "matched context"
                   # [at some point might expand the whole entry first? or not bother for examples]

               # seen already? [need some fuzziness in match!]
               i = 0
               while i < len(defns):
                   if pos == poss[i] and defn == defns[i] and ctx == ctxs[i]: break
                   i += 1
               if i >= len(defns):
                   langs.append(lang)
                   poss.append(pos)
                   defns.append(defn)
                   ctxs.append(ctx)
               else:
                   langs[i] += ', ' + lang
       
        # (re)generate prologue:
        # this is easier because we are working from NS:0 entries which don't have
        # the prologue in them, and we don't need to reprocess defs as much for examples
        # harder as we need to expand and kill cats

        newtext = '{{tocright}}\n'

        # first copy the existing stuff (also template, whatever)
        for line in text.splitlines():
           if line[:2] == '==': break
           newtext += line + '\n'

        # generate def lines
        # also need to handle several and many languages, and so on
        # section link languages?

        for i in range(0, len(defns)):
           ln = langs[i]
           if ln == 'English': ln = ''
           else: ln += ', '
           ctx = ctxs[i]
           if ctx: ctx = ', ' + ctx
           newtext += "# (''" + ln + poss[i] + ctx + "'') " + defns[i] + '\n'
           print "    # " + '(' + ln + poss[i] + ctx + ') ' + repr(defns[i])

        # append the rest of the entry: (all after first header)

        text = '\n' + text
        newtext += text[text.find('\n=='):]

        # almost there ...

        newtext = expand(newtext, title)
        
        # kill cats

        newtext = decat(newtext)

        # and write new page

        try:
            xpage = wikipedia.Page(site, "User:Robert Ullmann/Prologue/examples/" + title)
            # otext = getedit(xpage)
            otext = xpage.get()
            # putedit(xpage, newtext, comment = "write example")
            xpage.put(newtext, comment = "write example")
        except wikipedia.NoPage:
            xpage.put(newtext, comment = "write example") # write initial version w/framework
            pass
        except Exception, e:
            print "    exception getting/writing example page", repr(e)
            pass

        # finished with page loop

    # done

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()