User:Tbot/code/createflw

From Wiktionary, the free dictionary
Jump to navigation Jump to search



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Tbot/code/createflw


"""
Create a simple foreign word entry in the en.wikt

Append a section if not already present

"""

import wikipedia
import catlib
import sys
import re
import pickle
import socket
from time import time, sleep
import shelve
from mwapi import getwikitext, getedit

from __main__ import cache, logpage, plock

def safe(s):
    return pickle.dumps(s)[1:-5]

def log(s):
    with plock: print safe(s).strip("'" + '"')

# entries we've already seen that exist [not looking at sections yet]:

Exists = set()

# some regex

# these only catch default namespace names ...
reimage = re.compile(r'\[\[image:(.*?)[\|\]]', re.I)
reaudio = re.compile(r'\[\[media:(.*?)[\|\]]', re.I)
# catch image by the |thumb| parameter ;-)
rethumb = re.compile(r'\[\[(.*?)\|[^\]]*thumb[\|\]]')
# image by .jpg or .png:
rejpg = re.compile(r'[\|\{\[=]([^\|\{\[=]*?\.(jpg|png))[\|\]}]')
# and perhaps an ogg file in a template, as in en.wikt?
reogg = re.compile(r'[\|\{\[=]([^\|\{\[=]*?\.ogg)[\|\}]')

# IPA string
reIPA = re.compile(r'IPA.*?([/\[][^\{\}\|\[\]]+?[/\]])')
reIPAt = re.compile(r'\{\{IPA\|([^\{\}\|]+?)[\}\|]')

# fix glosses, context at start, (1) sense number at end should be removed, each should be subbed with space
regloss1 = re.compile(r"^''\(.*?\)''")
regloss2 = re.compile(r"^\(''.*?''\)")
regloss3 = re.compile(r"\(\d+\)$")

rejump = re.compile(r'\{\{jump\|[^}]*}}')

# need only do once on load

site = wikipedia.getSite("en", "wiktionary")
csite = wikipedia.getSite("commons", "commons")

# trans table prefixes, other than "*" at the start of the line
Tlist = dict(   ru = r'\s*\|en=',
                uk = r'\s*\|en=',
                nl = r':?\*',
                sq = r'<br>\{\{en}}',
                ga = r'\{\{aistr\|en', # careful here, next char is | which must match \W
                lt = r'\{\{env1}}',
                yi = r'\|EN=',
                tr = r':?\*\{\{en}}:',
                mn = r':\*\{\{en}}:' )
# and:
Tlist['is'] = r'\|en='   # "is" is a keyword ;-)

# by lc here, various languages
Wlist = dict( cs="{{Wikipedie}}",
              de="{{W}}",
              fr="{{WP}}",
              ga=u"{{vicip\u00e9id}}",
              hu="{{wp1}", # one arg?
              la="{{vicipaedia}}",
              lt="{{vikipedija}}",
              nl="{{-info-}}",
              pt=u"{{Wikip\u00e9dia}}",
              sl="{{W}}",
              vi="{{-info-}}" )

# pronunciation templates for IPA (modded for regex, use . for diacritics etc):
Plist = dict( de="Lautschrift",
              es="[Pp]ronunciaci.n",
              fr="pron" )

# images that show up in page structure for various reasons, e.g. first two on pt.wikt
Istops = set([ 'LuisdeCamoes4.jpg',
               'Os Lusadas.jpg',
               'Wikipedia.png' ])

cis = 0
def createFLentry(flw, lang, lc, pos, title, gloss, mod):
    global cis

    # for now, don't add to the same page (would cause edit conflict anyway?)
    if flw == title:
        # log("skipping addition to same title for now")
        return True
        # doesn't matter because not called with title == flw and return value used (see tbot.py)

    # check cache
    # records last time we tried this word, don't try again for 110 days
    # may need to disable sometimes for debugging!

    ckey = lc + ':' + flw
    if ckey in cache:
        last = cache[ckey]
        if last > time() - (110 * 24 * 3600):
            # log("%s:%s in 110 day cache, not checked" % (lc, flw))
            return False
    cache[ckey] = time() # assume we will complete check now ...
    cis += 1
    if cis % 20 == 0: cache.sync()

    log("createFL %s: %s[%s] %s, %s (%s)" % (flw, lang, lc, pos, title, gloss))

    # get the FL.wikt page

    # fix codes WMF hasn't yet (or has, but we still don't have set correctly :-)
    zlc = lc
    if lc == 'nb': zlc = 'no'
    if lc == 'cmn': zlc = 'zh'
    if lc == 'nan': zlc = 'zh-min-nan'
    # (no yue wikt as yet, hopefully will be created as yue, not zh-yue as in pedia)

    try:
        flsite = wikipedia.getSite(zlc, "wiktionary")
        flpage = wikipedia.Page(flsite, flw)
        # fltext = flpage.get()
        fltext = getwikitext(flpage)
        if fltext: print "FL page exists ..."
    except wikipedia.NoPage:
        with plock: print "page not in FL wikt"
        return False
    except wikipedia.IsRedirectPage:
        with plock: print "FL wikt entry is a redirect"
        return True  # can change to t+
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception, e:
        with plock: print "some exception getting page from FL wikt"
        return False
    if not fltext:
        with plock: print "page not in FL wikt"
        return False

    # see if English word in FL page, presumably as a translation

    if title not in fltext:
        print "FL wikt page does not contain title"
        # logpage.add("[[%s]] entry [[:%s:%s]] exists, title not in entry" % (title, lc, flw))
        return True # we want to insert t+ template, even though not adding entry

    # nl.wikt uses ":*", will be other variations,
    # ru.wikt uses |en= ... etc etc:
    if lc in Tlist: tpre = Tlist[lc]
    else: tpre = r'\*'
    retrans = re.compile(r'^' + tpre + r'.*\W' + re.escape(title) + r'(\W|$)', re.M)

    # look for a line that may be a trans line, with title surrounded by non-word characters
    mo = retrans.search(fltext)
    if mo:
        # truncate fltext at that line, so we don't get extra stuff from following sections
        fltextall = fltext
        fltext = fltext[0:fltext.find(mo.group(0))]    # must be there, but -1 won't hurt
    else:
        print "title not in translation line?"
        logpage.add("[[%s]] entry [[:%s:%s]] exists, pattern not matched" % (title, lc, flw))
        return True # we want to insert t+ template, even though not adding entry

    # a short entry may be just the English translation, not very good (80 is arbitrary)

    # if len(fltext) < 80:
    #    print "FL wikt page is too short"
    #    return True # we want to insert t+ template, even though not adding entry

    # now reconfirm local existence and section absent, get text

    seealso = ''
    addc = 'created %s entry ' % lang
    try:
        log("getting local page %s" % flw)
        page = wikipedia.Page(site, flw)
        text = getedit(page)
        # check language section ...
        if re.search('^==\s*\[*' + re.escape(lang) + '\]*\s*==', text, re.M):
            log("page %s and section %s already exists" % (flw, lang))
            return True  # meaning there is a page and section there now, so convert to t+

        # crappy special case until rationality w/r/t Norwegian and Nynorsk returns ...
        if lang == "Norwegian" and '==Norwegian ' in text:
            log("page %s and some Norwegian section already exists" % flw)
            return True  # meaning there is a page and section there now, so convert to t+

        # another temporary crappy special case, SC bullshit ...
        if lang in ['Croatian', 'Bosnian', 'Serbian'] and '==Serbo-' in text:
            log("page %s and some Serbo- section already exists" % flw)
            return True  # meaning there is a page and section there now, so convert to t+

        addc = 'added %s section ' % lang
    except wikipedia.NoPage:
        # usual case when entry is new
        text = ''
    except wikipedia.IsRedirectPage:
        # overwrite a redirect if present
        text = ''
        addc = 'replaced redirect with %s entry ' % lang
        seealso = page.getRedirectTarget()
        # limit to case redirects, simple case for now (so we don't "fix" Hebrew)
        if flw.lower() != seealso.lower():
            log("page %s is a redirect to %s, not replaced" % (flw, seealso))
            return True

    # see if we can "borrow" image or audio

    image = ''
    mo = reimage.search(fltext)
    if not mo and '|thumb|' in fltext: mo = rethumb.search(fltext)
    if not mo: mo = rejpg.search(fltext)
    if mo:
         img = mo.group(1)
         if ':' in img: img = img.split(':')[1]
         if img and img not in Istops:
             log("found image: %s" % img)
             ipage = wikipedia.Page(csite, "Image:" + img)
             try:
                 ipt = getwikitext(ipage)
                 image = '[[Image:%s|thumb|%s]]\n' % (img, flw)
                 with plock: print "found on commons"
             except wikipedia.NoPage:
                 with plock: print "not found on commons"
             except Exception, e:
                 with plock: print "other exception looking for commons image"
                 pass

    audio = ''
    mo = reaudio.search(fltext)
    if not mo: mo = reogg.search(fltext)
    if mo:
         aud = mo.group(1)
         if ':' in aud: aud = aud.split(':')[1]
         if aud[0:2].lower() != lc:
             log("audio file name %s does not match language %s" % (aud, lc))
             aud = ''
         if aud:
             log("found audio: %s" % aud)
             apage = wikipedia.Page(csite, "Image:" + aud)
             try:
                 apt = getwikitext(apage)
                 audio = '* {{audio|%s|%s}}\n' % (aud, flw)
                 with plock: print "found on commons"
             except wikipedia.NoPage:
                 with plock: print "not found on commons"
             except Exception, e:
                 with plock: print "other exception looking for commons audio"
                 pass

    ipa = ''
    ipas = set() # so repeats don't bother us
    for i in reIPA.findall(fltext):
        ipas.add(i)
    for i in reIPAt.findall(fltext):
        ipas.add(i)
    if lc in Plist:
        rp = re.compile(r'\{\{' + Plist[lc] + '\|(.*?)\}\}')
        for i in rp.findall(fltext):
            ipas.add(i)
    if len(ipas) == 1:
        i = ipas.pop().strip()
        if i.startswith('/'): i = '/' + i.strip('[] /') + '/'
        elif i.startswith('['): i = '[' + i.strip('[] /') + ']'
        elif i: i = '/' + i.strip('[] /') + '/'
        if i == '//' or i == '[]': i = ''
        if i == '/.../' or i == '[...]': i = ''
        if i:
            ipa = "* {{IPA|%s|lang=%s}}\n" % (i, lc)
            log("found IPA %s" % i)
    elif len(ipas) > 1:
        with plock: print "more than one IPA?"

    if audio or ipa:
         pron = '\n===Pronunciation===\n' + ipa + audio
    else:
         pron = ''

    # 'pedia link? look at all original text; often follow trans table

    wplink = ''
    if ("{{wikipedia}}" in fltextall or "{{wikipedia|" + flw + '}}' in fltextall or
             (lc in Wlist and Wlist[lc] in fltextall) or
             (lc in Wlist and Wlist[lc][:-2] + '|' + flw + '}}' in fltextall)):
        wplink = '{{wikipedia|lang=%s}}\n' % lc
        print "added wikipedia link"

    # set up additional infl params from attribute dict:
    
    aip = ''
    if 'alt' in mod and mod['alt']: aip += '|head=' + mod['alt']
    if 'tra' in mod and mod['tra']: aip += '|tr=' + mod['tra']
    if 'g' in mod and mod['g']: aip += '|g=' + mod['g']
    if 'g2' in mod and mod['g2']: aip += '|g2=' + mod['g2']
    if 'g3' in mod and mod['g3']: aip += '|g3=' + mod['g3']
    if 'scr' in mod and mod['scr']: aip += '|sc=' + mod['scr']

    gwas = gloss
    gloss = gloss.strip()
    gloss = regloss1.sub(' ', gloss)
    gloss = regloss2.sub(' ', gloss)
    gloss = regloss3.sub(' ', gloss)
    gloss = rejump.sub(' ', gloss)
    gloss = gloss.strip()
    if not gloss:
         log("nothing left to gloss ...")
         return True   # as FL wikt page does exist
    # decap gloss (some people insist on capitalizing it, which is wrong) and fix, this is almost always right:
    if gloss.startswith('Of '): gloss = 'of ' + gloss[3:]
    if gloss.startswith('Country '): gloss = 'country ' + gloss[8:]
    if gloss.startswith('Person '): gloss = 'person ' + gloss[7:]
    gl = gloss.lower()
    if "translation" in gl:
        log("word 'translation' in gloss, skipped")
        return True    # FL wikt page exists
    if gl[1:] != gloss[1:]: gl = gloss       # caps in string after first, so probably okay
    if gloss.startswith(title): gl = gloss   # Proper noun, e.g. "French language"
    if gl != gwas:
        log("gloss changed %s -> %s" % (gwas, gl))

    # change Proper noun to Noun if lower case; usually the right answer
    if pos == "Proper noun" and flw[0:1].islower():
        log("changed Proper noun to Noun")
        pos = "Noun"

    # add to or create entry text:
    if text: text += '\n\n----\n{{rfc-auto|sort languages}}\n'
    text += """==%s==
%s%s%s
===%s===
{{infl|%s|%s%s}}

# [[%s]] (%s)

{{tbot entry|%s|%s|{{subst:CURRENTYEAR}}|{{subst:CURRENTMONTHNAME}}|%s}}

""" % (lang, wplink, image, pron, pos, lc, pos.lower(), aip, title, gl, lang, title, lc)

    # other special things (no reason not to ;-)
    if lc == 'fr' and pos == 'Verb': text = text.replace("{{tbot", "{{rfinfl|type=conjugation|lang=fr}}\n{{tbot")
    # [ others as desired ]

    # add interwiki, let AutoFormat and Interwicket sort things as needed
    iw = '[[%s:%s]]' % (lc, flw)
    if iw not in text: text += iw + '\n'

    # if overwriting redirect, add see
    if seealso: text = '{{also|' + seealso + '}}\n' + text

    try:
        with plock:
             page.put(text, comment = addc + "from translation at [[%s]] and [[:%s:%s]]" % \
                       (title, lc, flw), minorEdit = False)
    except wikipedia.PageNotSaved:
        with plock: print "failed to save page"
        return False
    except socket.timeout:
        with plock: print "socket timeout, maybe not saving page"
        return False
    except socket.error:
        with plock: print "socket error, maybe not saving page"
        return False
    except Exception, e:
        with plock: print "some exception saving page", repr(e)
        return False

    # Exists.add(flw)
    return True