User:Visviva/transclusion.py
Jump to navigation
Jump to search
import re
import re
import xmlreader
import urllib
import sys
from htmlentitydefs import name2codepoint
magicwords=[]
parserfunctions=set(["#if","#ifeq","#switch","#expr","#ifexist","lc","ucfirst","ns"])
pipescape="(\[\[[^\]\{\}]*?(\{\{.*?\}\})*?[^\]\{\}]*?)\|([^\[\{\}]*?(\{\{.*?\}\})*?[^\[\{\}]*?\]\])" #regex for pipes in brackets
entitlesfile="D:\Code\\en_titles.txt"
if __name__ == '__main__':
import transclusion
transclusion.parse_list(sys.argv[1]) #will create a file called "wikt_EN_definitions.txt" in the current directory
def get_stuff(dumpfile="wikt.bz2"):
import datetime
dump=xmlreader.XmlDump(dumpfile)
global Templates
global alltitles
Templates={}
alltitles=set()
for d in dump.parse():
if "Template:" in d.title.capitalize():
templatename=d.title.split("emplate:",1)[1].encode("utf-8","ignore")
Templates[templatename]=d.text.encode("utf-8","ignore")
alltitles.add(d.title)
for t in Templates:
redirmatch=re.search("(?i)\#redirect\s*\[\[template\:(.*?)\]\]",Templates[t])
if redirmatch:
try:
print "Redirect from "+t+" to "+redirmatch.group(1)
Templates[t]=str(Templates[redirmatch.group(1)])
except Exception:
print "Error on "+t #Will complain about those templates that transclude non-Template:-namespace pages. Fortunately these are seldom found in definitions.
# print t,Templates[t]
Templates["PAGENAME"]="page.name"
Templates["CURRENTYEAR"]=str(datetime.date.today().year)
Templates["NAMESPACE"]=""
return Templates,alltitles
def rinse(text="",Templates={},alltitles=set()): #"text" must be a full, single template "{{...}}"
text=text.strip()
args={}
if text[0:2] != "{{" or text[-2:] != "}}":
print "Invalid template.",Templates["PAGENAME"]
return text,args
# get & remove template name
templatename=re.split("(?<=[^\\\]{1})\|",text)[0][2:].replace("}","").strip()
text=text[2:-2]+"|" #chop off closing brackets for uniformity
if templatename not in Templates:
if templatename.split(":")[0] not in parserfunctions: return "[[Template:"+templatename+"]]",args
elif ":" not in templatename and ("[" in templatename or "]" in templatename): return "{{"+text[:-1]+"}}",args
else: template=text
else: template=Templates[templatename]
template=re.sub('\<noinclude\>[\s\S]*?\<\/noinclude\>','',template)
template=re.sub('\<\!\-\-[\s\S]*?\-\-\>','',template)
if "<onlyinclude>" in template and "</onlyinclude>" in template:
template=template.split("<onlyinclude>")[1].split("</onlyinclude>")[0]
template=template.replace("<includeonly>","").replace("</includeonly>","")
#get args
args=get_args(text)
# print str(args)
# some very special cases
if templatename=="isValidPageName":
if not args["1"]: template=""
for badchar in ["[","]","{","}"]:
if badchar in args["1"]:
template=""
break
if template: template="valid"
elif templatename=="wlink": # Can get away with this because we just want plain text
for arr in ["2","1","w"]:
found=False
if arr in args.keys():
template=args[arr]
found=True
if not found: template=""
elif templatename=="form of" or templatename=="form_of":
if "2" in args.keys():
template=args["1"]+" of "+args["2"]+"."
else:
template="Form of "+args["1"]+"."
if "nocap" in args.keys():
if args["nocap"]:
# print "Nocap:"+ args["nocap"]
template=template[0].lower()+template[1:]
else:
template=template[0].upper()+template[1:]
if "nodot" in args.keys():
if args["nodot"]:
template=template[:-1]
#Render passed args, substituting values gleaned above
template=render_args(template,args)
while re.search("[^\{]+\{\{\{[^\{]+",template) or re.search("[^\}]+\}\}\}[^\}]+",template):
formerly=template
template=render_args(template,args)
if template == formerly:
break
#second, templates and magic words
for r in [x[1] for x in re.findall("((?<=[^\{]{1})|(?<=[\{]{2}))(\{\{[^\{\}]+?\}\})"," "+template+" ")]:
r=re.sub(pipescape,"\\1\|\\3",r)
rinsed,argybargy=rinse(r,Templates=Templates)
if rinsed != template:
template=template.replace(r,rinsed)
template=template.replace("\|","|")
return template,args
#is this a ParserFunction?
if templatename.split(":")[0] in parserfunctions:
pfunction=templatename.split(":")[0]
pargs=[n.strip() for n in re.split("(?<=[^\\\]{1})\|",template.split(":",1)[1])]
if pfunction=="#if":
if pargs[0].strip():
template=pargs[1]
else:
try: template=pargs[2]
except IndexError: template=""
elif pfunction=="#ifeq":
if pargs[0].strip()==pargs[1].strip():
template=pargs[2]
else:
try: template=pargs[3]
except IndexError: template=""
elif pfunction=="#switch":
options=[p.split("=")[0] for p in pargs[1:]]
if pargs[0] in options:
reg=re.search("[\{\|]+?\s*"+pargs[0].strip()+"\s*\=(.*?)\s*[\}\|]+",template)
if reg:
template=reg.group(1)
else:
reg=re.search("[\{\|]+?"+pargs[0].strip()+"\|[\{]+?\=(.*?)[\}\|]+",template)
try:
template=reg.group(1)
except:
template=""
elif "#DEFAULT" in options:
template=pargs[options.index("#DEFAULT")+1].split("=",1)[1]
elif "#default" in options:
template=pargs[options.index("#default")+1].split("=",1)[1]
elif pargs[-1].strip():
if "=" not in pargs[-1]:
template=pargs[-1]
else:
template=""
elif pfunction=="lc":
template=template.split(":",1)[1][:-1].lower()
elif pfunction=="ucfirst":
template=pargs[1].capitalize()
elif pfunction=="ns":
if pargs[0]=="0": template=""
else: template=pargs[0]
elif pfunction=="#expr":
exec("template="+pargs[0])
template=str(template)
elif pfunction=="#ifexist":
if pargs[0].strip() in alltitles:
template=pargs[1]
else:
try:
template=pargs[2]
# print "template:" ,pargs[2]
except IndexError:
template=""
if template and template[-1]=="|": #undo the hack from the beginning
template=template[0:-1]
return template,args
def cycle(text="",Templates={},alltitles=set(),args={},limit=1000):
old=""
x=0
while "{{" in text and "}}" in text and x < limit:
text,args=intake(text,Templates,alltitles,args)
if old == text: break #nothing left that renders
if re.search("[^\{]+\{[^\{]+",text) or re.search("[^\}]+\}[^\}]+",text):
print "Invalid output.",Templates["PAGENAME"]
# break
old=text
x+=1
if "{" in text or "}" in text or "_" in text: return False, text
else: return True,text
def intake(text="",Templates={},alltitles=set(),args={}): #For raw or returned text
text=re.sub('\<noinclude\>[\s\S]*?\<\/noinclude\>','',text)
text=re.sub('\<\!\-\-[\s\S]*?\-\-\>','',text)
text=text.replace("<includeonly>","").replace("</includeonly>","")
if len(text.split("{{"))==0:
# print "Nothing to parse."
return text,{}
if text.count("{") != text.count("}"):
print "Unmatched bracket.",Templates["PAGENAME"]
return text,{}
text=render_args(text,args)
templatecatcher="((?<=[^\{]{1})|(?<=[\{]{2}))(\{\{[^\{]+?\}\})"
while re.search(templatecatcher," "+text):
text=re.sub(pipescape,"\\1\|\\3",text) #escape any pipes inside bracketed links
r=re.search(templatecatcher," "+text).group(2)
rinsed,args=rinse(r,Templates)
text=text.replace(r,rinsed)
text=text.replace("\|","|")
if r == rinsed: break
return text,args
def balanced_triples(startval=3,str=""):
open=startval+str.count("{{{")
closed=str.count("}}}")
return open == closed
def render_arg(r,args,template):
argname=re.split("(?<=[^\\\]{1})\|",r)[0]
if argname[-3:] == "}}}":
argname=argname[:-3]
if argname[:3] == "{{{":
argname=argname[3:]
if "{{" in argname or "}}" in argname:
if argname.count("}}") == argname.count("{{"):
null,argname=cycle(argname,Templates,alltitles,args) #Can't do anything until any functions/templates in the argument name are dealt with
else: #if unbalanced, we must have cut too soon
argparts=re.split("(?<=[^\\\]{1})\|",r)
x=1
while argname.count("{{") != argname.count("}}"):
try:
argname=argname+"|"+argparts[x]
except IndexError:
print "We have a problem:", argname
x+=1
null,argname=cycle(argname,Templates,alltitles,args)
argname=re.escape(argname)
argname=argname.replace("{{{","").replace("}}}","").strip()
if argname in args:
template=re.sub("\{\{\{"+argname+"\|.*?\}\}\}",args[argname],template).strip()
template=template.replace("{{{"+argname+"}}}",args[argname]).strip()
else:
template=re.sub("\{\{\{"+argname+"\|(.*?)\}\}\}","\\1",template).strip()
template=template.replace("{{{"+argname+"}}}","_"+argname+"_").strip() #Bad hack
return template
def parse_list(dumpfile="C:\Code\\wikt.bz2",limit=0):
import xmlreader
English=set()
try: #in case the previous attempt was aborted, reuse data
import transclusion
Templates=transclusion.Templates
alltitles=transclusion.alltitles
print len(transclusion.Templates),len(transclusion.alltitles) #loaded as globals?
if not len(transclusion.Templates) or not len(transclusion.alltitles): #Zeroed?
print "Reloading templates and title list."
Templates,alltitles=transclusion.get_stuff()
except:
print "Getting templates and title list."
Templates,alltitles=transclusion.get_stuff()
contemplates={"context":""} # for holding all members of the {{context}} family
for t in Templates:
if "{{context" in Templates[t]:
temptext=re.sub("\[\[[^\]]*?\|(.*?)\]\]","\\1",Templates[t])
labelmatch=re.search("label\=([^\|\}]*)",temptext)
if not labelmatch:
label=t
else:
label=labelmatch.group(1)
contemplates[t]=label
for t2 in Templates: #some templates use {{context}} at secondhand
if t2 in contemplates: continue
workingtemplate=re.sub("\<noinclude\>[\s\S]*?\<\/noinclude\>","",Templates[t2])
workingtemplate=re.sub("\<.*?\>","",workingtemplate).strip()
if workingtemplate[0:2] != "{{": continue
else:
if workingtemplate[2:].startswith(tuple(contemplates.keys())):
labelmatch=re.search("label\=([^\|]+.*)",Templates[t2])
if labelmatch:
label=re.sub("\[\[[^\]]*?\|","",labelmatch.group(1)).split("|")[0]
else:
label=t2
contemplates[t2]=label
print "Identified "+str(len(contemplates))+" context templates."
try:
English=set(open(entitlesfile).read().split("\n")) #just as a timesaver
except:
pass
# print len(English)
writefile=open("wikt_EN_definitions.txt","w")
writefile.close()
writefile2=open("wikt_EN_definitions.txt","a")
# badfile=open("bad_pos.txt","w")
limitcounter=0
dump=xmlreader.XmlDump(dumpfile)
for entry in dump.parse():
limitcounter+=1
if limit and limitcounter > limit: break
if entry.title not in English and "==English==" not in entry.text: continue
if ":" in entry.title: continue
# try: print entry.title.encode('utf-8','ignore')
# except: pass
try:
section=re.split("\n\=\=[^\=]{1}",entry.text.split("English==",1)[1])[0]
except: continue
Templates["PAGENAME"]=entry.title.encode("utf-8","ignore")
posses=re.split("\n[\=]{3,5}(?=[^\=]{1})",section)[1:]
valids=["noun","proper noun","verb","adjective","adverb","article","preposition","conjunction","determiner","letter","symbol","initialism","acronym","abbreviation","cardinal number","ordinal number","numeral","pronoun","particle","suffix","prefix","confix","infix","circumfix","interfix","interjection","phrase","proverb","number","contraction","idiom","affix"]
print entry.title.encode("utf-8","ignore")
for p in posses:
if "\n#" not in p: continue
pos=p.split("=")[0].replace("{","").replace("}","").capitalize().encode("utf-8","ignore")
pos=pos.split("|")[0]
if pos.lower() not in valids:
continue
defs=[e.split("\n")[0].strip() for e in re.split("\n[\#]+(?=[^\:\#\*]{1})",p)[1:]]
for d in defs:
d=re.sub("\[\[[^\]]*?\|","",d) #remove pipes now, since we're not going to want them anyway
d=unescape(d)
d=d.replace("etyl|","").replace("non-gloss definition|","") #change etyl to basic template.
d=re.sub("\{\{i\|(.*?)\}\}","_\\1_",d)
d=re.sub("\{\{.*term\|([^\|\}]*)(.*?)\}\}","_\\1_",d) # term and en-term
begone=["jump\|","rf.*?","cattag.*?",]
for begonia in begone:
d=re.sub("\{\{"+begonia+".*?\}\}","",d)
d=re.sub("\{\{IPA\|(.*?)\}\}","\\1",d)
d=re.sub("\<\!\-\-[\s\S]*?\-\-\>","",d)
d=re.sub("\<ref.*?\>.*?\<\/ref\>","",d)
chunks=d.split("{{")
rebuilt=chunks[0]
for chunk in chunks[1:]: #context template hunting
tempname=chunk.split("|")[0].split("}")[0].strip()
if tempname in contemplates:
chunk=chunk.replace(tempname,contemplates[tempname])
chunkparts=re.sub("[\|\{]*\=[\|\}]*","",chunk)
chunkparts=chunk.split("}}")
try:
chunkparts[0]=re.sub("\|.*?\=[^\|^\}]*","",chunkparts[0])
chunk=chunkparts[0].replace("|_|"," ").replace("|",", ")
rebuilt+="("+chunk+")"+chunkparts[1]
except IndexError:
continue
else: rebuilt+="{{"+chunk
rebuilt=rebuilt.replace("(, ","(") #final tidy
d=rebuilt
# print d.encode('utf-8','ignore')
try: #now we transclude what we can...
okay,newdefline=cycle(d,Templates,alltitles,limit=25)
if not okay:
if "}" not in newdefline or "{" not in newdefline:
d=newdefline.replace("}","").replace("{","") #the problem is probably stray brackets
else:
if "[[Template" not in newdefline: #this almost always means trouble, ergo if present, skip
d=newdefline
except: pass #cycle() is still throwing some errors. If that happens, we just do it the hard way.
d=d.replace("qualifier|","").replace("ib|","").replace("italbrac|","")#common forms of uglitude
useless=["Template","Image","File","Category"] #template droppings, images
for u in useless:
d=re.sub("(?i)\[\["+u+"\:[^\]]*?\]\]","",d)
d=re.sub("\[\[[^\]]*?\|","",d) #visible text only, please
d=re.sub("\[http[^\]\s]+?\s*(.*?)\]","\\1",d)
d=re.sub("\(rf.*?\)","",d)
d=re.sub("\|[^\|]+?\=[^\}\|]*","",d)
d=re.sub("\<.*?\>","",d) # <span> et al.
d=d.replace("[[","").replace("]]","")
d=d.replace("{{","(").replace("}}",")")
d=d.replace("\t"," ") # no valid use for tabs on definition line
d=d.replace(" of|"," of ")
d=d.replace("from=","from ") # Surname /given name templates
d=d.replace("notcomparable","not comparable")
d=d.replace("(,)", "")
nocommas=["of","from","mostly","chiefly","usually","often","rarely","seldom","sometimes","extremely","markedly","or","and","except"]
for n in nocommas:
d=d.replace(" "+n+","," "+n+" ").replace("("+n+",","("+n+" ")
d=d.replace("'''",'"') #Most common use of explicit boldface (heaven knows why...)
d=d.replace("''(","(").replace(")''",")").replace("(''","(").replace("'')",")")
d=d.replace(":''",":").replace("):",")")
d=d.replace("|_|"," ").replace("|",", ")
d=d.replace('""','"')
d=d.replace("''","_") #standard plain-text code
d=d.replace(" )",")").replace("( ","(")
d=re.sub("\s+"," ",d) #normalize spacing
d=d.strip()
try:
d=d[0].capitalize()+d[1:]
except IndexError: d=d.capitalize()
try:
d=d.encode('utf-8','ignore')
except UnicodeDecodeError:
pass
try:
line="\t".join([entry.title.encode('utf-8','ignore'),pos,d])
# print line
except UnicodeDecodeError: continue
writefile2.write(line+"\n")
# badfile.close()
writefile2.close()
#alphabetize
print "Alphabetizing..."
lines=[(x.split("\t",1)[0],x) for x in open("parsed_titles.txt").read().split("\n")]
lines.sort()
fileheader="****\n\nThis is a text dump of definitions from Wiktionary, http://en.wiktionary.org, which is licensed under the GNU Free Documentation License. See http://en.wiktionary.org/w/index.php?title=PAGENAME&action=history for the full list of contributors to each entry. \n\n****\n\n"
writefile=open("parsed_titles.txt","w")
writefile.write(fileheader)
for line in lines:
writefile.write(line[1]+"\n")
writefile.close()
make_dictionarylike() # just for fun
def make_dictionarylike(file="parsed_titles.txt",outfile="dictionarylike.txt",searchterm=""):
currentword=""
currentpos=""
currententry=""
sortkey=""
outlines={}
try:
lines=open(file).read().split("****",2)[2].split("\n")
except IndexError:
lines=open(file).read().split("\n")
for line in lines:
parts=line.split("\t")
if len(parts) != 3: continue
if not parts[2].strip(): continue
try:
definition=parts[2][0].upper()+parts[2][1:]
except IndexError:
definition=parts[2].capitalize()
if re.search("\(.*?\)\W*\n",parts[2]+"\n"):
continue #skip any words that have a parenthesized (templated) definition only
elif "participle of" in parts[2] or "tense of" in parts[2] or "past of" in parts[2] or re.search("form of .+\.",parts[2]) or "spelling of" in parts[2]: #Get thee gone, inflected forms!
continue
if currentword != parts[0]:
outlines[sortkey]="*"+currententry+"<sup>[http://en.wiktionary.org/w/index.php?action=edit&title=%s e]</sup>\n" % urllib.quote(currentword)
currentword=parts[0]
currentpos=parts[1]
currententry="'''"+currentword+"'''. ''"+currentpos+"''. "
count=1
elif currentpos != parts[1]:
currentpos=parts[1]
count=1
currententry+="''%s''. " % currentpos
else:
count+=1
currententry+="'''"+str(count)+".''' "+definition
sortkey=re.sub("[^\w]","",currentword).lower().strip()+" "+currentword #Use fulll current word for tiebreaking only
if currententry.strip()[-1] != ".":
currententry+=". "
elif currententry[-1] != " ":
currententry+=" "
writefile=open(outfile,"w")
sortkeys=outlines.keys()
sortkeys.sort()
for s in sortkeys:
writefile.write(outlines[s])
writefile.close()
def page_from_word(word="",infile="dictionarylike.txt",outfile="daypage.txt"):
if not word:
import urllib2
mainpage=urllib2.urlopen(urllib2.Request("http://en.wiktionary.org/wiki/Wiktionary:Main_Page",'',{'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'})).read()
word=mainpage.split('<span id="WOTD-rss-title">')[1].split("</span>")[0]
print word
daily=True
else:
daily=False
intext=open(infile).read()
text=intext.split("\n*'''"+word+"'''",1)[1]
pagenum=int((len(intext)-len(text))/20000) #length of text up to this point, divided by approx. page length
text=re.sub("\[[\S]+?title\=(.*?) .+?\]\W*\n","\t\\1\t\n",text) #escape temporarily
edgetext=text[20000:].split("\n",1)[0]
text=text[0:20000]+edgetext
length=0
firstpage=""
for t in text.split("\n"):
firstpage+=t+"\n"
length+=len(t)
if length > 10000: break
secondpage=text.replace(firstpage,"")
firstpage="*'''"+word+"'''"+firstpage #restore
header='<!-- page of the day, an experimental service based on WOTD --><table><tr><th width="45%" align="left">'+word+'</th><th width="5%" align="center">'+str(pagenum)+'</th><th width="45%" align="right">'+text.split("\n*'''")[-1].split("'''")[0]+'</th></tr><tr valign="top"><td>'
breaker="</td><td></td><td>"
footer="</td></tr></table>"
text="\n".join([header,firstpage,breaker,secondpage,footer])
text=text.replace("\t\n"," e]\n").replace("\t","[http://en.wiktionary.org/w/index.php?action=edit&title=")
writefile=open(outfile,"w")
writefile.write(text)
writefile.close()
if daily:
import wikipedia
site=wikipedia.getSite("en","wiktionary")
page=wikipedia.Page(site,"User:Visviva/Page of the day")
page.put(text)
def unescape(text):
#From code by Fredrik Lundh at http://effbot.org/zone/re-sub.htm#-html
# Licensed to the public domain at http://effbot.org/zone/copyright.htm
# Seems to work better than BeautifulSoup for this purpose
def fixup(m):
text = m.group(0)
if text.startswith("&#"):
try:
if text.startswith("&#x"):
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text
return re.sub("\&\#?\w+\;", fixup, text)
def render_args(template,args):
matches=re.findall("((?<=[^\{]{1})|(?<=[\{]{2}))(\{\{\{[^\{\}\#]+?\}\}\})"," "+template+" ")
if not matches:
return template
for r in matches:
template=render_arg(r[1],args,template)
argparts=template.split("{{{")
counter=0
unfinished=""
while counter+1 < len(argparts):
counter+=1
workingarg=unfinished+argparts[counter]
# print "2nd iter"
if "}}}" not in workingarg:
unfinished+="{{{"+workingarg
continue
elif balanced_triples(3,workingarg) or balanced_triples(3,workingarg.split("}}}")[0]+"}}}"):
unfinished=""
workingarg="{{{"+("}}}".join(workingarg.split("}}}")[:-1])) # We know triple is balanced, so chop off anything after the last "}}}"
if "{{{" in workingarg[3:]: #possibility of unrendered sub-args?
workingarg=render_args(workingarg,args)
template=render_arg(workingarg,args,template)
else:
unfinished+="{{{"+workingarg
continue
return template
def get_args(text):
workingtext=" "+text+" "
args=dict((y[0].strip(),y[1].strip()) for y in re.findall("(?<=[^\\\]{1})\|([^\|\}\<\>\#]+?)\=(.{0}|[^\{\}\|]*?(\{\{.*?\}\})*[^\{\}\|]*?[^\\\\|]{1})(?=[\|\}]{1})",workingtext))
x=0
anonyparts=re.findall("(?<=[^\\\]{1}\|)([^\{\}\|\=]*([\{\[]{2}[^\}\{]+?[\}\]]{1,2})*[^\{\}\\=|]*?[^\=\\\\|]*[^\\\\=\|]{1}|.{0})[\|]{1}",text)
nextpart=""
while x < len(anonyparts):
thispart=anonyparts[x][0]
while thispart.count("{{") != thispart.count("}}"):
thispart=thispart+"|"+anonyparts[x+1][0]
# print thispart
x+=1
if "{{" in thispart and "}}" not in thispart.split("{{")[-1]: # did we go too far?
thispart="}}".join(thispart.split("}}")[:-1])
args[str(x+1)]=thispart.strip()
x+=1
for a in args:
args[a]=re.sub("\[\[.*?\|(.*?)\]\]","\\1",args[a])
args[a]=args[a].replace("[","").replace("]","") #cheating... don't want this markup for now.
args[a]=re.sub("(\w+)\#\w+","\\1",args[a]) # section links
return args