User:Visviva/Python/wiktiparse.py
Jump to navigation
Jump to search
import urllib2
import re, time
#from nltk import metrics
#import nlp3
#parser=nlp3.Parser()
posses=["noun","verb","adjective","adverb","determiner","article","preposition","conjunction","proper noun","letter","character","phrase","proverb","idiom","symbol","syllable"]
relations=["synonyms","antonyms","hypernyms","hyponyms","meronyms","holonyms","troponyms","related terms","derived terms","coordinate terms"]
class Section:
def __init__(self,text,level=2,title='entry',parent=False,fancy=True):
self.subsections={}
self.is_pos=False
self.content=''
self.definitions=[]
self.notes=[]
self.paragraphs=[]
self.defmatchers={}
self.inflection=''
self.title=title.lower().replace("_"," ")
self.parent=parent
self.categories=[]
if self.parent:
self.index=self.parent.index+"_"+self.title
else:
self.index=self.title
self.level=level #level of the current header; 1 for a full all-languages page
if fancy:
global parser
import nlp3
self.parser=nlp3.Parser()
else:
self.parser=False
# print len(text), self.title,len(self.definitions)
text=text.replace("\xa1\xb0","").replace("\xa1\xb1","")
if self.title in posses: self.is_pos=True
#are there any subsections in the section?
subs=self.split(text,level=level+1)
if len(subs) >= 2:
self.content=subs[''].strip()
subsections=[Section(x.strip(),level=level+1,title=s.strip(),parent=self,fancy=fancy) for s,x in subs.items() if s]
self.subsections=dict((x.title,x) for x in subsections)
else:
self.content=text.strip()
self.content=self.content.replace("\n----","")
self.content=re.sub("[\n\r]+\{\{count page.*?\}\}","",self.content)
lines=[x.strip() for x in self.content.split("\n") if x.strip()]
if self.content: # check for categories and interwikis
main=self.getmain(level=2)
catlines=[x for x in lines if x.startswith("[[Category:")]
if catlines:
for c in catlines:
main.categories.extend(x.strip() for x in x.split("]]")[0] for x in c.split("[[Category:"))
if main != self:
self.content=self.content.replace(c,"")
interlines=[x for x in lines if re.match("\[\[[a-z][a-z]+\:",x)]
for i in interlines:
self.content=self.content.replace(i,"") #for now, just remove
self.content=self.content.strip()
if self.content and self.title != "translations":
if self.is_pos:
self.inflection=lines[0]
defsplus=[x for x in lines if x.startswith("#")]
prevdef=False
for d in defsplus:
defline=re.match('\#+([^\*\:]+.*)',d)
if defline:
defn=Sense(defline.group(1).strip())
self.definitions.append(defn)
prevdef=defn
elif '#:' in d or '#*' in d:
if prevdef:
prevdef.citations.append(d)
else:
print "Unprocessed data: "+d
bullets=[x for x in lines if x.startswith("*")]
prevnote=False
for b in bullets:
noteline=re.match('\*([^\#\:\*]+.*)',b)
if noteline:
noteline=noteline.group(1)
noteline=noteline.strip()
note=Sense(noteline)
if noteline.startswith("{{sense"):
sense=noteline.split("{{sense")[1].split("|")[0]
rest=noteline.split("}}",1)[1]
if self.defmatchers:
self.defmatchers[sense].append(rest)
else:
self.defmatchers[sense]=[rest]
else:
self.notes.append(note)
prevnote=note
elif '*:' in b or '**' in b:
if prevnote:
prevnote.citations.append(b)
else:
print "Unprocessed data: "+b
self.paragraphs=[x for x in lines if x not in bullets and x not in defsplus]
if self.content and self.title == "translations":
content=str(self.content)
if "{{checktrans" in content:
if "{{checktrans-top" in content: catchme="checktrans-top"
else: catchme="checktrans}}"
checkit=content.split(catchme)[1].strip()
content=content.split(catchme)[0].strip()
self.defmatchers['']=checkit
if "{{trans-top" in self.content:
chunks=self.content.split("{{trans-top|")[1:]
self.defmatchers.update(dict(tuple(c.split("}}",1)) for c in chunks if "}}" in c))
else:
self.defmatchers['']=self.content
if fancy: self.process_defmatchers()
def __str__(self):
return self.title
def split(self,text,level=2): #splits wikitext by section at specified level
eqs="=" * level
splitter="[\n\r]+"+re.escape(eqs)+"\s*([^\=]+?)\s*"+re.escape(eqs)+"[\n\r]+"
sections=['']+re.split(splitter,"\n"+text)
processed=dict([(sections[x],sections[x+1]) for x in range(len(sections)) if x%2 == 0])
return processed
def getmain(self,level=1):
ancestor=self
while ancestor.parent and ancestor.level != level:
ancestor=ancestor.parent
return ancestor
def process_defmatchers(self):
if not self.is_pos or not self.definitions:
for s in self.subsections.values():
s.process_defmatchers()
return True
if self.is_pos and len(self.definitions)==1:
onlyone=self.definitions[0]
for s in self.subsections.values():
if s.title.lower() in relations:
onlyone.relations[s.title.lower()] = s.notes
return True
else:
onlyone=False
for s in self.subsections.values():
if not s.defmatchers or not self.parser:
if not onlyone: continue
print "**** %s ****" % s.title
matches=parser.matchup3(s.defmatchers.keys(),[str(x) for x in self.definitions])
if "__unmatched__" in matches.values():
print "Unable to match: "+str([x for x in matches.keys() if matches[x]=="__unmatched__"])
for d, v in s.defmatchers.items():
if not d: continue
if matches[d]=="__unmatched__": continue
match=[x for x in self.definitions if str(x) == matches[d]][0]
match.glosses.append(d)
# senses=sorted((metrics.edit_distance(str(x),d),x) for x in self.definitions)
# print senses[0][0],str(senses[0][1]),d
if s.title != "translations":
match.relations[s.title]=v
# senses[0][1].relations[s.title]=v
else: #translations...
text=str(v)
for t in text.split("\n"):
t=t.strip()
if not t.startswith("*"): continue
tr=re.match("\*[\*\:\#]*(.+)",t)
if tr:
tr=tr.groups(1).strip()
if ":" not in tr: print "Unable to process: "+tr
else:
match.translations[tr.split(":")[0]]=tr.split(":",1)[1]
# senses[0][1].translations[tr.split(":")[0]]=tr.split(":",1)[1]
# print senses[0][1].translations
def all_definitions(self,everything=False,pos=False):
output=[]
if not self.is_pos:
for s in self.subsections.values(): output.extend(s.all_definitions(everything,pos))
elif not pos or pos.lower() in self.title:
if everything:
output.extend((self.title,str(x),x.relations,x.translations,x.glosses) for x in self.definitions)
else:
output.extend((self.title,str(x)) for x in self.definitions)
return output
def all_subsections(self):
output=self.subsections
for s in self.subsections.values():
output.update(s.subsections)
return output
class Sense:
def __init__(self,content):
self.content=''
self.glosses=[]
self.citations=[]
self.translations={}
self.relations={}
self.content=content.strip()
self.exists=True
self.only_templates=False
self.is_non_gloss=False
self.form_of=False
self.has_templates="{{" in self.content and "}}" in self.content
sans_templates=re.sub("\{\{[^\}]*?\}\}","",self.content) # assuming for now that nested templates in sense lines will be vanishingly rare
if not sans_templates:
self.only_templates=True
if "{{non-gloss definition|" in self.content:
self.content=re.sub("\{\{non\-gloss definition\|(.*)\}\}","\1",self.content)
self.is_non_gloss=True
elif "{{defn" in self.content:
self.content=""
self.exists=False
else:
self.form_of=self.content.split("}}")[0].split("|")[-1]
elif " of|" in self.content:
self.form_of=self.content.split(" of|")[1].split("}}")[0].strip()
def __str__(self):
return self.content
class Entry:
def __init__(self,text='',language=False,fancy=True):
posses=[]
if text:
self.entry=Section(text,level=1,fancy=fancy)
self.languages=[x.title for x in self.entry.subsections.values()]
if language:
self.entry=[x for x in self.entry.subsections.values() if x.title == language][0]
class Diff:
def __init__(self,text1,text2,title=''):
self.added=set()
self.removed=set()
self.changed=set()
self.title=title.replace("_"," ")
self.before=Section(text1,level=1,fancy=False)
self.after=Section(text2,level=1,fancy=False)
self.beforesections=iterate_content(self.before,[])
self.aftersections=iterate_content(self.after,[])
self.beforeindices=set([x[0] for x in self.beforesections])
self.afterindices=set([x[0] for x in self.aftersections])
if self.beforesections == self.aftersections:
print "No changed sections."
# print self.beforeindices,self.afterindices
else:
self.added=self.afterindices-self.beforeindices
self.removed=self.beforeindices-self.afterindices
stillthere=dict([(x[0],x[1]) for x in self.beforesections if x[0] in self.afterindices])
self.changed=set()
for s in stillthere: #index
before=stillthere[s]
after=[x[1] for x in self.aftersections if x[0] == s] [0] #content
if before.strip() != after.strip():
self.changed.add(s)
def __str__(self):
output=''
if not self.added and not self.removed and not self.changed:
return 'No changes to entry "%s"' % self.title
else:
output='Changes to entry "%s"' % self.title
if self.added:
output+="\nAdded sections: "+", ".join(self.added)
if self.removed:
output+="\nRemoved sections: "+", ".join(self.removed)
if self.changed:
output+="\nModified sections: "+", ".join(self.changed)
return output
def iterate_content(section,thelist=[]): #create a flat list of tuples (title,content,subsections)
content=section.content.strip()
re.sub("[\s\r\n]+","",content)
thelist.append((section.index,content,section.subsections.values()))
for s in section.subsections.values():
iterate_content(s,thelist)
return thelist
def unescape(text):
#From code by Fredrik Lundh at http://effbot.org/zone/re-sub.htm#-html
# Licensed to the public domain at http://effbot.org/zone/copyright.htm
# Seems to work better than BeautifulSoup for this purpose
def fixup(m):
text = m.group(0)
if text.startswith("&#"):
try:
if text.startswith("&#x"):
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text
return re.sub("\&\#?\w+\;", fixup, text)
def cycle(cycletime=60,pause=5,depth=10):
last_timestamp=False
while True:
url="http://en.wiktionary.org/w/api.php?action=query&list=recentchanges&format=xml&rclimit="+str(depth)
if last_timestamp:
last_timestamp=str(int(last_timestamp.replace("T","").replace("-","").replace(":","").replace("Z","").split(".")[0])+1)
url+="&rcend="+last_timestamp
# print url
rcpage=unicode(urllib2.urlopen(url).read(),"utf-8","ignore")
if not rcpage:
print "Unable to open page"
time.sleep(cycletime)
continue
changes=re.findall("\<rc (.*?)\/\>",rcpage)[::-1]
print len(changes)
if not changes:
print "No updates..."
time.sleep(cycletime)
continue
last_timestamp=changes[-1].split('timestamp="')[1].split('"')[0]
for c in changes:
after_id=c.split('revid="')[1].split('"')[0]
before_id=c.split('old_revid="')[1].split('"')[0]
title=c.split('title="')[1].split('"')[0]
if ":" in title: continue
title=title.replace(" ","_")
if "&#" in title: title=unescape(title)
after_url=unicode("http://en.wiktionary.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=1&rvstartid=%s&rvprop=comment|content|user&format=xml").encode("utf-8","ignore") % (title,after_id)
after_url=after_url.encode("utf-8","ignore")
time.sleep(pause)
after=False
while not after:
try:
after=unicode(urllib2.urlopen(after_url).read(),"utf-8","ignore").encode("utf-8","ignore")
except:
print "Error in loading %s" % after_url
time.sleep(pause)
if before_id and before_id != "0":
before_url=unicode("http://en.wiktionary.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=1&rvstartid=%s&rvprop=comment|content|user&format=xml").encode("utf-8","ignore") % (title,before_id)
before_url=before_url.encode("utf-8","ignore")
time.sleep(pause)
before=False
while not before:
try:
before=unicode(urllib2.urlopen(before_url).read(),"utf-8","ignore").encode("utf-8","ignore")
except:
print "Error in loading %s" % before_url
time.sleep(pause)
else:
before_text=''
before=''
if 'missing=""' in before or 'missing=""' in after: #entry has been deleted
print "Entry %s deleted." % title.encode("utf-8","ignore")
continue
try: after_text=after.split('xml:space="preserve">')[1].split("</rev>")[0].strip()
except IndexError:
print after
print after_url
continue
if before:
try:
before_text=before.split('xml:space="preserve">')[1].split("</rev>")[0].strip()
except:
print before
print before_url
continue
try:
summary=after.split('comment="')[1].split('"')[0]
except IndexError:
summary=''
user=unicode(after.split('user="')[1].split('"')[0],"utf-8","ignore")
diff=Diff(before_text,after_text,title.encode("utf-8","ignore"))
timestamp=c.split('timestamp="')[1].split('"')[0].encode("utf-8","ignore")
print "\n\n%s edited %s at %s" % (user.encode("utf-8","ignore"),title.encode("utf-8","ignore"),timestamp)
print "Summary: %s" % summary
if not before: print "(New page)"
print diff #outputs summary when coerced to string
print "Waiting..."
time.sleep(cycletime)
def present(section): #human-readable depiction of entry as parsed
print "***** %s *****" % section.title
if section.is_pos: print "POS Section"
print "Content:\n"+section.content[:100]
if section.definitions:
print "Definitions:\n"+"\n".join(str(x) for x in section.definitions)
if section.notes:
print "Notes:\n"+"\n".join(str(x) for x in section.notes)
if section.defmatchers:
print "Defmatchers:\n"+"\n".join(str((x,v)) for x,v in section.defmatchers.items())
if section.subsections:
print "Subsections: "+", ".join(section.subsections.keys())
for s in section.subsections:
present(s)