User:Flubot/Adding sort key to French words
Jump to navigation
Jump to search
How to run
- A PC running Linux is needed; also python and pywikipedia.
- Copy the following script into pywikipedia folder.
- Make a list of words to modify and save it in the same folder or else chose a whole category
- Run with
- python taxin-fr.py -file:filename
- python taxin-fr.py -cat:CategoryName
- For more options see def main()
Known problems
- The script cannot handle headword templates including other templates like
{{l}}
. So, it will not modify any such pages. - Other templates outside of the inflexion line are used also for categorizing pages (see for instance Template:past participle of)
- Some pages do not have any headword template and use [[Category:French xxxxx]] instead.
taxin-fr.py
[edit]#!/usr/bin/python
# -*- coding: utf-8 -*-
import wikipedia, pagegenerators, catlib
import sys, re, codecs
class kleidabot:
def __init__(self, generator, debug_bul = True):
self.generator = generator
self.debug_bul = debug_bul
def run(self):
spaces_tag = re.compile('([\*\.,\'\(\)]*)')
apost_tag = re.compile(u'\’')
page_tag = re.compile('(.*)')
pos2_tag = re.compile(u'\{\{fr-(?P<pos>noun|adj|adv|abbr|intj|letter|phrase|prep|pron|verb|diacritical|proper|past participle)(?P<parms>.*)(?P<kleidi>\|(sort|cat)\=[^\|\}]*)(\|*)(?P<selse>.*)\}\}')
pos1_tag = re.compile(u'\{\{fr-(?P<pos>noun|adj|adv|abbr|intj|letter|phrase|prep|pron|verb|diacritical|proper|past participle)(?P<param>[^\}]*)(\|*)\}\}')
templ_tag = re.compile(u'\{\{')
inflsort_tag = re.compile('\{\{infl\|fr\|(?P<pos>noun|adj|adv|abbr|intj|plural|past participle form|letter|phrase|prep|prefix|pron|verb|diacritical mark|proper noun)(?P<param>[^\}]*)(?P<kleidi>\|sort\=[^\}]*)\}\}')
infl_tag = re.compile('\{\{infl\|fr\|(?P<pos>noun|adj|adv|abbr|intj|plural|past participle form|letter|phrase|prep|prefix|pron|verb|diacritical mark|proper noun)(?P<param>[^\}]*)(\|*)\}\}')
ppartsort_tag = re.compile('\{\{(?P<formof>.*)past participle of\|(?P<verb>.+)\|lang=fr\|(?P<kleidi>(cat|sort)=([^\}]*))\}\}')
ppart_tag = re.compile('\{\{(?P<formof>.*)past participle of\|(?P<verb>.+)\|lang=fr\}\}')
plursort_tag = re.compile('\{\{plural of\|(?P<lemma>.+)\|lang=fr\|(?P<kleidi>(cat|sort)=([^\}]*))\}\}')
plur_tag = re.compile('\{\{plural of\|(?P<lemma>.+)\|lang=fr\}\}')
cat1sort_tag = re.compile('\[\[Category:(?P<fr>French|fr:)(?P<subject>[^/|]+)\|(?P<kleidi>[^\}]*)\]\]')
cat1_tag = re.compile('\[\[Category:(?P<fr>French|fr:)(?P<subject>[^/|]+)\]\]')
debug_bul = True
mtg_apo = u'à,À,â,Â,é,É,è,È,ê,Ê,ë,Ë,î,Î,ï,Ï,ô,Ô,ù,Ù,û,Û,ü,Ü,ç,Ç,Æ,æ,Œ,œ,-'
mtg_se = u'a,a,a,a,e,e,e,e,e,e,e,e,i,i,i,i,o,o,u,u,u,u,u,u,c,c,ae,ae,oe,oe, '
trkeys = mtg_apo.split(u',')
for i in range(len(trkeys)):
trkeys[i] = ord(trkeys[i])
trvals = mtg_se.split(u',')
trtable = dict(zip(trkeys,trvals))
# Which entry?
for p in self.generator:
t = p.title()
print(t)
kleida = t.translate(trtable)
kleida = spaces_tag.sub('',kleida)
kleida = apost_tag.sub('',kleida)
if kleida != t:
kleida = kleida.lower()
kleida = kleida.strip(' ')
kleidi = '|' + kleida
kleida = '|sort=' + kleida
wikipedia.setAction('Adding sort key')
else:
kleida = ''
kleidi = ''
wikipedia.setAction(u'remove wrong or reduntant sort key')
page = wikipedia.Page(wikipedia.getSite(), t)
arxiko = page.get(get_redirect=True)
# Adding sort parameter, if needed
seires = arxiko.split("\n")
kainoyrio = []
for seira in seires:
result1 = infl_tag.search(seira)
if (result1):
if not templ_tag.search(result1.group('param')):
result2 = inflsort_tag.search(seira)
if (result2):
seira = inflsort_tag.sub(u'{{infl|fr|\g<pos>\g<param>' + kleida + '}}',seira)
else:
seira = infl_tag.sub(u'{{infl|fr|\g<pos>\g<param>' + kleida + '}}',seira)
result = pos2_tag.search(seira)
if (result):
if result.group('parms'):
if not templ_tag.search(result.group('parms')):
if result.group('kleidi'):
if result.group('selse'):
if not templ_tag.search(result.group('selse')):
seira = pos2_tag.sub(u'{{fr-\g<pos>\g<parms>|\g<selse>'+ kleida + u'}}',seira)
else:
seira = pos2_tag.sub(u'{{fr-\g<pos>\g<parms>'+ kleida + u'}}',seira)
else:
if result.group('kleidi'):
if result.group('selse'):
if not templ_tag.search(result.group('selse')):
seira = pos2_tag.sub(u'{{fr-\g<pos>|\g<selse>'+ kleida + u'}}',seira)
else:
#print "no params neither something else, just the sort/cat parameter"
seira = pos2_tag.sub(u'{{fr-\g<pos>'+ kleida + u'}}',seira)
else:
result4 = pos1_tag.search(seira)
if (result4):
if result4.group('param'):
if not templ_tag.search(result4.group('param')):
seira = pos1_tag.sub(u'{{fr-\g<pos>\g<param>' + kleida + u'}}',seira)
else:
seira = pos1_tag.sub(u'{{fr-\g<pos>' + kleida + u'}}',seira)
result = cat1sort_tag.search(seira)
if (result):
seira = cat1sort_tag.sub(u'[[Category:\g<fr>\g<subject>' + kleidi + ']]',seira)
else:
seira = cat1_tag.sub(u'[[Category:\g<fr>\g<subject>' + kleidi + ']]',seira)
result = plursort_tag.search(seira)
if (result):
seira = plursort_tag.sub(u'{{plural of|\g<lemma>|lang=fr' + kleida + u'}}',seira)
#print seira
else:
result = plur_tag.search(seira)
if (result):
#print 10
seira = plur_tag.sub(u'{{plural of|\g<lemma>|lang=fr' + kleida + u'}}',seira)
result = ppartsort_tag.search(seira)
if (result):
seira = ppartsort_tag.sub(u'{{\g<formof>past participle of|\g<verb>|lang=fr' + kleida + '}}',seira)
else:
seira = ppart_tag.sub(u'{{\g<formof>past participle of|\g<verb>|lang=fr' + kleida + '}}',seira)
kainoyrio.append(seira)
keimeno = "\n".join(kainoyrio)
# Saving changes
if keimeno != page.get():
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
wikipedia.showDiff(page.get(), keimeno)
#choice= 'y'
if debug_bul:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['yes', 'No', 'all'], ['y', 'N', 'a'], 'N')
if choice == 'a':
choice = 'y'
debug_bul = False
elif choice != 'y':
choice='n'
if choice == 'y':
try:
# Save the page
page.put(keimeno)
except wikipedia.IsRedirectPage:
wikipedia.output(u'Skipping %s because it is a redirect' % (page.title()))
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
except wikipedia.SpamfilterError, error:
wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
return 0
def main():
gen = None
pageTitle = []
for arg in wikipedia.handleArgs():
if arg:
if arg.startswith('-start:'):
gen = pagegenerators.AllpagesPageGenerator(arg[7:])
elif arg.startswith('-ref:'):
referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
gen = pagegenerators.ReferringPageGenerator(referredPage)
elif arg.startswith('-links:'):
linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
gen = pagegenerators.LinkedPageGenerator(linkingPage)
elif arg.startswith('-file:'):
gen = pagegenerators.TextfilePageGenerator(arg[6:])
elif arg.startswith('-cat:'):
cat = catlib.Category(wikipedia.getSite(), arg[5:])
gen = pagegenerators.CategorizedPageGenerator(cat)
else:
pageTitle.append(arg)
if pageTitle:
page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
gen = iter([page])
if not gen:
wikipedia.showHelp('touch')
else:
preloadingGen = pagegenerators.PreloadingGenerator(gen)
bot = kleidabot(preloadingGen)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
# "kainourio" means new
# "seira" means line (of text)
# "kleida" means (sort) key
# "arxiko" means initial
# :P