User:Flubot/Adding sort key to French words

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

How to run

  • A PC running Linux is needed; also python and pywikipedia.
  • Copy the following script into pywikipedia folder.
  • Make a list of words to modify and save it in the same folder or else chose a whole category
  • Run with
    python taxin-fr.py -file:filename
    python taxin-fr.py -cat:CategoryName
    For more options see def main()


Known problems

  • The script cannot handle headword templates including other templates like {{l}}. So, it will not modify any such pages.
  • Other templates outside of the inflexion line are used also for categorizing pages (see for instance Template:past participle of)
  • Some pages do not have any headword template and use [[Category:French xxxxx]] instead.

taxin-fr.py[edit]

#!/usr/bin/python
# -*- coding: utf-8 -*-
 
import wikipedia, pagegenerators, catlib
import sys, re, codecs
 
class kleidabot:
 
  def __init__(self, generator, debug_bul = True):
    self.generator = generator
    self.debug_bul = debug_bul
 
  def run(self):
	spaces_tag = re.compile('([\*\.,\'\(\)]*)')
	apost_tag = re.compile(u'\’')
	page_tag = re.compile('(.*)')
	pos2_tag = re.compile(u'\{\{fr-(?P<pos>noun|adj|adv|abbr|intj|letter|phrase|prep|pron|verb|diacritical|proper|past participle)(?P<parms>.*)(?P<kleidi>\|(sort|cat)\=[^\|\}]*)(\|*)(?P<selse>.*)\}\}')
	pos1_tag = re.compile(u'\{\{fr-(?P<pos>noun|adj|adv|abbr|intj|letter|phrase|prep|pron|verb|diacritical|proper|past participle)(?P<param>[^\}]*)(\|*)\}\}')
	templ_tag = re.compile(u'\{\{')
	inflsort_tag = re.compile('\{\{infl\|fr\|(?P<pos>noun|adj|adv|abbr|intj|plural|past participle form|letter|phrase|prep|prefix|pron|verb|diacritical mark|proper noun)(?P<param>[^\}]*)(?P<kleidi>\|sort\=[^\}]*)\}\}')
	infl_tag = re.compile('\{\{infl\|fr\|(?P<pos>noun|adj|adv|abbr|intj|plural|past participle form|letter|phrase|prep|prefix|pron|verb|diacritical mark|proper noun)(?P<param>[^\}]*)(\|*)\}\}')
	ppartsort_tag = re.compile('\{\{(?P<formof>.*)past participle of\|(?P<verb>.+)\|lang=fr\|(?P<kleidi>(cat|sort)=([^\}]*))\}\}')
	ppart_tag = re.compile('\{\{(?P<formof>.*)past participle of\|(?P<verb>.+)\|lang=fr\}\}')
	plursort_tag = re.compile('\{\{plural of\|(?P<lemma>.+)\|lang=fr\|(?P<kleidi>(cat|sort)=([^\}]*))\}\}')
	plur_tag = re.compile('\{\{plural of\|(?P<lemma>.+)\|lang=fr\}\}')
 
	cat1sort_tag = re.compile('\[\[Category:(?P<fr>French|fr:)(?P<subject>[^/|]+)\|(?P<kleidi>[^\}]*)\]\]')
	cat1_tag = re.compile('\[\[Category:(?P<fr>French|fr:)(?P<subject>[^/|]+)\]\]')
 
	debug_bul = True
 
	mtg_apo = u'à,À,â,Â,é,É,è,È,ê,Ê,ë,Ë,î,Î,ï,Ï,ô,Ô,ù,Ù,û,Û,ü,Ü,ç,Ç,Æ,æ,Œ,œ,-'
	mtg_se =  u'a,a,a,a,e,e,e,e,e,e,e,e,i,i,i,i,o,o,u,u,u,u,u,u,c,c,ae,ae,oe,oe, '
 
	trkeys = mtg_apo.split(u',')
	for i in range(len(trkeys)):
        	trkeys[i] = ord(trkeys[i]) 
	trvals = mtg_se.split(u',')
	trtable = dict(zip(trkeys,trvals)) 
 
	# Which entry?
	for p in self.generator:
	  t = p.title()
	  print(t)
	  kleida = t.translate(trtable)
	  kleida = spaces_tag.sub('',kleida)
	  kleida = apost_tag.sub('',kleida)
	  if kleida != t:
	  	kleida = kleida.lower()
	  	kleida = kleida.strip(' ')
	  	kleidi = '|' + kleida
	  	kleida = '|sort=' + kleida
		wikipedia.setAction('Adding sort key')
	  else:
	        kleida = ''
	        kleidi = ''
		wikipedia.setAction(u'remove wrong or reduntant sort key')
	  page = wikipedia.Page(wikipedia.getSite(), t)
	  arxiko = page.get(get_redirect=True)
 
	# Adding sort parameter, if needed
	  seires = arxiko.split("\n")
	  kainoyrio = []  
	  for seira in seires:
		  result1 = infl_tag.search(seira)
		  if (result1):
		     if not templ_tag.search(result1.group('param')):
		  	result2 = inflsort_tag.search(seira)
		  	if (result2):
		  		seira = inflsort_tag.sub(u'{{infl|fr|\g<pos>\g<param>' + kleida + '}}',seira)
		  	else:
				seira = infl_tag.sub(u'{{infl|fr|\g<pos>\g<param>' + kleida + '}}',seira)  
	          result = pos2_tag.search(seira)
	          if (result):
	             if result.group('parms'):
	               if not templ_tag.search(result.group('parms')):
	                  if result.group('kleidi'):
	                  	if result.group('selse'):
	                  	   if not templ_tag.search(result.group('selse')):
	                  	       seira = pos2_tag.sub(u'{{fr-\g<pos>\g<parms>|\g<selse>'+ kleida + u'}}',seira)
	                  	else:
	                          seira = pos2_tag.sub(u'{{fr-\g<pos>\g<parms>'+ kleida + u'}}',seira)
	             else:
	                  if result.group('kleidi'):
	                  	if result.group('selse'):
	                  	   if not templ_tag.search(result.group('selse')):
	                  	       seira = pos2_tag.sub(u'{{fr-\g<pos>|\g<selse>'+ kleida + u'}}',seira)
	                  	else:
	                  	  #print "no params neither something else, just the sort/cat parameter"
	                          seira = pos2_tag.sub(u'{{fr-\g<pos>'+ kleida + u'}}',seira)                          
	          else:
	              result4 = pos1_tag.search(seira)
	              if (result4):
	                 if result4.group('param'):
	                   if not templ_tag.search(result4.group('param')): 
	                  	seira = pos1_tag.sub(u'{{fr-\g<pos>\g<param>' + kleida + u'}}',seira)
	                 else:
	              	   seira = pos1_tag.sub(u'{{fr-\g<pos>' + kleida + u'}}',seira)
	          result = cat1sort_tag.search(seira)
          	  if (result):
          		seira = cat1sort_tag.sub(u'[[Category:\g<fr>\g<subject>' + kleidi + ']]',seira)
          	  else:
          		seira = cat1_tag.sub(u'[[Category:\g<fr>\g<subject>' + kleidi + ']]',seira)
 
	          result = plursort_tag.search(seira)
          	  if (result):
          		seira = plursort_tag.sub(u'{{plural of|\g<lemma>|lang=fr' + kleida + u'}}',seira)
          		#print seira
          	  else:
          	  	result = plur_tag.search(seira)
          	  	if (result):
          	  		#print 10
          			seira = plur_tag.sub(u'{{plural of|\g<lemma>|lang=fr' + kleida + u'}}',seira)
 
	          result = ppartsort_tag.search(seira)
          	  if (result):
          		seira = ppartsort_tag.sub(u'{{\g<formof>past participle of|\g<verb>|lang=fr' + kleida + '}}',seira)
          	  else:
          		seira = ppart_tag.sub(u'{{\g<formof>past participle of|\g<verb>|lang=fr' + kleida + '}}',seira)
	          kainoyrio.append(seira)
	  keimeno = "\n".join(kainoyrio)
 
# Saving changes
	  if keimeno != page.get():
	      wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
	      wikipedia.showDiff(page.get(), keimeno)
      #choice= 'y'
	      if debug_bul:
	        choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['yes', 'No', 'all'], ['y', 'N', 'a'], 'N')
	        if choice == 'a':
	          choice = 'y'
	          debug_bul = False
	        elif choice != 'y':
	          choice='n'
	      if choice == 'y':
                    try:
                        # Save the page
                        page.put(keimeno)
		    except wikipedia.IsRedirectPage:
			wikipedia.output(u'Skipping %s because it is a redirect' % (page.title()))
                    except wikipedia.EditConflict:
                        wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
                    except wikipedia.SpamfilterError, error:
                        wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
 
	return 0
 
def main():
    gen = None
    pageTitle = []
    for arg in wikipedia.handleArgs():
        if arg:
            if arg.startswith('-start:'):
                gen = pagegenerators.AllpagesPageGenerator(arg[7:])
            elif arg.startswith('-ref:'):
                referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.ReferringPageGenerator(referredPage)
            elif arg.startswith('-links:'):
                linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
                gen = pagegenerators.LinkedPageGenerator(linkingPage)
            elif arg.startswith('-file:'):
                gen = pagegenerators.TextfilePageGenerator(arg[6:])
            elif arg.startswith('-cat:'):
                cat = catlib.Category(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.CategorizedPageGenerator(cat)
            else:
                pageTitle.append(arg)
 
    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])
    if not gen:
        wikipedia.showHelp('touch')
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = kleidabot(preloadingGen)
        bot.run()
 
 
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
 
# "kainourio" means new
# "seira" means line (of text)
# "kleida" means (sort) key
# "arxiko" means initial
# :P