User:Flubot/How to find sorting problems in French categories

Definition from Wiktionary, the free dictionary
Jump to navigation Jump to search

If we have the whole list of French words in a given category, we can check it for sorting problems, eg missing sort keys in some entries.

The list of words must be in a file named unsorted_list, in the same directory.

Run with python find_unsorted.py > list_of_problems

Read the file list_of_problems to find possibly problematic entries.

find_unsorted.py[edit]

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys, re, codecs
page_tag = re.compile('(.*)')

reload(sys)
sys.setdefaultencoding('utf-8')

spaces_tag = re.compile('([\*\.,\'\(\)]*)')
apost_tag = re.compile(u'\’')
page_tag = re.compile('(.*)')

mtg_apo = u'à,À,â,Â,é,É,è,È,ê,Ê,ë,Ë,î,Î,ï,Ï,ô,Ô,ù,Ù,û,Û,ü,Ü,ç,Ç,Æ,æ,Œ,œ,-'
mtg_se =  u'a,a,a,a,e,e,e,e,e,e,e,e,i,i,i,i,o,o,u,u,u,u,u,u,c,c,ae,ae,oe,oe, '

trkeys = mtg_apo.split(u',')
for i in range(len(trkeys)):
       	trkeys[i] = ord(trkeys[i]) 
trvals = mtg_se.split(u',')
trtable = dict(zip(trkeys,trvals)) 


fin = codecs.open('unsorted_list', 'r', 'utf-8')
eof=0
line = fin.readline()
t1 = page_tag.search(line)
pr = t1.group(1)

kleida = pr.translate(trtable)
kleida = spaces_tag.sub('',kleida)
kleida = apost_tag.sub('',kleida)
kleida = kleida.lower()
kleida = kleida.strip(' ')

while not eof:
 line = fin.readline()
 #line = unicode(line, 'utf-8')
 if line == "":
	eof = 1
 else:
	t1 = page_tag.search(line)
	next = t1.group(1)
	kleida1 = next.translate(trtable)
	kleida1 = spaces_tag.sub('',kleida1)
	kleida1 = apost_tag.sub('',kleida1)
	kleida1 = kleida1.lower()
	kleida1 = kleida1.strip(' ')

	if not (kleida1 >= kleida):
		print next
	pr = next
	kleida = kleida1
fin.close()