User:Matthias Buchmeier/trans-en-es.awk

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

Dictionaries from translations sections[edit]

This is a gawk skript to create wikified bilingual dictionaries form the translation sections using the datadase dump.

Usage:[edit]

bzcat enwiktionary-DATE-pages-articles.xml.bz2|gawk -v LANG=language -v ISO=iso-code -f trans-en-es.awk|sort -d -k 1,1 -t"{">en-xx.wiki

Code:[edit]

#  gawk script to extract translations from the database dump of en.wiktionary.org
#
# (c) 2011-2014 by Matthias Buchmeier
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
# TODO Template:{{lang|iso
# TODO: resolve {{unsupported|...}} esp. from glosses
# TODO: remove trans-see links without target
# TODO: inclusion of English "alternative forms/spellings" as trans-see links
# TODO: include blacklist of pages to be excluded, e.g Taumatawhakatangihangakoauauotamateaturipukakapikimaungahoronukupokaiwhenuakitanatahu
# TODO: proper and complete treatment of Module:gender DONE (generalized cxx nounclasses still missing)
# TODO: proper treatment of diacritics following Module:languages. 
#       Currently only diacritics on Russian Cyrillic are removed. Option to keep diacritics should be implemnted.
#
BEGIN {
# target language configuration
# to configure the target language edit the following lines
# or configure the target language on the command-line with the following options:
#
# Command-line options:
#######################
#  required gawk command-line switches:
#
#    name of the language to be extracted, or bar-separated list of languages:
#    -v LANG=language or  -v LANG="language1|language2|language3"
#	
#    iso-code of the language to be extracted, or bar-separated list of iso-codes:
#    -v ISO=iso-code   or   -v ISO="iso1|iso2|iso3"
# 
#    name of the language family as specified on the headline of a nested section:
#    (required only if LANG contains multiple languages)
#    -v GENERIC_LANG=language_family
#
#  optional gawk command-line switches:
#
#    this option has to be used for languages written in non-latin script, e.g. Cyrillic, Greek, etc.:
#    -v LATIN=n    
#
#    remove wiki-links and wiki-style bolding, italicizing:
#    -v REMOVE_WIKILINKS=y --re-interval
#
#    don't include trans-see links:
#    -v TRANS_SEE=n  
#
#    include English pronunciation (IPA): 
#    -v ENABLE_IPA=y
#
#    bar-separated list of languages to be excluded (the default is to include all nested lines):
#    -v EXCLUDE_LANG="language1|language2|langguage3"
#
#    bar-separated list of qualifiers to be added (specified in the same order as the ISO-list):
#    -v ISO_QUALIFIER="qualifier1|qualifier2|qualifier3"
#    
#    bar-separated list of qualifiers to be added (specified in the same order as the LANG-list):
#    -v LANG_QUALIFIER="qualifier1|qualifier2|qualifier3"
#
#    don't include transliterations
#    -v REMOVE_TRANSLIT="y"
#
#########################
# User defined variables:
#########################
# English names of the target language as specified on the beginning of translation lines,
# multiple names have to separated by "|":
# this list should include both the language family name and the nested section language names
# lang="Spanish";
#
# iso codes of the target language as used in t-template,
# multiple codes have to separated by "|":
# iso = "es";
#
# unique language family name, as used on the nesting headline
#generic_lang = "Spanish";
generic_lang = "";
#
# language headwords of nested sections to be excluded from the dictionary
# multiple languages have to separated by "|":
exclude_lang = "";
#
# set to 1/0  for latin/non-latin script
latin = 1;
#
# set to 1 if you want to remove [[]]-wikilinks and wiki-syntax bolding and italicizing
remove_wikilinks=0;
#
# set to 1 if transliterations might contain wikilinks
links_inside_tr=0;
#
# show trans-see links
enable_trans_see=1;
#
# show ttbc sections
enable_ttbc=1;
#
# enable English IPA
enable_ipa=0;
#
# remove transliterations
rmtr=0;
#
# parsing of commandline switches
#
if(LANG!="") lang = LANG;
#
# default excluded languages (uncomment if you want to specify them on the command line)
if(EXCLUDE_LANG=="")
{
if(lang=="French") exclude_lang = "French Creole|Old French|Middle French|Gallo|Norman";
if(lang=="Spanish") exclude_lang = "Old Spanish|Aragonese";
if(lang=="German") exclude_lang = "German Low German|Middle Low German|Low German|Middle High German|Old High German|Alemannic|Alemannic German|Kölsch|Bavarian|Alsatian|Badisch|Berliner|Bernese|Camelottisch|Frankonian|Lichtensteinisch|Luxembourgeois|Moselfraenkisch|Plattdeutsch|Rhoihessisch|Ruhrisch|Saarlaendisch|Saxon|Swabian|Viennese|Alsace|Palatinate German|Swiss German|Kölsch|Silesian German|Saterland";
if(lang=="Italian") exclude_lang = "Sicilian|Old Italian";
if(lang=="Korean") exclude_lang = "Old Korean";
if(lang=="Portuguese") exclude_lang = "Old Portuguese";
}
#
# predefined language options (uncomment if you want to configure on the command line)
if(lang == "Norwegian")
{
iso="no|nn|nb";
lang="Norwegian|Nynorsk|Norwegian Nynorsk|Bokmål|Norwegian Bokmål|Norwegian Høgnorsk";
iso_qualifier="|Nynorsk|Bokmål";
lang_qualifier="|Nynorsk|Nynorsk|Bokmål|Bokmål|Høgnorsk";
exclude_lang = "Old Norse|Old Norwegian"
}
#
if(lang == "Dutch")
{
iso="nl|vls";
lang="Dutch|Flemish|West Flemish|Brabantish";
iso_qualifier="|Flemish";
lang_qualifier="|Flemish|Flemish|Brabantish";
exclude_lang = "Dutch Low Saxon|Dutch Low German|Old Dutch|Middle Dutch|Drents|Gronings|Twents|Low German";
}
#
if(lang == "Japanese")
{
iso="ja";
links_inside_tr=1;
latin=0;
}
#
if(lang == "Standard_Arabic")
{
generic_lang="Arabic";
lang="Arabic|MSA|Standard Arabic";
iso="ar|arb";
latin=0;
enable_trans_see=0;
exclude_lang = "Algerian|Andalusian|Bahrani|Chadian|Egyptian|Egyptian Arabic|Gulf|Gulf Arabic|Hassānīya|Iraqi|Iraqi Arabic|Lebanese|Lebanese/Syrian|Levantine|Levantine Arabic|Libyan|Moroccan|Moroccan Arabic|Morocco|North Levantine Arabic|Palestinian|Palestinian Arabic|South Levantine Arabic|Syrian|Sudanese|Tunisian Arabic|UAE|Hadrami Arabic|Hijazi Arabic|Juba Arabic|Egyptian Arabic|North Levantine|Hassaniya"
}
#
#
if(lang == "Mandarin")
{
generic_lang="Chinese";
lang="Mandarin|Central Mandarin|Jianghuai Mandarin|Northern Mandarin|West Mandarin|Wuhan|Xi[']an|Liuzhou|Chengdu|Xuzhou|Yangzhou|Ürümqi|Harbin|Simplified|Traditional|Chinese [(]Mandarin[)]|Chinese traditional[/]simplified|Chinese|Pinyin|Chinese [(]Traditional[)]|Chinese [(]Simplified[)]";
lang_qualifier="|Central China|Jianghuai|Northern China|West China|Wuhan|Xi[']an|Liuzhou|Chengdu|Xuzhou|Yangzhou|Ürümqi|Harbin|"
iso="zh|lzh|zho|chi|cmn|zh-tw|zh-cn|zhx-zho";
iso_qualifier="|Literary Chinese|";
latin=0;
enable_trans_see=0;
exclude_lang = "Amoy|Bai|Cantonese|Changsha|Chaozhou|Dungan|Eastern Hokkien|Eastern Min|Fuzhou|Gan|Guangzhou|Haikou|Hainanese|Hakka|Hangzhou|Hokkien|Hui|Jian[']ou|Jin|Jixi|Meixian|Min Bei|Min Dong|Min-nan|Min nan|Min Nan|Min-Nan|Nanchang|Nanning|Northern Hokkien|Northern Min|Northern Wu|Old Chinese|Pinghua|Shanghai|Shanghainese|Sichuanese|Southern Min|Southern Wu|Suzhou|Taiyuan|Taiwan|Taiwanese|Teochew|Tuhua Dong[']an|Wenzhou|Wu|Xiang|Xiamen|Yangzhou|Yue|Middle Chinese"
}
#
if(lang == "Mandarin_nonested")
{
lang="Mandarin";
generic_lang="Mandarin";
enable_trans_see=0;
enable_ttbc=0;
latin=0;
iso="zh|cmn"
}
#
#
if(lang=="Persian")
{
iso="fa";
exclude_lang="Old Persian|Middle Persian|Eastern Persian";
latin=0;
enable_trans_see=0;
}
#
if(lang=="Kurdish")
{
iso="ku|kmr|kur";
exclude_lang="Sorani|Soranî|Central Kurdish|Southern Kurdish";
lang="Kurmanji|Kurmancî|Kurdish";
generic_lang="Kurdish";
enable_trans_see=0;
latin=0;
}
#
# Modern Greek
if(lang=="Greek")
{
iso="el";
exclude_lang="Ancient Greek|Ancient|Hebrew|Modern Romanization|Ancient Romanization|Mycenaean|Classical|Katharevousa|Katharevoussa|Pontic Greek|Koine|Pontic Greek|Roman|Cappadocian";
lang="Modern Greek|Modern|Greek";
generic_lang="Greek";
latin=0;
# remove transliterations (upon user request) 
rmtr=1; 
}
#
if(lang=="Indonesian")
{
iso="id";
generic_lang="Indonesian";
lang="Indonesian|Standard Indonesian|Stabdard";
exclude_lang="Acehnese|Balinese|Banjar|Banjarese|Buginese|Javanese|Kaili|Madurese|Makasar|Mandar|Minangkabau|Nias|Sasak|Sunda|Sundanese|Indonesian Bajau";
enable_trans_see=0;
}
#
if(lang=="Malay")
{
iso="ms";
generic_lang="Malay";
lang="Rumi|Malay|Latin";
exclude_lang="Malayalam|Malaysian Sign Language|Jawi|Arabic|Malayo-Polynesian";
enable_trans_see=0;
}
#
if(lang == "Catalan")
{
iso="ca";
lang="Catalan|Valencian|Alguerese|Balearic";
iso_qualifier="";
lang_qualifier="|Valencian|Alguerese|Balearic";
exclude_lang = "";
}
 
 
#
if(lang=="Serbo-Croatian")
{
iso="sh|bs|hr|sr";
generic_lang="Serbo-Croatian";
lang="Serbo-Croatian|Serbian|Bosnian|Croatian|Roman|Latin|Cyrillic";
#exclude_lang="Cyrillic";
lang_qualifier="|Serbian|Bosnian|Croatian|||";
# has to be configured as non-latin for now because many Cyrillic terms are not tagged 
latin=0;
}
 
#
# END of user defined section
#############################
#
if(LATIN == "n") latin = 0;
if(REMOVE_WIKILINKS == "y") remove_wikilinks = 1;
if((ISO!="")&&(iso=="")) iso = ISO;
if(GENERIC_LANG!="") generic_lang = GENERIC_LANG;
if((LANG!="")&&(GENERIC_LANG=="")&&(generic_lang=="")) generic_lang = LANG;
if(ISO_QUALIFIER!="")  iso_qualifier = ISO_QUALIFIER;
if(LANG_QUALIFIER!="")  lang_qualifier = LANG_QUALIFIER;
if(TRANS_SEE=="n") enable_trans_see = 0;
if(EXCLUDE_LANG!="") exclude_lang = EXCLUDE_LANG;
if(ENABLE_IPA=="y") enable_ipa = 1;
if(REMOVE_TRANSLIT=="y") rmtr = 1;
 
#
#print "lang="lang";iso="iso";generic_lang="generic_lang";exclude_lang="exclude_lang;
#
# write iso- and lang-qualifiers into array
n_iso=split(iso,iso_array,"|");
split(iso_qualifier,iso_qualifier_array,"|");
for(i=1;i<=n_iso;i++) {
if(iso_qualifier_array[i] == "") qualifier[iso_array[i]] = "";
else   qualifier[iso_array[i]] = " ["iso_qualifier_array[i]"] ";
#print iso_array[i]" "qualifier[iso_array[i]];
}
 
n_lang=split(lang,lang_array,"|");
split(lang_qualifier,lang_qualifier_array,"|");
for(i=1;i<=n_lang;i++) {
if(lang_qualifier_array[i] == "") qualifier[lang_array[i]] = "";
else   qualifier[lang_array[i]] = " ["lang_qualifier_array[i]"] ";
#print lang_array[i]" "qualifier[lang_array[i]];
}
 
#
# initialization of variables used for parsing
#
# english = 0/1 outside/inside English section
english = 0; 
# trans = 0/1 outside/inside Translations section
trans = 0; 
# gloss = gloss-string or empty
gloss = ""; 
# pos = part of speech
pos = ""; 
# title = pagetitle
title = "";
# inside nested section? 0/1
nestsect = 0;
# inside Pronunciation section? 0/1
pron = 0;
# default IPA pronunciation
ipa1 = "";
# default IPA regexp
defipa="\\{\\{a\\|(US|GenAm).*\\{\\{IPA\\|";
# alternative IPA pronunciation
ipa2 = "";
# alternative IPA regexp
altipa = "\\{\\{IPA\\|";
#
oldLHS = ""; oldRHS = "";
# regexp matching translation line
#transline = "^[*:]*[\\x20]*[[]*("lang")[]]*[\\x20]*[:]|^[*:]*[\\x20]*\\{\\{qualifier\\||\\{\\{ttbc\\|("lang")\\}\\}|\\{\\{ttbc\\|("iso")\\}\\}";
# regexp matching start of nested section
if(enable_ttbc==1)
neststart = "^\\*[ ]*([[]*("generic_lang")|\\{\\{ttbc\\|("generic_lang")\\}\\}|\\{\\{ttbc\\|("iso")\\}\\}|\\{\\{trreq\\|("iso")\\}\\})"; 
if(enable_ttbc==0)
neststart = "^\\*[ ]*[[]*("generic_lang")"; 
# regexp matching translation lines to be excluded
exclude = "^$";
if(exclude_lang != "")
#exclude = "^[*:]*[\\x20]*[[]*("exclude_lang")[]]*[\\x20]*[:]";
exclude = "^[*:]*[ ]*[[]*("exclude_lang")";
 
# array containing POS header regexps and POS label
# indexed by frequency ('for var in array' gives arbitrary array sorting in awk)
PHR[1] = "[=][=][=][ ]*Noun"; POSL[1] = "n";
PHR[2] = "[=][=][=][ ]*Verb"; POSL[2] = "v";
PHR[3] = "[=][=][=][ ]*(Adjective|Posesive[ ]adjective)"; POSL[3] = "adj";
PHR[4] = "[=][=][=][ ]*(Adverb|Adverbial)"; POSL[4] = "adv";
PHR[5] = "[=][=][=][ ]*Interjection"; POSL[5] = "interj";
PHR[6] = "[=][=][=][ ]*Proper[ ]noun"; POSL[6] = "prop";
PHR[7] = "[=][=][=][ ]*Phrase[ ]*[=]"; POSL[7] = "phrase";
PHR[8] = "[=][=][=][ ]*Article"; POSL[8] = "article";
PHR[9] = "[=][=][=][ ]*Preposition[ ]*"; POSL[9] = "prep";
PHR[10] = "[=][=][=][ ]*(Initialism|\\{\\{initialism)"; POSL[10] = "initialism";
PHR[11] = "[=][=][=][ ]*(Number|Numeral)[ ]*"; POSL[11] = "num";
PHR[12] = "[=][=][=][ ]*Cardinal num(ber|eral)[ ]*[=]"; POSL[12] = "num";
PHR[13] = "[=][=][=][ ]*Ordinal number"; POSL[13] = "num";
PHR[14] = "[=][=][=][ ]*(Acronym|\\{\\{acronym)"; POSL[14] = "acronym";
PHR[15] = "[=][=][=][ ]*(Abbreviation[ ]*[=]|\\{\\{abbreviation)"; POSL[15] = "abbr";
PHR[16] = "[=][=][=][ ]*Determiner"; POSL[16] = "determiner";
PHR[17] = "[=][=][=][ ]*Suffix"; POSL[17] = "suffix";
PHR[18] = "[=][=][=][ ]*Pronoun"; POSL[18] = "pron";
PHR[19] = "[=][=][=][ ]*Conjunction"; POSL[19] = "conj";
PHR[20] = "[=][=][=][ ]*Proverb"; POSL[20] = "proverb";
PHR[21] = "[=][=][=][ ]*Contraction"; POSL[21] = "contraction";
PHR[22] = "[=][=][=][ ]*Particle"; POSL[22] = "particle";
PHR[23] = "[=][=][=][ ]*Symbol"; POSL[23] = "symbol";
PHR[24] = "[=][=][=][ ]*Prefix"; POSL[24] = "prefix";
PHR[25] = "[=][=][=][ ]*Prepositional[ ]phrase"; POSL[25] = "prep";
PHR[26] = "[=][=][=][ ]*Interfix"; POSL[26] = "interfix";
NPHR = 26;
# for(i=1;i<= NPHR;i=i+1)  { print i" "POSL[i];} exit(0);
 
}
# End BEGIN block
##############################################################
 
function printout(outp) {
 
# convert special xml formating like &lt; to html
                        gsub(/&lt;/,"<",outp);
                        gsub(/&gt;/,">",outp);
                        gsub(/&amp;/,"\\&",outp);
                        gsub(/&quot;/,"\"",outp);
			gsub(/&nbsp;/, " ", outp);
                        gsub(/&hellip;/, "...", outp);
                        gsub(/&quot;/, "\"", outp);
			gsub(/&[mn]dash;/, "-", outp);
			gsub(/&thinsp;/, "", outp);
 
# NOTE: these must be done after converting '&lt;' -> '<'  and '&gt;' -> '>'
# remove <ref ... \>
			gsub(/<ref[^>]*\/>/,"",outp);
 
# remove <ref [name=....]> blabla </ref> OK?
			gsub(/<ref[^>]*>.*<\/ref>/,"",outp);
 
# remove one-line <!-- commented text -->
			gsub(/<!--[^>]*-->/,"",outp); 
 
# remove extra spaces
                        gsub(/[\ ]+/, " ", outp);
 
# remove remaining "<!--" (will prevent display of wikifile)
			gsub(/<!--/,"", outp);
 
if(remove_LHS-term==1) {
# remove LHS {{term|...}}
			gsub(/\|sc=[^\|\}]*/, "", outp);
			gsub(/\|lang=[^\|\}]*/, "", outp);
			gsub(/\{\{term\|/, "", outp);
			gsub(/\}\}/, "", outp);
			}
 
if(remove_wikilinks==1) {
			outp = gensub(/([[][[])([^]|]*\|)([^]]*)([]][]])/ , "\\3", "g", outp);
			outp = gensub(/([[][[])([^]]*)([]][]])/ , "\\2", "g", outp);
			gsub(/[']{2,}/, "", outp);	
			}
 
# force LR-switch for some characters
if((generic_lang=="Arabic")&&(remove_wikilinks==0)) {
			gsub(/[]][ ]*[/]3/, "] {{LR}}/3", outp);
			}
 
# remove diacritics for some languages:
if(generic_lang=="Russian") {
			gsub(/\xCC\x81/, "", outp);	
			gsub(/\xCC\x80/, "", outp);
			}
 
                        print outp;
}
# End printout function
 
# gender and number template 
# TODO: yet no support for generic cXX noun types
function template_g(input) {
 
MAXGENDERS=5;
output=input;
 
for(i=1;;i++)
{
regexp = "(\\{\\{g)(\\|)((an|in|pr|impf|pf|m|f|n|c|s|d|p)([-](an|in|pr|impf|pf|m|f|n|c|s|d|p))*)((\\|[^\\}]*)*(\\}\\}))";
output=gensub(regexp, "{\\3} \\1\\7", "g", output);
regexp = "[ ]*\\{\\{g\\}\\}";
gsub(regexp, "", output);
 
if(output !~ "\\{\\{g\\|") break;
if(i>MAXGENDERS-1) { print "# Warning too many genders"; break}
}
 
return (output)
}
# End template_g function
 
# replacement of nested template calls
# should be called before processing any templates
function nested_templates(input) {
# {{,}}, {{NNBS}}, {{LR}} and {{RL}}
output=input;
 
# Narrow No-Break Space
gsub(/\{\{NNBS\}\}/, "", output);
# serial comma
gsub(/\{\{,\}\}/, "", output);
# U+200E
gsub(/\{\{LR\}\}/, "", output);
# U+200
gsub(/\{\{RL\}\}/, "", output);
return (output)
}
 
############################################################
#
# Main Program
#
############################################################
 
# determine page title
/\x3Ctitle/ { 
gsub(/^[^\x3C]*/, ""); gsub(/[^\x3E]*$/, ""); gsub(/\x3Ctitle\x3E/, ""); gsub(/\x3C\/title\x3E/, "");  
title=$0; 
english=0; 
trans=0; gloss=""; pos=""; nestsect=0;
if(index(title,"Wiktionary:") != 0) title="";
if(index(title,"Template:") != 0) title="";
if(index(title,"Appendix:") != 0) title="";
if(index(title,"User:") != 0) title="";
if(index(title,"Help:") != 0) title="";
}
 
# discard non-useful lines (speedup and false "trans-see" lines from comment lines)
/<comment>|<\/?page>|<timestamp>|<id>|<\/?contributor>|<\/?revision>|<username>|<minor \/>/  {next;}
/^$/ {next;}
/^[#\[]/ {next;}
 
# discard Wiktionary, Template and Appendix namespaces
{if(title=="") next;}
 
# detect English language section
/[=][=][ ]*English[ ]*[=][=]/ { 
english=1;
trans=0; gloss = ""; pos= ""; nestsect = 0;
pron=0; ipa1=""; ipa2="";
next;}
 
# detect non-English language section
/^[=][=][^=]+/ { 
english=0; 
trans=0; gloss = ""; pos= ""; nestsect = 0;
next;}
 
# language and page title detection done; skip all lines if not inside English section
{if(english==0) next;}
 
#################################################
# Now inside English section
#################################################
 
# determine pronunciation section
/[=][=][=][ ]*Pronunciation/ {pron=1; ipa1=""; ipa2="";}
#determine ipa1 and ipa2
$0 ~ defipa { 
if((pron==1)&&(ipa1=="")){ 	gsub(/\|lang\=en/, "", $0);
				ipa1=gensub(/(.*\{\{IPA\|[\/\[]*)([^}\|\/]*)([\/\]]*.*)/, "\\2", "g", $0); 
# print "def "title" "ipa1 >>"IPA.txt";
next;
}}
$0 ~ altipa {
if((pron==1)&&(ipa2=="")) { 	gsub(/\|lang\=en/, "", $0);	
				ipa2=gensub(/(.*\{\{IPA\|[\/\[]*)([^}\|\/]*)([\/\]]*.*)/, "\\2", "g", $0); 
# print "alt "title" "ipa2 >>"IPA.txt";
next;
}}
 
# determine part of speech (POS)
/^[ ]*[=][=][=]/ { for(i=1;i<= NPHR;i=i+1)  {
                                    if($0 ~ PHR[i]) 
				     { pos=POSL[i]; trans=0; gloss = ""; next;} 
				    }}
 
# detect end of Translations section
/^[=][=]|^\[\[/ {trans=0; nestsect=0;}
 
# detect start of Translations section
/[=][=][=][=][ ]*Translations[ ]*[=][=][=][=]/ {
if(english==1) {trans=1; gloss=""; nestsect=0;}
next;}
 
# detect start of Checktrans section
/\{\{checktrans/ {gloss=""; nestsect=0;}
 
# determine gloss
/\{\{trans\-top\||\{\{trans\-top\-also\|/ { 
$0 = nested_templates($0)
gloss=gensub(/(\{\{trans\-top\||\{\{trans\-top\-also\|)(.*)(\}\})/, "\\2", "g", $0);
gsub(/\{\{jump[^\}]*\}\}/, "", gloss);
gsub(/\([1-9]\)/, "", gloss);
gsub(/\|/, ", see also: ", gloss);
nestsect=0;
}
 
# handle {{trans-see||}} links
/\{\{trans\-see\|/ {
 
# remove <\/text>, (might be there at the end of page (XML-code)                        
                gsub(/<\/text>/,"",$0);
		$0 = nested_templates($0);
 
                gloss=gensub(/(\{\{trans-see\|)([^\}\|]*)(\}\}.*)/, "\\2", "g", $0);
		link=gloss;	
		if(gloss==$0)  {
		gloss=gensub(/(\{\{trans-see\|)([^\|]+)(\|)([^\}]+)(\}\}.*)/, "\\2", "g", $0);
		link=gensub(/(\{\{trans-see\|)([^\|]+)(\|)([^\}]+)(\}\}.*)/, "\\4", "g", $0);
		}
#		print "TRANS-SEE: "$0" :: "gloss" :: "link;
 
		gsub(/\[\[/,"",gloss);
		gsub(/\]\]/,"",gloss);
 
		LHS = sprintf("[[%s]] ", title);
                        if(pos != "") LHS = (LHS sprintf("{%s} ", pos));
		if(enable_ipa==1) {
			if(ipa1!="") { LHS = (LHS sprintf("/%s/ ", ipa1)); ipa1=""; ipa2="";}
			if(ipa2!="") { LHS = (LHS sprintf("/%s/ ", ipa2)); ipa1=""; ipa2="";}
			}
                        if (gloss != "") LHS = (LHS sprintf("(%s) ", gloss));
 
		if(index(link,"[[")==0)	
			outp = (LHS " SEE: [["link"]] ::");
 
		if(index(link,"[[")!=0)
                        outp = (LHS " SEE: "link" ::");
 
if(enable_trans_see==1)
			printout(outp);
 
			gloss=""; nestsect = 0;
}
 
# determine nested section
/^[*][^*:]|\{\{ttbc|\{\{trans\-|\{\{trreq|^[[]/ {nestsect = 0;}
#/^\*[\x20]*[[]*[A-Z]|\{\{ttbc|\{\{trans\-|\{\{trreq/ {nestsect = 0;}
$0 ~ neststart {nestsect = 1;}
 
# skip lines matching exclude
$0 ~ exclude {next;}
 
# skip {{trreq| ... lines
/\{\{trreq\|/  {next;}
 
# determine translations
#$0 ~ transline	{
{if(trans==0) next;}
/^[*]/ {
#print "transline:"$0";trans="trans";nestsect="nestsect;
	if((trans==1)&&(nestsect==1)) {
 
# set LHS
		LHS = sprintf("[[%s]] ", title); 
		if(pos != "") LHS = (LHS sprintf("{%s} ", pos));
		if(enable_ipa==1) {
			if(ipa1!="") { LHS = (LHS sprintf("/%s/ ", ipa1)); ipa1=""; ipa2="";}
			if(ipa2!="") { LHS = (LHS sprintf("/%s/ ", ipa2)); ipa1=""; ipa2="";}
			}	
		if (gloss != "") LHS = (LHS sprintf("(%s) ", gloss));
 
# conversion of obsolete/redirected/equivalent/recently changed templates
# 
# nested templates
$0 = nested_templates($0);
 
# the g-template
# gsub(/\{\{g\|/, "{{", $0);
if($0 ~ "\\{\\{g\\|") { $0 = template_g($0); }
gsub(/\{\{(i|italbrac|ib|qual)\|/, "{{qualifier|", $0);
gsub(/\{\{(t-simple|apdx-t|t-SOP|t[+]|t[-]|tø|t0)\|/, "{{t|", $0);
 
regexp="^\\*[ ]*[[]*"generic_lang"[ ]*[]]*[:]";
if($0 ~ regexp) {
 
for(i=1;i<=n_iso;i++) {
regexp="\\{\\{t\\|"iso_array[i]"\\|"
repl=qualifier[iso_array[i]]"&";
#print regexp"; "repl;
gsub(regexp, repl);
} }
 
for(i=1;i<=n_lang;i++) {
regexp="^[*:]*[ ]*[[]*"lang_array[i]"[]]*[ :]*|^[*:]*[ ]*\\{\\{ttbc\\|"lang_array[i]"\\}\\}[ :]*";
gsub(regexp, qualifier[lang_array[i]]);
}
 
for(i=1;i<=n_iso;i++) {
regexp="^[*:]*[ ]*\\{\\{ttbc\\|"iso_array[i]"\\}\\}[ :]*";
gsub(regexp, qualifier[iso_array[i]]);
}
 
# remove remaining "^** " from qualifier nested sections
			sub(/^[*:]*[ ]*/, "" ,$0);
 
# remove xs parameter from t-templates:
			gsub(/\|xs=[^\|\}]*/, "", $0);
 
# remove sc script-type parameter from t-templates:
			gsub(/\|sc=[^\|\}]*/, "", $0);
 
			TR=$0;	
 
# remove script templates Cyrl, Arab, fa-Arab, Thai, IPA, IPAchar, unicode, Jpan, Latinx, Hani, Hans, Hant, Tfng, Deva, Hebr, Kore, Hang:
			TR=gensub(/(\{\{(Arab|Cyrl|fa-Arab|IPA|IPAchar|Thai|unicode|Jpan|Latinx|Hani|Hans|Hant|Tfng|ku-Arab|Deva|Hebr|Kore|Hang)\|)([^}]*)(\}\})/, "\\3", "g", TR);
 
# language family related templates
	if(generic_lang=="Arabic") {
			gsub(/\{\{LR\}\}/,"",TR);
			gsub(/\{\{dual\}\}/,"{dual}",TR);		
	}
 
	if((generic_lang=="Chinese")||(generic_lang=="Mandarin")) {
# first version without wikilinks inside
			TR=gensub(/(\{\{zh\-ts\|)([^\|\[]*)(\|)([^\|\[]*)(\}\})/, "[[\\2]], [[\\4]]", "g", TR);
			TR=gensub(/(\{\{zh\-zh\-p\|)([^\|\[]*)(\|)([^\|]*)(\}\})/, "[[\\2]] /\\4/", "g", TR);
			TR=gensub(/(\{\{zh\-tsp\|)([^\|\[]*)(\|)([^\|\[]*)(\|)([^\|]*)(\}\})/, "[[\\2]], [[\\4]] /\\6/", "g", TR);
# next allow wikilinks but don't link additionally
                        TR=gensub(/(\{\{zh\-ts\|)([^\|]*)(\|)([^\|]*)(\}\})/, "\\2, \\4", "g", TR);
                        TR=gensub(/(\{\{zh\-zh\-p\|)([^\|]*)(\|)([^\|]*)(\}\})/, "\\2 /\\4/", "g", TR);
                        TR=gensub(/(\{\{zh\-tsp\|)([^\|]*)(\|)([^\|]*)(\|)([^\|]*)(\}\})/, "\\2, \\4 /\\6/", "g", TR);
	}
 
# rm rfr rfscript etc
			gsub(/\{\{(rfr|rfscript|rftranslit)\|[^}]*\}\}/, "", TR);
 
# convert {{term|...|lang=xx}} -> [[...]]
			TR=gensub(/(\{\{term\|)([^}\|]*)([^}]*)(\}\})/, "[[\\2]]", "g", TR);
 
# convert {{l|iso|...|...|tr=...|g=...}} -> [[...|...]] /TR/
# transliterations
	if(latin == 0) 	{
			regexp = "(\\{\\{l[^}]*)(\\|tr=)([^|}]*)([^}]*\\}\\})";
			if(rmtr==0) TR=gensub(regexp, "\\1\\4 /\\3/", "g", TR);
			if(rmtr==1) TR=gensub(regexp, "\\1\\4", "g", TR);
			}
# rm gloss and g
			regexp = "(\\{\\{l[^}]*)(\\|)(g=|g2=|gloss=)([^|}]*)([^}]*\\}\\})"
			TR=gensub(regexp, "\\1\\5", "g", TR);
# the l-template (use wikilinks)
			TR=gensub(/(\{\{l[\|\/])([^}\|]*\|)([^}]*)(\}\})/, "[[\\3]]", "g", TR);
#			TR=gensub(/(\{\{l\|)([^}\|]*\|)([^}\|]*)(\|[^}]*\}\})/, "[[\\3]]", "g", TR);
# the lang-template (no wikilinks)
			TR=gensub(/(\{\{lang[\|])([^}\|]*\|)([^}]*)(\}\})/, "\\3", "g", TR);
 
 
# convert {{t|...}} -> [[...]] and determine gender and singular/plural:	
############################################################################
# move Transliteration (tr= arguments) -> /.../
	if(latin == 0) 	{
			regexp = "(\\{\\{[^}]*)(\\|tr=)([^|}]*)([^}]*\\}\\})";
			if(rmtr==0) TR=gensub(regexp, "\\1\\4 /\\3/", "g", TR);
			if(rmtr==1) TR=gensub(regexp, "\\1\\4", "g", TR);
			}
 
# the gender-number g-template calls inside {{t|
# TODO: nounclasses cxx not yet supported
			GENDERS="an|in|pr|impf|pf|m|f|n|c|s|d|p"	
			regexp="(\\{\\{t\\|("iso")\\|[^}]*)(\\|)(("GENDERS")([-]("GENDERS"))*)(\\}\\}|\\|[^}]*\\}\\})";
			MAXGENDER=4;
			for(i=0;i<MAXGENDER;i++) 
				{TR=gensub(regexp, "\\1\\8 {\\4}", "g", TR);}
 
			regexp = "(\\{\\{t\\|("iso")\\|)([^}\\]\\[]*)(\\}\\})";
			TR=gensub(regexp, "[[\\3]]", "g", TR);
			regexp = "(\\{\\{t\\|("iso")\\|)([^}]*)(\\}\\})";
			TR=gensub(regexp, "\\3", "g", TR);
			gsub(/\|alt\=/, "|", TR);
			TR=gensub(/([[][[])(alt\=)([^|]*)(\|)([^]]*)([]][]])/, "[[\\5|\\3]]", "g", TR);
 
 
			regexp = "\\[\\[\\#("lang")\\|";
			gsub(regexp, "[[", TR);
			regexp = "#("lang")\\|";
			gsub(regexp, "|", TR);
 
# convert common gender "{c}" to "{m} {f}" for languages de, es, fr, it, pt
if((iso=="de")||(iso=="es")||(iso=="fr")||(iso=="it")||(iso=="pt")) {
			gsub(/\{\{c\}\}/,"{m-f}",TR);
			gsub(/\{c\}/,"{m-f}",TR);
			}
 
# convert obsolete {{plural}} to {p}
			gsub(/\{\{plural\}\}/,"{p}",TR);
 
# convert {{pf.}}, {{impf}}
			gsub(/\{\{impf\}\}/,"{impf}",TR); gsub(/\{\{pf[.]*\}\}/,"{pf}",TR);
 
# convert {{indeclinable}} {{indecl}}
			gsub(/\{\{(indecl|indeclinable)\}\}/,"{indecl}",TR);
 
# remove {{g|}}, {{attention|}} tags, {{rfc-tbot}}, {{inv}}
			gsub(/\{\{g\|[^\}]*\}\}|\{\{attention\|[^\}]*\}\}|\{\{rfc-tbot\}\}|\{\{inv\}\}/,"",TR);
 
# convert {{not used|iso}} -> Not used in LANG
			regexp = "(\\{\\{not used\\|("iso")\\}\\})";
			repl = "Not used in "generic_lang;
			gsub(regexp, repl, TR);
 
# convert "qualifier" templates -> [...]:
			TR=gensub(/(\{\{qualifier\|)([^}]*)(\}\})/, "[\\2]", "g", TR);
 
# remove (1) and thelike
			gsub(/\([0-9 ,;-]*\)/, "", TR);
 
# convert Transliteration in brackets (...) -> /.../
		if(latin == 0) {
			if(rmtr==0) {
			if(links_inside_tr==1) TR=gensub(/([^'])(\()([^'][^){]*)(\))/ ,"\\1/\\3/", "g", TR);
			if(links_inside_tr==0) TR=gensub(/([^'])(\()([^'][^)[{+"]*)(\))/ ,"\\1/\\3/", "g", TR);
				}
			if(rmtr==1) {
			if(links_inside_tr==1) TR=gensub(/([^'])(\()([^'][^){]*)(\))/ ,"\\1", "g", TR);
			if(links_inside_tr==0) TR=gensub(/([^'])(\()([^'][^)[{+"]*)(\))/ ,"\\1", "g", TR);
				}
 
# rm empty transliteration "//" (due to empty tr argument)
			gsub(/[/][/]/, "", TR);
			}
# rm empty wikilinks
			gsub(/\[\[[ ]*\]\]/, "", TR);
 
# convert {{gloss|...}},  {{sense|...}} -> (...)
			TR=gensub(/(\{\{(gloss|sense)\|)([^}]*)(\}\})/, "(\\3)", "g", TR);
 
# remove <\/text>, (might be there at the end of page (XML-code)			
			gsub(/<\/text>/,"",TR);
 
# change [ | | ] -> [ , , ]
			old=TR;			
			for(i=0;i<20;i++)
			{TR=gensub(/((^|[^\[])\[[^\]\[]*)(\|)([^\]]*])/, "\\1, \\4", "g", old);
  			if(TR==old) break;
  			old=TR;}
 
if(LHS == oldLHS) {
if(TR != "") {if(oldRHS != "") oldRHS = oldRHS"; "TR;
		if(oldRHS == "") oldRHS = TR; }
}
 
if(LHS != oldLHS) {
if(oldRHS != "")  {outp = (oldLHS sprintf(":: %s",oldRHS));
			printout(outp);}
oldLHS = LHS;
oldRHS = TR;
}
 
# print a comment if POS is unknown
			if (pos == "") print "#WARNING: unknown POS on page:\""title"\"";
# end trans=1
} 
next;
# end determine translations	
}
 
# prevent flooding of dict with Warnings on nested lines starting with "** {{qualifier|":
/^[*:]*[ ]*\{\{qualifier\|/ {next;}
# this line might flood file with Warnings:
/^[*][*:]/ {if((trans==1)&&(nestsect == 1)) print "#WARNING: unknown nested section headword on page:\""title"\", :",$0;}  
 
END {
if(oldRHS != "")	{outp = (oldLHS sprintf(":: %s",oldRHS));
		printout(outp);}
}

Dictionaries from non-English language sections[edit]

This is a gawk skript to create wikified bilingual dictionaries form the foreign language (FL) sections using the datadase dump.

Usage:[edit]

bzcat enwiktionary-DATE-pages-articles.xml.bz2|gawk -v LANG=foreign-language -f trans-FL-en.awk|sort -d -k 1,1 -t"{">xx-en.wiki

  • Currently supported foreign languages: Italian, Spanish, French, Finnish

Code:[edit]

# gawk script to create a Foreign_Language-English dictionary from 
# the Foreign_Language sections of en.wiktionary.org
#
# (c) 2011-2014 by Matthias Buchmeier
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#
# TODO:
# rm lines with {{past participle of| ?
# detect gender from {{es-proper noun|
# include IPA?
# {{soplink|
# {{vern|, {{short form of|
# {{g|: DONE
# {{cx| -> {{context|: DONE
# {{label|ISO| -> {{context|: DONE
# changing context-label code, once the new context template is settled
#
# Command-line options:
########################
#  required gawk command-line switches:
#
#    name of the language to be extracted
#    -v LANG=language 
#
#    iso-code of the language to be extracted
#    -v ISO=iso-code
#
#  optional gawk command-line switches:
#
#    remove wiki-links and wiki-style bolding, italicizing:
#    -v REMOVE_WIKILINKS=y --re-interval
#
#########################
 
BEGIN {
#########################
# User defined variables:
#########################
# English names of the target language
# supported at the moment: Italian, French, Spanish, Finnish, Portuguese
# default language:
lang="Spanish";
# default isocode
iso="es"
#
# command line parsing
#
if(LANG!="") lang = LANG;
if(ISO!="") iso = ISO;
no_head=-1;
#
# language specific configuration 
#
# configuration for es-en dictionary
#
if(lang=="Spanish") {
# iso code of the language
iso = "es";
# exclude entire current POS if regexp is matched
# this regexp typically contains headline-templates of non-lema (form of) entries
exclude_POS="\\{\\{es\\-verb\\-form\\||\\{\\{es-adj[^\\}]*\\|(m|masculine)\\=|\\{\\{head\\|es\\|(noun|adjective|verb) form|\\{\\{es\\-adj\\-form|\\{\\{head\\|es\\|(misspelling|obsolete)|\\{\\{misspelling of\\||\\{\\{es-past participle|\\{\\{es\\-pp\\|";
# exclude matched definition lines
# this regexp typically contains form of templates
exclude_defn="\\{\\{(es-verb form of|rfdef|defn)\\|";
# regexp matching head line of a masculine noun
m_noun="\\{\\{es\\-noun(\\|m|.*\\|g=m)[\\|\\}]";
# regexp matching head line of a feminine noun
f_noun="\\{\\{es\\-noun(\\|f|.*\\|g=f)[\\|\\}]";
# regexp matching head line of a both masculine  and feminine noun
mf_noun="\\{\\{es\\-noun(\\|mf|.*\\|g=mf)[\\|\\}]";
# regexp matching verb headline
verbhead="\\{\\{es\\-verb[\\|\\}]|\\{\\{head\\|es\\|verb[\\|\\}]";
# discard entries without head-line template:
no_head=0;
# languge specific templates to be removed from output lines
rmtemplate="\\{\\{(es\\-demonstrative\\-accent-usage|gloss\\-stub\\|(Spanish|es))[^}]*}}";
}
#
if(lang=="Italian") {
iso = "it";
exclude_POS="\\{head\\|it\\|[^\\:\\|]* form[s]*[\\|\\}]|\\{\\{(head\\|it\\|(misspelling|obsolete|plural|g=)|it-pp)[\\|\\}]|\\{\\{head\\|it\\}";
exclude_defn="Compound of|\\{\\{(present participle of|past participle of|rfdef|defn|misspelling of|conjugation of|feminine plural past participle of|masculine plural past participle of|feminine past participle of|inflection of|masculine plural of)\\|";
m_noun="\\{\\{it\\-noun\\|[^\\|]*(\\|m|.*\\|g=m)[\\|\\}]";
f_noun="\\{\\{it\\-noun\\|[^\\|]*(\\|f|.*\\|g=f)[\\|\\}]";
mf_noun="\\{\\{it\\-noun\\|[^\\|]*(\\|mf|.*\\|g=mf)[\\|\\}]";
verbhead="\\{\\{it\\-verb[\\|\\}]|\\{\\{head\\|it\\|verb[\\|\\}]";
no_head=1;
rmtemplate="\\{\\{gloss\\-stub\\|(Italian|it)[^}]*}}|\\{\\{jump\\|[^}]*}}";
}
#
if(lang=="French") {
iso = "fr";
exclude_POS="\\{\\{head\\|fr\\|[^\\|]* form[s]*[\\|\\}]|\\{\\{head\\|fr\\|(misspelling|obsolete|plural|present participle|g=)|\\{\\{(misspelling of|fr-pp|fr-verb-form|fr-verb form|fr-adj-form|fr-past participle)(\\||\\})|\\{\\{head\\|fr\\}";
exclude_defn="\\{\\{past participle of\\||Compound of|masculine plural past participle of|present participle of|feminine plural past participle of|masculine plural of|conjugation of|inflection of|plural of|feminine plural of|feminine past participle of|plural past participle of|\\{\\{(rfdef|defn)\\|";
m_noun="\\{\\{fr\\-noun(\\|m|.*\\|g=m)[\\|\\}]";
f_noun="\\{\\{fr\\-noun(\\|f|.*\\|g=f)[\\|\\}]";
mf_noun="\\{\\{fr\\-noun(\\|mf|.*\\|g=mf)[\\|\\}]";
verbhead="\\{\\{fr\\-verb[\\|\\}]|\\{\\{head\\|fr\\|verb[\\|\\}]";
no_head=1;
rmtemplate="\\{\\{gloss-stub\\|(French|fr)[^}]*}}|\\{\\{jump\\|[^}]*}}";
}
#
if(lang=="Finnish") {
iso = "fi";
exclude_POS="\\{\\{head\\|fi\\|(noun|adjective|verb|proper noun) form|\\{\\{head\\|fi\\|(misspelling|obsolete)|\\{\\{misspelling of\\||\\{\\{head\\|fi\\}\\}";
exclude_defn="\\{\\{(fi-form of|fi-participle of|infinitive of|inflected form of|agent noun of|fi-verb form of|defn|rfdef|nominative plural of|rftrans)\\|";
no_head=1;
rmtemplate="\\{\\{gloss\\-stub\\|Finnish[^}]*}}";
}
#
if(lang=="Portuguese") {
iso = "pt";
exclude_defn="\\{\\{(pt-verb form of|pt-verb-form-of|conjugation of|misspelling of|pt-noun form of|pt-adj form of|feminine past participle of|feminine plural past participle of|masculine plural of|inflection of|pt-ordinal form|pt-adv form of|plural form of|pt-article form of|masculine plural past participle of|pt-cardinal form of|pt-apocopic-verb)\\|";
# regexp matching head line of a masculine noun
m_noun="\\{\\{pt\\-noun(\\|m|.*\\|g=m)[\\|\\}]";
# regexp matching head line of a feminine noun
f_noun="\\{\\{pt\\-noun(\\|f|.*\\|g=f)[\\|\\}]";
# regexp matching head line of a both masculine  and feminine noun
mf_noun="\\{\\{pt\\-noun(\\|mf|.*\\|g=mf)[\\|\\}]";
# regexp matching verb headline
verbhead="\\{\\{pt\\-verb[\\|\\}]|\\{\\{head\\|pt\\|verb[\\|\\}]";
}
#
#
# initialization of undefined lang-specific regexps
if(m_noun=="") m_noun="XXXXXX";
if(f_noun=="") f_noun="XXXXXX";
if(mf_noun=="") mf_noun="XXXXXX";
if(verbhead=="") verbhead="XXXXXX";
if(rmtemplate=="") rmtemplate="XXXXXX";
if(no_head==-1) no_head=1;
if(exclude_POS=="") exclude_POS="XXXXXX";
if(exclude_defn=="") exclude_defn="XXXXXX";
 
#
# initialization of variables used for parsing
#
# set to 0/1 if outside/inside language section 
langsect=0; 
# variable holding POS (part of speech) information 
# pos=="-" means the current POS is a non-lema form to be excluded from the dictionary 
pos= ""; 
# variable holding additional grammatical information as gender, plural/singular etc ({mfncps}})
gend="";
# variable holding page title
title = ""; 
#
# language dependent regular expressions
#
# command-line options
#
if(REMOVE_WIKILINKS == "y") remove_wikilinks = 1;
#
# regexp matching language section header
langhead="\\x3D\\x3D[\\x20]*"lang"[\\x20]*\\x3D\\x3D";
# regexp matching {{head|...|noun...
nounhead="\\{\\{head\\|"iso"\\|noun";
# 
warnmissing="[[][[]Category:"lang" (nouns|adjectives|verbs)[]][]]";
 
# print lang" "iso" "no_head;
 
# mapping of iso-codes to language-names (not longer supported by temples
isocodes="en|grc|la|es|ru|pt|LL.|it|gem|cel|ga|eu|de|fr|sv|ar|cel-gae";
languages="English|Ancient Greek|Latin|Spanish|Russian|Portuguese|Late Latin|Italian|Germanic|Celtic|Irish|Basque|German|French|Swedish|Arabic|Goidelic";
# write isocodes and language-names into array
n_iso=split(isocodes,iso_array,"|");
split(languages,languages_array,"|");
for(i=1;i<=n_iso;i++) { language_names[iso_array[i]] = languages_array[i];
#print iso_array[i]" "language_names[iso_array[i]];
}
 
}
 
function printout(outp) {
 
# convert special xml formating like &lt; to html
                        gsub(/&lt;/,"<",outp);
                        gsub(/&gt;/,">",outp);
                        gsub(/&amp;/,"\\&",outp);
                        gsub(/&quot;/,"\"",outp);
                        gsub(/&nbsp;/, " ", outp);
                        gsub(/&hellip;/, "...", outp);
                        gsub(/&quot;/, "\"", outp);
                        gsub(/&[mn]dash;/, "-", outp);
 
# NOTE: these must be done after converting '&lt;' -> '<'  and '&gt;' -> '>'
# remove <ref ... \>
                        gsub(/<ref[^>]*\/>/,"",outp);
 
# remove <ref [name=....]> blbla </ref> OK?
                        gsub(/<ref[^>]*>.*<\/ref>/,"",outp);
 
# remove one-line <!-- commented text -->
                        gsub(/<!--[^!>]*-->/,"",outp); 
 
# remove extra spaces
                        gsub(/[\ ]+/, " ", outp);
 
if(remove_wikilinks==1) {
                        outp = gensub(/([[][[])([^]|]*\|)([^]]*)([]][]])/ , "\\3", "g", outp);
                        outp = gensub(/([[][[])([^]]*)([]][]])/ , "\\2", "g", outp);
                        gsub(/[']{2,}/, "", outp);      
                        }
 
                        print outp;
}
 
 
# TODO: yet no support for generic cXX noun types
function template_g(input) {
 
MAXGENDERS=5;
output=input;
 
for(i=1;;i++)
{
regexp = "(\\{\\{g)(\\|)((an|in|pr|impf|pf|m|f|n|c|s|d|p)([-](an|in|pr|impf|pf|m|f|n|c|s|d|p))*)((\\|[^\\}]*)*(\\}\\}))";
output=gensub(regexp, "{\\3} \\1\\7", "g", output);
regexp = "[ ]*\\{\\{g\\}\\}";
gsub(regexp, "", output);
 
if(output !~ "\\{\\{g\\|") break;
if(i>MAXGENDERS-1) { print "# Warning too many genders"; break}
}
 
return (output)
}
 
 
/\x3Ctitle/ { 
gsub(/^[^\x3C]*/, ""); gsub(/[^\x3E]*$/, ""); gsub(/\x3Ctitle\x3E/, ""); gsub(/\x3C\/title\x3E/, "");  
title=$0; 
#print title;
langsect=0; pos= ""; gend = ""; gend2 = "";
}
 
# discard non-useful lines (speedup and false "trans-see" lines from comment lines)
/<comment>|<\/?page>|<timestamp>|<id>|<\/?contributor>|<\/?revision>|<username>|<minor \/>/  {next;}
/^$/ {next;}
 
{if(index(title,"Wiktionary:") != 0) {title=""; next;}}
{if(index(title,"Template:") != 0) {title=""; next;}}
{if(index(title,"Index:") != 0) {title=""; next;}}
{if(index(title,"Appendix:") != 0) {title=""; next;}}
{if(index(title,"User:") != 0) {title=""; next;}}
{if(index(title,"Help:") != 0) {title=""; next;}}
 
 
$0 ~ langhead { 
langsect=1; pos= ""; gend = ""; gend2 = "";
#print lang, ": ", title; 
next;}
 
/^\x3D\x3D[^\x3D]+/ { langsect=0; pos= ""; gend= ""; gend2= ""; next;}
 
# language and title detection done; skip all lines if not inside LANG section
{if(langsect==0) next;}
 
# determine POS
/\x3D\x3D\x3D/ { pos=""; gend=""; gend2=""; }
/\x3D\x3D\x3D[\x20]*Noun[\x20]*[1-9]*\x3D\x3D\x3D/ { pos="n"; next;}
#/\x3D\x3D\x3D[\x20]*Verb[\x20]*\x3D\x3D\x3D/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Verb/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Adjective[\x20]*\x3D\x3D\x3D/ { pos="adj"; next;}
/\x3D\x3D\x3D[\x20]*Adverb[\x20]*\x3D\x3D\x3D/ { pos="adv"; next;}
/\x3D\x3D\x3D[\x20]*Interjection[\x20]*\x3D\x3D\x3D/ { pos="interj"; next;}
/\x3D\x3D\x3D[\x20]*Article[\x20]*\x3D\x3D\x3D/ { pos="art"; next;}
/\x3D\x3D\x3D[\x20]*Proper\x20noun[\x20]*[1-9]*\x3D\x3D\x3D/ { pos="prop"; next;}
/\x3D\x3D\x3D[\x20]*Preposition[\x20]*\x3D\x3D\x3D/ { pos="prep"; next;}
/\x3D\x3D\x3D[\x20]*Postposition[\x20]*\x3D\x3D\x3D/ { pos="postp"; next;}
/\x3D\x3D\x3D[\x20]*\{\{initialism/ { pos="initialism"; next;}
/\x3D\x3D\x3D[\x20]*Numeral[\x20]*\x3D\x3D\x3D/ { pos="num"; next;}
/\x3D\x3D\x3D[\x20]*Cardinal num(ber|eral)[\x20]*\x3D\x3D\x3D/ { pos="cardinal num"; next;}
/\x3D\x3D\x3D[\x20]*Ordinal (number|numeral)[\x20]*\x3D\x3D\x3D/ { pos="ordinal num"; next;}
/\x3D\x3D\x3D[\x20]*Number[\x20]*\x3D\x3D\x3D/ { pos="num"; next;}
/\x3D\x3D\x3D[\x20]*\{\{acronym/ { pos="acronym"; next;}
/\x3D\x3D\x3D[\x20]*Acronym/ { pos="acronym"; next;}
/\x3D\x3D\x3D[\x20]*\{\{abbreviation/ { pos="abbr"; next;}
/\x3D\x3D\x3D[\x20]*Determiner[\x20]*\x3D\x3D\x3D/ { pos="determiner"; next;}
/\x3D\x3D\x3D[\x20]*Phrase[\x20]*\x3D\x3D\x3D/ { pos="phrase"; next;}
/\x3D\x3D\x3D[\x20]*Suffix[\x20]*\x3D\x3D\x3D/ { pos="suffix"; next;}
/\x3D\x3D\x3D[\x20]*Pronoun[\x20]*\x3D\x3D\x3D/ { pos="pron"; next;}
/\x3D\x3D\x3D[\x20]*Conjunction[\x20]*\x3D\x3D\x3D/ { pos="conj"; next;}
/\x3D\x3D\x3D[\x20]*Proverb[\x20]*\x3D\x3D\x3D/ { pos="proverb"; next;}
/\x3D\x3D\x3D[\x20]*Contraction[\x20]*\x3D\x3D\x3D/ { pos="contraction"; next;}
/\x3D\x3D\x3D[\x20]*Particle[\x20]*\x3D\x3D\x3D/ { pos="particle"; next;}
/\x3D\x3D\x3D[\x20]*Symbol[\x20]*\x3D\x3D\x3D/ { pos="symbol"; next;}
/\x3D\x3D\x3D[\x20]*Prefix[\x20]*\x3D\x3D\x3D/ { pos="prefix"; next;}
/\x3D\x3D\x3D[\x20]*Letter[\x20]*\x3D\x3D\x3D/ { pos="letter"; next;}
/\x3D\x3D\x3D[\x20]*Abbreviation[\x20]*\x3D\x3D\x3D/ { pos="abbr"; next;}
/\x3D\x3D\x3D[\x20]*Initialism[\x20]*\x3D\x3D\x3D/ { pos="initialism"; next;}
/\x3D\x3D\x3D[\x20]*Idiom[\x20]*\x3D\x3D\x3D/ { pos="idiom"; next;}
/\x3D\x3D\x3D[\x20]*Affix[\x20]*\x3D\x3D\x3D/ { pos="affix"; next;}
/\x3D\x3D\x3D[\x20]*Adverbial phrase[\x20]*\x3D\x3D\x3D/ { pos="adv"; next;}
/\x3D\x3D\x3D[\x20]*Prepositional phrase[\x20]*\x3D\x3D\x3D/ { pos="prep"; next;}
#
# Usage notes dont contain definitions, skip
/\x3D\x3D\x3D[\x20]*Usage notes[\x20]*\x3D\x3D\x3D/ { pos="-"; next;}
 
# These are supposed to be examples: ommit
/\x23\:|\x23\x23|\x23\*/ {next;}
 
# convert old {{infl| head line template call to new {{head| replacement
/\{\{infl\|/ {gsub(/\{\{infl\|/, "{{head|", $0);}
 
# convert new {{label|iso| templates to {{cx|
/\{\{label\|/ { regexp = "\\{\\{label\\|"iso"\\|";
		gsub(regexp, "{{cx|", $0);}
 
# form of headers, exclude current POS section
$0 ~ exclude_POS {pos="-";}
 
# discard entry without head-line
# use option "no_head=1" for languages with plain '''WORD''' rather than {{head|iso|... form| 
# on the head line of non-lema entries
/^[']['][']/ {
if(no_head==1) {
if(index($0, "'''"title"'''") !=0) 
	if((pos=="adj")||(pos=="n")||(pos=="v")) pos="-";
}}
 
# determine gender of nouns
$0 ~ m_noun {gend="m";}
$0 ~ f_noun {gend="f";}
$0 ~ mf_noun {gend="mf";}
$0 ~ nounhead  {	
				gend="";
# detetermine gender via g, g2, g3 parameters of head-template, e.g. {{head|blabla|g=m|g2=f}}		
				regexp="(\\{\\{head\\|"iso"\\|noun.*)(\\|g[=])([mfncps])(.*)";
				temp=gensub(regexp, "\\3", "", $0);
				if(temp != $0) gend = temp;
 
				regexp="(\\{\\{head\\|"iso"\\|noun.*)(\\|g[=][mfnc]-)(p)(.*)";
				temp=gensub(regexp, "\\3", "", $0);
				if(temp != $0) gend = (gend "p");
 
				regexp="(\\{\\{head\\|"iso"\\|noun.*)(\\|g2[=])([mfncps])(.*)";
				temp=gensub(regexp, "\\3", "", $0);
				if(temp != $0) gend = (gend temp);
 
				regexp="(\\{\\{head\\|"iso"\\|noun.*)(\\|g2[=][mfnc]-)(p)(.*)";
				temp=gensub(regexp, "\\3", "", $0);
				if(temp != $0) gend = (gend "p");
 
				regexp="(\\{\\{head\\|"iso"\\|noun.*)(\\|g3[=])([mfncps])(.*)";
				temp=gensub(regexp, "\\3", "", $0);
				if(temp != $0) gend = (gend temp);
 
# detetermine gender via extra template called after head-template, e.g.: {{head|blabla}} {{m|f|blabla}}
				regexp="(\\{\\{head\\|"iso"\\|.*\\{)([mfncps])([\\|\\}].*)";
				temp=gensub(regexp, "\\2", "", $0);
				if(temp != $0) gend = (gend temp);
				regexp="(\\{\\{head\\|"iso"\\|.*\\{[mfncps]\\|)([mfncps])([\\|\\}].*)";
				temp=gensub(regexp, "\\2", "", $0);
				if(temp != $0) gend = (gend temp);
				regexp="(\\{\\{head\\|"iso"\\|.*\\{[mfncps]\\|[mfncps]\\|)([mfncps])([\\|\\}].*)";
				temp=gensub(regexp, "\\2", "", $0);
				if(temp != $0) gend = (gend temp);
			}
 
$0 ~ verbhead {
		gend="";
                if(match($0, "intransitive") != 0) gend = (gend "i");
                if(match($0, "[^ni]transitive") != 0) gend = (gend "t");
		if(match($0, "ambitransitive") != 0) gend = (gend "it");
                if(match($0, "reflexive") != 0) gend = (gend "r");
#		print "# VERBHEAD: "$0" on page: "title" gend: "gend;
		}
 
$0 ~ exclude_defn {next;}
 
# main section: format output lines
/^[\x20]*\x23/ 	{if((langsect==1)&&(pos != "-")&&(title!=""))
 			{
#remove some common template options
		gsub(/\|(lang|pos|sc|nodot|nocap|sort|diminutive|nocat|POS|gender|from|skey)=[^\|\}]*/, "", $0);
 
# remove {{rf* templates 
		gsub(/\{\{(rfex|rfgloss|R\:|attention\||\,|rfv\-sense|defdate)[^}]*\}\}/, "", $0);
 
# remove languge specific templates
		gsub(rmtemplate, "", $0);
 
# remove {{g| templates
if($0 ~ "\\{\\{g\\|") { $0 = template_g($0); } 
 
# convert languge specific templates
if(lang=="Portuguese") {
		$0=gensub(/(\{\{(pt-obsolete|pt-superseded-hyphen|pt-superseded-silent-letter-1990)[^}\|]*\|)([^}\|]*)(\}\})/, "obsolete spelling of [[\\3]]", "g", $0);
}
 
# discard all "form of" and "plural of" entries
		if(index($0, "{{plural of|") !=0) next;
		if(index($0, "{{form of|") !=0) next;
		if(index($0, "{{archaic form of|") !=0) next;
		if(index($0, "{{Latn-def|") !=0) next;
 
# convert {{term|...|lang=xx}} -> [[...]]
		$0=gensub(/(\{\{term\|)([^}\|]*)([^}]*)(\}\})/, "[[\\2]]", "g", $0);
 
# convert {{l|???|...}} -> [[...]]
		$0=gensub(/(\{\{l[\|\/][^\|\}]*\|)([^\}]*)(\}\})/, "[[\\2]]", "g", $0);
 
# taxlink must come before gloss as it is likely nested inside
# convert {{taxlink|
		gsub(/\|noshow=1/, "", $0);
		$0=gensub(/(\{\{taxlink\|)([^\}]*)(\|(genus|subspecies|species|family|tribe|epithet|section|variety|order|tritaxon|subgenus)\}\})/, "[[\\2]]", "g", $0);
 
# convert {{vern|1}} etc. to [[1]]
		$0=gensub(/(\{\{vern\|)([^}\|]*)(\}\})/, "[[\\2]]", "g", $0)
 
# convert {{gloss|...}} -> (...)
		$0=gensub(/(\{\{(gloss)\|)([^}{]*)(\}\})/, "(\\3)", "g", $0);
 
# convert {{non-gloss definition|...}} , {{n-g|, {{w|, {{spelink| {{&lit| {{pedlink| -> ...
		$0=gensub(/(\{\{(non-gloss definition|non gloss definition|n-g|non-gloss|non gloss|w|spelink|pedlink|[&]amp[;]lit)\|)([^}]*)(\}\})/, "\\3", "g", $0);
 
# convert {{sense|...}} -> (...)
		$0=gensub(/(\{\{sense\|)([^}]*)(\}\})/, "(\\2)", "g", $0);
 
# convert {{qualifier|...}} -> [...]
		$0=gensub(/(\{\{qualifier\|)([^}]*)(\}\})/, "[\\2]", "g", $0);
 
# convert {{italbrac|...}} -> (...)
		$0=gensub(/(\{\{italbrac\|)([^}]*)(\}\})/, "(\\2)", "g", $0);
 
# convert |_| -> " "
		gsub(/\|_\|/, " ", $0);
 
# convert {{apocopic form of|...}} etc. 
# regexp not working?
		$0=gensub(/(\{\{(apocopic|obsolete|short|informal) form of\|)([^}]*)(\}\})/, "\\2 form of \\3", "g", $0);
 
# convert {{feminine of|...}} etc. 
		$0=gensub(/(\{\{(feminine singular|feminine|neuter|diminutive|superlative|comparative|augmentative) of\|)([^}]*)(\}\})/, "\\2 form of \\3", "g", $0);
 
# convert {{reflexive of|link|word}} -> reflexive form of [[word]]
		$0=gensub(/(\{\{(reflexive) of\|)([^}]*[|])([^}|]+)(\}\})/, "\\2 form of \\4", "g", $0);
 
# convert {{reflexive of|link}} -> reflexive form of [[link]]
		$0=gensub(/(\{\{(reflexive) of\|)([^}|]+)(\}\})/, "\\2 form of \\3", "g", $0);
 
# convert {{alternative spelling of|...}} etc.,
		$0=gensub(/(\{\{(contraction of|dated form of|alternative capitalization of|informal spelling of|nonstandard spelling of|alternative spelling of|obsolete spelling of|alternative form of|alternate form of|feminine plural of|abbreviation of|acronym of|rare spelling of|archaic spelling of|singular of|obsolete form of|eye dialect of|agent noun of|initialism of|synonym of|alternate spelling of|rare form of|eye dialect|only used in|medieval spelling of|European Portuguese form of|past participle of|superseded spelling of|European Portuguese spelling of|euphemistic spelling of|gerund of)\|)([^}|]*)([^}]*\}\})/, "\\2 \\3", "g", $0);
 
# covert {{given name|male/female}}
		$0=gensub(/(\{\{given name\|)(male|female)(\}\})/, "\\2 given name", "g", $0);
 
# convert {{surname|
		replacement=lang" surname";
		$0=gensub(/\{\{surname[^}]*\}\}/, replacement, "g", $0);
 
# convert {{etyl|iso|iso2}} -> {{iso}}
# TODO: {{iso}} -> Language-name
#		$0=gensub(/(\{\{etyl[|])([^|]*)([|][^}]*}})/, "{{\\2}}", "g", $0);
regexp="\\{\\{etyl[|]";
if($0 ~ regexp) {
 
for(i=1;i<=n_iso;i++) {
regexp="\\{\\{etyl\\|"iso_array[i]"\\|[^}]*\\}\\}"
repl=language_names[iso_array[i]];
#print regexp"; "repl;
gsub(regexp, repl);
} }
 
# verb: determine if transitive, intransitive, reflexive
			if(pos=="v") {
			gend2="";
                        if(match($0, "intransitive") != 0) gend2 = (gend2 "i");
                        if(match($0, "[^in]transitive") != 0) gend2 = (gend2 "t");
                        if(match($0, "ambitransitive") != 0) gend2 = (gend2 "it");
                        if(match($0, "reflexive") != 0) gend2 = (gend2 "r");
			if(match($0, "pronominal") != 0) gend2 = (gend2 "p");
			}
 
			pos3=( pos gend );
			pos3=( pos3 gend2 );
			pos3=gensub(/(n)([mfncps])/, "\\2", "1", pos3);
 
                        LHS = sprintf("[[%s]] {%s}",title,pos3);
 
			gsub(/^[\x20]*\x23[\x20]*/,"",$0);			
                        gsub(/<\/text>/,"",$0);			
			gsub(/\.$/,"",$0);
			regexp="\\|lang\\="iso;
			gsub(regexp,"",$0);	
			$0 = gensub(/(\{\{|\|)((reflexive|intransitive|ambitransitive|transitive|pronominal)\|)/,"\\1", "g", $0);
			$0 = gensub(/(\|(reflexive|intransitive|ambitransitive|transitive|pronominal))(\}\})/,"\\3", "g", $0);
			$0 = gensub(/(\{\{(reflexive|intransitive|ambitransitive|transitive|pronominal)\}\})/,"\\3", "g", $0);
#			gsub(/\{\{context\|/,"{{", $0);
			gsub(/\{\{(context|cx)[\|]*/,"{{", $0);
#			gsub(/\{\{label[\|][^\|]*[\|]/,"{{", $0);
			gsub(/\{\{\}\}/,"", $0);	
 
			gsub(/\#English/,"",$0);
			gsub(/\[\|/,"[",$0);
 
                        outp = LHS" :: "$0;
 
# change :: {{...}} -> [...] ::
# templates at beginning of definition-lines are supposed to be context
			old=outp;			
			for(i=0;i<10;i++)
			{ 
# move contexts to LHS
			if(match(old, /::[\x20]*\{\{(gloss|sense)[\|]/) != 0) break;
#			if(match(old, /::[\x20]*\{\{[^\}\|]*of[\|\}]/) != 0) break;
			outp=gensub(/(.*)([\x20]::[\x20])([\x20]*\{\{)([^\}]*)(\}\})(.*)/, "\\1 [\\4] \\2\\6", "g", old);
			if(outp==old) break;
 			old=outp;
			}			
 
# change [ | | ] -> [ , , ]
			old=outp;			
			for(i=0;i<10;i++)
			{ outp=gensub(/([^\[]\[[^\]\[]*)(\|)([^\]]*])/, "\\1, \\3", "g", old);
			if(outp==old) break;
 			old=outp;
			}
 
# remove extra spaces
			gsub(/[\ ]+/, " ", outp);
 
# convert {{plural}} -> {p}, {{m}} -> {m}, {{f}} -> {f}
			outp=gensub(/(\{\{)([mfncsp])(lural\}\}|\}\})/, "{\\2}", "g", outp);
 
# rm #blabla from link inside square brackets
# first [[#bla|word]] -> [[title|word]] then other cases
			outp=gensub(/(\[\[[#][^\|\]]*)(\|[^\]]*\]\])/, "[["title"\\2", "g", outp);
			outp=gensub(/(\[\[[^\|\]]*[^ ])([ ]*[#][^\|\]]*)(\|[^\]]*\]\])/, "\\1\\3", "g", outp);
 
			printout(outp);
			if (pos == "") print "#UNKNOWN POS on page ",title ;
			}
			}
 
$0 ~ warnmissing	{ 
			if((pos=="-")&&(no_head==1)) 
				print "missing head template on page [["title"]]" >"FIXME-"lang".txt";
			}