User:Matthias Buchmeier/trans-en-es.awk

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

Dictionaries from translations sections[edit]

Below you find a gawk script to create wikified bilingual dictionaries form the translation sections from the datadase dump.

Usage:[edit]

  1. Download the database dump (enwiktionary-DATE-pages-articles.xml.bz2) from here.
  2. Copy the code below to trans-en-es.awk.

on LINUX[edit]

  1. Enter the following command (language is the language name e.g. "Spanish", iso-code is the corresponding iso language code, e.g. "es" for Spanish, as they appear in the wiki-code of the translation sections):
    • bzcat enwiktionary-DATE-pages-articles.xml.bz2|gawk -v LANG=language -v ISO=iso-code -v REMOVE_WIKILINKS="y" -f trans-en-es.awk|sort -d -k 1,1 -t"{">OUTPUT-FILE

on MS-Windows[edit]

  1. Unzip enwiktionary-DATE-pages-articles.xml.bz2
  2. Run the following command from the DOS-window (langiage is the language name e.g. "Spanish", iso-code is the corresponding iso language code, e.g. "es" for Spanish, as they appear in the wiki-code of the translation sections):
    • gawk -v LANG=language -v ISO=iso-code -v REMOVE_WIKILINKS="y" -f trans-en-es.awk enwiktionary-DATE-pages-articles.xml> OUPUT-FILE
  3. Optionally sort the OUPUT-FILE with whatever program at hand.

Code (translation sections):[edit]

# gawk script to create a Foreign_Language-English dictionary from 
# the Foreign_Language sections of en.wiktionary.org
#
# (c) 2011-2017 by Matthias Buchmeier
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#
# TODO:
# rm lines with {{past participle of| ?
# include IPA
# {{=}}
# Pronunciation sections inside POS
# proper treatment of {{indtr|
# optionally include non-lemma forms
#
#
# Command-line options:
########################
#  required gawk command-line switches:
#
#    name of the language to be extracted
#    -v LANG="language" 
#
#    iso-code of the language to be extracted
#    -v ISO="iso-code"
#
#  optional gawk command-line switches:
#
#    remove wiki-links and wiki-style bolding, italicizing:
#    -v REMOVE_WIKILINKS="y"
#
#########################

BEGIN {
#########################
# User defined variables:
#########################
# English names of the target language
# supported at the moment: Italian, French, Spanish, Finnish, Portuguese
# default language:
lang="Spanish";
# default isocode
iso="es"
#
no_head=-1;
#
# command line parsing
#
if(LANG!="") lang = LANG;
if(ISO!="") iso = ISO;
#
###########################################
# language specific configuration: 
###########################################
#
# configuration for Spanish
if(lang=="Spanish") {
# iso code of the language
iso = "es";

# exclude entire current POS-subsection if regexp is matched
# this regexp should typically contain headline-templates of non-lema (form of) entries
exclude_POS="\\{\\{es-verb-form|\\{\\{es-adj[^\\}]*\\|(m|masculine)\\=|\\{\\{head\\|es\\|(noun|adjective|verb) form|\\{\\{es-adj-form|\\{\\{head\\|es\\|(misspelling|obsolete)|\\{\\{misspelling of\\||\\{\\{es-past participle|\\{\\{es-pp\\||\\{\\{head\\|es\\|participle form|\\{\\{head\\|es\\|past participle form|\\{\\{head\\|es\\|present participlen|\\{\\{head\\|es\\|noun plural form(\\||\\})";

# exclude the whole matched definition line
# this regexp typically contains form of definition-line templates
exclude_defn="\\{\\{(es-verb form of|rfdef|def|plural of|form of|inflection of|archaic form of)(\\||\\})";

# language specific templates to be removed from output lines
# keeps the rest of definition line
# regexp MUST MATCH THE TEMPLATE NAME!
rmtemplate="";

# regexp matching headline
# used to set gender from headline
nounhead="\\{\\{head\\|es\\|(noun|proper noun)\\||\\{\\{es-noun\\||\\{\\{es-proper noun\\|";

# regexp matching verb headline
# used to set transitive etc. from headline
verbhead="\\{\\{es-verb[\\|\\}]|\\{\\{head\\|es\\|verb[\\|\\}]";

# discard entries without head-line template:
no_head=0;

# set to 1 if the language has neuter gender. otherwise 0:
has_neuter=0;
}
#
if(lang=="Italian") {
iso = "it";
exclude_POS="\\{head\\|it\\|[^\\:\\|]* form[s]*[\\|\\}]|\\{\\{(head\\|it\\|(misspelling|obsolete|plural|g=)|it-pp)[\\|\\}]|\\{\\{head\\|it\\}|\\{\\{it-adj-form(\\||\\})";
exclude_defn="Compound of|\\{\\{(present participle of|past participle of|rfdef|defn|misspelling of|conjugation of|feminine plural past participle of|masculine plural past participle of|feminine past participle of|inflection of|masculine plural of|it-adj form of)|\\{\\{rfdef|\\{\\{misspelling of|\\{\\{uncommon spelling of|\\{\\{(plural of|form of|inflection of|archaic form of)(\\||\\})";
verbhead="\\{\\{it-verb[\\|\\}]|\\{\\{head\\|it\\|verb[\\|\\}]";
nounhead="\\{\\{head\\|it\\|noun\\||\\{\\{it-noun\\|";
no_head=1;
has_neuter=0;
}
#
if(lang=="French") {
iso = "fr";
exclude_POS="\\{\\{head\\|fr\\|[^\\|]* form[s]*[\\|\\}]|\\{\\{head\\|fr\\|(misspelling|obsolete|plural|present participle|g=)|\\{\\{(misspelling of|fr-pp|fr-verb-form|fr-verb form|fr-adj-form|fr-past participle)(\\||\\})|\\{\\{head\\|fr\\}";
exclude_defn="\\{\\{past participle of\\||Compound of|masculine plural past participle of|present participle of|feminine plural past participle of|masculine plural of|conjugation of|inflection of|plural of|feminine plural of|feminine past participle of|plural past participle of|\\{\\{(rfdef|defn|plural of|form of|inflection of|archaic form of)(\\||\\})";
verbhead="\\{\\{fr-verb[\\|\\}]|\\{\\{head\\|fr\\|verb[\\|\\}]";
nounhead="\\{\\{head\\|fr\\|(noun|proper noun)(\\||\\})|\\{\\{fr-noun\\||\\{\\{fr-proper noun(\\||\\})";
no_head=1;
has_neuter=0;
}
#
if(lang=="Finnish") {
iso = "fi";
exclude_POS="\\{\\{head\\|fi\\|(noun|adjective|verb|proper noun) form|\\{\\{head\\|fi\\|(misspelling|obsolete)|\\{\\{misspelling of\\||\\{\\{head\\|fi\\}|\\{\\{head\\|fi\\|infinitive(\\||\\})";
exclude_defn="\\{\\{(fi-form of|fi-participle of|infinitive of|inflected form of|agent noun of|fi-verb form of|defn|rfdef|nominative plural of|rftrans|plural of|form of|inflection of|archaic form of)(\\||\\})";
no_head=1;
nounhead="XXXXXXXXX";
}
#
if(lang=="Portuguese") {
iso = "pt";
exclude_POS="\\{\\{head\\|pt\\|past participle form(\\||\\})"
exclude_defn="\\{\\{(pt-verb form of|pt-verb-form-of|conjugation of|misspelling of|pt-noun form of|pt-adj form of|feminine past participle of|feminine plural past participle of|masculine plural of|inflection of|pt-ordinal form|pt-adv form of|plural form of|pt-article form of|masculine plural past participle of|pt-cardinal form of|pt-apocopic-verb|rfdef|plural of|form of|inflection of|archaic form of)\\||\\{\\{head\\|pt\\|(verb|noun|adjective) form";
nounhead="\\{\\{head\\|pt\\|(noun|proper noun)\\||\\{\\{pt-noun\\||\\{\\{pt-proper noun\\|";
verbhead="\\{\\{pt-verb[\\|\\}]|\\{\\{head\\|pt\\|verb[\\|\\}]";
no_head=1;
has_neuter=0;
}
#
#
if(lang=="Latin") {
iso = "la";
exclude_POS="\\{\\{la-(verb|part|noun|proper noun|adj|gerund|num)-form(\\||\\})";
exclude_defn="\\{\\{(conjugation of|inflection of|(genitive|nominative|vocative|accusative) (singular|plural) of|la-verb-form|misspelling of|defn|combining form of|inflected form of|rfdef|rftrans|plural of|archaic form of)\\|";
has_neuter=1;
nounhead="\\{\\{head\\|la\\|(noun|proper noun)(\\||\\})|\\{\\{la-noun\\||\\{\\{la-proper noun(\\||\\})";
no_head=1;
}

if(lang=="German") {
iso = "de";
exclude_POS="\\{\\{head\\|de\\|(verb|noun|proper noun|adjective) form(\\||\\})";
exclude_defn="\\{\\{(conjugation of|inflection of|(genitive|nominative|vocative|accusative) (singular|plural) of|de-verb form of|misspelling of|defn|combining form of|inflected form of|rfdef|rftrans|plural of|archaic form of|de-inflected form of|form of|de-form-adj|past tense of|de-form-noun|genitive of|dative plural of|de-umlautless spelling of|accusative of|dative of|dative singular of|de-zu-infinitive of|de-du contraction|obsolete typography of|present participle of)(\\||\\})";
has_neuter=1;
nounhead="\\{\\{head\\|de\\|(noun|proper noun)\\||\\{\\{de-noun\\||\\{\\{de-proper noun\\|";
rmtemplate="gerund of";
no_head=1;
}
#
if(lang=="Dutch") {
iso = "nl";
exclude_POS="\\{\\{(nl-adj-form|nl-verb-form|head\\|nl\\|noun plural form)(\\||\\})";
exclude_defn="\\{\\{(nl-noun form of|nl-adj form of|nl-verb form of|misspelling of|form of|inflection of|archaic form of)(\\||\\})";
nounhead="\\{\\{head\\|nl\\|(noun|proper noun)\\||\\{\\{nl-noun\\||\\{\\{nl-proper noun\\|";
has_neuter=1;
no_head=1;
rmtemplate="g2";
}
#
#
# initialization of undefined lang-specific regexps
# regexp matching {{head|...|noun...
if(nounhead == "") nounhead="\\{\\{head\\|"iso"\\|noun";
if(verbhead=="") verbhead="XXXXXX";
if(rmtemplate=="") rmtemplate="XXXXXX";
if(no_head==-1) no_head=1;
if(has_neuter==-1) has_neuter=0;
if(exclude_POS=="") exclude_POS="XXXXXX";
if(exclude_defn=="") exclude_defn="\\{\\{(rfdef|rftrans|defn)\\|";

#
# initialization of variables used for parsing
#
# set to 0/1 if outside/inside language section 
langsect=0; 
# variable holding POS (part of speech) information 
# pos=="-" means the current POS is a non-lema form to be excluded from the dictionary 
pos= ""; 
# variables holding additional grammatical information as gender, plural/singular etc ({mfncps}})
# from headline
gend="";
#from definitions
gend2="";
# variable holding page title
title=""; 
#
# language dependent regular expressions
#
# command-line options
#
if(REMOVE_WIKILINKS == "y") remove_wikilinks = 1;
	else remove_wikilinks = 0;
#
# regexp matching language section header
langhead="\\x3D\\x3D[\\x20]*"lang"[\\x20]*\\x3D\\x3D";
# 
warnmissing="[[][[]Category:"lang" (nouns|adjectives|verbs)[]][]]";

# headline=1 inside headlines =0 elsewhere
headline = 0; 

# mapping of iso-codes to language-names (not longer supported by temples
isocodes="en|grc|la|es|ru|pt|LL.|it|gem|cel|ga|eu|de|fr|sv|ar|cel-gae";
languages="English|Ancient Greek|Latin|Spanish|Russian|Portuguese|Late Latin|Italian|Germanic|Celtic|Irish|Basque|German|French|Swedish|Arabic|Goidelic";
# write isocodes and language-names into array
n_iso=split(isocodes,iso_array,"|");
split(languages,languages_array,"|");
for(i=1;i<=n_iso;i++) { language_names[iso_array[i]] = languages_array[i];
#print iso_array[i]" "language_names[iso_array[i]];
}

}
# end of BEGIN block
########################

########################
# function definitions:

function replace_template(tpar, n_unnamed,     outp, i, j, start)	{
# scans tpar and returns replacement string for the template
# tpar[0] is the template name
# tpar[1], ..., tpar[n_unnamed] are the unnamed parameters
# tpar["name1"], ...,  tpar["nameN"] are the named parameters with names name1, ..., nameN

# debug output
# for (j in tpar) print j, tpar[j];
# print tpar[0]; 
MAXGENDERS = 5;

# user-defined remove per language:
if(tpar[0] ~ rmtemplate)
	return "";

switch (tpar[0]) {

# qualifier
case /^(qualifier|i|italbrac|ib|qual|q|a|qf)$/:
outp =  tpar[1];
for(i=2;i in tpar;i++)	outp = outp ", " tpar[i];
outp = linktotext(outp);
return "[" outp "]";

# gloss-template -> ({{1}})
case /^(gloss|sense|gl)$/:
outp = "(" tpar[1] ")";
return outp;

# l-templates
# TODO: tr parameters
case /^(l|l-self|link|m|mention|m-self|ll|l\/.*)$/:
if(tpar[2] ~ /\[\[/)
	 outp = tpar[2];

	else {
		if(3 in tpar) outp = "[[" tpar[2] "|" tpar[3] "]]";
			else outp = "[[" tpar[2] "]]";
		}
return outp;

# lb-template
# TODO: senseid etc as first template
# TODO: join code with term-label?
case /^(lb|label|lbl|indtr)$/:
j=1;
for(i=2;i in tpar;i++) {
if(pos=="v") 	{
		if(tpar[i] == "intransitive") { gend2 = (gend2 "i"); continue;}
                if(tpar[i] == "transitive") { gend2 = (gend2 "t"); continue;}
                if(tpar[i] == "ambitransitive") { gend2 = (gend2 "it"); continue;}
                if(tpar[i] == "reflexive") { gend2 = (gend2 "r"); continue;}
		if(tpar[i] == "pronominal") { gend2 = (gend2 "p"); continue;}
		}
j++;
		if(j > 2) outp = outp ", ";
		outp = outp tpar[i];
}
# cleanup ", _,", ", and," etc
gsub(/,[\ ]_,/, "", outp);
outp = gensub(/,[\ ](and|or)[,]*[\ ]/, " \\1 ", "g", outp);
gsub(/^(and|or)$/, "", outp);

# there might be labels on the headline
if((headline == 1)&&(pos=="v")) gend = gend gend2;
outp = linktotext(outp);
if(template_number == 1) {
				LHS_qualifier = LHS_qualifier outp; 
				return ""; 
				}
if(j==1) outp = "";
	else outp = "[" outp "]";

return outp;

# labels on headline for entire pos
case /^(tlb|term-label|term-context|tcx)$/:
if(headline != 1) {
print "#ERROR: term-label on definition-line, title: \"" title "\", line: \"" $0 "\"" >"FIXME-"lang".txt";
return "";}

if(tpar[0] ~  /^(tlb|term-label)$/) start =2;
	else start =1;
j=1;
for(i=start;i in tpar;i++) {
if(pos=="v") 	{
		if(tpar[i] == "intransitive") { gend = (gend "i"); continue;}
                if(tpar[i] == "transitive") { gend = (gend "t"); continue;}
                if(tpar[i] == "ambitransitive") { gend = (gend "it"); continue;}
                if(tpar[i] == "reflexive") { gend = (gend "r"); continue;}
		if(tpar[i] == "pronominal") { gend = (gend "p"); continue;}
		}
j++;
		if(j > 2) outp = outp ", ";
		outp = outp tpar[i];
}
outp = linktotext(outp);
# cleanup ", _,", ", and," etc
gsub(/,[\ ]_,/, "", outp);
outp = gensub(/,[\ ](and|or)[,]*[\ ]/, " \\1 ", "g", outp);
gsub(/^(and|or)$/, "", outp);

if(outp != "") {
if(term_label != "") term_label = term_label ", ";
term_label = term_label outp;
}
return "";


# templates to be removed
case /^(attention|rfc-tbot|inv|rfr|rfscript|rftranslit|NNBS|RL|LR|\,|jump|rfv|rfex|rfgloss|attention|rfv-sense|defdate|gloss-stub|senseid|es-demonstrative-accent-usage|R[:].*|pos_n|rfdef|cite-web|cite|C|cite-book|rft-sense|cite|rfquote-sense|RFV-sense|rfc-sense|datedef)$/:
template_number -= 1;
return "";

# get gender from the head-template:
case "head":
if(headline ==1) 
{
#for(i in tpar) print i, tpar[i];
if("g" in tpar) gend = gend sob tpar["g"] scb;
if("g2" in tpar) gend = gend sob tpar["g2"] scb;
if("g3" in tpar) gend = gend sob tpar["g3"] scb;
}
else print "#WARNING: misplaced head-template, title: \"" title "\", line: \"" $0 "\"" >"FIXME-"lang".txt";
return "";


# the g-template
case "g":
outp = "";
for(i=1;i in tpar;i++)	outp = outp sob tpar[i] scb;
if(headline==0) return outp;
else {gend = gend outp; return "";}

# obsolete term, vern etc {{1}} -> [[1]]
case /^(term|vern|specieslink)$/:
return "[[" tpar[1] "]]"

# alternative, obsolete etc forms
case "altform":
return "alternative form of [[" tpar[1] "]]";

case "altcaps":
return "alternative case form of [[" tpar[1] "]]";

case /^(altspelling|alt-sp)$/:
return "alternative spelling of [[" tpar[1] "]]";

# template-name [[{{1}}]] or template-name {{1}}
case /^(contraction of|dated form of|alternative capitalization of|informal spelling of|nonstandard spelling of|alternative spelling of|obsolete spelling of|alternative form of|alternate form of|feminine plural of|abbreviation of|acronym of|rare spelling of|archaic spelling of|singular of|obsolete form of|eye dialect of|agent noun of|initialism of|synonym of|alternate spelling of|rare form of|eye dialect|only used in|medieval spelling of|European Portuguese form of|past participle of|superseded spelling of|European Portuguese spelling of|euphemistic spelling of|gerund of|alternative term for|feminine noun of|plural form of|alternative form of|alternative case form of|alt form of|alt form|short of|common misspelling of|only in|pejorative of|attributive of|short for|euphemistic form of|present participle of|eye-dialect of|nonstandard form of|neuter singular of|masculine plural of|short form of|short for|feminine singular of|feminine of|neuter of|diminutive of|superlative of|comparative of|augmentative of|reflexive of|apocopic form of|obsolete form of|short form of|informal form of|dated spelling of|pronunciation spelling of|former name of|superseded form of|clipping of|praenominal abbreviation of|alternative typography of|supine of|nominalization of)$/:
outp = tpar[0] " ";
if(tpar[1] ~ /\[\[/)
	 outp = outp tpar[1];
	else outp = outp "[[" tpar[1] "]]";
return outp;

case /^(synonym|abbreviation|clipping|abbreviation-old|altname)$/:
if(tpar[1] ~ /\[\[/) outp = outp tpar[1];
	else outp = outp "[[" tpar[1] "]]";

return tpar[0] " of " outp; 

# templates replaced by first unnamed parameter
case /^(unsupported|w|non-gloss definition|n-g|taxlink|non gloss definition|non-gloss|non gloss|spelink|pedlink|def|IPAchar|def-date)$/:
return tpar[1];

# templates to be replaced by "{templatename}"
case /^(impf|dual)$/:
return sob tpar[0] scb;

# templates to be replaced by templatename
case /^(CE|BC|given name|surname|historical given name|AD|praenomen)$/:
return  tpar[0];

# &oth, &lit
case /^[&]amp[;](lit|oth)/:
outp = "See:";
for(i=1;i in tpar;i++) 
	if(tpar[i] ~ /\[\[/) 	outp = outp tpar[i];
	else outp = outp " [[" tpar[i] "]]";
return outp;

case "indeclinable":
return sob "indecl" scb;

case "pf.":
return sob "pf" scb;

# templates replaced by 2nd unnamed parameter, e.g. lang-template
case /^(lang|cog)$/:
return tpar[2];

# construed with
case "construed with":
return "construed with"  tpar[1];

# obsolete plural
case "plural":
return sob "p" scb;

# place-template
# TODO: proper link?
case /^place($|[:].+$)/:
if(tpar[2] != "") outp = "[[" title "]] (" tpar[2] ")";
	else outp = "[[" title "]] (placename)";
# print outp;
return outp;


# etyl template
case "etyl":
outp = tpar[1];
for(i=1;i<=n_iso;i++) {
regexp="^" iso_array[i] "$";
repl=language_names[iso_array[i]];
sub(regexp, repl, outp);
}
return outp;

case "nbsp":
return " ";

#TODO: from fields
case "standard spelling of":
return "alternative spelling of " tpar[1];

#############################
# language speciic templates
#############################
# German
case "de-superseded spelling of":
return "obsolete spelling of " tpar[1];

case "de-plural noun":
if(headline == 1) 
	if(1 in tpar) gend = gend sob tpar[1] "p" scb;
return "";

# Portuguese:
case /^(pt-obsolete.*|pt-superseded.*)$/:
return "obsolete spelling of " tpar[1]; 

case "pt-pron def":
if(tpar[2] ~ /[1-3]/)
	outp = tpar[2] ". person" tpar[3] ". "  tpar[1] "pronoun";
	else outp = tpar[3] ". " tpar[4] ". form of "  tpar[1] " " tpar[2];
return outp;

case /^pt-pronoun-with-[nl]/:
outp = "alternative form of [[";
if(tpar[1] == "m") outp = outp "o";
	else   outp = outp "a";
if(tpar[1] == "pl")  outp = outp "s";
outp = outp "]]";
return outp;

case "+preo":
outp = " [+ " tpar[2] " (object)";
if(3 in tpar) outp = outp " = " tpar[3];
if("means" in tpar) outp = outp " = " tpar["means"];
outp = outp "]";
return outp;

case "+obj":
outp = " [+ " tpar[2];
if(3 in tpar) outp = outp " " tpar[3];
if(4 in tpar) outp = outp " " tpar[4];
if("means" in tpar) outp = outp " = " tpar["means"];
outp = outp "]";
return outp;


# Spanish and others
case /^(es|it|pt|fr|de|nl)-noun$/:
if(headline == 1) 
	if(1 in tpar) gend = gend sob tpar[1] scb;
return "";

# get gender from es-proper noun ans others
case /^(es|fr|pt|it|de|nl)-proper noun$/:
if(headline == 1)
	if(1 in tpar) gend = gend sob tpar[1] scb;
	if("g" in tpar) gend = gend sob tpar["g"] scb;
	if("g2" in tpar) gend = gend sob tpar["g2"] scb;	
return "";

# Finnish
case "fi-infinitive of":
return "infinitive of " tpar[1]; 

# Latin
case "NL.":
return "New Latin";

# get gender from the head-template la-noun:
case "la-noun":
if(headline == 1) 
{
#for(i in tpar) print i, tpar[i];
if(3 in tpar) gend = gend sob tpar[3] scb;
if("g2" in tpar) gend = gend sob tpar["g2"] scb;
if("g3" in tpar) gend = gend sob tpar["g3"] scb;
}
else print "#WARNING: head-template on defline, title: \"" title "\", pos: \"" pos "\", line: \"" $0 "\"" >"FIXME-"lang".txt";
return "";

# get gender from la-proper noun
case "la-proper noun":
if(headline == 1)
	if(3 in tpar) gend = gend sob tpar[3] scb;
return "";

case "nl-pronadv of":
return "pronominal adverb form of " tpar[1] " + " tpar[2];

case "uncertain":
return  sob "uncertain meaning" scb;

# names of Latin letters
case "Latn-def":
outp = outp "letter";
if(4 in tpar) outp = outp ": [[" tpar[4] "]]";
if(5 in tpar) outp = outp ", [[" tpar[5] "]]";
if(6 in tpar) outp = outp ", [[" tpar[6] "]]";
if(7 in tpar) outp = outp ", [[" tpar[7] "]]";
return outp;

# soplink
case "soplink":
for(i=1;i in tpar;i++) {
if (tpar[i] !~ /[\ \-\/]/) outp = outp  "[[" tpar[i] "]]";
	else outp = outp  tpar[i];
}
return outp;

case "PAGENAME":
return title;

####################
# inflected forms:
####################


# unknown templates are deleted
default:
if(headline != 1)
	print "#WARNING: deleted unknown template: {{" tpar[0] "}} in entry: \"" title "\" on line: \"" $0 "\"" >"FIXME-"lang".txt";
return "";
}
}

#####################################
function parse_templates(input,         i, j, k, ta, sa, nt, ts, na, targs, n2, a2, tpar, rep, outp)
{
# parses string for templates 
# and calls replace_templat() for each template found
# then returns a replacement string
# THIS FUNCTION HAS TO BE CALLED MULTIPLIPLE TIMES FOR STRINGS WITH NESTED TEMPLATES

# replace bars inside wikilinks with wlbar
wlbar="_WLB_";
# replace single braces
sob="_SOB_";
scb="_SCB_";

input = gensub(/([^\{])(\{)([^\{])/, "\\1" sob "\\3", "g", input);
input = gensub(/([^\}])(\})([^\}])/, "\\1" scb "\\3", "g", input);

# is this necessary?
delete ta; delete sa;

# split input string into templates (ta[1, ..., n]) and nontemplate strings (sa[0, ..., n])
nt = patsplit(input, ta, /\{\{[^}{]*\}\}/, sa);

output = "";
for(i=1; i<=nt; i=i+1) {
	ts = ta[i]
#	replace bars inside wikilinks with wbar
	ts = gensub(/(\[\[[^\]]*)(\|)([^\]]*\]\])/, "\\1" wlbar "\\3", "g", ts); 

#	split template args into array targs	
	sub(/\{\{/, "", ts); sub(/\}\}/, "", ts);
	na = split(ts, targs, "|");

	k = 0; delete tpar;
	for(j=1; j<=na; j=j+1) {
		n2 = split(targs[j], a2, "=");
		# prevent uninitialized  a2[1] for empty template argument targs[j]
		if(n2==0)  a2[1] = "";
		if(n2 <= 1) {tpar[k] = a2[1]; k=k+1;}
		else        tpar[a2[1]] = a2[2];
		}
#	debug output
#	for (test in tpar) print test, tpar[test];
#	now call replace_template function which returns a replacement string for the template
	template_number = i;
	rep = replace_template(tpar, k-1);
#	print rep;	
	ta[i] = rep;
	}
outp = "";
if(0 in sa) outp = sa[0]; 
for(i=1; i<=nt; i=i+1) {outp = outp ta[i]; if(i in sa) outp = outp sa[i];}
return outp;
}

#####################################
function printout(out) {
# does formating before output

# remove XML code at the end of last line
	gsub(/<\/text>/,"",out);

# remove dots at end of line	
	gsub(/\.[\ ]*$/,"",out);

# convert back escaped special characters (template-parsing)
	gsub(wlbar, "|", out); gsub(sob, "{", out); gsub(scb, "}", out);


# convert special xml formating like &lt; to html
                        gsub(/&lt;/,"<",out);
                        gsub(/&gt;/,">",out);
                        gsub(/&amp;/,"\\&",out);
                        gsub(/&quot;/,"\"",out);
			gsub(/&nbsp;/, " ", out);
                        gsub(/&hellip;/, "...", out);
                        gsub(/&quot;/, "\"", out);
			gsub(/&[mn]dash;/, "-", out);
			gsub(/&thinsp;/, "", out);
			gsub(/&minus;/, "-", out);

# NOTE: these must be done after converting '&lt;' -> '<'  and '&gt;' -> '>'
# remove <ref ... \>
		gsub(/<ref[^>]*\/>/,"",out);

# remove <ref [name=....]> blabla </ref> OK?
		gsub(/<ref[^>]*>.*<\/ref>/,"",out);

# remove one-line <!-- commented text -->
		gsub(/<!--[^>]*-->/,"",out); 

# remove extra spaces
		gsub(/[\ ]+/, " ", out);

# remove remaining "<!--" (will prevent display of wikifile)
		gsub(/<!--/,"", out);

if(remove_wikilinks==1) {
#			wikilinks and italicizing, bolding
			out = gensub(/([[][[])([^]|]*\|)([^]]*)([]][]])/ , "\\3", "g", out);
			out = gensub(/([[][[])([^]]*)([]][]])/ , "\\2", "g", out);
			gsub(/['][']+/, "", out);

#			<sub> and <sup>
			gsub(/<sup>/, "^", out);  gsub(/<\/sup>/, "", out);
			gsub(/<sub>/, "", out);  gsub(/<\/sub>/, "", out);
			 
#			<nowiki> 			
			gsub(/<nowiki>/, "", out); gsub(/<\/nowiki>/, "", out);	
			}
print out;
}

function linktotext(text) {
gsub(/_WLB_/, "|", text);
text = gensub(/([[][[])([^]|]*\|)([^]]*)([]][]])/ , "\\3", "g", text);
text = gensub(/([[][[])([^]]*)([]][]])/ , "\\2", "g", text);
return text;
}

######################################
# 	Main program
######################################

/\x3Ctitle/ { 
gsub(/^[^\x3C]*/, ""); gsub(/[^\x3E]*$/, ""); gsub(/\x3Ctitle\x3E/, ""); gsub(/\x3C\/title\x3E/, "");  
title=$0; 
#print title;
langsect=0; pos= ""; gend = ""; gend2 = "";
}

# discard non-useful lines (speedup and false "trans-see" lines from comment lines)
/<comment>|<\/?page>|<timestamp>|<id>|<\/?contributor>|<\/?revision>|<username>|<minor \/>/  {next;}
/^$/ {next;}

{if(index(title,"Wiktionary:") != 0) {title=""; next;}}
{if(index(title,"Template:") != 0) {title=""; next;}}
{if(index(title,"Index:") != 0) {title=""; next;}}
{if(index(title,"Appendix:") != 0) {title=""; next;}}
{if(index(title,"User:") != 0) {title=""; next;}}
{if(index(title,"Help:") != 0) {title=""; next;}}


$0 ~ langhead { 
langsect=1; pos= ""; gend = ""; gend2 = "";
#print lang, ": ", title; 
next;}

/^\x3D\x3D[^\x3D]+/ { langsect=0; pos= ""; gend= ""; gend2= ""; next;}

# language and title detection done; skip all lines if not inside LANG section
{if(langsect==0) next;}

# determine POS
/\x3D\x3D\x3D/ { pos=""; gend=""; gend2=""; term_label=""; }
/\x3D\x3D\x3D[\x20]*Noun[\x20]*[1-9]*\x3D\x3D\x3D/ { pos="n"; next;}
#/\x3D\x3D\x3D[\x20]*Verb[\x20]*\x3D\x3D\x3D/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Verb/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Adjective[\x20]*\x3D\x3D\x3D/ { pos="adj"; next;}
/\x3D\x3D\x3D[\x20]*Adverb[\x20]*\x3D\x3D\x3D/ { pos="adv"; next;}
/\x3D\x3D\x3D[\x20]*Interjection[\x20]*\x3D\x3D\x3D/ { pos="interj"; next;}
/\x3D\x3D\x3D[\x20]*Article[\x20]*\x3D\x3D\x3D/ { pos="art"; next;}
/\x3D\x3D\x3D[\x20]*Proper\x20noun[\x20]*[1-9]*\x3D\x3D\x3D/ { pos="prop"; next;}
/\x3D\x3D\x3D[\x20]*Preposition[\x20]*\x3D\x3D\x3D/ { pos="prep"; next;}
/\x3D\x3D\x3D[\x20]*Postposition[\x20]*\x3D\x3D\x3D/ { pos="postp"; next;}
/\x3D\x3D\x3D[\x20]*\{\{initialism/ { pos="initialism"; next;}
/\x3D\x3D\x3D[\x20]*Numeral[\x20]*\x3D\x3D\x3D/ { pos="num"; next;}
/\x3D\x3D\x3D[\x20]*Cardinal num(ber|eral)[\x20]*\x3D\x3D\x3D/ { pos="cardinal num"; next;}
/\x3D\x3D\x3D[\x20]*Ordinal (number|numeral)[\x20]*\x3D\x3D\x3D/ { pos="ordinal num"; next;}
/\x3D\x3D\x3D[\x20]*Number[\x20]*\x3D\x3D\x3D/ { pos="num"; next;}
/\x3D\x3D\x3D[\x20]*\{\{acronym/ { pos="acronym"; next;}
/\x3D\x3D\x3D[\x20]*Acronym/ { pos="acronym"; next;}
/\x3D\x3D\x3D[\x20]*\{\{abbreviation/ { pos="abbr"; next;}
/\x3D\x3D\x3D[\x20]*Determiner[\x20]*\x3D\x3D\x3D/ { pos="determiner"; next;}
/\x3D\x3D\x3D[\x20]*Phrase[\x20]*\x3D\x3D\x3D/ { pos="phrase"; next;}
/\x3D\x3D\x3D[\x20]*Suffix[\x20]*\x3D\x3D\x3D/ { pos="suffix"; next;}
/\x3D\x3D\x3D[\x20]*Pronoun[\x20]*\x3D\x3D\x3D/ { pos="pron"; next;}
/\x3D\x3D\x3D[\x20]*Conjunction[\x20]*\x3D\x3D\x3D/ { pos="conj"; next;}
/\x3D\x3D\x3D[\x20]*Proverb[\x20]*\x3D\x3D\x3D/ { pos="proverb"; next;}
/\x3D\x3D\x3D[\x20]*Contraction[\x20]*\x3D\x3D\x3D/ { pos="contraction"; next;}
/\x3D\x3D\x3D[\x20]*Particle[\x20]*\x3D\x3D\x3D/ { pos="particle"; next;}
/\x3D\x3D\x3D[\x20]*Symbol[\x20]*\x3D\x3D\x3D/ { pos="symbol"; next;}
/\x3D\x3D\x3D[\x20]*Prefix[\x20]*\x3D\x3D\x3D/ { pos="prefix"; next;}
/\x3D\x3D\x3D[\x20]*Letter[\x20]*\x3D\x3D\x3D/ { pos="letter"; next;}
/\x3D\x3D\x3D[\x20]*Abbreviation[\x20]*\x3D\x3D\x3D/ { pos="abbr"; next;}
/\x3D\x3D\x3D[\x20]*Initialism[\x20]*\x3D\x3D\x3D/ { pos="initialism"; next;}
/\x3D\x3D\x3D[\x20]*Idiom[\x20]*\x3D\x3D\x3D/ { pos="idiom"; next;}
/\x3D\x3D\x3D[\x20]*Affix[\x20]*\x3D\x3D\x3D/ { pos="affix"; next;}
/\x3D\x3D\x3D[\x20]*Adverbial phrase[\x20]*\x3D\x3D\x3D/ { pos="adv"; next;}
/\x3D\x3D\x3D[\x20]*Prepositional phrase[\x20]*\x3D\x3D\x3D/ { pos="prep"; next;}
/\x3D\x3D\x3D[\x20]*Participle[\x20]*\x3D\x3D\x3D/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Ambiposition[\x20]*\x3D\x3D\x3D/ { pos="ambip"; next;}
/\x3D\x3D\x3D[\x20]*Gerund[\x20]*\x3D\x3D\x3D/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Circumposition[\x20]*\x3D/ { pos="circump"; next;}
/\x3D\x3D\x3D[\x20]*Circumfix[\x20]*\x3D/ { pos="circumfix"; next;}
/\x3D\x3D\x3D[\x20]*Interfix[\x20]*\x3D/ { pos="interfix"; next;}

#
# Usage notes dont contain definitions, skip
/\x3D\x3D\x3D[\x20]*Usage notes[\x20]*\x3D\x3D\x3D/ { pos="-"; next;}

# These are supposed to be examples: ommit
/\x23\:|\x23\*/ {next;}

# discard entry without head-line
# use option "no_head=1" for languages with plain '''WORD''' rather than {{head|iso|... form| 
# on the head line of non-lema entries
/^[']['][']/ {
if(no_head==1) {
if(index($0, "'''"title"'''") !=0) 
	if((pos=="adj")||(pos=="n")||(pos=="v")) pos="-";
}
next;}

# form of headers, exclude current POS section
$0 ~ exclude_POS {pos="-"; next;}

# determine gender of nouns
$0 ~ nounhead  { if((pos=="n")||(pos=="prop")) {	
		gend = "";	
		headline=1;

# detetermine gender via g, g2, g3 parameters of head-template, e.g. {{head|blabla|g=m|g2=f}}	
# or via extra g-template called after head-template, e.g.: {{head|blabla}} {{g|m|f}}
# gender stored in gend by replace_template function
		HD = $0;
		HD = parse_templates(HD);
# do we have nested headlines? would require parsing twice:
		parse_templates(HD);				
		headline=0;
		}
		next;
	}

$0 ~ verbhead {
# parse templates first, then look for verbatim labels
		gend="";
		headline = 1;
		HD = $0;
		HD = parse_templates(HD);
# do we have nested headlines? would require parsing twice:
		HD = parse_templates(HD);

                if(match(HD, "intransitive") != 0) gend = (gend "i");
                if(match(HD, "[^ni]transitive") != 0) gend = (gend "t");
		if(match(HD, "ambitransitive") != 0) gend = (gend "it");
                if(match(HD, "reflexive") != 0) gend = (gend "r");
		if(match(HD, "pronominal") != 0) gend = (gend "p");
		
		headline = 0;		
		next;
}

# parse term-labels of misc headers:
/^[^#].*\{\{(term-label|tlb|term-context|tcx)/ {
		headline = 1;
		HD = $0;
		HD = parse_templates(HD);
# do we have nested headlines? would require parsing twice:
		HD = parse_templates(HD);
		headline = 0;
}

$0 ~ exclude_defn {next;}

#/^[\x20]*\x23\x23/ 	{ 
#if((langsect==1)&&(pos != "-")&&(title!=""))
#	print "#WARNING nested definition line \"" $0 "\" on [["title"]]" >"FIXME-"lang".txt";
#next;
#}

# main section: format output lines
## exclude nested definitions
#/^[\x20]*\x23/ 	{ 
## include nested definitions
/^[\x20]*\x23+/ 	{ 


if((langsect==1)&&(pos != "-")&&(title!=""))
{

DL = $0;

# context -> label
gsub(/\{\{(cx|context|tcx|term-context)\|/, "{{lb|en|", DL);

# escape some special characters (template-parsing)
# convert |_| -> " "
# gsub(/\|_\|/, " ", DL);

# convert {{=}} -> " "?
gsub(/\{\{\x23\}\}/, " ", DL);
gsub(/\{\{[!]\}\}/, "_WLB_", DL);


############################
# now replace the templates:
############################
gend2="";
LHS_qualifier = "";

MAXNESTING = 3;
for(i=1; i<= MAXNESTING ; i = i+1)
{
DL = parse_templates(DL);
# print DL;
if(DL !~ /\{\{/) break;
}

if(DL ~ /\{\{|\}\}/) {
		print "#WARNING: at entry: \"" title "\": skipping badly formated input line: \"" $0 "\" or maybe to much template nesting, try to increase the \"MAXNESTING\" variable" >"FIXME-"lang".txt";
		next;}

# Latin reconstructed forms
if(lang=="Latin") sub(/Reconstruction[:]Latin\//, "*", LHS);

# remove "#" (\x23) and space
#gsub(/^[\x20]*\x23[\x20]*/,"",DL);			
gsub(/^[\x20]*\x23+[\x20]*/,"",DL);

# remove XML code at the end of last line
	gsub(/<\/text>/,"",out);

# remove leading punctuation
	gsub(/^[\ \.,;\:]+/,"",out);

# discard empty definition lines:
if(DL ~ /^[\ .\{\}\(\)\[\];\:]*$/) next;

# now formating LHS:
pos2="";
if(pos == "n")
{
if(gend == "")
	if(has_neuter==1) pos2 = "{noun}";
		else pos2 ="{n}";
	else pos2 = gend;
}
if(pos=="prop") {
pos2 = "{" pos "}";
if(gend != "") pos2 = pos2 " " gend;
}
if(pos=="v") {
if(gend2 == "")
	pos2 = "{" pos gend "}";
else pos2 = "{" pos gend2 "}"
}

# other cases
if(pos2 == "") pos2 = "{" pos "}";

# clean up pos2
gsub(sob, "{", pos2); gsub(scb, "}", pos2);
sub(/\{m-f\}/, "{mf}", pos2); 
sub(/\{m\}\{f\}/, "{mf}", pos2);
gsub(/\}\{/, "} {", pos2);
pos2 = gensub(/([mfnc])-([sp])/, "\\1\\2", "g", pos2); 

# now print left hand side
LHS = sprintf("[[%s]] %s",title,pos2);
# format LHS_qualifier and term_label:
if(term_label != "") {
if(LHS_qualifier != "") 
	LHS_qualifier = term_label ", " LHS_qualifier;
else LHS_qualifier = term_label;
}
if(LHS_qualifier != "") LHS = LHS " ["  LHS_qualifier "]";

# the ouput line:
outp = LHS " :: " DL;

# wikilink cleanup:
###################
gsub(/\#English/,"",outp);
gsub(/\[\[\|/,"[[",outp);

# rm #blabla from link inside square brackets
# first [[#bla|word]] -> [[title|word]] then other cases
outp=gensub(/(\[\[[#][^\|\]]*)(\|[^\]]*\]\])/, "[["title"\\2", "g", outp);
outp=gensub(/(\[\[[^\|\]]*[^ ])([ ]*[#][^\|\]]*)(\|[^\]]*\]\])/, "\\1\\3", "g", outp);
				
printout(outp);
if (pos == "") print "#UNKNOWN POS on page ",title ", line: " $0>"FIXME-"lang".txt";
			
} }

$0 ~ warnmissing { 
# explicit lemma Category on entry
if((pos=="-")&&(no_head==1)) 
	print "#WARNING missing head template on page [["title"]]" >"FIXME-"lang".txt";
}

Dictionaries from non-English language sections[edit]

This is a gawk script to create wikified bilingual dictionaries form the foreign language (FL) sections from the datadase dump.

Usage:[edit]

bzcat enwiktionary-DATE-pages-articles.xml.bz2|gawk -v LANG=foreign-language -v REMOVE_WIKILINKS="y" -f trans-FL-en.awk|sort -d -k 1,1 -t"{">OUTPUT-FILE

  • Currently supported foreign languages: Italian, Spanish, French, Finnish, Latin

Code (non-English entries):[edit]

# gawk script to create a Language-English dictionary
# from the Foreign-Language sections of en.wiktionary.org
# Version: 20170613
#
# (c) 2011-2017 by Matthias Buchmeier
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#
# TODO:
# rm lines with {{past participle of| ?
# include IPA
# Pronunciation sections inside POS
# proper treatment of {{indtr|
# optionally include non-lemma forms: currently Italian
#
#
# Command-line options:
########################
#  required gawk command-line switches:
#
#    name of the language to be extracted
#    -v LANG="language" 
#
#    iso-code of the language to be extracted
#    -v ISO="iso-code"
#
#  optional gawk command-line switches:
#
#    remove wiki-links and wiki-style bolding, italicizing:
#    -v REMOVE_WIKILINKS="y"
#
#########################

BEGIN {
#########################
# User defined variables:
#########################
# English names of the target language
# supported at the moment: 
# Italian, French, Spanish, Finnish, Portuguese, German, Latin, Dutch
# phony target languages for dictionaries with inflected forms: 
# Italian_with_forms, French_with_forms
# default language:
lang="Spanish";
# default isocode
iso="es"
#
rm_headless_pos=-1;
#
# command line parsing
#
if(LANG!="") lang = LANG;
if(ISO!="") iso = ISO;
#
# debug output filename:
fixme = "FIXME-"lang".txt";
#
###########################################
# language specific configuration: 
###########################################
#
# configuration for Spanish
if(lang=="Spanish") {
# iso code of the language
iso = "es";

# exclude entire current POS-subsection if regexp is matched
# this regexp should typically contain headline-templates of non-lema (form of) entries
exclude_POS="\\{\\{es-adj[^\\}]*\\|(m|masculine)\\=|\\{\\{(head\\|es\\|(participle form|past participle form|present participle|noun plural form|(noun|adjective|verb) form|misspelling|obsolete)|es-adj-form|es-verb-form)(\\||\\})";

# exclude the whole matched definition line
# this regexp typically contains form of definition-line templates
exclude_defn="\\{\\{(es-verb form of|rfdef|defn|form of|inflection of|archaic form of|misspelling of|es-compound of)(\\||\\})";

# language specific templates to be removed from output lines
# keeps the rest of definition line
# regexp must match TEMPLATE NAME!! not the template
rmtemplate="";

# regexp matching headline
# used to set gender from headline
nounhead="\\{\\{head\\|es\\|(noun|proper noun)\\||\\{\\{es-noun\\||\\{\\{es-proper noun\\|";

# regexp matching verb headline
# used to set transitive etc. from headline
verbhead="\\{\\{es-verb[\\|\\}]|\\{\\{head\\|es\\|verb[\\|\\}]";

# discard entries without head-line template if rm_headless_pos=1:
# rm_headless_pos = 1 is the only effective way to filter form of entries without headline template and definition-line form-of template
rm_headless_pos=0;

# set to 1 if the language has neuter gender. otherwise 0:
has_neuter=0;
}
#
if(lang=="Italian") {
iso = "it";
exclude_POS="\\{head\\|it\\|[^\\:\\|]* form[s]*[\\|\\}]|\\{\\{(head\\|it\\|(misspelling|obsolete|plural|g=)|it-pp|it-adj-form)[\\|\\}]|\\{\\{head\\|it\\}";
exclude_defn="Compound of|\\{\\{(rfdef|defn|misspelling of|uncommon spelling of|archaic form of|conjugation of|inflection of|form of|feminine (singular|plural) past participle of|masculine plural past participle of|feminine past participle of|(masculine|feminine) plural of|it-adj form of|gerund of)(\\||\\})";
verbhead="\\{\\{it-verb[\\|\\}]|\\{\\{head\\|it\\|verb[\\|\\}]";
nounhead="\\{\\{head\\|it\\|noun\\||\\{\\{it-noun\\|";
rm_headless_pos=1;
# still many form-of entries without headline-template:
has_neuter=0;
}
#
if(lang=="French") {
iso = "fr";
exclude_POS="\\{\\{head\\|fr\\|[^\\|]* form[s]*[\\|\\}]|\\{\\{head\\|fr\\|(misspelling|obsolete|plural|present participle|g=)|\\{\\{(misspelling of|fr-pp|fr-verb-form|fr-verb form|fr-adj-form|fr-past participle)(\\||\\})|\\{\\{head\\|fr\\}";
exclude_defn="\\{\\{past participle of\\||Compound of|masculine plural past participle of|present participle of|feminine plural past participle of|masculine plural of|conjugation of|inflection of|plural of|feminine plural of|feminine past participle of|plural past participle of|\\{\\{(rfdef|defn|plural of|form of|inflection of|archaic form of)(\\||\\})";
verbhead="\\{\\{fr-verb[\\|\\}]|\\{\\{head\\|fr\\|verb[\\|\\}]";
nounhead="\\{\\{head\\|fr\\|(noun|proper noun)(\\||\\})|\\{\\{fr-noun\\||\\{\\{fr-proper noun(\\||\\})";
# no entries without headline template
rm_headless_pos=0;
has_neuter=0;
}
#
if(lang=="Finnish") {
iso = "fi";
exclude_POS="\\{\\{head\\|fi\\|(noun|adjective|verb|proper noun) form|\\{\\{head\\|fi\\|(misspelling|obsolete)|\\{\\{misspelling of\\||\\{\\{head\\|fi\\}|\\{\\{head\\|fi\\|infinitive(\\||\\})";
exclude_defn="\\{\\{(fi-form of|fi-participle of|infinitive of|inflected form of|agent noun of|fi-verb form of|defn|rfdef|nominative plural of|rftrans|plural of|form of|inflection of|archaic form of)(\\||\\})";
rm_headless_pos=0;
nounhead="XXXXXXXXX";
has_neuter=0;
}
#
if(lang=="Portuguese") {
iso = "pt";
exclude_POS="\\{\\{head\\|pt\\|past participle form(\\||\\})"
exclude_defn="\\{\\{(pt-verb form of|pt-verb-form-of|conjugation of|misspelling of|pt-noun form of|pt-adj form of|feminine past participle of|feminine plural past participle of|masculine plural of|inflection of|pt-ordinal form|pt-adv form of|plural form of|pt-article form of|masculine plural past participle of|pt-cardinal form of|pt-apocopic-verb|rfdef|plural of|form of|inflection of|archaic form of)\\||\\{\\{head\\|pt\\|(verb|noun|adjective) form";
nounhead="\\{\\{head\\|pt\\|(noun|proper noun)\\||\\{\\{pt-noun\\||\\{\\{pt-proper noun\\|";
verbhead="\\{\\{pt-verb[\\|\\}]|\\{\\{head\\|pt\\|verb[\\|\\}]";
rm_headless_pos=1;
has_neuter=0;
}
#
#
if(lang=="Latin") {
iso = "la";
exclude_POS="\\{\\{la-(verb|part|noun|proper noun|adj|gerund|num)-form(\\||\\})";
exclude_defn="\\{\\{(conjugation of|inflection of|(genitive|nominative|vocative|accusative) (singular|plural) of|la-verb-form|misspelling of|defn|combining form of|inflected form of|rfdef|rftrans|plural of|archaic form of)\\|";
has_neuter=1;
nounhead="\\{\\{head\\|la\\|(noun|proper noun)(\\||\\})|\\{\\{la-noun\\||\\{\\{la-proper noun(\\||\\})";
rm_headless_pos=1;
}

if(lang=="German") {
iso = "de";
exclude_POS="\\{\\{head\\|de\\|(verb|noun|proper noun|adjective) form(\\||\\})";
exclude_defn="\\{\\{(conjugation of|inflection of|(genitive|nominative|vocative|accusative) (singular|plural) of|de-verb form of|misspelling of|defn|combining form of|inflected form of|rfdef|rftrans|plural of|archaic form of|de-inflected form of|form of|de-form-adj|past tense of|de-form-noun|genitive of|dative plural of|de-umlautless spelling of|accusative of|dative of|dative singular of|de-zu-infinitive of|de-du contraction|obsolete typography of|present participle of)(\\||\\})";
has_neuter=1;
nounhead="\\{\\{head\\|de\\|(noun|proper noun)\\||\\{\\{de-noun\\||\\{\\{de-proper noun\\||\\{\\{de-plural noun\\|";
rmtemplate="gerund of";
# allmost no entries without headline template (06-2017)
rm_headless_pos=0;
}
#
if(lang=="Dutch") {
iso = "nl";
exclude_POS="\\{\\{(nl-adj-form|nl-verb-form|head\\|nl\\|noun plural form)(\\||\\})";
exclude_defn="\\{\\{(nl-noun form of|nl-adj form of|nl-verb form of|misspelling of|form of|inflection of|archaic form of)(\\||\\})";
nounhead="\\{\\{head\\|nl\\|(noun|proper noun)\\||\\{\\{nl-noun\\||\\{\\{nl-proper noun\\|";
has_neuter=1;
rm_headless_pos=1;
rmtemplate="g2";
}
#
if(lang=="Italian_with_forms") {
iso = "it";
lang="Italian";
verbhead="\\{\\{it-verb[\\|\\}]|\\{\\{head\\|it\\|verb[\\|\\}]";
nounhead="\\{\\{head\\|it\\|noun\\||\\{\\{it-noun\\|";
rm_headless_pos = 0;
has_neuter=0;
}
#
if(lang=="French_with_forms") {
iso = "fr";
lang="French";
verbhead="\\{\\{fr-verb[\\|\\}]|\\{\\{head\\|fr\\|verb[\\|\\}]";
nounhead="\\{\\{head\\|fr\\|(noun|proper noun)(\\||\\})|\\{\\{fr-noun\\||\\{\\{fr-proper noun(\\||\\})";
rm_headless_pos = 0;
has_neuter = 0;
}
#
# initialization of undefined lang-specific regexps
# regexp matching {{head|...|noun...
if(nounhead == "") nounhead="\\{\\{head\\|"iso"\\|noun";
if(verbhead=="") verbhead="XXXXXX";
if(rmtemplate=="") rmtemplate="XXXXXX";
if(rm_headless_pos==-1) rm_headless_pos=0;
if(has_neuter==-1) has_neuter=0;
if(exclude_POS=="") exclude_POS="XXXXXX";
if(exclude_defn=="") exclude_defn="\\{\\{(rfdef|rftrans|defn|misspelling of|archaic form of)\\|";

#
# initialization of variables used for parsing
#
# set to 0/1 if outside/inside language section 
langsect=0; 
# variable holding POS (part of speech) information 
# pos=="-" means the current POS is a non-lema form to be excluded from the dictionary 
pos= ""; 
# variables holding additional grammatical information as gender, plural/singular etc.
# from headline-templates
gend="";
# from definition-lines
gend2="";
# variable holding page title
title=""; 
# headline=1 inside headlines =0 elsewhere
headline = 0;
#
# command-line options
#
if(REMOVE_WIKILINKS == "y") remove_wikilinks = 1;
	else remove_wikilinks = 0;
#
# language dependent regular expressions
#
# regexp matching language section header
langhead="\\x3D\\x3D[\\x20]*"lang"[\\x20]*\\x3D\\x3D";
# 
warnmissing="[[][[]Category:"lang" (nouns|adjectives|verbs)[]][]]";

# mapping of iso-codes to language-names (not longer supported by temples
isocodes="en|grc|la|es|ru|pt|LL.|it|gem|cel|ga|eu|de|fr|sv|ar|cel-gae";
languages="English|Ancient Greek|Latin|Spanish|Russian|Portuguese|Late Latin|Italian|Germanic|Celtic|Irish|Basque|German|French|Swedish|Arabic|Goidelic";
# write isocodes and language-names into array
n_iso=split(isocodes,iso_array,"|");
split(languages,languages_array,"|");
for(i=1;i<=n_iso;i++) { language_names[iso_array[i]] = languages_array[i];
#print iso_array[i]" "language_names[iso_array[i]];
}
#
# mapping of shortcuts used in form of templates:
# shortcuts:
shortcuts="1|2|3|impers|s|sg|p|pl|d|col|m|f|n|c|pres|past|fut|futr|prog|pret|pret|perf|impf|imperf|plup|pluperf|phis|imp|impr|ind|indc|indic|sub|subj|cond|dat|acc|actv|act|part|inf";
# corresponding replacements:
relacement="first-person|second-person|third-person|impersonal|singular|singular|plural|plural|dual|collective|masculine|feminine|neuter|common|present|past|future|future|progressive|preterite|preterite|perfect|imperfect|imperfect|pluperfect|pluperfect|past historic|imperative|imperative|indicative|indicative|indicative|subjunctive|subjunctive|conditional|dative|accusative|active|active|participle|infinitive";
# common strings which are no shortcuts:
non_replacement = ";|,|and|historic|gerund|of the|simple|[[past historic]]";
shortcuts = shortcuts "|" non_replacement "|" relacement ;
relacement = relacement "|" non_replacement "|" relacement ;
# write replacement text into array
n_shortcuts=split(shortcuts,shortcut_array,"|");
n_replacement=split(relacement,replacement_array,"|");
if(n_shortcuts != n_replacement) print "#WARNING: badly formated form shortcut strings" >fixme;
for(i=1;i<=n_shortcuts;i++) { replacement_text[shortcut_array[i]] = replacement_array[i];
#print shortcut_array[i] " " replacement_text[shortcut_array[i]];
	}

# shortcuts for template names:
template_shortcuts="altform|alt-form|altcaps|altspelling|alt-sp|synonym|abbreviation|clipping|abbreviation-old|altname|pf.|indeclinable|plural";
template_replacement="alternative form of|alternative form of|alternative letter-case form of|alternative spelling of|alternative spelling of|synonym of|abbreviation of|clipping of|old abbreviation of|alternative name of|pf|indecl|p";
nts=split(template_shortcuts,ts_array,"|");
ntr=split(template_replacement,tr_array,"|");
if(nts != ntr) print "#WARNING: badly formated template shortcut strings" >fixme;
for(i=1;i<=nts;i++) { trep_text[ts_array[i]] = tr_array[i];
#print ts_array[i] " " trep_text[ts_array[i]];
	}

}
# end of BEGIN block
########################

########################
# function definitions:

function sc2txt(shortcut) {
# replace shortcut with text
if(shortcut == "") return "";
if(shortcut in replacement_text) return replacement_text[shortcut];
	else {	print  "#WARNING: unknown shortcut:\"" shortcut "\" on title:\"" title "\", line:" $0 >fixme;
		return shortcut;
		}	
}


function iso2lang(isocode) {
# replace iso-code with language name
if(isocode == "") return "";
if(isocode in language_names) return language_names[isocode];
	else {	print  "#WARNING: unknown iso-code:\"" isocode "\" on title:\"" title "\", line:" $0 >fixme;
		return isocode;
		}	
}


function replace_template(tpar, n_unnamed,     outp, i, j, start)	{
# scans tpar and returns replacement string for the template
# tpar[0] is the template name
# tpar[1], ..., tpar[n_unnamed] are the unnamed parameters
# tpar["name1"], ...,  tpar["nameN"] are the named parameters with names name1, ..., nameN
outp = "";

# debug output
# for (j in tpar) print j, tpar[j];
# print tpar[0]; 
MAXGENDERS = 5;

# user-defined remove per language:
if(tpar[0] ~ rmtemplate)
	return "";

switch (tpar[0]) {

# qualifier
case /^(qualifier|i|italbrac|ib|qual|q|a|qf)$/:
outp =  tpar[1];
for(i=2;i in tpar;i++)	outp = outp ", " tpar[i];
outp = linktotext(outp);
return "[" outp "]";

# gloss-template -> ({{1}})
case /^(gloss|sense|gl)$/:
outp = "(" tpar[1] ")";
return outp;

# l-templates
case /^(l|l-self|link|m|mention|m-self|ll|l\/.*)$/:
if(tpar[2] ~ /\[\[/)
	 outp = tpar[2];

	else {
		if((3 in tpar)&&(tpar[3]!="")) outp = "[[" tpar[2] "|" tpar[3] "]]";
			else outp = "[[" tpar[2] "]]";
		}
if("tr" in tpar) outp = outp " /" tpar["tr"] "/";
if(4 in tpar) outp = outp " (" tpar[4] ")";
return outp;

# lb-template
# TODO: senseid etc as first template
# TODO: join the code with term-label?
case /^(lb|label|lbl|indtr)$/:
j=1;
for(i=2;i in tpar;i++) {
if(pos=="v") 	{
		if(tpar[i] == "intransitive") { gend2 = (gend2 "i"); continue;}
                if(tpar[i] == "transitive") { gend2 = (gend2 "t"); continue;}
                if(tpar[i] == "ambitransitive") { gend2 = (gend2 "it"); continue;}
                if(tpar[i] == "reflexive") { gend2 = (gend2 "r"); continue;}
		if(tpar[i] == "pronominal") { gend2 = (gend2 "p"); continue;}
		}
j++;
		if(j > 2) outp = outp ", ";
		outp = outp tpar[i];
}
# cleanup ", _,", ", and," etc
gsub(/,[\ ]_,/, "", outp);
outp = gensub(/,[\ ](and|or)[,]*[\ ]/, " \\1 ", "g", outp);
gsub(/^(and|or)$/, "", outp);

# there might be labels on the headline
if((headline == 1)&&(pos=="v")) gend = gend gend2;
outp = linktotext(outp);
if(template_number == 1) {
				LHS_qualifier = LHS_qualifier outp; 
				return ""; 
				}
if(j==1) outp = "";
	else outp = "[" outp "]";

return outp;

# labels on headline for entire pos
case /^(tlb|term-label|term-context|tcx)$/:
if(headline != 1) {
print "#ERROR: term-label on definition-line, title: \"" title "\", line: \"" $0 "\"" >fixme;
return "";}

if(tpar[0] ~  /^(tlb|term-label)$/) start =2;
	else start =1;
j=1;
for(i=start;i in tpar;i++) {
if(pos=="v") 	{
		if(tpar[i] == "intransitive") { gend = (gend "i"); continue;}
                if(tpar[i] == "transitive") { gend = (gend "t"); continue;}
                if(tpar[i] == "ambitransitive") { gend = (gend "it"); continue;}
                if(tpar[i] == "reflexive") { gend = (gend "r"); continue;}
		if(tpar[i] == "pronominal") { gend = (gend "p"); continue;}
		}
j++;
		if(j > 2) outp = outp ", ";
		outp = outp tpar[i];
}
outp = linktotext(outp);
# cleanup ", _,", ", and," etc
gsub(/,[\ ]_,/, "", outp);
outp = gensub(/,[\ ](and|or)[,]*[\ ]/, " \\1 ", "g", outp);
gsub(/^(and|or)$/, "", outp);

if(outp != "") {
if(term_label != "") term_label = term_label ", ";
term_label = term_label outp;
}
return "";


# templates to be removed
case /^(attention|rfc-tbot|inv|rfr|rfscript|rftranslit|NNBS|RL|LR|\,|jump|rfv|rfex|rfgloss|attention|rfv-sense|defdate|gloss-stub|senseid|es-demonstrative-accent-usage|R[:].*|pos_n|rfdef|cite-web|cite|C|cite-book|rft-sense|cite|rfquote-sense|RFV-sense|rfc-sense|datedef)$/:
template_number -= 1;
return "";

# get gender from the head-template:
case "head":
if(headline ==1) 
{
#for(i in tpar) print i, tpar[i];
if("g" in tpar) gend = gend sob tpar["g"] scb;
if("g2" in tpar) gend = gend sob tpar["g2"] scb;
if("g3" in tpar) gend = gend sob tpar["g3"] scb;
}
else print "#WARNING: misplaced head-template, title: \"" title "\", line: \"" $0 "\"" >fixme;
return "";


# the g-template
case "g":
outp = "";
for(i=1;i in tpar;i++)	outp = outp sob tpar[i] scb;
if(headline==0) return outp;
else {gend = gend outp; return "";}

# obsolete term, vern etc {{1}} -> [[1]]
case /^(term|vern|specieslink)$/:
return "[[" tpar[1] "]]"

# template-name [[{{1}}]] or template-name {{1}}
case /^(altform|alt-form|altcaps|altspelling|alt-sp|synonym|abbreviation|clipping|abbreviation-old|altname)$/:
tpar[0]=trep_text[tpar[0]];

# template-name [[{{1}}]] or template-name {{1}}
case /^(contraction of|dated form of|alternative capitalization of|informal spelling of|nonstandard spelling of|alternative spelling of|obsolete spelling of|alternative form of|alternate form of|abbreviation of|acronym of|rare spelling of|archaic spelling of|obsolete form of|eye dialect of|agent noun of|initialism of|synonym of|alternate spelling of|rare form of|eye dialect|only used in|medieval spelling of|European Portuguese form of|superseded spelling of|European Portuguese spelling of|euphemistic spelling of|alternative term for|feminine noun of|alternative form of|alternative case form of|alt form of|alt form|short of|common misspelling of|only in|pejorative of|attributive of|short for|euphemistic form of|eye-dialect of|nonstandard form of|short form of|short for|diminutive of|superlative of|comparative of|augmentative of|reflexive of|apocopic form of|obsolete form of|short form of|informal form of|dated spelling of|pronunciation spelling of|former name of|superseded form of|clipping of|praenominal abbreviation of|alternative typography of|supine of|nominalization of|construed with)$/:
outp = tpar[0] " ";
if(tpar[1] ~ /\[\[/)
	 outp = outp tpar[1];
	else outp = outp "[[" tpar[1] "]]";
return outp;

# templates replaced by first unnamed parameter
case /^(unsupported|w|non-gloss definition|n-g|taxlink|non gloss definition|non-gloss|non gloss|spelink|pedlink|def|IPAchar|def-date)$/:
return tpar[1];


# templates to be replaced by "{templatename}"
case /^(indeclinable|pf[.]|plural)$/:
tpar[0]=trep_text[tpar[0]];

# templates to be replaced by "{templatename}"
case /^(impf|dual)$/:
return sob tpar[0] scb;

# templates to be replaced by templatename
case /^(CE|BC|given name|surname|historical given name|AD|praenomen)$/:
return  tpar[0];


# &oth, &lit
case /^[&]amp[;](lit|oth)/:
outp = "See:";
for(i=1;i in tpar;i++) 
	if(tpar[i] ~ /\[\[/) 	outp = outp tpar[i];
	else outp = outp " [[" tpar[i] "]]";
return outp;


# templates replaced by 2nd unnamed parameter, e.g. lang-template
case /^(lang|cog)$/:
return tpar[2];


# place-template
# TODO: proper link?
case /^place($|[:].+$)/:
if((2 in tpar) && (tpar[2] != "")) outp = "[[" title "]] (" tpar[2] ")";
	else outp = "[[" title "]] (placename)";
# print outp;
return outp;


# etyl template
case "etyl":
return iso2lang(tpar[1]);

case "nbsp":
return " ";

#TODO: from fields
case "standard spelling of":
return "alternative spelling of " tpar[1];

#############################
# language speciic templates
#############################
# German
case "de-superseded spelling of":
return "obsolete spelling of " tpar[1];

case "de-plural noun":
if(headline == 1) 
	if(1 in tpar) gend = gend sob tpar[1] "p" scb;
		else gend = gend sob "p" scb;
return "";

# Portuguese:
case /^(pt-obsolete.*|pt-superseded.*)$/:
return "obsolete spelling of " tpar[1]; 

case "pt-pron def":
if(tpar[2] ~ /[1-3]/)
	outp = tpar[2] ". person" tpar[3] ". "  tpar[1] "pronoun";
	else outp = tpar[3] ". " tpar[4] ". form of "  tpar[1] " " tpar[2];
return outp;

case /^pt-pronoun-with-[nl]/:
outp = "alternative form of [[";
if(tpar[1] == "m") outp = outp "o";
	else   outp = outp "a";
if(tpar[1] == "pl")  outp = outp "s";
outp = outp "]]";
return outp;

case "+preo":
outp = " [+ " tpar[2] " (object)";
if(3 in tpar) outp = outp " = " tpar[3];
if("means" in tpar) outp = outp " = " tpar["means"];
outp = outp "]";
return outp;

case "+obj":
outp = " [+ " tpar[2];
if(3 in tpar) outp = outp " " tpar[3];
if(4 in tpar) outp = outp " " tpar[4];
if("means" in tpar) outp = outp " = " tpar["means"];
outp = outp "]";
return outp;


# Spanish and others
case /^(es|it|pt|fr|de|nl)-noun$/:
if(headline == 1) 
	if(1 in tpar) gend = gend sob tpar[1] scb;
return "";

# get gender from es-proper noun ans others
case /^(es|fr|pt|it|de|nl)-proper noun$/:
if(headline == 1)
	if(1 in tpar) gend = gend sob tpar[1] scb;
	if("g" in tpar) gend = gend sob tpar["g"] scb;
	if("g2" in tpar) gend = gend sob tpar["g2"] scb;	
return "";

# Finnish
case "fi-infinitive of":
return "infinitive of " tpar[1]; 

# Latin
case "NL.":
return "New Latin";

# get gender from the head-template la-noun:
case "la-noun":
if(headline == 1) 
{
#for(i in tpar) print i, tpar[i];
if(3 in tpar) gend = gend sob tpar[3] scb;
if("g2" in tpar) gend = gend sob tpar["g2"] scb;
if("g3" in tpar) gend = gend sob tpar["g3"] scb;
}
else print "#WARNING: head-template on defline, title: \"" title "\", pos: \"" pos "\", line: \"" $0 "\"" >fixme;
return "";

# get gender from la-proper noun
case "la-proper noun":
if(headline == 1)
	if(3 in tpar) gend = gend sob tpar[3] scb;
return "";

case "nl-pronadv of":
return "pronominal adverb form of " tpar[1] " + " tpar[2];

case "uncertain":
return  sob "uncertain meaning" scb;

# names of Latin letters
case "Latn-def":
outp = outp "letter";
if(4 in tpar) outp = outp ": [[" tpar[4] "]]";
if(5 in tpar) outp = outp ", [[" tpar[5] "]]";
if(6 in tpar) outp = outp ", [[" tpar[6] "]]";
if(7 in tpar) outp = outp ", [[" tpar[7] "]]";
return outp;

# soplink
case "soplink":
for(i=1;i in tpar;i++) {
if (tpar[i] !~ /[-\ \/]/) outp = outp  "[[" tpar[i] "]]";
	else outp = outp  tpar[i];
}
return outp;

case "PAGENAME":
return title;

####################
# inflected forms:
####################
# output template-name {{1}}
case /^(present participle of|past participle of|feminine plural past participle of|feminine singular past participle of|masculine plural past participle of|feminine past participle of|masculine plural of|feminine plural of|plural of|singular of|uncommon spelling of|imperative of|gerund of|plural form of|neuter singular of|feminine singular of|feminine of|neuter of|misspelling of)$/:
outp = tpar[0] " ";
if(tpar[1] ~ /\[\[/) outp = outp tpar[1];
	else outp = outp "[[" tpar[1] "]]";
return outp;

# TODO: gloss, alt-text and tr
case "form of":
outp = tpar[1] " of " tpar[2];
return outp;

# TODO: alt-text, recognised shortcuts from Module:form of/data
case /^(conjugation of|inflection of)$/:
for(i=3;i in tpar;i++) {
outp =  outp sc2txt(tpar[i]) " of ";}
if(tpar[1] ~ /\[\[/) outp = outp tpar[1];
	else outp = outp "[[" tpar[1] "]]";
return outp;

case "it-adj form of":
for(i=2;i<=4;i++) 
	if(i in tpar) outp = outp sc2txt(tpar[i]) " ";
outp = outp " of " tpar[1]; 
return outp;


# unknown templates are deleted
default:
if(headline != 1)
	print "#WARNING: deleted unknown template: {{" tpar[0] "}} in entry: \"" title "\" on line: \"" $0 "\"" >fixme;
return "";
}
}

#####################################
function parse_templates(input,         i, j, k, ta, sa, nt, ts, na, targs, n2, a2, tpar, rep, outp)
{
# parses string for templates 
# and calls replace_templat() for each template found
# then returns a replacement string
# THIS FUNCTION HAS TO BE CALLED MULTIPLIPLE TIMES FOR STRINGS WITH NESTED TEMPLATES

# replace bars inside wikilinks with wlbar
wlbar="_WLB_";
# replace single braces
sob="_SOB_";
scb="_SCB_";

input = gensub(/([^\{])(\{)([^\{])/, "\\1" sob "\\3", "g", input);
input = gensub(/([^\}])(\})([^\}])/, "\\1" scb "\\3", "g", input);

# is this necessary?
delete ta; delete sa;

# split input string into templates (ta[1, ..., n]) and nontemplate strings (sa[0, ..., n])
nt = patsplit(input, ta, /\{\{[^}{]*\}\}/, sa);

output = "";
for(i=1; i<=nt; i=i+1) {
	ts = ta[i]
#	replace bars inside wikilinks with wbar
	ts = gensub(/(\[\[[^\]]*)(\|)([^\]]*\]\])/, "\\1" wlbar "\\3", "g", ts); 

#	split template args into array targs	
	sub(/\{\{/, "", ts); sub(/\}\}/, "", ts);
	na = split(ts, targs, "|");

	k = 0; delete tpar;
	for(j=1; j<=na; j=j+1) {
		n2 = split(targs[j], a2, "=");
		# prevent uninitialized  a2[1] for empty template argument targs[j]
		if(n2==0)  a2[1] = "";
		if(n2 <= 1) {tpar[k] = a2[1]; k=k+1;}
		else        tpar[a2[1]] = a2[2];
		}
#	debug output
#	for (test in tpar) print test, tpar[test];
#	now call replace_template function which returns a replacement string for the template
	template_number = i;
	rep = replace_template(tpar, k-1);
#	print rep;	
	ta[i] = rep;
	}
outp = "";
if(0 in sa) outp = sa[0]; 
for(i=1; i<=nt; i=i+1) {outp = outp ta[i]; if(i in sa) outp = outp sa[i];}
return outp;
}

#####################################
function printout(out) {
# does formating before output
# then prints to stdout

# remove XML code at the end of last line
	gsub(/<\/text>/,"",out);

# remove dots at end of line	
	gsub(/\.[\ ]*$/,"",out);

# convert back escaped special characters (template-parsing)
	gsub(wlbar, "|", out); gsub(sob, "{", out); gsub(scb, "}", out);


# convert special xml formating like &lt; to html
                        gsub(/&lt;/,"<",out);
                        gsub(/&gt;/,">",out);
                        gsub(/&amp;/,"\\&",out);
                        gsub(/&quot;/,"\"",out);
			gsub(/&nbsp;/, " ", out);
                        gsub(/&hellip;/, "...", out);
                        gsub(/&quot;/, "\"", out);
			gsub(/&[mn]dash;/, "-", out);
			gsub(/&thinsp;/, "", out);
			gsub(/&minus;/, "-", out);
			gsub(/&equals;/, "=", out);

# NOTE: these must be done after converting '&lt;' -> '<'  and '&gt;' -> '>'
# remove <ref ... \>
		gsub(/<ref[^>]*\/>/,"",out);

# remove <ref [name=....]> blabla </ref> OK?
		gsub(/<ref[^>]*>.*<\/ref>/,"",out);

# remove one-line <!-- commented text -->
		gsub(/<!--[^>]*-->/,"",out); 

# remove extra spaces
		gsub(/[\ ]+/, " ", out);

# remove remaining "<!--" (will prevent display of wikifile)
		gsub(/<!--/,"", out);

if(remove_wikilinks==1) {
#			wikilinks and italicizing, bolding
			out = gensub(/([[][[])([^]|]*\|)([^]]*)([]][]])/ , "\\3", "g", out);
			out = gensub(/([[][[])([^]]*)([]][]])/ , "\\2", "g", out);
			gsub(/['][']+/, "", out);

#			<sub> and <sup>
			gsub(/<sup>/, "^", out);  gsub(/<\/sup>/, "", out);
			gsub(/<sub>/, "", out);  gsub(/<\/sub>/, "", out);
			 
#			<nowiki> 			
			gsub(/<nowiki>/, "", out); gsub(/<\/nowiki>/, "", out);	
			}
print out;
}

function linktotext(text) {
gsub(/_WLB_/, "|", text);
text = gensub(/([[][[])([^]|]*\|)([^]]*)([]][]])/ , "\\3", "g", text);
text = gensub(/([[][[])([^]]*)([]][]])/ , "\\2", "g", text);
return text;
}

######################################
# 	Main program
######################################

/\x3Ctitle/ { 
gsub(/^[^\x3C]*/, ""); gsub(/[^\x3E]*$/, ""); gsub(/\x3Ctitle\x3E/, ""); gsub(/\x3C\/title\x3E/, "");  
title=$0; 
#print title;
langsect=0; pos= ""; gend = ""; gend2 = "";
}

# discard non-useful lines (speedup and false "trans-see" lines from comment lines)
/<comment>|<\/?page>|<timestamp>|<id>|<\/?contributor>|<\/?revision>|<username>|<minor \/>/  {next;}
/^$/ {next;}

{if(index(title,"Wiktionary:") != 0) {title=""; next;}}
{if(index(title,"Template:") != 0) {title=""; next;}}
{if(index(title,"Index:") != 0) {title=""; next;}}
{if(index(title,"Appendix:") != 0) {title=""; next;}}
{if(index(title,"User:") != 0) {title=""; next;}}
{if(index(title,"Help:") != 0) {title=""; next;}}


$0 ~ langhead { 
langsect=1; pos= ""; gend = ""; gend2 = "";
#print lang, ": ", title; 
next;}

/^\x3D\x3D[^\x3D]+/ { langsect=0; pos= ""; gend= ""; gend2= ""; next;}

# language and title detection done; skip all lines if not inside LANG section
{if(langsect==0) next;}

# determine POS
/\x3D\x3D\x3D/ { pos=""; gend=""; gend2=""; term_label=""; }
/\x3D\x3D\x3D[\x20]*Noun[\x20]*[1-9]*\x3D\x3D\x3D/ { pos="n"; next;}
#/\x3D\x3D\x3D[\x20]*Verb[\x20]*\x3D\x3D\x3D/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Verb/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Adjective[\x20]*\x3D\x3D\x3D/ { pos="adj"; next;}
/\x3D\x3D\x3D[\x20]*Adverb[\x20]*\x3D\x3D\x3D/ { pos="adv"; next;}
/\x3D\x3D\x3D[\x20]*Interjection[\x20]*\x3D\x3D\x3D/ { pos="interj"; next;}
/\x3D\x3D\x3D[\x20]*Article[\x20]*\x3D\x3D\x3D/ { pos="art"; next;}
/\x3D\x3D\x3D[\x20]*Proper\x20noun[\x20]*[1-9]*\x3D\x3D\x3D/ { pos="prop"; next;}
/\x3D\x3D\x3D[\x20]*Preposition[\x20]*\x3D\x3D\x3D/ { pos="prep"; next;}
/\x3D\x3D\x3D[\x20]*Postposition[\x20]*\x3D\x3D\x3D/ { pos="postp"; next;}
/\x3D\x3D\x3D[\x20]*\{\{initialism/ { pos="initialism"; next;}
/\x3D\x3D\x3D[\x20]*Numeral[\x20]*\x3D\x3D\x3D/ { pos="num"; next;}
/\x3D\x3D\x3D[\x20]*Cardinal num(ber|eral)[\x20]*\x3D\x3D\x3D/ { pos="cardinal num"; next;}
/\x3D\x3D\x3D[\x20]*Ordinal (number|numeral)[\x20]*\x3D\x3D\x3D/ { pos="ordinal num"; next;}
/\x3D\x3D\x3D[\x20]*Number[\x20]*\x3D\x3D\x3D/ { pos="num"; next;}
/\x3D\x3D\x3D[\x20]*\{\{acronym/ { pos="acronym"; next;}
/\x3D\x3D\x3D[\x20]*Acronym/ { pos="acronym"; next;}
/\x3D\x3D\x3D[\x20]*\{\{abbreviation/ { pos="abbr"; next;}
/\x3D\x3D\x3D[\x20]*Determiner[\x20]*\x3D\x3D\x3D/ { pos="determiner"; next;}
/\x3D\x3D\x3D[\x20]*Phrase[\x20]*\x3D\x3D\x3D/ { pos="phrase"; next;}
/\x3D\x3D\x3D[\x20]*Suffix[\x20]*\x3D\x3D\x3D/ { pos="suffix"; next;}
/\x3D\x3D\x3D[\x20]*Pronoun[\x20]*\x3D\x3D\x3D/ { pos="pron"; next;}
/\x3D\x3D\x3D[\x20]*Conjunction[\x20]*\x3D\x3D\x3D/ { pos="conj"; next;}
/\x3D\x3D\x3D[\x20]*Proverb[\x20]*\x3D\x3D\x3D/ { pos="proverb"; next;}
/\x3D\x3D\x3D[\x20]*Contraction[\x20]*\x3D\x3D\x3D/ { pos="contraction"; next;}
/\x3D\x3D\x3D[\x20]*Particle[\x20]*\x3D\x3D\x3D/ { pos="particle"; next;}
/\x3D\x3D\x3D[\x20]*Symbol[\x20]*\x3D\x3D\x3D/ { pos="symbol"; next;}
/\x3D\x3D\x3D[\x20]*Prefix[\x20]*\x3D\x3D\x3D/ { pos="prefix"; next;}
/\x3D\x3D\x3D[\x20]*Letter[\x20]*\x3D\x3D\x3D/ { pos="letter"; next;}
/\x3D\x3D\x3D[\x20]*Abbreviation[\x20]*\x3D\x3D\x3D/ { pos="abbr"; next;}
/\x3D\x3D\x3D[\x20]*Initialism[\x20]*\x3D\x3D\x3D/ { pos="initialism"; next;}
/\x3D\x3D\x3D[\x20]*Idiom[\x20]*\x3D\x3D\x3D/ { pos="idiom"; next;}
/\x3D\x3D\x3D[\x20]*Affix[\x20]*\x3D\x3D\x3D/ { pos="affix"; next;}
/\x3D\x3D\x3D[\x20]*Adverbial phrase[\x20]*\x3D\x3D\x3D/ { pos="adv"; next;}
/\x3D\x3D\x3D[\x20]*Prepositional phrase[\x20]*\x3D\x3D\x3D/ { pos="prep"; next;}
/\x3D\x3D\x3D[\x20]*Participle[\x20]*\x3D\x3D\x3D/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Ambiposition[\x20]*\x3D\x3D\x3D/ { pos="ambip"; next;}
/\x3D\x3D\x3D[\x20]*Gerund[\x20]*\x3D\x3D\x3D/ { pos="v"; next;}
/\x3D\x3D\x3D[\x20]*Circumposition[\x20]*\x3D/ { pos="circump"; next;}
/\x3D\x3D\x3D[\x20]*Circumfix[\x20]*\x3D/ { pos="circumfix"; next;}
/\x3D\x3D\x3D[\x20]*Interfix[\x20]*\x3D/ { pos="interfix"; next;}

#
# Usage notes dont contain definitions, skip
/\x3D\x3D\x3D[\x20]*Usage notes[\x20]*\x3D\x3D\x3D/ { pos="-"; next;}

# These are supposed to be examples: ommit
/\x23\:|\x23\*/ {next;}

# discard entry without head-line
# use option "rm_headless_pos = 1" for languages with plain '''WORD''' rather than {{head|iso|... form| 
# on the head line of non-lema entries
/^[']['][']/ {
if(rm_headless_pos == 1) {
if(index($0, "'''"title"'''") !=0) 
	if((pos=="adj")||(pos=="n")||(pos=="v")) pos="-";
		else print "#WARNING: including entry: \"" title "\" without headline-template, line: " $0 >fixme
}
next;}

# form of headers, exclude current POS section
$0 ~ exclude_POS {pos="-"; next;}

# determine gender of nouns
$0 ~ nounhead  { if((pos=="n")||(pos=="prop")) {	
		gend = "";	
		headline=1;
# parse gender in headline-template via replace_template function
		HD = $0;
		HD = parse_templates(HD);
# do we have nested headlines? would require parsing twice:
		parse_templates(HD);				
		headline=0;
		}
		next;
	}

$0 ~ verbhead {
# parse templates first, then look for verbatim labels
		gend="";
		headline = 1;
		HD = $0;
		HD = parse_templates(HD);
# do we have nested headlines? would require parsing twice:
		HD = parse_templates(HD);

                if(match(HD, "intransitive") != 0) gend = (gend "i");
                if(match(HD, "[^ni]transitive") != 0) gend = (gend "t");
		if(match(HD, "ambitransitive") != 0) gend = (gend "it");
                if(match(HD, "reflexive") != 0) gend = (gend "r");
		if(match(HD, "pronominal") != 0) gend = (gend "p");
		
		headline = 0;		
		next;
}

# parse term-labels of misc headers:
/^[^#].*\{\{(term-label|tlb|term-context|tcx)/ {
		headline = 1;
		HD = $0;
		HD = parse_templates(HD);
# do we have nested headlines? would require parsing twice:
		HD = parse_templates(HD);
		headline = 0;
}

# skip definition lines
$0 ~ exclude_defn {next;}

#/^[\x20]*\x23\x23/ 	{ 
#if((langsect==1)&&(pos != "-")&&(title!=""))
#	print "#WARNING nested definition line \"" $0 "\" on [["title"]]" >fixme;
#next;
#}

# main section: format output lines
## exclude nested definitions
#/^[\x20]*\x23/ 	{ 
## include nested definitions
/^[\x20]*\x23+/ 	{ 


if((langsect==1)&&(pos != "-")&&(title!=""))
{

DL = $0;

# context -> label
gsub(/\{\{(cx|context)\|/, "{{lb|en|", DL);

# escape some special characters (template-parsing)
# convert {{=}} -> " "?
gsub(/\{\{\x23\}\}/, "\\&equals", DL);
gsub(/\{\{[!]\}\}/, "_WLB_", DL);


############################
# now replace the templates:
############################
gend2="";
LHS_qualifier = "";

MAXNESTING = 3;
for(i=1; i<= MAXNESTING ; i = i+1)
{
DL = parse_templates(DL);
# print DL;
if(DL !~ /\{\{/) break;
}

if(DL ~ /\{\{|\}\}/) {
		print "#WARNING: at entry: \"" title "\": skipping badly formated input line: \"" $0 "\" or maybe to much template nesting, try to increase the \"MAXNESTING\" variable" >fixme;
		next;}

# Latin reconstructed forms
if(lang=="Latin") sub(/Reconstruction[:]Latin\//, "*", LHS);

# remove "#" (\x23) and space
#gsub(/^[\x20]*\x23[\x20]*/,"",DL);			
gsub(/^[\x20]*\x23+[\x20]*/,"",DL);

# remove XML code at the end of last line
	gsub(/<\/text>/,"",DL);

# remove leading punctuation
	gsub(/^[\ \.,;\:]+/,"",DL);

# discard empty definition lines:
if(DL ~ /^[\ .\{\}\(\)\[\];\:]*$/) next;

# now formating LHS:
pos2="";
if(pos == "n")
{
if(gend == "")
	if(has_neuter==1) pos2 = "{noun}";
		else pos2 ="{n}";
	else pos2 = gend;
}
if(pos=="prop") {
pos2 = "{" pos "}";
if(gend != "") pos2 = pos2 " " gend;
}
if(pos=="v") {
if(gend2 == "")
	pos2 = "{" pos gend "}";
else pos2 = "{" pos gend2 "}"
}

# other cases
if(pos2 == "") pos2 = "{" pos "}";

# clean up pos2
gsub(sob, "{", pos2); gsub(scb, "}", pos2);
sub(/\{m-f\}/, "{mf}", pos2); 
sub(/\{m\}\{f\}/, "{mf}", pos2);
gsub(/\}\{/, "} {", pos2);
pos2 = gensub(/([mfnc])-([sp])/, "\\1\\2", "g", pos2); 
if(pos2 == "") pos2 ="?";

# now print left hand side
LHS = sprintf("[[%s]] %s",title,pos2);
# format LHS_qualifier and term_label:
if(term_label != "") {
if(LHS_qualifier != "") 
	LHS_qualifier = term_label ", " LHS_qualifier;
else LHS_qualifier = term_label;
}
if(LHS_qualifier != "") LHS = LHS " ["  LHS_qualifier "]";

# the ouput line:
outp = LHS " :: " DL;

# wikilink cleanup:
###################
gsub(/\#English/,"",outp);
gsub(/\[\[\|/,"[[",outp);

# rm #blabla from link inside square brackets
# first [[#bla|word]] -> [[title|word]] then other cases
outp=gensub(/(\[\[[#][^\|\]]*)(\|[^\]]*\]\])/, "[["title"\\2", "g", outp);
outp=gensub(/(\[\[[^\|\]]*[^ ])([ ]*[#][^\|\]]*)(\|[^\]]*\]\])/, "\\1\\3", "g", outp);
				
printout(outp);
if (pos == "") print "#UNKNOWN POS on page ",title ", line: " $0 >fixme;
			
} }

$0 ~ warnmissing { 
# explicit lemma Category on entry
if((pos=="-")&&(rm_headless_pos==0)) 
	print "#WARNING possible missing head template (explicit POS-category) on page [["title"]]" >fixme;
}