User:Hippietrail/ajaxtranslinks.js
Jump to navigation
Jump to search
Note: You may have to bypass your browser’s cache to see the changes. In addition, after saving a sitewide CSS file such as MediaWiki:Common.css, it will take 5-10 minutes before the changes take effect, even if you clear your cache.
- Mozilla / Firefox / Safari: hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Command-R on a Macintosh);
- Konqueror and Chrome: click Reload or press F5;
- Opera: clear the cache in Tools → Preferences;
- Internet Explorer: hold Ctrl while clicking Refresh, or press Ctrl-F5.
- This script lacks a documentation subpage. Please create it.
- Useful links: root page • root page’s subpages • links • redirects • your own
// <nowiki>
// TODO {{top}} used with non-translation sections are parsed anyway
// TODO don't treat whitespace as tokens. instead include a prevWhite field for each token
// TODO is the sublang system complete?
// TODO finish the refactoring
// TODO doesn't handle non linked multiword terms like "Sri Lanka"
// TODO == Current bugs (flat dom parser) ==
// TODO can we cope with missing colon after language name?
// TODO sense numbers after terms: Finnish: jargon (1, 2)
// TODO <i><a>trad.</a></i> and <i><a>simpl.</a></i> in some Chinese entries
// TODO (<i>pf.</i>) and (<i>impf.</i>) in some Russian entries
// TODO comma between term and its gender [[rose]] Catalan
// TODO == Old bugs (manual parser) ==
// TODO Cantonese and Mandarin as sublangs can match each other
// TODO handle wikified sublanguages
// TODO handle translations that have both a main entry and subentries ([[corn]] German)
// TODO selflinked language names cause breakage
// TODO == Can we handle these cases?
// TODO single terms wikilinked in separate parts: Hungarian: [[tönköly]] [[búza]]
// TODO non-linked reflexive particles: sich [[treffen]], [[pridružiti]] se
// TODO what to do about "comments" before or after the line or one term? (foo) ''(foo)'' (''foo'') ''foo''
// TODO subentries which are not sublanguages: Chinese and Japanese entries at "Mongolian"
// this seems to require non complicated lookahead
//////////////////////////////////////////////////////////////////
//
// functions for parsing the "other" page in raw wikitext form
//
//////////////////////////////////////////////////////////////////
// callback from ajaxing in the term in the other language
// parse the wikitext of another page
// modify the dom of the translation entry
function parse_other_raw(li, anchor, term, lang, sublang, page) {
var state = 0;
var arr = page.split("\n");
var line;
if (anchor) {
anchor.className = 'dunno';
anchor.title = 'Looking...';
}
// so we can detect synonyms
var langpats = build_lang_patterns(lang, sublang);
for (var i = 0; i < arr.length; i++) {
line = arr[i];
// redirect?
if (line.match(/#\s*[rR][eE][dD][iI][rR][eE][cC][tT]/)) {
state = -1;
break;
} else {
var foundlang;
var r;
// ==Lang== or one of its synonyms or variants?
r = line.match(langpats[0]);
if (r != null)
foundlang = r[1];
// ==[[Lang]]== or one of its synonyms or variants?
r = line.match(langpats[1]);
if (r != null)
foundlang = r[1];
if (foundlang != null) {
state = 1;
break;
}
}
}
langpats = null;
// set the class and title of the <a>
// page exists and has an entry for this language
if (state == 1) {
if (anchor) {
anchor.className = '';
anchor.title = term + ' (exists in ' + foundlang + ')';
if (anchor.href.indexOf('#') == -1) {
// TODO unicode in lang names breaks: Guaraní -> #Guaran%C3%AD but #Guaran.C3.AD
anchor.href += '#' + foundlang;
}
// plain terms have no <a> so output results some other way
} else {
li.title += '; res: ' + term + ' exists in ' + foundlang;
}
// page exists but has no entry for this language
} else if (state == 0) {
if (anchor) {
anchor.className = 'new partlynew';
anchor.title = term + ' exists (but not in ' + (sublang ? lang + ' (' + sublang + ')' : lang) + ')';
} else {
li.title += '; res: ' + term + ' exists but not in ' + (sublang ? lang + ' (' + sublang + ')' : lang);
}
// page exists but is a redirect
} else if (state == -1) {
if (anchor) {
anchor.className = 'redirect';
anchor.title = term + ' exists but is a redirect';
} else {
li.title += '; res: ' + term + ' exists but is a redirect';
}
}
}
// the language name used in the translation table might not be a
// synonym or variant of the name used in the foreign terms' own page
function build_lang_patterns(lang, sublang) {
var langs = lang;
// Ancient Greek
if (sublang)
langs += '|' + sublang + '|' + sublang + ' ' + lang + '|' + lang + ' \\(' + sublang + '\\)|' + lang + ', ' + sublang;
// Chinese
if (lang == 'Chinese')
langs += '|Mandarin|Cantonese';
else if (lang == 'Mandarin' || lang == 'Cantonese')
langs += '|Chinese';
// CJKV/Han characters
else if (lang.match(/(CJKV?|Chinese) [cC]haracters/))
langs += '|Translingual';
// Other synonyms and spelling variants
else if (lang == 'Anglo-Saxon') langs += '|Old English';
else if (lang == 'Azerbaijani') langs += '|Azeri';
else if (lang == 'Azeri') langs += '|Azerbaijani';
else if (lang == 'Burmese') langs += '|Myanmar';
else if (lang == 'Faeroese') langs += '|Faroese';
else if (lang == 'Faroese') langs += '|Faeroese';
else if (lang == 'Farsi') langs += '|Persian';
else if (lang == 'Guaraní') langs += '|Guarani';
else if (lang == 'Guarani') langs += '|Guaraní';
else if (lang == 'Malay') langs += '|Malaysian';
else if (lang == 'Malaysian') langs += '|Malay';
else if (lang == 'Maori') langs += '|Māori';
else if (lang == 'Māori') langs += '|Maori';
else if (lang == 'Myanmar') langs += '|Burmese';
else if (lang == 'Old English') langs += '|Anglo-Saxon';
else if (lang == 'Persian') langs += '|Farsi';
else if (lang == 'Romani') langs += '|Romany';
else if (lang == 'Romansch') langs += '|Romansh';
else if (lang == 'Romansh') langs += '|Romansch';
else if (lang == 'Romany') langs += '|Romani';
else if (lang == 'Scots Gaelic') langs += '|Scottish Gaelic';
else if (lang == 'Scottish Gaelic') langs += '|Scots Gaelic';
else if (lang == 'Sinhala') langs += '|Sinhalese';
else if (lang == 'Sinhalese') langs += '|Sinhala';
else if (lang == 'Slovak') langs += '|Slovakian';
else if (lang == 'Slovakian') langs += '|Slovak';
else if (lang == 'Slovene') langs += '|Slovenian';
else if (lang == 'Slovenian') langs += '|Slovene';
else if (lang == 'Tupinambá') langs += '|Tupinamba';
else if (lang == 'Tupinamba') langs += '|Tupinambá';
else if (lang == 'Uighur') langs += '|Uyghur';
else if (lang == 'Uyghur') langs += '|Uighur';
var rx1 = new RegExp('^==\\s*(' + langs + ')\\s*==');
var rx2 = new RegExp('^==\\s*\\[\\[(' + langs + ')]]\\s*==');
return [rx1, rx2];
}
// use ajax to load the raw pages of each translated term
function lookup_langs(li, anchor, term, lang, sublang) {
function on200(req) {
parse_other_raw(li, anchor, term, lang, sublang, req.responseText );
}
if (term)
ajax(wgScript + '?title=' + term.replace(/ /g, '_') + '&action=raw', on200, function() {});
}
// TODO misses items not wrapped in { {top}} etc
// TODO gets some related terms etc which also use { {top}} etc
function get_trans_listitems() {
var lis = [];
// find the translations section
// { {top}} and { {trans-top}} both result in <table class="translations">
var bc = document.getElementById('bodyContent');
if (bc != null) {
var tables = bc.getElementsByTagName('table');
if (tables != null) {
for (var t = 0; t < tables.length; t++) {
if (tables[t].className.match(/\btranslations\b/)) {
var somelis = tables[t].getElementsByTagName('li');
for (var l = 0; l < somelis.length; l++) {
lis.push(somelis[l]);
}
}
}
}
}
return lis;
}
function parse_translistitems(lis) {
if (window.domtokenizer)
parse_translistitems_haveflatdom(lis);
/*else
parse_translistitems_noflatdom(lis);
*/
}
// for each item create parser, prefetch, parse, destroy parser
function parse_translistitems_haveflatdom(lis) {
// members accessible by any parser function
// item always points to the top-level li
// subitem always points to the current level
// which may be the li or one of its child dd
var toker = null; // dom tokenizer (generator)
var gItem = null; // lis[i]: eg *Spanish:
var gSubItem = null; // lis[i] or dd: eg *Serbian: *: Cyrillic
var gDepth = 0;
// main loop
for (var i = 0; i < lis.length; i++) {
gSubItem = gItem = lis[i];
toker = new domtokenizer(gItem);
// get first token
toker.gettok();
try {
parsetransentry();
}
// google chrome can't handle "if" here
//catch (e if e == 'WiktParseException') {
catch (e) {
if (e == 'WiktParseException') {
//consolelog('caught parser exception: ' + e);
} else {
throw e;
}
}
toker = null;
}
// SUB FUNCTIONS
function pp_unexpected(level, msg) {
var txt = level + ': unexpected token';
if (msg)
txt += ' at ' + msg;
txt += ':';
consolelog(txt);
consolelog(toker.tok);
consolelog(toker.nexttok);
}
function pp_expect_text(v) {
if (toker.tok && toker.tok.t == 't' && toker.tok.x == v)
toker.gettok();
else
pp_error('text "' + v + '"');
}
function pp_expect_start(v) {
if (toker.tok && toker.tok.t == 's' && toker.tok.n.nodeName == v)
toker.gettok();
else
pp_error('<' + v + '>');
}
function pp_expect_end(v) {
if (toker.tok && toker.tok.t == 'e' && toker.tok.n.nodeName == v)
toker.gettok();
else
pp_error('</' + v + '>');
}
function pp_warn(msg) {
addclass(gSubItem, 'parsewarn');
pp_unexpected('warning', msg);
}
function pp_error(msg) {
addclass(gSubItem, 'parserror');
pp_unexpected('error', msg);
throw 'WiktParseException';
}
// TODO accepts any whitespace including nbsp due to mw french punc feature
function pp_tolerate_space() {
if (toker.tok.t == 't' && toker.tok.isWhite) {
toker.gettok();
pp_warn('tolerate space');
}
}
function pp_tolerate_missing_space() {
if (toker.tok.t == 't' && toker.tok.isWhite)
toker.gettok();
else
pp_warn('tolerate missing space');
}
//////////////////////////////////////////////////////////
// returns tree
// this is the only place where gSubItem is changed!
function pp_sublang() {
var lat = null;
var dd = toker.tok.n;
pp_expect_start('DD');
gSubItem = dd;
gDepth ++;
lat = pp_lang_and_terms();
gDepth --;
pp_expect_end('DD');
pp_expect_text('\n');
return lat;
}
// returns array of sublang trees
function pp_sublangs() {
var aot = [];
pp_expect_start('DL');
pp_expect_text('\n');
while (true) {
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'DD')
aot.push(pp_sublang());
else
break;
}
pp_expect_end('DL');
pp_expect_text('\n');
return aot;
}
// returns tree
// TODO tolerate comma or missing colon between langname and dl
function pp_lang_and_terms() {
var lang = null;
var sublangs = null;
var terms = null;
lang = pp_lang();
pp_tolerate_space();
pp_expect_text(':');
pp_tolerate_missing_space(); // space, or \n if followed by <dl>
// set the language (and title) for each branch and leaf
gSubItem.title = lang;
gSubItem.wiktLang = lang;
// if we're a branch node
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'DL' && toker.nexttok.t == 't' && toker.nexttok.isEOL == true) {
// then process the leaves
sublangs = pp_sublangs();
// else we're a leaf node
} else {
// set the term array only for leaves
// only set this for leaf nodes
// watch out because gSubItem will be set the same for the last leaf
// node and afterward its parent branch node
// which would result in nulling the last sublang's terms
gSubItem.wiktTerms = terms = pp_terms();
}
// return a tree
// branch nodes always have null "terms"
// leaf nodes always have null "sublangs"
return { lang: lang, sublangs: sublangs, terms: terms };
}
// parse the <li> then call function to parse its lang and terms
// trreq and ttbc are handled here as they are not expected to
// occur in sublanguages
function parsetransentry() {
var lang = null;
var termArray = null;
var isErr = false;
var isWarn = false;
var data = '';
pp_expect_start('LI');
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'trreq')
lang = trreq();
else {
var tree = pp_lang_and_terms();
consolelog(' lang: ' + tree.lang);
if (tree.sublangs)
for (var i in tree.sublangs)
consolelog(' ' + tree.sublangs[i].lang);
}
pp_expect_end('LI');
return;
function trreq() {
var lang = null;
toker.gettok(); // <span>
lang = parseunlinkedlangname();
// TODO this should be done in the function above
toker.gettok();
pp_expect_text(':');
pp_tolerate_missing_space();
pp_expect_start('I');
while (toker.tok.t == 't')
toker.gettok(); // please add...
pp_expect_end('I');
pp_expect_end('SPAN');
return lang;
}
}
// TODO handle language names with a linked part and unlinked part: [[Sorbian]] (lower)
function pp_lang() {
var lang = null;
if (toker.tok.t == 't') {
lang = parseunlinkedlangname();
// TODO this should be done in the function above
toker.gettok();
} else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A')
lang = linked_or_ttbc_lang('A');
else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'ttbc')
lang = linked_or_ttbc_lang('SPAN');
return lang;
}
// TODO are there 3-part language names which use comma or parentheses?
// TODO this function doesn't consume each token as soon as it is recognized
function parseunlinkedlangname() {
// first word of lang name
lang = toker.tok.x;
// following words?
if (toker.nexttok.t == 't') {
// Old English; English (Old)
if (toker.nexttok.x == ' ') {
toker.gettok(); // eat first word
if (toker.nexttok.t == 't') {
// English (Old)
if (toker.nexttok.x == '(') {
toker.gettok(); // eat space
lang = toker.nexttok.x + ' ' + lang; // get second word
toker.gettok(); // eat (
toker.gettok(); // eat second word
// Old English; Torres Strait Creole
} else {
lang = lang + ' ' + toker.gettok().x; // eat space, get second word
if (toker.nexttok.t == 't' && toker.nexttok.x == ' ') {
toker.gettok(); // eat second word
// Torres Strait Creole
if (toker.nexttok.t == 't')
lang = lang + ' ' + toker.gettok().x; // eat space, get third word
}
}
}
// English, Old or Greek, instead of Greek:
} else if (toker.nexttok.x == ',') {
toker.gettok(); // eat first word
if (toker.nexttok.t == 't' && toker.nexttok.x == ' ') {
toker.gettok(); // eat comma
lang = toker.gettok().x + ' ' + lang; // eat space, get second word
}
else
toker.ungettok();
}
}
return lang;
}
function linked_or_ttbc_lang(tag) {
toker.gettok(); // <a> or <span class="ttbc">
var lang = parseunlinkedlangname();
// TODO this should be done in the function above
toker.gettok();
pp_expect_end(tag);
return lang;
}
// parse a list of terms separated by commas or semicolons
// does not handle sublanguage lists
// returns array of terms only
function pp_terms() {
var terms = [];
var term = null;
term = parseterm();
if (term)
terms.push(term);
parseterms_rest();
return terms;
// SUB FUNCTIONS
// term interwiki? ((gender translit?) | (translit gender?))?
function parseterm() {
var term = null;
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A')
term = parseterm_link();
else if (is_script_span(toker.tok))
term = parseterm_script();
else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'STRONG' && toker.tok.n.className == 'selflink')
term = parseterm_selflink();
else if (toker.tok.t == 't') {
if (toker.tok.x == '(' || toker.tok.x == '[') {
var term = {};
parse_translit_gender(term);
} else
term = parseterm_plain();
} else
pp_unexpected('error', 'term');
// if we parsed a term now parse its attributes such as gender and transliteration
if (term) {
parse_optional_interwiki(term);
parse_optional_gender_translit(term);
logterm(term);
}
return term;
function logterm(term) {
var outputstr = '';
if (typeof term.p != 'undefined') {
outputstr += ' ' + term.p;
if (typeof term.x != 'undefined' && term.x != term.p)
outputstr += '|' + term.x;
}
if (term.tr)
outputstr += ' (' + term.tr + ')';
if (term.g)
outputstr += ' ' + term.g + '.';
if (term.n)
outputstr += ' ' + term.n + '.';
if (term.iw.code || term.iw.sign) {
outputstr += ' ';
if (term.iw.code)
outputstr += term.iw.code;
if (term.iw.sign)
outputstr += term.iw.sign;
}
consolelog(outputstr);
}
function parseterm_link() {
var a = toker.tok.n;
var term = null;
toker.gettok(); // <a>
if (is_script_span(toker.tok))
term = parseterm_link_script();
else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'FONT')
term = parseterm_link_font();
else
term = parseterm_inner();
pp_expect_end('A');
term.a = a;
term.p = title_from_anchor(a);
return term;
function parseterm_link_script() {
var term = null;
toker.gettok(); // <span>
term = parseterm_inner();
pp_expect_end('SPAN');
return term;
}
function parseterm_link_font() {
var term = null;
toker.gettok(); // <font>
term = parseterm_inner();
pp_expect_end('FONT');
return term;
}
}
function parseterm_script() {
var term = null;
toker.gettok(); // <span>
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A')
term = parseterm_script_link();
else
term = parseterm_inner();
pp_expect_end('SPAN');
return term;
function parseterm_script_link() {
var a = toker.tok.n;
var term = null;
toker.gettok(); // <a>
term = parseterm_inner();
pp_expect_end('A');
term.a = a;
term.p = title_from_anchor(a);
return term;
}
}
function parseterm_selflink() {
var term = null;
toker.gettok(); // <strong>
term = parseterm_inner();
pp_expect_end('STRONG');
return term;
}
// get one piece of text
function parseterm_plain() {
var term = {};
term.x = term.p = toker.tok.x;
toker.gettok(); // term itself
return term;
}
}
// TODO this doesn't really have to be recursive does it?
function parseterms_rest() {
var term = null;
var dorest = false;
// comma or semicolon possibly preceded by a space
if (toker.tok.t == 't') {
if (toker.tok.x == ',' || toker.tok.x == ';' || toker.tok.x == '/')
dorest = true;
else if (toker.tok.x == ' ') {
if (toker.nexttok.t == 't' && (toker.nexttok.x == ',' || toker.nexttok.x == ';' || toker.nexttok.x == '/')) {
toker.gettok(); // eat whitespace
dorest = true;
}
}
if (dorest) {
if (toker.tok.x == '/')
pp_warn('tolerate / in place of , or ;');
toker.gettok(); // eat , or ;
pp_tolerate_missing_space();
// next term
term = parseterm();
if (term)
terms.push(term);
// recur
parseterms_rest();
}
}
}
}
// interwiki?
function parse_optional_interwiki(term) {
var iw = {};
// current (cc) style
if (toker.tok.t == 't' && toker.tok.isWhite) {
if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SUP') {
toker.gettok(); // space
// class to tell us if it's a red or blue link?
if (toker.tok.n.className == 'tpos')
iw.sign = '+';
else if (toker.tok.n.className == 'tneg')
iw.sign = '-';
else
iw.sign = '';
toker.gettok(); // <sup>
// template:t style
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') {
toker.gettok(); // <a>
// blue cross-wikt link or new-style class'd sup
if (toker.tok.t == 't' && toker.tok.x == '(') {
toker.gettok(); // (
iw.code = toker.tok.x;
//iw.sign = '+';
toker.gettok(); // language code
pp_expect_text(')');
// old-style red cross-wikt link which wrapped sup with span
} else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'new') {
toker.gettok(); // <span class="new">
pp_expect_text('(');
iw.code = toker.tok.x;
//iw.sign = '-';
toker.gettok(); // language code
pp_expect_text(')');
pp_expect_end('SPAN');
}
pp_expect_end('A');
}
// template:he-translation style
else if (toker.tok.t == 't' && toker.tok.x == '(') {
toker.gettok(); // (
pp_expect_start('A');
iw.code = toker.tok.x;
iw.sign = '';
toker.gettok(); // he
pp_expect_end('A');
pp_expect_text(')');
}
toker.gettok(); // </sup>
}
// old ^ style
else if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'A' && toker.nexttok.n.className == 'extiw') {
toker.gettok(); // space
toker.gettok(); // <a>
iw.code = '^';
iw.sign = '';
toker.gettok(); // <^>
pp_expect_end('A');
}
// ^ style used on [[swan]] Greek
// AMBIG looks like transliteration
// TODO since this comes between the transliteration and the gender
// TODO we should accept all of (transliteration, gender, interwiki) in any order
// TODO this would entail left factoring (^) and transliteration
/*
else if (toker.nexttok.t == 't' && toker.nexttok.x == '(') {
toker.gettok(); // space
if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'A') {
toker.gettok(); // (
toker.gettok(); // <a>
iw.code = '^';
iw.sign = '';
toker.gettok(); // <^>
toker.gettok(); // </a>
toker.gettok(); // )
}
else
toker.ungettok();
}
*/
}
term.iw = iw;
}
function parse_optional_gender_translit(term) {
var gt = null;
if (toker.tok.t == 't' && toker.tok.isWhite) {
// transliteration?
if ((toker.nexttok.t == 't' && (toker.nexttok.x == '(' || toker.nexttok.x == '['))
|| (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
&& (toker.nexttok.n.className == 'IPA' || toker.nexttok.n.className == 'Unicode'))
|| (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
&& toker.nexttok.n.className == 'ib-brac')
|| (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'I')) {
toker.gettok(); // space
parse_translit_gender(term);
}
// gender?
else if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
&& toker.nexttok.n.className.match(/\b(gend|numb)er\b/)) {
toker.gettok(); // space
parse_gender_translit(term);
}
}
return gt;
}
// translit gender?
function parse_translit_gender(term) {
parse_translit(term);
// is there a gender after the transliteration?
if (toker.tok.t == 't' && toker.tok.isWhite) {
if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
&& toker.nexttok.n.className.match(/\b(gend|numb)er\b/)) {
toker.gettok(); // space
parse_gender_num(term);
}
}
}
// transliterations
function parse_translit(term) {
var rbrac = null;
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN'
&& (toker.tok.n.className == 'IPA' || toker.tok.n.className == 'Unicode')) {
toker.gettok(); // <span class="IPA|Unicode">
outer_list(term);
pp_expect_end('SPAN');
} else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'I') {
toker.gettok(); // <i>
outer_list(term);
pp_expect_end('I');
} else if (toker.tok.x == '(' || toker.tok.x == '[')
outer_list(term);
else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN'
&& toker.tok.n.className == 'ib-brac') {
// TODO this won't handle more than one transliteration inside { {ib}}
toker.gettok(); // <span class="ib-brac">
pp_expect_text('(');
pp_expect_end('SPAN');
pp_expect_start('SPAN'); // class="ib-content"
inner_list(term);
pp_expect_end('SPAN'); // class="ib-content"
pp_expect_start('SPAN');
pp_expect_text(')');
pp_expect_end('SPAN'); // class="ib-brac"
} else
pp_error('transliteration');
return;
// => "(" , transliteration { "," , transliteration } , ")"
function outer_list(term) {
if (toker.tok.x == '(') rbrac = ')';
else if (toker.tok.x == '[') rbrac = ']';
else pp_error('transliteration list start bracket');
toker.gettok(); // ( or [
term.tr = inner_list(term);
pp_expect_text(rbrac);
}
// => transliteration { "," , transliteration }
function inner_list(term) {
var translits = [];
var translit = null;
// => transliteration
if (translit = parsetranslit())
translits.push(translit);
// => { "," , transliteration }
parsetranslits_rest();
return translits;
/////////////////////////
// lang="XX" + A + translit || A + translit || translit
function parsetranslit() {
var translit = null;
// japanese may wrap kana transliterations in a font tag
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className.match(/^[A-Z][A-Z]$/)) {
translit = lang_anchor();
} else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') {
translit = anchor();
} else {
translit = inner();
}
return translit;
function lang_anchor() {
var translit = null;
toker.gettok(); // <span lang="JA">
translit = anchor();
pp_expect_end('SPAN');
return translit;
}
function anchor() {
var translit = null;
toker.gettok(); // <a>
translit = inner();
pp_expect_end('A');
return translit;
}
function inner() {
var translit = '';
while (true) {
translit += toker.tok.x;
toker.gettok(); // transliteration itself
// some greek transliterations use [] instead of ()
if (toker.tok.x == ',' || toker.tok.x == rbrac || toker.tok.t != 't')
break;
}
return translit;
}
}
function parsetranslits_rest() {
var translit = null;
var dorest = false;
// comma possibly preceded by a space
if (toker.tok.t == 't') {
if (toker.tok.x == ',')
dorest = true;
else if (toker.tok.x == ' ') {
if (toker.nexttok.t == 't' && toker.nexttok.x == ',') {
toker.gettok(); // eat whitespace
dorest = true;
}
}
if (dorest) {
toker.gettok(); // eat , or ;
pp_tolerate_missing_space();
// next translit
translit = parsetranslit();
if (translit)
translits.push(translit);
// recur
parsetranslits_rest();
}
}
}
}
}
// gender translit?
function parse_gender_translit(term) {
parse_gender_num(term);
// is there a transliteration after the gender?
// TODO sense numbers after the gender look like transliterations to the parser
if (toker.tok.t == 't' && toker.tok.isWhite) {
if ((toker.nexttok.t == 't' && (toker.nexttok.x == '(' || toker.nexttok.x == '['))
|| (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
&& (toker.nexttok.n.className == 'IPA' || toker.nexttok.n.className == 'Unicode'))
|| (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
&& toker.nexttok.n.className == 'ib-brac')) {
toker.gettok(); // space
parse_translit(term);
}
}
}
// gender(s) and possibly number
function parse_gender_num(term) {
var gender = null;
// TODO we handle { {m}} but not yet ''m''
gender = parse_gender_or_number();
// plain comma?
if (toker.tok.t == 's' && toker.tok.n.nodeName == 'I') {
while (true) {
pp_expect_start('I');
pp_expect_text(',');
pp_expect_end('I');
toker.gettok(); // space
gender += parse_gender_or_number();
if (toker.tok.t != 's' || toker.tok.n.nodeName != 'I')
break;
}
toker.gettok(); // <span class="serial-comma">
pp_expect_start('I');
pp_expect_text(',');
pp_expect_end('I');
pp_expect_end('SPAN');
}
// serial comma?
if (toker.tok.t == 't' && toker.tok.isWhite
&& toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN' && toker.nexttok.n.className == 'serial-and') {
toker.gettok(); // space
toker.gettok(); // <span class="serial-and">
pp_expect_start('I');
pp_expect_text('and');
pp_expect_end('I');
pp_expect_end('SPAN');
toker.gettok(); // space
gender += parse_gender_or_number();
}
// and number?
if (toker.tok.t == 't' && toker.tok.isWhite
&& toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN' && toker.nexttok.n.className.match(/\bnumber\b/)) {
toker.gettok(); // space
term.n = parse_gender_or_number();
}
term.g = gender;
function parse_gender_or_number() {
var gender = null;
toker.gettok(); // <span class="gender">
pp_expect_start('I');
gender = toker.tok.x;
toker.gettok(); // gender
pp_expect_start('SPAN');
pp_expect_text('.');
pp_expect_end('SPAN');
pp_expect_end('I');
pp_expect_end('SPAN');
return gender;
}
}
// get all pieces of text
function parseterm_inner() {
var px = '';
px = '';
while (true) {
px += toker.tok.x;
toker.gettok(); // term itself
if (toker.tok.t != 't')
break;
}
return { p: px, x: px };
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper functions
// log to firebug console if it exists
function consolelog(data) {
if (typeof window.console != 'undefined') {
console.log(data);
}
}
// script spans are mostly inserted by templates. old ones use language codes as two capital letters
// some newer ones use the newer ISO script names but these may be prefixed by 'sc' or not...
// TODO add 'polytonic' here?
function is_script_span(tok) {
if (tok.t == 's' && tok.n.nodeName == 'SPAN'
&& (tok.n.className.match(/^[A-Z][A-Z]$/) || tok.n.className == 'scHebr' || tok.n.className == 'Deva'))
return true;
else
return false;
}
// get an unadorned title from an anchor
// since the nodeValue will contain the title
// including optional characters like Hebrew vowels
// and Latin and Old English macrons
function title_from_anchor(a) {
var t;
// red link
if (a.search) {
var l = a.search.indexOf('title=') + 6;
var r = a.search.indexOf('&', l);
if (r == -1)
t = a.search.substr(l);
else
t = a.search.substr(l, r-l);
}
// blue link
else
t = a.pathname.substr(a.pathname.lastIndexOf('/') + 1);
return decodeURIComponent(t).replace(/_/g, ' ');
}
// add a CSS class to an element which may or may
// not already have other classes. will not add
// a class that's already there
function addclass(ele, newclass) {
if (ele.className) {
var p = new RegExp('\\b' + newclass + '\\b');
if (!ele.className.match(p))
ele.className += newclass;
} else
ele.className = newclass;
}
if (wgNamespaceNumber === 0)
jQuery(document).ready(function () {
jQuery.when(
jQuery.getScript(mw.util.getUrl('User:Hippietrail/hippajax.js', { action: 'raw', ctype: 'text/javascript', maxage: 86400, smaxage: 86400 })),
jQuery.getScript(mw.util.getUrl('User:Hippietrail/domtokenizer.js', { action: 'raw', ctype: 'text/javascript', maxage: 86400, smaxage: 86400 }))
).then(function () {
// find all the translation entries in the dom
var lis = get_trans_listitems();
if (lis != null) {
// parse the language name from each translation entry
// and an array of terms for each entry
parse_translistitems(lis);
// look up the other language term for each entry
for (var i = 0; i < lis.length; i++) {
if (lis[i].wiktTerms && lis[i].wiktTerms.length > 0)
for (var j = 0; j < lis[i].wiktTerms.length; j++)
lookup_langs(lis[i], lis[i].wiktTerms[j].a, lis[i].wiktTerms[j].p, lis[i].wiktLang, null /*sublang*/);
// do sublanguages
var dds = lis[i].getElementsByTagName('dd');
for (var k = 0; k < dds.length; k++)
if (dds[k].wiktTerms && dds[k].wiktTerms.length > 0)
for (var l = 0; l < dds[k].wiktTerms.length; l++)
lookup_langs(dds[k], dds[k].wiktTerms[l].a, dds[k].wiktTerms[l].p, lis[i].wiktLang, dds[k].wiktLang);
}
}
});
});