User:Hippietrail/domtokenizer.js

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Note – after saving, you may have to bypass your browser’s cache to see the changes.

  • Mozilla / Firefox / Safari: hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Command-R on a Macintosh);
  • Konqueror and Chrome: click Reload or press F5;
  • Opera: clear the cache in Tools → Preferences;
  • Internet Explorer: hold Ctrl while clicking Refresh, or press Ctrl-F5.

// needs at least JavaScript 1.7

function domtokenizer(startnode) {
  // public
  this.lasttok = null;
  this.tok = null;
  this.nexttok = null;

  // private
  this.ungot = false;
  this.nextnext = null;
  this.eof = false;

  this.domgen = domgenerator(startnode);

  this.gettok = function() {
    this.lasttok = this.tok;
    this.tok = this.nexttok;
    if (this.ungot) {
      this.nexttok = this.nextnext;
      this.ungot = false;
    } else if (this.eof) {
      // because we have one token of lookahead
      // we need to be able to go one token past the end
    } else {
      this.nexttok = this.domgen.next();
      if (this.nexttok == null)
        this.eof = true;
    }
    return this.tok;
  }

  this.ungettok = function() {
    this.ungot = true;
    this.nextnext = this.nexttok;
    this.nexttok = this.tok;
    this.tok = this.lasttok;
    this.lasttok = null;
  }

  // lookahead
  this.gettok();
}

function domgenerator(startnode) {
  var node = startnode;

  while (true) {
    // EMIT

    if (node == null) {
      yield null;
      break;
    }

    // tag nodes
    else if (node.nodeType == 1)
      yield { "t": "s", "n": node };

    // text nodes
    else if (node.nodeType == 3) {
      var txtgen = texttokenizer(node.nodeValue);

      var t;
      while (t = txtgen.next())
        yield t;
    }

    // other nodes (comments etc)
    else
      yield { "t": "o", "n": node };

    //////////////////////////////////////////////

    // WALK

    // child of this tag
    if (node.firstChild)
      node = node.firstChild;

    // close this tag then go to sibling or parent
    else while (true) {
      if (node.nodeType == 1) {
        yield { "t": "e", "n": node };

        if (node == startnode) {
          node = null;
          break;
        }
      }

      if (node.nextSibling) {
        node = node.nextSibling;
        break;
      }

      node = node.parentNode;
    }
  }

  function texttokenizer(text) {
    var stdin = text;
    var i = 0;
    var c = null;
    var cc = -1;
    var lookahead = null;

    var s = '';

    // lookahead
    getc();

    while (true) {
      var isWhite = false;
      var isEOL = false;
      var isEOF = false;
      var isCyr = false;
      var isHeb = false;
      var isAra = false;
      var isCJK = false;
      getc();

      if (c == null) {
        s = null;
        isEOF = true;
      } else if (c == ',') {
        s = c;
      } else if (c == ':') {
        s = c;
      } else if (c == ';') {
        s = c;
      } else if (c == '(') {
        s = c;
      } else if (c == ')') {
        s = c;
      // MediaWiki converts some spaces to non-breaking spaces near punctuation
      // This is a feature for the French language and an unexpected surprise for the rest of us!
      } else if (c.match(/[\r\n \u00A0]/)) {
        s = c;
        while (true) {
          getc();
          if (c != null && c.match(/[\r\n \u00A0]/))
            s += c;
          else {
            ungetc();
            break;
          }
        }
        isWhite = true;
        if (s == '\r' || s == '\n' || s == '\r\n')
          isEOL = true;
      // Characters used in language names
      } else if (c.match(/[-'!a-záåâāàăçéêíñõöüũúA-Z]/)) {
        s = c;
        while (true) {
          getc();
          if (c != null && c.match(/[-'!a-záåâāàăçéêíñõöüũúA-Z]/))
            s += c;
          else {
            ungetc();
            break;
          }
        }
      // Cyrillic characters
      } else if (inCyrillic(cc)) {
        s = c;
        while (true) {
          getc();
          if (inCyrillic(cc)) {
            s += c;
          } else {
            ungetc();
            break;
          }
        }
        isCyr = true;
      // Hebrew characters
      } else if (inHebrew(cc)) {
        s = c;
        while (true) {
          getc();
          if (inHebrew(cc)) {
            s += c;
          } else {
            ungetc();
            break;
          }
        }
        isHeb = true;
      // Arabic characters
      } else if (inArabic(cc)) {
        s = c;
        while (true) {
          getc();
          if (inArabic(cc)) {
            s += c;
          } else {
            ungetc();
            break;
          }
        }
        isAra = true;
      // CJKV characters
      } else if (inCJK(cc)) {
        s = c;
        while (true) {
          getc();
          if (inCJK(cc)) {
            s += c;
          } else {
            ungetc();
            break;
          }
        }
        isCJK = true;
      } else {
        s = c;
      }

      // EOF
      if (c == null)
        yield null;
      else {
        var retval = { "t": "t", "x": s, "isWhite": isWhite };
        if (isEOL) retval.isEOL = true;

        yield retval;
      }
    }

      function inCyrillic(c) {
        if (c >= 0x0400 && c <= 0x04FF)
          return true;
        else
          return false;
      }

      function inHebrew(c) {
        if (c >= 0x0590 && c <= 0x05FF)
          return true;
        else
          return false;
      }

      function inArabic(c) {
        if (c >= 0x0600 && c <= 0x06FF)
          return true;
        else
          return false;
      }

      function inCJK(c) {
        if ((c >= 0x2E80 && c <= 0x303F)
          || (c >= 0x31C0 && c <= 0x31EF)
          || (c >= 0x3200 && c <= 0x4DBF)
          || (c >= 0x4E00 && c <= 0x9FFF)
          || (c >= 0xF900 && c <= 0xFAFF)
          || (c >= 0xFE30 && c <= 0xFE4F)
          // || (c >= 0x20000 && c <= 0x2A6DF)
          // || (c >= 0x2F800 && c <= 0x2FA1F)
          ) {
          return true;
        } else {
          return false;
        }
      }

    function getc() {
      c = lookahead;
      if (c == null)
        cc = -1;
      else
        cc = c.charCodeAt(0);
      lookahead = null;
      
      if (i < stdin.length)
        lookahead = stdin.charAt(i);

      i++;
    }

    function ungetc() {
      lookahead = c;
      i--;
      c = stdin.charAt(i);
      if (c == null)
        cc = -1;
      else
        cc = c.charCodeAt(0);
    }
  }
}