User:Tbot/code/script

From Wiktionary, the free dictionary
Jump to navigation Jump to search



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Tbot/code/script


"""
Given a word and language code, return a script template for the en.wikt

"""

# table of scripts, each is lowest character code point, highest code + 1, ISO script

Scs = [
         (0x0370, 0x0400, 'Grek'),
         (0x0400, 0x0530, 'Cyrl'),
         (0x0530, 0x0590, 'Armn'),
         (0x0590, 0x0600, 'Hebr'),
         (0x0600, 0x0700, 'Arab'),
         (0x0700, 0x0750, 'Syrc'),
         (0x0750, 0x0780, 'Arab'),
         (0x0900, 0x0980, 'Deva'),
         (0x0980, 0x0A00, 'Beng'),
         (0x0A00, 0x0A80, 'Guru'),
         (0x0A80, 0x0B00, 'Gujr'),
         (0x0B00, 0x0B80, 'Orya'),
         (0x0B80, 0x0C00, 'Taml'),
         (0x0C00, 0x0C80, 'Telu'),
         (0x0C80, 0x0D00, 'Knda'),
         (0x0D00, 0x0D80, 'Mlym'),
         (0x0D80, 0x0E00, 'Sinh'),
         (0x0E00, 0x0E80, 'Thai'),
         (0x0E80, 0x0F00, 'Laoo'),
         (0x0F00, 0x1000, 'Tibt'),
         (0x1000, 0x10A0, 'Mymr'),
         (0x10A0, 0x1100, 'Geor'),
         (0x1100, 0x1200, 'Hang'),     # jamo
         (0x1200, 0x13A0, 'Ethi'),
         (0x13A0, 0x1400, 'Cher'),
         (0x1400, 0x1680, 'Cans'),
         (0x3040, 0x3100, 'Jpan'),
         (0x3400, 0xA000, 'Hani'),     # Han Ext A and Unified
         (0xAC00, 0xD800, 'Hang'),
         (0x20000, 0x2A6D7, 'Hant') ]  # Han Ext B, mostly archaic so assume traditional


# table of combinations for specific languages that have particular templates

Lsp = { 'fa-Arab':'fa-Arab', 'ur-Arab':'ur-Arab', 'pa-Arab':'pa-Arab', 'ku-Arab':'ku-Arab',
        'grc-Grek':'polytonic', 'ja-Hani':'Jpan', 'ja-Hant':'Jpan' }
# need some more ...

# all recognized script templates, including redirects, which we do not canonicalize
Scripts = set(['ARchar', 'KUchar', 'FAchar', 'THchar', 'URchar', 'Arab', 'fa-Arab', 'ur-Arab',
               'pa-Arab', 'ku-Arab',
               'THchar', 'polytonic', 'Hebr', 'Beng', 'Hant', 'Hani', 'Jpan', 'Grek',
               'Cyrl', 'Deva', 'Sryc', 'Hang', 'RUchar', 'JAchar', 'Hayeren'])
for low, high, scode in Scs: Scripts.add(scode) # make sure we have all of those

def script(word, lc, report = False):

    if not word: return ''
    a = ord(word[0:1])
    if a >= 0xd800 and a < 0xdc00:
        if len(word) < 2: return ''
        b = ord(word[1:2])
        # "UTF-16" crap:
        a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000

    sc = ''
    for low, high, scode in Scs:
        if a >= low and a < high:
            sc = scode
            break

    if sc and lc + '-' + sc in Lsp: sc = Lsp[lc + '-' + sc]

    if report and not sc and a > 0x0370: print "no match for script for char code %x" % a

    return sc

def scriptp(sc):

    if sc in Scripts: return True
    return False