User:Erutuon/scripts/UnicodeScriptRecognition.js/documentation
Jump to navigation
Jump to search
Defines a function that returns official Unicode script property for a code point, like lookup_script
in Module:Unicode data on Wikipedia. Compare to User:Erutuon/scripts/scriptRecognition.js and char_to_script
in Module:Unicode data, which returns the Wiktionary script code.
The data is for Unicode 11.0 and was generated with the following Lua 5.3 scripts, placed in the same directory as Scripts.txt from the Unicode Character Database:
Lua 5.3 scripts
File 1
local infilehandle = assert(io.open([[./Scripts.txt]], 'rb'))
local script_data = assert(infilehandle:read 'a')
infilehandle:close()
local outfile = [[./data.js]]
local script_name_to_code = dofile [[./name_to_code.lua]]
local script_ranges = {}
local prev_codepoint, prev_script_name, prev_script_range
for codepoint1, codepoint2, script_name in script_data:gmatch '%f[^\n%z](%x+)%.?%.?(%x*)%s+;%s*([%w_]+)' do
codepoint1, codepoint2 = tonumber(codepoint1, 16), tonumber(codepoint2, 16)
local script_range
if prev_script_range and script_name == prev_script_name and codepoint1 - prev_codepoint == 1 then
prev_script_range[2] = codepoint2 or codepoint1
else
script_range = { codepoint1, codepoint2 or codepoint1, script_name_to_code[script_name] }
table.insert(script_ranges, script_range)
end
prev_codepoint, prev_script_name, prev_script_range =
codepoint2 or codepoint1, script_name, script_range or prev_script_range
end
table.sort(script_ranges,
function (range1, range2)
return range1[1] < range2[1]
end)
io.output(outfile)
io.write 'var script_ranges = [\n'
for i, range in ipairs(script_ranges) do
if i > 1 then io.write ',\n' end
io.write(('\t[ 0x%05X, 0x%05X, "%s" ]'):format(range[1], range[2], range[3]))
end
io.write '\n];\n'
io.output():close()
File 2: name_to_code.lua
local lpeg = require 'lpeg'
local property_value_aliases_filename = "./PropertyValueAliases.txt"
local property_value_aliases = assert(io.open(property_value_aliases_filename, 'rb')):read('a')
for k, v in pairs(lpeg) do
local firstletter = k:sub(1, 1)
if firstletter:upper() == firstletter then
_ENV[k] = v
end
end
local script_name_to_code = {}
local function add_to_table(code, name)
script_name_to_code[name] = code
end
local patt = P {
(V 'script_line' / add_to_table + 1)^1,
script_line = V 'nl' * P 'sc' * V 'sep' * C(V 'code') * V 'sep' * C(V 'name') * (P(1) - V 'nl')^0,
code = R 'AZ' * V 'lower' * V 'lower' * V 'lower',
name = R('AZ', 'az', '__')^1,
lower = R 'az',
sep = V 'w' * P ';' * V 'w',
w = S ' \t'^0,
nl = P '\r'^-1 * P '\n'
}
patt:match(property_value_aliases)
-- print(require 'inspect' (script_name_to_code))
return script_name_to_code