Module:scripts/data: difference between revisions

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Content deleted Content added
widen Syrc range
removing "unicode"; looks unused, RFDO discussion seemed to indicate intent to retire it. now waiting if anything breaks
Line 664: Line 664:
m["Ugar"] = {
m["Ugar"] = {
names = { "Ugaritic" },
names = { "Ugaritic" },
}

m["unicode"] = {
names = { "Latin" },
}
}



Revision as of 04:19, 7 May 2014

This module contains definitions and metadata for all script codes on Wiktionary. See Wiktionary:Scripts for more information.

This module must not be imported using require. Instead, it is imported like this:

local m_scripts = mw.loadData("Module:scripts")

This ensures that the data is only loaded once per page, rather than once for every module invocation like normal.

To access this data from templates, use Module:script utilities.

See Module:data consistency check to check for errors in this data module.

Required values

Every entry in the table must contain the following properties:

canonicalName
The "canonical" name of the script. This is the name that is used in Wiktionary entries and category names.

Optional values

aliases
A list of aliases/synonyms for the script, other than the canonical name.
varieties
A table of script varieties that are subsumed under the script. This should not in general include those varieties for which separate script codes have been assigned. If a given variety has several names, they can all be listed by including a sublist in the overall list, where the first element is the canonical name that you want the variety to be known by, and the remainder are aliases. For example, the Arabic script entry lists the following under varieties:
	{"Jawi", {"Nastaliq", "Nastaleeq"}}

Here, the Nastaliq variety has an alias "Nastaleeq" specified, while the Jawi variety has no aliases listed. Note that, as here (where Jawi is a particular Arabic script for writing the Malay language while Nastaliq is a calligraphic hand commonly used to write Kashmiri and Urdu), varieties at different levels of specificity can be given in the same list.

otherNames (deprecated)
A table of all non-canonical names that this scripts is known by, including both synonyms and varieties. This should not be used in new scripts, and existing scripts should have the entries in this list moved into either aliases or varieties.
characters
A Lua character class that matches on any character that belongs to this script. The character classes are the same as those in Lua search/replace patterns, but without the surrounding [ ] (these are implicitly added). They also resemble those found in regular expressions.
In its simplest form, the character class can just be a list of all the characters in the script (for example, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"). But it's easier to describe using character ranges, especially when the script contains many dozens or even hundreds of different characters. Character ranges are given with only the first and last character, and separated by a hyphen -. The set then implicitly includes all characters whose Unicode codepoints are in between the two given characters (for example, "A-Za-z"). This character set is used for automatic script recognition (see findBestScript in Module:scripts) if a language has this script's code in its scripts table.
character_category
Used by Module:character info. If false, prevents {{character info}} from adding a characters category, such as Category:Latin script characters.
direction
Describes the text direction. The default value is "ltr" (horizontal left-to-right), so this can be omitted. Horizontal right-to-left scripts should have "rtl", while vertical scripts should have "vertical-ltr" (vertical left-to-right) or "vertical-rtl" (vertical right-to-left).
systems
The categories of writing system to which the script belongs. See Module:writing_systems/data for a list of supported values. At the moment, this is only used by Module:category_tree/poscatboiler/data/scripts. The system will only be displayed if there is only one category. See Category:Latin script for an example.

-- When adding new scripts to this file, please don't forget to add
-- style definitons for the script in [[MediaWiki:Common.css]].

local m = {}

m["Afak"] = {
    names = { "Afaka" },
}

m["Aghb"] = {
    names = { "Caucasian Albanian" },
}

m["Arab"] = {
    names = { "Arabic" },
    characters = "؀-ۿݐ-ݿﭐ-﷽ﹰ-ﻼ",
}

m["fa-Arab"] = {
    names = { "Arabic" },
}

m["kk-Arab"] = {
    names = { "Arabic" },
}

m["ks-Arab"] = {
    names = { "Arabic" },
}

m["ku-Arab"] = {
    names = { "Arabic" },
}

m["mzn-Arab"] = {
    names = { "Arabic" },
}

m["ota-Arab"] = {
    names = { "Arabic" },
}

m["pa-Arab"] = {
    names = { "Arabic", "Shahmukhi" },
}

m["ps-Arab"] = {
    names = { "Arabic" },
}

m["sd-Arab"] = {
    names = { "Arabic" },
}

m["tt-Arab"] = {
    names = { "Arabic" },
}

m["ug-Arab"] = {
    names = { "Arabic" },
}

m["ur-Arab"] = {
    names = { "Arabic" },
}

m["Armi"] = {
    names = { "Imperial Aramaic" },
    characters = "𐡀-𐡟",
}

m["Armn"] = {
    names = { "Armenian" },
    characters = "Ա-֏ﬓ-ﬗ",
}

m["Avst"] = {
    names = { "Avestan" },
    characters = "𐬀-𐬿",
}

m["Bali"] = {
    names = { "Balinese" },
    characters = "ᬀ-᭼",
}

m["Bamu"] = {
    names = { "Bamum" },
    characters = "ꚠ-꛷𖠀-𖨸",
}

m["Bass"] = {
    names = { "Bassa", "Bassa Vah", "Vah" },
}

m["Batk"] = {
    names = { "Batak" },
    characters = "ᯀ-᯿",
}

m["Beng"] = {
    names = { "Bengali" },
    characters = "ঁ-৺",
}

m["Bopo"] = {
    names = { "Zhuyin", "Zhuyin Fuhao", "Bopomofo" },
    characters = "ㄅ-ㄭㆠ-ㆺ",
}

m["Brah"] = {
    names = { "Brahmi" },
    characters = "𑀀-𑁯",
}

m["Brai"] = {
    names = { "Braille" },
    characters = "⠀-⣿",
}

m["Bugi"] = {
    names = { "Buginese" },
    characters = "ᨀ-᨟",
}

m["Buhd"] = {
    names = { "Buhid" },
}

m["Cakm"] = {
    names = { "Chakma" },
}

m["Cans"] = {
    names = { "Canadian syllabics" },
    characters = "᐀-ᙿ",
}

m["Cari"] = {
    names = { "Carian" },
}

m["Cham"] = {
    names = { "Cham" },
    characters = "ꨀ-꩟"
}

m["Cher"] = {
    names = { "Cherokee" },
    characters = "Ꭰ-Ᏼ",
}

m["Copt"] = {
    names = { "Coptic" },
    characters = "Ⲁ-⳿", -- This is the separate "Coptic" block, not the unified "Greek and Coptic"
}

m["Cprt"] = {
    names = { "Cypriot" },
}

m["Cyrl"] = {
    names = { "Cyrillic" },
    characters = "Ѐ-џҊ-ԧꚀ-ꚗ",
}

m["Cyrs"] = {
    names = { "Old Cyrillic" },
    characters = "Ѐ-ԧꙀ-ꚗ",
}

m["Deva"] = {
    names = { "Devanagari" },
    characters = "ऀ-ॿ꣠-ꣻ",
}

m["Dsrt"] = {
    names = { "Deseret" },
    characters = "𐐀-𐑏",
}

m["Dupl"] = {
    names = { "Duployan" },
}

m["Egyd"] = {
    names = { "Demotic" },
}

m["Egyp"] = {
    names = { "Egyptian hieroglyphic" },
    characters = "𓀀-𓐮",
}

m["Elba"] = {
    names = { "Elbasan" },
}

m["Ethi"] = {
    names = { "Ethiopic", "Ge'ez" },
    characters = "ሀ-᎙ⶀ-ⷞꬁ-ꬮ",
}

m["Geok"] = {
    names = { "Nuskhuri", "Khutsuri", "Asomtavruli" },
    characters = "Ⴀ-Ⴭⴀ-ⴭ", -- Ⴀ-Ⴭ is Asomtavruli, ⴀ-ⴭ is Nuskhuri
}

m["Geor"] = {
    names = { "Georgian", "Mkhedruli" },
    characters = "Ⴀ-ჼ", -- technically only the range [ა-ჿ] is Mkhedruli
}

m["Glag"] = {
    names = { "Glagolitic" },
    characters = "Ⰰ-ⱞ",
}

m["Goth"] = {
    names = { "Gothic" },
    characters = "𐌰-𐍊",
}

m["Gran"] = {
    names = { "Grantha" },
    -- Not in Unicode
}

m["Grek"] = {
    names = { "Greek" },
    characters = "ʹ-Ͽ",
}

m["polytonic"] = {
    names = { "Greek" },
    characters = "ἀ-῾" .. m["Grek"].characters,
}

m["Gujr"] = {
    names = { "Gujarati" },
    characters = "ઁ-૱",
}

m["Guru"] = {
    names = { "Gurmukhi" },
    characters = "ਁ-ੵ",
}

m["Hang"] = {
    names = { "Hangul" },
    characters = "가-힣",
}

m["Hani"] = {
    names = { "Han" },
    characters = "一-鿌㐀-䶵 -〿𠀀-𫠝!-○",
}

m["Hans"] = {
    names = { "Simplified Han" },
}

m["Hant"] = {
    names = { "Traditional Han" },
}

m["Hira"] = {
    names = { "Hiragana" },
    characters = "ぁ-ゟ",
}

m["Kana"] = {
    names = { "Katakana" },
    characters = "゠-ヿㇰ-ㇿ",
}

-- These should be defined after the scripts they are composed of

m["Jpan"] = {
    names = { "Japanese" },
    characters = m["Hira"].characters .. m["Kana"].characters .. m["Hani"].characters, 
}

m["Kore"] = {
    names = { "Korean" },
    characters = m["Hang"].characters .. m["Hani"].characters .. "!-○", 
}

m["CGK"] = {
    names = { "Korean" },
}

m["Hano"] = {
    names = { "Hanunoo" },
}

m["Hebr"] = {
    names = { "Hebrew" },
    characters = "א-ת",
}

m["Hmng"] = {
    names = { "Hmong", "Pahawh Hmong" },
}

m["Ibrn"] = {
    names = { "Iberian" },
}

m["Inds"] = {
    names = { "Indus", "Harappan", "Indus Valley" },
}

m["IPAchar"] = {
    names = { "International Phonetic Alphabet" },
}

m["Ital"] = {
    names = { "Old Italic" },
    characters = "𐌀-𐌣",
}

m["Java"] = {
    names = { "Javanese" },
    characters = "ꦀ-꧟",
}

m["Jurc"] = {
    names = { "Jurchen" },
}

m["Kali"] = {
    names = { "Kayah Li" },
    characters = "꤀-꤯",
}

m["Khar"] = {
    names = { "Kharoshthi" },
    characters = "𐨀-𐩘",
}

m["Khmr"] = {
    names = { "Khmer" },
    characters = "ក-៹",
}

m["Knda"] = {
    names = { "Kannada" },
    characters = "ಂ-ೲ",
}

m["Kthi"] = {
    names = { "Kaithi" },
    characters = "𑂀-𑃁",
}

m["Lana"] = {
    names = { "Lanna" },
}

m["Laoo"] = {
    names = { "Lao" },
    characters = "ກ-ໝ",
}

m["Latf"] = {
    names = { "Fraktur", "Blackletter" },
}

m["Latinx"] = {
    names = { "Latin" },
}

m["Latn"] = {
    names = { "Latin", "Roman" },
    characters = "0-z¡-ɏḀ-ỿ",
}

m["nv-Latn"] = {
    names = { "Latin" },
}

m["pjt-Latn"] = {
    names = { "Latin" },
}

m["Lepc"] = {
    names = { "Lepcha" },
}

m["Limb"] = {
    names = { "Limbu" },
}

m["Lina"] = {
    names = { "Linear A" },
}

m["Linb"] = {
    names = { "Linear B" },
    characters = "𐀀-𐃺",
}

m["Lisu"] = {
    names = { "Lisu", "Fraser" },
    characters = "ꓐ-꓿",
}

m["Lyci"] = {
    names = { "Lycian" },
}

m["Lydi"] = {
    names = { "Lydian" },
}

m["Mand"] = {
    names = { "Mandaic" },
}

m["Mani"] = {
    names = { "Manichaean" },
    characters = m["Latn"].characters, -- Not in Unicode
}

m["Maya"] = {
    names = { "Maya", "Maya hieroglyphic", "Mayan", "Mayan hieroglyphic" },
}

m["Mend"] = {
    names = { "Mende", "Mende Kikakui" },
}

m["Merc"] = {
    names = { "Meroitic cursive" },
    characters = "𐦠-𐦿",
}

m["Mero"] = {
    names = { "Meroitic hieroglyphic" },
    characters = "𐦀-𐦟",
}

m["Mlym"] = {
    names = { "Malayalam" },
    characters = "ം-ൿ",
}

m["Mong"] = {
    names = { "Mongolian" },
    characters = "᠀-ᢪ",
}

m["Mtei"] = {
    names = { "Meitei Mayek" },
    characters = "ꯀ-꯿ꫠ-꫿",
}

m["musical"] = {
    names = { "musical notation" },
	characters = "𝄀-𝇝",
}

m["Mymr"] = {
    names = { "Burmese" },
    characters = "က-ၙ",
}

m["Nkoo"] = {
    names = { "N'Ko" },
}

m["None"] = {
    names = { "No script specified" },
    -- This should not have any characters listed
}

m["Ogam"] = {
    names = { "Ogham" },
    characters = " -᚜",
}

m["Olck"] = {
    names = { "Ol Chiki" },
}

m["Orkh"] = {
    names = { "Orkhon runes" },
    characters = "𐰀-𐱈",
}

m["Orya"] = {
    names = { "Oriya" },
    characters = "ଁ-୷",
}

m["Osma"] = {
    names = { "Osmanya" },
}

m["Phag"] = {
    names = { "Phags-pa" },
    characters = "ꡀ-꡷",
}

m["Phli"] = {
    names = { "Inscriptional Pahlavi" },
    characters = "𐭠-𐭿",
}

m["Phlp"] = {
    names = { "Psalter Pahlavi" },
    characters = m["Latn"].characters, -- Not in Unicode
}

m["Phlv"] = {
    names = { "Book Pahlavi" },
    characters = m["Latn"].characters, -- Not in Unicode
}

m["Phnx"] = {
    names = { "Phoenician" },
    characters = "𐤀-𐤟",
}

m["Plrd"] = {
    names = { "Pollard" },
    characters = "𖼀-𖾟",
}

m["Prti"] = {
    names = { "Parthian" },
    characters = "𐭀-𐭟",
}

m["Rjng"] = {
    names = { "Rejang" },
    characters = "ꤰ-꥟",
}

m["Ruminumerals"] = {
    names = { "Rumi numerals" },
    characters = "𐹠-𐹾",
}

m["Runr"] = {
    names = { "Runic" },
    characters = "ᚠ-ᛰ",
}

m["Samr"] = {
    names = { "Samaritan" },
}

m["Sarb"] = {
    names = { "Old South Arabian" },
}

m["Saur"] = {
    names = { "Saurashtra" },
    characters = "ꢀ-꣙",
}

m["Sgnw"] = {
    names = { "SignWriting" },
}

m["Shaw"] = {
    names = { "Shavian" },
    characters = "𐑐-𐑿",
}

m["Shrd"] = {
    names = { "Sharada" },
    characters = "𑆀-𑇙",
}

m["Sinh"] = {
    names = { "Sinhalese" },
    characters = "ං-෴",
}

m["Sora"] = {
    names = { "Sorang Sompeng", "Sora Sompeng" },
}

m["Sund"] = {
    names = { "Sundanese" },
    characters = "ᮀ-ᮿ",
}

m["Sylo"] = {
    names = { "Syloti Nagri", "Sylheti Nagari" },
}

m["Syrc"] = {
    names = { "Syriac" },
    characters = "܀-ݏ",
}

m["Tagb"] = {
    names = { "Tagbanwa" },
}

m["Tale"] = {
    names = { "Tai Le" },
}

m["Talu"] = {
    names = { "New Tai Lue" },
}

m["Taml"] = {
    names = { "Tamil" },
    characters = "ஂ-௺",
}

m["Tang"] = {
    names = { "Tangut" },
}

m["Tavt"] = {
    names = { "Tai Viet" },
}

m["Telu"] = {
    names = { "Telugu" },
    characters = "ఁ-౿",
}

m["Teng"] = {
    names = { "Tengwar" },
}

m["Tfng"] = {
    names = { "Tifinagh" },
    characters = "ⴰ-⵿",
}

m["Tglg"] = {
    names = { "Tagalog" },
    characters = "ᜀ-᜔",
}

m["Thaa"] = {
    names = { "Thaana" },
    characters = "ހ-ޱ",
}

m["Thai"] = {
    names = { "Thai" },
    characters = "ก-ฺ",
}

m["Tibt"] = {
    names = { "Tibetan" },
    characters = "ༀ-࿚",
}

m["xzh-Tibt"] = {
    names = { "Zhang-Zhung" },
}

m["Ugar"] = {
    names = { "Ugaritic" },
}

m["Vaii"] = {
    names = { "Vai" },
}

m["Xpeo"] = {
    names = { "Old Persian" },
    characters = "𐎠-𐏕",
}

m["Xsux"] = {
    names = { "Cuneiform", "Sumero-Akkadian Cuneiform" },
    characters = "𒀀-𒍮𒐀-𒑳",
}

m["Yiii"] = {
    names = { "Yi" },
    characters = "ꀀ-꓆",
}

m["Zmth"] = {
    names = { "mathematical notation" },
    characters = "ℵ∀-⋿⟀-⟯⦀-⧿⨀-⫿𝐀-𝟿",
}

m["Zsym"] = {
    names = { "symbol" },
	characters = "─-➿←⇿⌀-⏳",
}

m["Zyyy"] = {
    names = { "undetermined" },
    -- This should not have any characters listed, probably
}

m["Zzzz"] = {
    names = { "uncoded" },
    -- This should not have any characters listed
}

return m