local export = {}
-- http://www.unicode.org/Public/UNIDATA/Jamo.txt
local hangul_leads = {
[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS",
"", "J", "JJ", "C", "K", "T", "P", "H"
}
local hangul_vowels = {
[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
"I"
}
local hangul_trails = {
[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K",
"T", "P", "H"
}
local name_hooks = {
{ 0x00, 0x1f, "<control-%04X>" }, -- C0 control characters
{ 0x80, 0x9f, "<control-%04X>" }, -- C1 control characters
{ 0x3400, 0x4db5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{ 0x4e00, 0x9fcc, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{ 0xac00, 0xd7a3, function (codepoint)
local m_hangul = require('Module:ko-hangul')
local li, vi, ti = m_hangul.syllableIndex2JamoIndices(
codepoint - 0xac00
)
return ("HANGUL SYLLABLE %s%s%s"):format(
hangul_leads[li], -- I hate one-based indexing
hangul_vowels[vi],
hangul_trails[ti] -- never mind, I can live with it
)
end },
{ 0xd800, 0xdb7f, "<surrogate-%04X>" }, -- Non Private Use High Surrogate
{ 0xdb80, 0xdbff, "<surrogate-%04X>" }, -- Private Use High Surrogate
{ 0xdc00, 0xdfff, "<surrogate-%04X>" }, -- Low Surrogate
{ 0xe000, 0xf8ff, "<private-use-%04X>" }, -- Private Use
{ 0x20000, 0x2a6d6, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{ 0x2a700, 0x2b734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{ 0x2a740, 0x2b81d, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{ 0x2b820, 0x2ceaf, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{ 0x2f800, 0x2fa1d, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{ 0xf0000, 0xffffd, "<private-use-%05X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10fffd, "<private-use-%06X>" } -- Plane 16 Private Use
}
local noncharacters = {
[ 0xfdd0] = true, [ 0xfdd1] = true,
[ 0xfdd2] = true, [ 0xfdd3] = true,
[ 0xfdd4] = true, [ 0xfdd5] = true,
[ 0xfdd6] = true, [ 0xfdd7] = true,
[ 0xfdd8] = true, [ 0xfdd9] = true,
[ 0xfdda] = true, [ 0xfddb] = true,
[ 0xfddc] = true, [ 0xfddd] = true,
[ 0xfdde] = true, [ 0xfddf] = true,
[ 0xfde0] = true, [ 0xfde1] = true,
[ 0xfde2] = true, [ 0xfde3] = true,
[ 0xfde4] = true, [ 0xfde5] = true,
[ 0xfde6] = true, [ 0xfde7] = true,
[ 0xfde8] = true, [ 0xfde9] = true,
[ 0xfdea] = true, [ 0xfdeb] = true,
[ 0xfdec] = true, [ 0xfded] = true,
[ 0xfdee] = true, [ 0xfdef] = true,
[ 0xfffe] = true, [ 0xffff] = true,
[ 0x1fffe] = true, [ 0x1ffff] = true,
[ 0x2fffe] = true, [ 0x2ffff] = true,
[ 0x3fffe] = true, [ 0x3ffff] = true,
[ 0x4fffe] = true, [ 0x4ffff] = true,
[ 0x5fffe] = true, [ 0x5ffff] = true,
[ 0x6fffe] = true, [ 0x6ffff] = true,
[ 0x7fffe] = true, [ 0x7ffff] = true,
[ 0x8fffe] = true, [ 0x8ffff] = true,
[ 0x9fffe] = true, [ 0x9ffff] = true,
[ 0xafffe] = true, [ 0xaffff] = true,
[ 0xbfffe] = true, [ 0xbffff] = true,
[ 0xcfffe] = true, [ 0xcffff] = true,
[ 0xdfffe] = true, [ 0xdffff] = true,
[ 0xefffe] = true, [ 0xeffff] = true,
[ 0xffffe] = true, [ 0xfffff] = true,
[0x10fffe] = true, [0x10ffff] = true
}
local name_range_cache
function export.lookup_name(codepoint)
if noncharacters[codepoint] then
return ("<noncharacter-%.4X>"):format(codepoint)
end
if name_range_cache then
if (codepoint >= name_range_cache[1]) and (codepoint <= name_range_cache[2]) then
if type(name_range_cache[3]) == "string" then
return name_range_cache[3]:format(codepoint)
else
return name_range_cache[3](codepoint)
end
end
end
for _, item in ipairs(name_hooks) do
if (codepoint >= item[1]) and (codepoint <= item[2]) then
name_range_cache = item
if type(item[3]) == "string" then
return item[3]:format(codepoint)
else
return item[3](codepoint)
end
elseif codepoint < item[1] then
break
end
end
local success, data = pcall(mw.loadData,
('Module:Unicode data/names/%03X'):format(
math.floor(codepoint / 0x1000)
)
)
return (success and data[codepoint]) or ("<U-%06X>"):format(codepoint)
end
function export.template_lookup_name(frame)
local codepoint = tonumber(frame.args[1] or frame:getParent().args[1])
local name = export.lookup_name(codepoint)
return name:gsub("<", "<")
end
local planes = {
[ 0] = "Basic Multilingual Plane";
[ 1] = "Supplementary Multilingual Plane";
[ 2] = "Supplementary Ideographic Plane";
[13] = "Supplementary Special-purpose Plane";
[14] = "Supplementary Private Use Area-A";
[15] = "Supplementary Private Use Area-B";
}
-- http://www.unicode.org/Public/UNIDATA/Blocks.txt
local blocks = {
["لاتينية أساسية" ] = { 0x000000, 0x00007f };
["Latin-1 Supplement" ] = { 0x000080, 0x0000ff };
["Latin Extended-A" ] = { 0x000100, 0x00017f };
["Latin Extended-B" ] = { 0x000180, 0x00024f };
["IPA Extensions" ] = { 0x000250, 0x0002af };
["Spacing Modifier Letters" ] = { 0x0002b0, 0x0002ff };
["Combining Diacritical Marks" ] = { 0x000300, 0x00036f };
["يونانية وقبطية" ] = { 0x000370, 0x0003ff };
["كيريلية" ] = { 0x000400, 0x0004ff };
["إضافات كيريلية" ] = { 0x000500, 0x00052f };
["أرمنية" ] = { 0x000530, 0x00058f };
["عبرية" ] = { 0x000590, 0x0005ff };
["عربية" ] = { 0x000600, 0x0006ff };
["سريانية" ] = { 0x000700, 0x00074f };
["إضافات عربية" ] = { 0x000750, 0x00077f };
["Thaana" ] = { 0x000780, 0x0007bf };
["نكو" ] = { 0x0007c0, 0x0007ff };
["Samaritan" ] = { 0x000800, 0x00083f };
["مندائية" ] = { 0x000840, 0x00085f };
["Arabic Extended-A" ] = { 0x0008a0, 0x0008ff };
["ديوناكري" ] = { 0x000900, 0x00097f };
["Bengali" ] = { 0x000980, 0x0009ff };
["Gurmukhi" ] = { 0x000a00, 0x000a7f };
["غوجاراتية" ] = { 0x000a80, 0x000aff };
["أوريا" ] = { 0x000b00, 0x000b7f };
["تاميلية" ] = { 0x000b80, 0x000bff };
["Telugu" ] = { 0x000c00, 0x000c7f };
["كنادية" ] = { 0x000c80, 0x000cff };
["Malayalam" ] = { 0x000d00, 0x000d7f };
["سنهالية" ] = { 0x000d80, 0x000dff };
["Thai" ] = { 0x000e00, 0x000e7f };
["لاو" ] = { 0x000e80, 0x000eff };
["Tibetan" ] = { 0x000f00, 0x000fff };
["Myanmar" ] = { 0x001000, 0x00109f };
["Georgian" ] = { 0x0010a0, 0x0010ff };
["هانغل جامو" ] = { 0x001100, 0x0011ff };
["Ethiopic" ] = { 0x001200, 0x00137f };
["Ethiopic Supplement" ] = { 0x001380, 0x00139f };
["شيروكية" ] = { 0x0013a0, 0x0013ff };
["Unified Canadian Aboriginal Syllabics" ] = { 0x001400, 0x00167f };
["Ogham" ] = { 0x001680, 0x00169f };
["Runic" ] = { 0x0016a0, 0x0016ff };
["Tagalog" ] = { 0x001700, 0x00171f };
["Hanunoo" ] = { 0x001720, 0x00173f };
["Buhid" ] = { 0x001740, 0x00175f };
["Tagbanwa" ] = { 0x001760, 0x00177f };
["خميرية" ] = { 0x001780, 0x0017ff };
["منغولية" ] = { 0x001800, 0x0018af };
["Unified Canadian Aboriginal Syllabics Extended" ] = { 0x0018b0, 0x0018ff };
["Limbu" ] = { 0x001900, 0x00194f };
["Tai Le" ] = { 0x001950, 0x00197f };
["New Tai Lue" ] = { 0x001980, 0x0019df };
["Khmer Symbols" ] = { 0x0019e0, 0x0019ff };
["بجينيزية" ] = { 0x001a00, 0x001a1f };
["Tai Tham" ] = { 0x001a20, 0x001aaf };
["Combining Diacritical Marks Extended" ] = { 0x001ab0, 0x001aff };
["بالية" ] = { 0x001b00, 0x001b7f };
["Sundanese" ] = { 0x001b80, 0x001bbf };
["Batak" ] = { 0x001bc0, 0x001bff };
["Lepcha" ] = { 0x001c00, 0x001c4f };
["Ol Chiki" ] = { 0x001c50, 0x001c7f };
["Sundanese Supplement" ] = { 0x001cc0, 0x001ccf };
["Vedic Extensions" ] = { 0x001cd0, 0x001cff };
["Phonetic Extensions" ] = { 0x001d00, 0x001d7f };
["Phonetic Extensions Supplement" ] = { 0x001d80, 0x001dbf };
["Combining Diacritical Marks Supplement" ] = { 0x001dc0, 0x001dff };
["Latin Extended Additional" ] = { 0x001e00, 0x001eff };
["Greek Extended" ] = { 0x001f00, 0x001fff };
["General Punctuation" ] = { 0x002000, 0x00206f };
["Superscripts and Subscripts" ] = { 0x002070, 0x00209f };
["Currency Symbols" ] = { 0x0020a0, 0x0020cf };
["Combining Diacritical Marks for Symbols" ] = { 0x0020d0, 0x0020ff };
["Letterlike Symbols" ] = { 0x002100, 0x00214f };
["أشكال عدد" ] = { 0x002150, 0x00218f };
["Arrows" ] = { 0x002190, 0x0021ff };
["Mathematical Operators" ] = { 0x002200, 0x0022ff };
["Miscellaneous Technical" ] = { 0x002300, 0x0023ff };
["Control Pictures" ] = { 0x002400, 0x00243f };
["Optical Character Recognition" ] = { 0x002440, 0x00245f };
["Enclosed Alphanumerics" ] = { 0x002460, 0x0024ff };
["Box Drawing" ] = { 0x002500, 0x00257f };
["Block Elements" ] = { 0x002580, 0x00259f };
["Geometric Shapes" ] = { 0x0025a0, 0x0025ff };
["Miscellaneous Symbols" ] = { 0x002600, 0x0026ff };
["Dingbats" ] = { 0x002700, 0x0027bf };
["Miscellaneous Mathematical Symbols-A" ] = { 0x0027c0, 0x0027ef };
["Supplemental Arrows-A" ] = { 0x0027f0, 0x0027ff };
["نمط بريل" ] = { 0x002800, 0x0028ff };
["Supplemental Arrows-B" ] = { 0x002900, 0x00297f };
["Miscellaneous Mathematical Symbols-B" ] = { 0x002980, 0x0029ff };
["Supplemental Mathematical Operators" ] = { 0x002a00, 0x002aff };
["Miscellaneous Symbols and Arrows" ] = { 0x002b00, 0x002bff };
["Glagolitic" ] = { 0x002c00, 0x002c5f };
["Latin Extended-C" ] = { 0x002c60, 0x002c7f };
["Coptic" ] = { 0x002c80, 0x002cff };
["إضافات جورجية" ] = { 0x002d00, 0x002d2f };
["Tifinagh" ] = { 0x002d30, 0x002d7f };
["Ethiopic Extended" ] = { 0x002d80, 0x002ddf };
["Cyrillic Extended-A" ] = { 0x002de0, 0x002dff };
["Supplemental Punctuation" ] = { 0x002e00, 0x002e7f };
["CJK Radicals Supplement" ] = { 0x002e80, 0x002eff };
["Kangxi Radicals" ] = { 0x002f00, 0x002fdf };
["Ideographic Description Characters" ] = { 0x002ff0, 0x002fff };
["CJK Symbols and Punctuation" ] = { 0x003000, 0x00303f };
["Hiragana" ] = { 0x003040, 0x00309f };
["Katakana" ] = { 0x0030a0, 0x0030ff };
["Bopomofo" ] = { 0x003100, 0x00312f };
["Hangul Compatibility Jamo" ] = { 0x003130, 0x00318f };
["Kanbun" ] = { 0x003190, 0x00319f };
["Bopomofo Extended" ] = { 0x0031a0, 0x0031bf };
["CJK Strokes" ] = { 0x0031c0, 0x0031ef };
["Katakana Phonetic Extensions" ] = { 0x0031f0, 0x0031ff };
["Enclosed CJK Letters and Months" ] = { 0x003200, 0x0032ff };
["CJK Compatibility" ] = { 0x003300, 0x0033ff };
["CJK Unified Ideographs Extension A" ] = { 0x003400, 0x004dbf };
["Yijing Hexagram Symbols" ] = { 0x004dc0, 0x004dff };
["CJK Unified Ideographs" ] = { 0x004e00, 0x009fff };
["Yi Syllables" ] = { 0x00a000, 0x00a48f };
["Yi Radicals" ] = { 0x00a490, 0x00a4cf };
["Lisu" ] = { 0x00a4d0, 0x00a4ff };
["Vai" ] = { 0x00a500, 0x00a63f };
["Cyrillic Extended-B" ] = { 0x00a640, 0x00a69f };
["Bamum" ] = { 0x00a6a0, 0x00a6ff };
["Modifier Tone Letters" ] = { 0x00a700, 0x00a71f };
["Latin Extended-D" ] = { 0x00a720, 0x00a7ff };
["Syloti Nagri" ] = { 0x00a800, 0x00a82f };
["Common Indic Number Forms" ] = { 0x00a830, 0x00a83f };
["Phags-pa" ] = { 0x00a840, 0x00a87f };
["Saurashtra" ] = { 0x00a880, 0x00a8df };
["Devanagari Extended" ] = { 0x00a8e0, 0x00a8ff };
["Kayah Li" ] = { 0x00a900, 0x00a92f };
["Rejang" ] = { 0x00a930, 0x00a95f };
["Hangul Jamo Extended-A" ] = { 0x00a960, 0x00a97f };
["جاوية" ] = { 0x00a980, 0x00a9df };
["Myanmar Extended-B" ] = { 0x00a9e0, 0x00a9ff };
["Cham" ] = { 0x00aa00, 0x00aa5f };
["Myanmar Extended-A" ] = { 0x00aa60, 0x00aa7f };
["Tai Viet" ] = { 0x00aa80, 0x00aadf };
["Meetei Mayek Extensions" ] = { 0x00aae0, 0x00aaff };
["Ethiopic Extended-A" ] = { 0x00ab00, 0x00ab2f };
["Latin Extended-E" ] = { 0x00ab30, 0x00ab6f };
["Cherokee Supplement" ] = { 0x00ab70, 0x00abbf };
["Meetei Mayek" ] = { 0x00abc0, 0x00abff };
["Hangul Syllables" ] = { 0x00ac00, 0x00d7af };
["Hangul Jamo Extended-B" ] = { 0x00d7b0, 0x00d7ff };
["High Surrogates" ] = { 0x00d800, 0x00db7f };
["High Private Use Surrogates" ] = { 0x00db80, 0x00dbff };
["Low Surrogates" ] = { 0x00dc00, 0x00dfff };
["Private Use Area" ] = { 0x00e000, 0x00f8ff };
["CJK Compatibility Ideographs" ] = { 0x00f900, 0x00faff };
["Alphabetic Presentation Forms" ] = { 0x00fb00, 0x00fb4f };
["Arabic Presentation Forms-A" ] = { 0x00fb50, 0x00fdff };
["Variation Selectors" ] = { 0x00fe00, 0x00fe0f };
["Vertical Forms" ] = { 0x00fe10, 0x00fe1f };
["Combining Half Marks" ] = { 0x00fe20, 0x00fe2f };
["CJK Compatibility Forms" ] = { 0x00fe30, 0x00fe4f };
["Small Form Variants" ] = { 0x00fe50, 0x00fe6f };
["Arabic Presentation Forms-B" ] = { 0x00fe70, 0x00feff };
["Halfwidth and Fullwidth Forms" ] = { 0x00ff00, 0x00ffef };
["خاصة" ] = { 0x00fff0, 0x00ffff };
["Linear B Syllabary" ] = { 0x010000, 0x01007f };
["Linear B Ideograms" ] = { 0x010080, 0x0100ff };
["Aegean Numbers" ] = { 0x010100, 0x01013f };
["Ancient Greek Numbers" ] = { 0x010140, 0x01018f };
["رموز قديمة" ] = { 0x010190, 0x0101cf };
["Phaistos Disc" ] = { 0x0101d0, 0x0101ff };
["Lycian" ] = { 0x010280, 0x01029f };
["Carian" ] = { 0x0102a0, 0x0102df };
["Coptic Epact Numbers" ] = { 0x0102e0, 0x0102ff };
["Old Italic" ] = { 0x010300, 0x01032f };
["قوطية" ] = { 0x010330, 0x01034f };
["Old Permic" ] = { 0x010350, 0x01037f };
["أوغاريتية" ] = { 0x010380, 0x01039f };
["فارسية قديمة" ] = { 0x0103a0, 0x0103df };
["Deseret" ] = { 0x010400, 0x01044f };
["Shavian" ] = { 0x010450, 0x01047f };
["Osmanya" ] = { 0x010480, 0x0104af };
["Elbasan" ] = { 0x010500, 0x01052f };
["Caucasian Albanian" ] = { 0x010530, 0x01056f };
["Linear A" ] = { 0x010600, 0x01077f };
["Cypriot Syllabary" ] = { 0x010800, 0x01083f };
["Imperial Aramaic" ] = { 0x010840, 0x01085f };
["Palmyrene" ] = { 0x010860, 0x01087f };
["Nabataean" ] = { 0x010880, 0x0108af };
["Hatran" ] = { 0x0108e0, 0x0108ff };
["فينيقية" ] = { 0x010900, 0x01091f };
["ليديونية" ] = { 0x010920, 0x01093f };
["Meroitic Hieroglyphs" ] = { 0x010980, 0x01099f };
["Meroitic Cursive" ] = { 0x0109a0, 0x0109ff };
["Kharoshthi" ] = { 0x010a00, 0x010a5f };
["عربية جنوبية قديمة" ] = { 0x010a60, 0x010a7f };
["عربية شمالية قديمة" ] = { 0x010a80, 0x010a9f };
["Manichaean" ] = { 0x010ac0, 0x010aff };
["Avestan" ] = { 0x010b00, 0x010b3f };
["Inscriptional Parthian" ] = { 0x010b40, 0x010b5f };
["Inscriptional Pahlavi" ] = { 0x010b60, 0x010b7f };
["Psalter Pahlavi" ] = { 0x010b80, 0x010baf };
["Old Turkic" ] = { 0x010c00, 0x010c4f };
["مجرية قديمة" ] = { 0x010c80, 0x010cff };
["Rumi Numeral Symbols" ] = { 0x010e60, 0x010e7f };
["Brahmi" ] = { 0x011000, 0x01107f };
["Kaithi" ] = { 0x011080, 0x0110cf };
["Sora Sompeng" ] = { 0x0110d0, 0x0110ff };
["Chakma" ] = { 0x011100, 0x01114f };
["Mahajani" ] = { 0x011150, 0x01117f };
["Sharada" ] = { 0x011180, 0x0111df };
["Sinhala Archaic Numbers" ] = { 0x0111e0, 0x0111ff };
["Khojki" ] = { 0x011200, 0x01124f };
["Multani" ] = { 0x011280, 0x0112af };
["Khudawadi" ] = { 0x0112b0, 0x0112ff };
["Grantha" ] = { 0x011300, 0x01137f };
["Tirhuta" ] = { 0x011480, 0x0114df };
["Siddham" ] = { 0x011580, 0x0115ff };
["Modi" ] = { 0x011600, 0x01165f };
["Takri" ] = { 0x011680, 0x0116cf };
["Ahom" ] = { 0x011700, 0x01173f };
["Warang Citi" ] = { 0x0118a0, 0x0118ff };
["Pau Cin Hau" ] = { 0x011ac0, 0x011aff };
["مسمارية" ] = { 0x012000, 0x0123ff };
["Cuneiform Numbers and Punctuation" ] = { 0x012400, 0x01247f };
["Early Dynastic Cuneiform" ] = { 0x012480, 0x01254f };
["هيرغليفية مصرية" ] = { 0x013000, 0x01342f };
["Anatolian Hieroglyphs" ] = { 0x014400, 0x01467f };
["Bamum Supplement" ] = { 0x016800, 0x016a3f };
["Mro" ] = { 0x016a40, 0x016a6f };
["Bassa Vah" ] = { 0x016ad0, 0x016aff };
["Pahawh Hmong" ] = { 0x016b00, 0x016b8f };
["Miao" ] = { 0x016f00, 0x016f9f };
["Kana Supplement" ] = { 0x01b000, 0x01b0ff };
["Duployan" ] = { 0x01bc00, 0x01bc9f };
["Shorthand Format Controls" ] = { 0x01bca0, 0x01bcaf };
["Byzantine Musical Symbols" ] = { 0x01d000, 0x01d0ff };
["Musical Symbols" ] = { 0x01d100, 0x01d1ff };
["Ancient Greek Musical Notation" ] = { 0x01d200, 0x01d24f };
["Tai Xuan Jing Symbols" ] = { 0x01d300, 0x01d35f };
["Counting Rod Numerals" ] = { 0x01d360, 0x01d37f };
["Mathematical Alphanumeric Symbols" ] = { 0x01d400, 0x01d7ff };
["Sutton SignWriting" ] = { 0x01d800, 0x01daaf };
["Mende Kikakui" ] = { 0x01e800, 0x01e8df };
["Arabic Mathematical Alphabetic Symbols" ] = { 0x01ee00, 0x01eeff };
["Mahjong Tiles" ] = { 0x01f000, 0x01f02f };
["Domino Tiles" ] = { 0x01f030, 0x01f09f };
["Playing Cards" ] = { 0x01f0a0, 0x01f0ff };
["Enclosed Alphanumeric Supplement" ] = { 0x01f100, 0x01f1ff };
["Enclosed Ideographic Supplement" ] = { 0x01f200, 0x01f2ff };
["Miscellaneous Symbols and Pictographs" ] = { 0x01f300, 0x01f5ff };
["تعبيرات" ] = { 0x01f600, 0x01f64f };
["Ornamental Dingbats" ] = { 0x01f650, 0x01f67f };
["رموز نقل وخرائط" ] = { 0x01f680, 0x01f6ff };
["رموز خيميائية" ] = { 0x01f700, 0x01f77f };
["Geometric Shapes Extended" ] = { 0x01f780, 0x01f7ff };
["Supplemental Arrows-C" ] = { 0x01f800, 0x01f8ff };
["Supplemental Symbols and Pictographs" ] = { 0x01f900, 0x01f9ff };
["CJK Unified Ideographs Extension B" ] = { 0x020000, 0x02a6df };
["CJK Unified Ideographs Extension C" ] = { 0x02a700, 0x02b73f };
["CJK Unified Ideographs Extension D" ] = { 0x02b740, 0x02b81f };
["CJK Unified Ideographs Extension E" ] = { 0x02b820, 0x02ceaf };
["CJK Compatibility Ideographs Supplement" ] = { 0x02f800, 0x02fa1f };
["Tags" ] = { 0x0e0000, 0x0e007f };
["Variation Selectors Supplement" ] = { 0x0e0100, 0x0e01ef };
["Supplementary Private Use Area-A" ] = { 0x0f0000, 0x0fffff };
["Supplementary Private Use Area-B" ] = { 0x100000, 0x10ffff };
}
function export.enum_blocks()
local list = {}
for name, range in pairs(blocks) do
table.insert(list, { name, range[1], range[2] })
end
table.sort(list, function (apple, orange)
return apple[2] < orange[2]
end)
return function (list, i)
local data = list[i + 1]
if not data then
return nil
end
return i + 1, data[1], data[2], data[3]
end, list, 0
end
function export.lookup_plane(codepoint)
local i = math.floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
end
function export.lookup_block(codepoint)
for name, range in pairs(blocks) do
if (codepoint >= range[1]) and (codepoint <= range[2]) then
return name
end
end
end
function export.get_block_range(name)
local range = blocks[name]
if range then
return range[1], range[2]
end
end
function export.is_valid_pagename(pagename)
local has_nonws = false
for cp in mw.ustring.gcodepoint(pagename) do
if false
or (cp == 0x0023) -- #
or (cp == 0x005b) -- [
or (cp == 0x005d) -- ]
or (cp == 0x007b) -- {
or (cp == 0x007c) -- |
or (cp == 0x007d) -- }
or (cp == 0x180e) -- MONGOLIAN VOWEL SEPARATOR
or ((cp >= 0x2000) and (cp <= 0x200a))
or (cp == 0xfffd) -- REPLACEMENT CHARACTER
then
return false
end
local printable, result = export.is_printable(cp)
if not printable then
return false
end
if result ~= "space-separator" then
has_nonws = true
end
end
return has_nonws
end
local function manual_unpack(what, from)
local result = {}
from = from or 1
for i, item in ipairs(what) do
if i >= from then
table.insert(result, item)
end
end
return unpack(result)
end
local function memo_lookup(loader, match_func, ...)
local dots = { ... }
local cache = {}
local singles, ranges
return function (codepoint)
if not singles then
singles, ranges = loader()
end
if singles[codepoint] then
return match_func(codepoint, singles[codepoint])
end
local lastlast = -1
for _, range in pairs(cache) do
if (range[1] <= codepoint) and (codepoint <= range[2]) then
return match_func(codepoint, unpack(range, 3))
end
end
for _, range in pairs(ranges) do
if (range[1] <= codepoint) and (codepoint <= range[2]) then
table.insert(cache, { manual_unpack(range) })
return match_func(codepoint, manual_unpack(range, 3))
elseif codepoint < range[1] then
table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) })
return match_func(codepoint, unpack(dots))
else
lastlast = range[2]
end
end
return match_func(codepoint)
end
end
export.is_combining = memo_lookup(function ()
local m_comb = mw.loadData('Module:Unicode data/combining')
return m_comb.single, m_comb.ranges
end, function (codepoint, cc)
return cc and (cc ~= 0)
end, 0)
local lookup_control = memo_lookup(function ()
local m_cc = mw.loadData('Module:Unicode data/control')
return m_cc.single, m_cc.ranges
end, function (codepoint, ccc)
return ccc or "assigned"
end, "assigned")
function export.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
end
function export.is_printable(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
end
function export.is_whitespace(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
end
-- to be used in language-neutral context only (e.g. character lists)
local script_pats
local script_blacklist = {
["Latf"] = true;
["Hans"] = true;
["Hant"] = true;
["Kore"] = true;
["Jpan"] = true;
}
local script_cache = {}
function export.get_script(codepoint)
local text = mw.ustring.char(codepoint)
for pat, sc in pairs(script_cache) do
if mw.ustring.match(text, pat) then
return sc
end
end
if not script_pats then
local m_scripts = mw.loadData("Module:scripts/data")
script_pats = {}
for sc, info in pairs(m_scripts) do
if info.characters and not script_blacklist[sc] then
script_pats[sc] = "[" .. info.characters .. "]"
end
end
end
for sc, pat in pairs(script_pats) do
if mw.ustring.match(text, pat) then
script_cache[pat] = sc
return sc
end
end
return "Zyyy"
end
local unsupported_title = {
[0x0020] = "Unsupported titles/Space";
[0x0023] = "Unsupported titles/Number sign";
[0x002e] = "Unsupported titles/Full stop";
[0x003a] = "Unsupported titles/Colon";
[0x003c] = "Unsupported titles/Less than sign";
[0x003e] = "Unsupported titles/Greater than sign";
[0x005b] = "Unsupported titles/Left square bracket";
[0x005d] = "Unsupported titles/Right square bracket";
[0x005f] = "Unsupported titles/Low line";
[0x007b] = "Unsupported titles/Left curly bracket";
[0x007c] = "Unsupported titles/Vertical line";
[0x007d] = "Unsupported titles/Right curly bracket";
[0x1680] = "Unsupported titles/Ogham space";
[0xfffd] = "Unsupported titles/Replacement character";
}
function export.get_entry_title(codepoint)
if unsupported_title[codepoint] then
return unsupported_title[codepoint]
end
if lookup_control(codepoint) ~= "assigned" then
return nil
end
return mw.ustring.char(codepoint)
end
return export