وحدة:ar-translit
يمكن إنشاء صفحة توثيق الوحدة في وحدة:ar-translit/شرح
-- Authors: Benwing, ZxxZxxZ, Atitarev
local export = {}
local rsub = mw.ustring.gsub
local u = mw.ustring.char
local gcodepoint = mw.ustring.gcodepoint
local zwnj = u(0x200c) -- zero-width non-joiner
--local zwj = u(0x200d) -- zero-width joiner
--local lrm = u(0x200e) -- left-to-right mark
--local rlm = u(0x200f) -- right-to-left mark
-- A comment about notation like \216\169: We use this in place of directly
-- encoding diacritics to avoid difficulties with display in the editing window.
-- These are decimal (NOT octal) encodings of the UTF-8 equivalent of the
-- characters, e.g. \216\169 = D8 A9 in UTF-8 = U+0629 = ة = tāʾ marbūṭa.
local tt = {
-- consonants
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʿ", ["غ"]="ḡ",
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ه"]="h",
-- tāʾ marbūṭa (special) - always after a fátḥa (a), silent at the end of
-- an utterance, "t" in ʾiḍāfa or with pronounced tanwīn. We catch
-- most instances of tāʾ marbūṭa before we get to this stage.
["\216\169"]="t", -- tāʾ marbūṭa = ة
-- control characters
[zwnj]="-", -- ZWNJ (zero-width non-joiner)
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
["پ"]="p", ["چ"]="č", ["ڤ"]="v", ["گ"]="g", ["ڨ"]="g", ["ڧ"]="q",
-- semivowels or long vowels, alif, hamza, special letters
["ا"]="ā", -- ʾalif = \216\167
-- hamzated letters
["أ"]="ʾ", -- hamza over alif = \216\163
["إ"]="ʾ", -- hamza under alif = \216\165
["ؤ"]="ʾ", -- hamza over wāw = \216\164
["ئ"]="ʾ", -- hamza over yā = \216\166
["ء"]="ʾ", -- hamza on the line = \216\161
-- long vowels
["و"]="w", --"ū" after ḍamma (u) and not before diacritic = \217\136
["ي"]="y", --"ī" after kasra (i) and not before diacritic = \217\138
["ى"]="ā", -- ʾalif maqṣūra = \217\137
["آ"]="ʾā", -- ʾalif madda = \216\162
["ٱ"]= "", -- hamzatu l-waṣl = \217\177
["\217\176"] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
-- short vowels, šádda and sukūn
["\217\139"]="an", -- fatḥatan
["\217\140"]="un", -- ḍammatan
["\217\141"]="in", -- kasratan
["\217\142"]="a", -- fatḥa
["\217\143"]="u", -- ḍamma
["\217\144"]="i", -- kasra
-- \217\145 = šadda - doubled consonant
["\217\146"]="", --sukūn - no vowel
-- ligatures
["لا"]="lā",
["الله"]="llāh",
-- taṭwīl
["ـ"]="", -- taṭwīl, no sound
-- numerals
["١"]="1", ["٢"]="2", ["٣"]="3", ["٤"]="4", ["٥"]="5",
["٦"]="6", ["٧"]="7", ["٨"]="8", ["٩"]="9", ["٠"]="0",
-- punctuation (leave on separate lines)
["؟"]="?", -- question mark
["،"]=",", -- comma
["؛"]=";" -- semicolon
}
local sun_letters = "تثدذرزسشصضطظلن"
-- For use in implementing sun-letter assimilation of ال (al-)
local ttsun1 = {}
local ttsun2 = {}
local ttsun3 = {}
for cp in gcodepoint(sun_letters) do
local ch = u(cp)
ttsun1[ch] = tt[ch]
ttsun2["l-" .. ch] = tt[ch] .. "-" .. ch
table.insert(ttsun3, tt[ch])
end
-- For use in implementing elision of al-
local sun_letters_tr = table.concat(ttsun3, "")
local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكلمنهپچڤگڨڧأإؤئءةالله"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ويآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels .. "وي"
-- Arabic semicolon, comma, question mark; taṭwīl; period, exclamation point,
-- single quote for bold/italic
local punctuation = "؟،؛" .. "ـ" .. ".!'"
local numbers = "١٢٣٤٥٦٧٨٩٠"
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
-- convert llh for allāh into ll+shadda+dagger-alif+h
{"لله", "للّٰه"},
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- transliteration process inconvenient, so undo it.
{"([\217\139\217\140\217\141\217\142\217\143\217\144\217\176])\217\145", "\217\145%1"},
-- ignore alif jamīla (otiose alif in 3pl verb forms)
-- #1: handle ḍamma + wāw + alif (final -ū)
{"\217\143\217\136\216\167", "\217\143\217\136"},
-- #2: handle wāw + sukūn + alif (final -w in -aw in defective verbs)
-- this must go before the generation of w, which removes the waw here.
{"\217\136\217\146\216\167", "\217\136\217\146"},
-- ignore final alif or alif maqṣūra following fatḥatan (e.g. in accusative
-- singular or words like عَصًا "stick" or هُذًى "guidance"; this is called
-- tanwin nasb)
{"\217\139[\216\167\217\137]", "\217\139"},
-- same but with the fatḥatan placed over the alif or alif maqṣūra
-- instead of over the previous letter (considered a misspelling but
-- common)
{"[\216\167\217\137]\217\139", "\217\139"},
-- tāʾ marbūṭa should always be preceded by fatḥa, alif, alif madda or
-- dagger alif; infer fatḥa if not
{"([^\217\142\216\167\216\162\217\176])\216\169", "%1\217\142\216\169"},
-- similarly for alif between consonants, possibly marked with shadda
-- (does not apply to initial alif, which is silent when not marked with
-- hamza, or final alif, which might be pronounced as -an)
{"([" .. lconsonants .. "]\217\145?)\216\167([" .. rconsonants .. "])",
"%1\217\142\216\167%2"},
-- infer fatḥa in case of non-fatḥa + alif/alif-maqṣūra + dagger alif
{"([^\217\142])([\216\167\217\137]\217\176)", "%1\217\142%2"},
-- infer kasra in case of hamza-under-alif not + kasra
{"\216\165([^\217\144])", "\216\165\217\144%1"},
-- ignore dagger alif placed over regular alif or alif maqṣūra
{"([\216\167\217\137])\217\176", "%1"},
----------- rest of these concern definite article alif-lām ----------
-- NOTE: \217\132 = lām = ل
-- in kasra/ḍamma + alif + lam, make alif into hamzatu l-waṣl, so we
-- handle cases like بِالتَّوْفِيق (bi-t-tawfīq) correctly
{"([\217\143\217\144])\216\167\217\132", "%1\217\177\217\132"},
-- al + consonant + shadda (only recognize word-initially if regular alif): remove shadda
{"^(\216\167\217\142?\217\132[" .. lconsonants .. "])\217\145", "%1"},
{"%s(\216\167\217\142?\217\132[" .. lconsonants .. "])\217\145", " %1"},
{"(\217\177\217\142?\217\132[" .. lconsonants .. "])\217\145", "%1"},
-- handle l- hamzatu l-waṣl or word-initial al-
{"^\216\167\217\142?\217\132", "al-"},
{"%s\216\167\217\142?\217\132", " al-"},
-- next one for bi-t-tawfīq
{"([\217\143\217\144])\217\177\217\142?\217\132", "%1-l-"},
-- next one for remaining hamzatu l-waṣl (at beginning of word)
{"\217\177\217\142?\217\132", "l-"},
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
-- so we don't mistakenly double the dash
{"l%-\217\145", "ll"},
-- implement assimilation of sun letters
{"l%-[" .. sun_letters .. "]", ttsun2},
}
-- Transliterate any words or phrases. OMIT_I3RAAB means leave out final
-- short vowels (ʾiʿrāb). FORCE_TRANSLATE causes even non-vocalized text to
-- be transliterated (normally the function checks for non-vocalized text and
-- returns nil, since such text is ambiguous in transliteration).
function export.tr(text, lang, sc, omit_i3raab, gray_i3raab, force_translate)
-- make it possible to call this function from a template
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, lang, sc, omit_i3raab, force_translate =
f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
end
for _, sub in ipairs(before_diacritic_checking_subs) do
text = rsub(text, sub[1], sub[2])
end
if not force_translate and not has_diacritics(text) then
return nil
end
------------ transformations after checking for diacritics --------------
-- Replace plain alif with hamzatu l-waṣl when followed by fatḥa/ḍamma/kasra.
-- Must go after handling of initial al-, which distinguishes alif-fatḥa
-- from alif w/hamzatu l-waṣl. Must go before generation of ū and ī, which
-- eliminate the ḍamma/kasra.
text = rsub(text, "\216\167([\217\142\217\143\217\144])", "\217\177%1")
-- ḍamma + waw not followed by a diacritic is ū, otherwise w
text = rsub(text, "\217\143\217\136([^\217\139\217\140\217\141\217\142\217\143\217\144\217\145\217\146\217\176])", "ū%1")
text = rsub(text, "\217\143\217\136$", "ū")
-- kasra + yaa not followed by a diacritic (or ū from prev step) is ī, otherwise y
text = rsub(text, "\217\144\217\138([^\217\139\217\140\217\141\217\142\217\143\217\144\217\145\217\146\217\176ū])", "ī%1")
text = rsub(text, "\217\144\217\138$", "ī")
-- convert shadda to double letter.
text = rsub(text, "(.)\217\145", "%1%1")
if not omit_i3raab and gray_i3raab then -- show ʾiʿrāb grayed in transliteration
-- decide whether to gray out the t in ة. If word begins with al- or l-, yes.
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
text = rsub(text, "^(a?l%-[^%s]+)\216\169([\217\139\217\140\217\141\217\142\217\143\217\144])",
'%1<span style="color: #888888">t</span>%2')
text = rsub(text, "(%sa?l%-[^%s]+)\216\169([\217\139\217\140\217\141\217\142\217\143\217\144])",
'%1<span style="color: #888888">t</span>%2')
text = rsub(text, "\216\169([\217\142\217\143\217\144])", "t%1")
text = rsub(text, "\216\169([\217\139\217\140\217\141])",
'<span style="color: #888888">t</span>%1')
text = rsub(text, ".", {
["\217\139"] = '<span style="color: #888888">an</span>',
["\217\141"] = '<span style="color: #888888">in</span>',
["\217\140"] = '<span style="color: #888888">un</span>'
})
text = rsub(text, "([\217\142\217\143\217\144])%s", {
["\217\142"] = '<span style="color: #888888">a</span> ',
["\217\144"] = '<span style="color: #888888">i</span> ',
["\217\143"] = '<span style="color: #888888">u</span> '
})
text = rsub(text, "[\217\142\217\143\217\144]$", {
["\217\142"] = '<span style="color: #888888">a</span>',
["\217\144"] = '<span style="color: #888888">i</span>',
["\217\143"] = '<span style="color: #888888">u</span>'
})
text = rsub(text, '</span><span style="color: #888888">', "")
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
text = rsub(text, "[\217\139\217\140\217\141]", "")
text = rsub(text, "[\217\142\217\143\217\144]%s", " ")
text = rsub(text, "[\217\142\217\143\217\144]$", "")
end
-- tāʾ marbūṭa should not be rendered by -t if word-final even when
-- ʾiʿrāb (desinential inflection) is shown; instead, use (t) before
-- whitespace, nothing when final; but render final -اة and -آة as -āh,
-- consistent with Wehr's dictionary
text = rsub(text, "([\216\167\216\162])\216\169$", "%1h")
-- Need to do the following after graying or omitting word-final ʾiʿrāb.
text = rsub(text, "\216\169$", "")
if not omit_i3raab then -- show ʾiʿrāb in transliteration
text = rsub(text, "\216\169%s", "(t) ")
else
-- When omitting ʾiʿrāb, show all non-absolutely-final instances of
-- tāʾ marbūṭa as (t), with trailing ʾiʿrāb omitted.
text = rsub(text, "\216\169", "(t)")
end
text = rsub(text, ".", tt)
text = rsub(text, "aā", "ā")
-- Implement elision of al- after a final vowel. We do this
-- conservatively, only handling elision of the definite article rather
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
-- or form-VII and above verbal nouns) partly because elision in
-- these cases isn't so common in MSA and partly to avoid excessive
-- elision in case of words written with initial bare alif instead of
-- properly with hamzated alif. Possibly we should reconsider.
-- At the very least we currently don't handle elision of الَّذِي (allaḏi)
-- correctly because we special-case it to appear without the hyphen;
-- perhaps we should reconsider that.
text = rsub(text, "([aiuāīū]) a([" .. sun_letters_tr .. "]%-)",
"%1 %2")
if gray_i3raab then
text = rsub(text, "([aiuāīū]</span>) a([" .. sun_letters_tr .. "]%-)",
"%1 %2")
end
-- Special-case the transliteration of allāh, without the hyphen
text = rsub(text, "^(a?)l%-lāh", "%1llāh")
text = rsub(text, "(%sa?)l%-lāh", "%1llāh")
return text
end
local has_diacritics_subs = {
-- FIXME! What about lam-alif ligature?
-- remove punctuation and shadda
-- must go before removing final consonants
{"[" .. punctuation .. "\217\145]", ""},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrāb (must go before removing other consonants).
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "]%s", " "},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing shadda
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"[" .. lconsonants .. "\216\167]([\217\139\217\140\217\141\217\142\217\143\217\144\217\146\217\176])", "%1"},
-- the following two must go after removing consonants w/diacritics because
-- we only want to treat vocalic wāw/yā' in them (we want to have removed
-- wāw/yā' followed by a diacritic)
-- remove ḍamma + wāw
{"\217\143\217\136", ""},
-- remove kasra + yā'
{"\217\144\217\138", ""},
-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
{"[\217\139\217\142][\216\167\217\137]", ""},
-- remove diacritics
{"[\217\139\217\140\217\141\217\142\217\143\217\144\217\146\217\176]", ""},
-- remove numbers, hamzatu l-waṣl, alif madda
{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
-- remove non-Arabic characters
{"[^" .. u(0x0600) .. "-" .. u(0x06FF) .. u(0x0750) .. "-" .. u(0x077F) ..
u(0x08A0) .. "-" .. u(0x08FF) .. u(0xFB50) .. "-" .. u(0xFDFF) ..
u(0xFE70) .. "-" .. u(0xFEFF) .. "]", ""}
}
function has_diacritics(text)
for _, sub in ipairs(has_diacritics_subs) do
text = rsub(text, sub[1], sub[2])
end
return #text == 0
end
----------------------------------------------------------------------------
-- Transliterate from Latin to Arabic --
----------------------------------------------------------------------------
--------- Transliterate with unvocalized Arabic to guide ---------
local silent_alif_subst = u(0xfff1)
local silent_alif_maqsuura_subst = u(0xfff2)
local hamza_match={"ʾ","’","'","`"}
local hamza_match_or_empty={"ʾ","’","'","`",""}
-- Special-case matching at beginning of word. Plain alif normally corresponds
-- to nothing, and hamza seats might correspond to nothing (omitted hamza
-- at beginning of word). We can't allow e.g. أ to have "" as one of its
-- possibilities mid-word because that will screw up a word like سألة "saʾala",
-- which won't match at all because the أ will match nothing directly after
-- the Latin "s", and then the ʾ will never be matched.
local tt_to_arabic_matching_bow = { --beginning of word
["ا"]={""},
["أ"]=hamza_match_or_empty, ["إ"]=hamza_match_or_empty,
["آ"]={"ʾaā","’aā","'aā","`aā","aā"}, -- ʾalif madda = \216\162
}
-- Special-case matching at end of word. Some ʾiʿrāb endings may appear in
-- the Arabic but not the transliteration; allow for that.
local tt_to_arabic_matching_eow = { --end of word
["\217\140"]={"un",""}, -- ḍammatan
["\217\142"]={"a",""}, -- fatḥa (in plurals)
["\217\143"]={"u",""}, -- ḍamma (in diptotes)
["\217\144"]={"i",""}, -- kasra (in duals)
}
-- This table maps Arabic characters to all the Latin characters that might
-- correspond to them. The entries can be a string (equivalent to a one-entry
-- array) or an array of strings. Each string might have multiple characters,
-- to handle things like خ=kh and ث=th.
local tt_to_arabic_matching = {
-- consonants
["ب"]="b", ["ت"]="t", ["ث"]={"ṯ","ŧ","θ","th"}, ["ج"]="j",
-- allow what would normally be capital H, but we lowercase all text
-- before processing
["ح"]={"ḥ","ħ","h"}, ["خ"]={"ḵ","x","kh"},
["د"]="d", ["ذ"]={"ḏ","đ","ð","dh"}, ["ر"]="r", ["ز"]="z",
["س"]="s", ["ش"]={"š","sh"},
-- allow non-emphatic to match so we can handle uppercase S, D, T, Z;
-- we lowercase the text before processing to handle proper names and such
["ص"]={"ṣ","sʿ","s"}, ["ض"]={"ḍ","dʿ","d"}, ["ط"]={"ṭ","tʿ","ṫ","t"}, ["ظ"]={"ẓ","ðʿ","đ̣","z"},
["ع"]={"ʿ","ʕ","`","‘","ʻ","3"}, ["غ"]={"ḡ","ġ","ğ","gh"},
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ه"]="h",
["ة"]={"h","t","(t)",""},
-- control characters
[zwnj]={"-",""}, -- ZWNJ (zero-width non-joiner)
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
["پ"]="p", ["چ"]={"č","ch"}, ["ڤ"]="v", ["گ"]="g", ["ڨ"]="g", ["ڧ"]="q",
-- semivowels or long vowels, alif, hamza, special letters
["ا"]="ā", -- ʾalif = \216\167
[silent_alif_subst]="",
[silent_alif_maqsuura_subst]="",
-- hamzated letters
["أ"]=hamza_match, ["إ"]=hamza_match, ["ؤ"]=hamza_match,
["ئ"]=hamza_match, ["ء"]=hamza_match,
["و"]={"w","ū"},
["ي"]={"y","ī"},
["ى"]="ā", -- ʾalif maqṣūra = \217\137
["آ"]={"ʾaā","’aā","'aā","`aā"}, -- ʾalif madda = \216\162
["ٱ"]= "", -- hamzatu l-waṣl = \217\177
["\217\176"] = "aā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
-- short vowels, šadda and sukūn
["\217\139"]="an", -- fatḥatan
["\217\140"]="un", -- ḍammatan
["\217\141"]="in", -- kasratan
["\217\142"]="a", -- fatḥa
["\217\143"]="u", -- ḍamma
["\217\144"]="i", -- kasra
["\217\145"]="\217\145", -- šadda - doubled consonant
["\217\146"]="", --sukūn - no vowel
-- ligatures
["لا"]="lā",
["الله"]="llāh",
-- taṭwīl
["ـ"]="", -- taṭwīl, no sound
-- numerals
["١"]="1", ["٢"]="2", ["٣"]="3", ["٤"]="4", ["٥"]="5",
["٦"]="6", ["٧"]="7", ["٨"]="8", ["٩"]="9", ["٠"]="0",
-- punctuation (leave on separate lines)
["؟"]="?", -- question mark
["،"]=",", -- comma
["؛"]=";", -- semicolon
[" "]=" "
}
local tt_to_arabic_unmatching = {
["a"]="\217\142",
["u"]="\217\143",
["i"]="\217\144",
["\217\145"]="\217\145",
}
function canonicalize_latin(text)
text = mw.ustring.lower(text)
-- eliminate accents
text = rsub(text, ".",
{["á"]="a", ["é"]="e", ["í"]="i", ["ó"]="o", ["ú"]="u",
["ā́"]="ā", ["ḗ"]="ē", ["ī́"]="ī", ["ṓ"]="ō", ["ū́"]="ū"})
-- some accented macron letters have the accent as a separate Unicode char
text = rsub(text, ".́",
{["ā́"]="ā", ["ḗ"]="ē", ["ī́"]="ī", ["ṓ"]="ō", ["ū́"]="ū"})
-- eliminate doubled vowels = long vowels
text = rsub(text, "([aeiou])%1", {a="ā", e="ē", i="ī", o="ō", u="ū"})
-- eliminate vowels followed by colon = long vowels
text = rsub(text, "([aeiou])[:ː]", {a="ā", e="ē", i="ī", o="ō", u="ū"})
-- eliminate - or ' separating t-h, t'h, etc. in transliteration style
-- that uses th to indicate ث
text = rsub(text, "([dtgkcs])[-']h", "%1h")
text = rsub(text, "ūw", "uww")
text = rsub(text, "īy", "iyy")
text = rsub(text, "ai", "ay")
text = rsub(text, "au", "aw")
text = rsub(text, "āi", "āy")
text = rsub(text, "āu", "āw")
text = rsub(text, "[-]", "") -- eliminate stray hyphens (e.g. in al-)
-- add short vowel before long vowel since corresponding Arabic has it
text = rsub(text, ".",
{["ā"]="aā", ["ē"]="iī", ["ī"]="iī", ["ō"]="uū", ["ū"]="uū"})
return text
end
function canonicalize_unvoc(unvoc)
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- transliteration process inconvenient, so undo it.
unvoc = rsub(unvoc,
"([\217\139\217\140\217\141\217\142\217\143\217\144\217\176])\217\145", "\217\145%1")
-- tāʾ marbūṭa should always be preceded by fatḥa, alif or dagger alif;
-- infer fatḥa if not. This fatḥa will force a match to an "a" in the Latin,
-- so we can safely have tāʾ marbūṭa itself match "h", "t" or "", making it
-- work correctly with alif + tāʾ marbūṭa where e.g. اة = ā and stil
-- correctly allow e.g. رة = ra but disallow رة = r.
unvoc = rsub(unvoc, "([^\217\142\216\167\217\176])\216\169", "%1\217\142\216\169")
-- Final alif or alif maqṣūra following fatḥatan is silent (e.g. in
-- accusative singular or words like عَصًا "stick" or هُذًى "guidance"; this is
-- called tanwin nasb). So substitute special silent versions of these
-- vowels.
unvoc = rsub(unvoc, "\217\139\216\167", "\217\139" .. silent_alif_subst)
unvoc = rsub(unvoc, "\217\139\217\137", "\217\139" .. silent_alif_maqsuura_subst)
-- same but with the fatḥatan placed over the alif or alif maqṣūra
-- instead of over the previous letter (considered a misspelling but
-- common)
unvoc = rsub(unvoc, "\216\167\217\139", silent_alif_subst .. "\217\139")
unvoc = rsub(unvoc, "\217\137\217\139", silent_alif_maqsuura_subst .. "\217\139")
return unvoc
end
function canonicalize_arabic(text)
text = rsub(text, silent_alif_subst, "ا")
text = rsub(text, silent_alif_maqsuura_subst, "ى")
-- add sukūn between adjacent consonants
text = rsub(text, "([" .. lconsonants .. "])([" .. rconsonants .. "])", "%1\217\146%2")
-- remove sukūn after ḍamma + wāw
text = rsub(text, "\217\143\217\136\217\146", "\217\143\217\136")
-- remove sukūn after kasra + yā'
text = rsub(text, "\217\144\217\138\217\146", "\217\144\217\138")
return text
end
-- Transliterate any words or phrases from Latin into Arabic script.
-- UNVOC is the unvocalized equivalent in Arabic. If unable to match, throw
-- an error if ERR, else return nil. This works by matching the
-- Latin to the unvocalized Arabic and inserting the appropriate diacritics
-- in the right places, so that ambiguities of Latin transliteration can be
-- correctly handled.
function export.tr_latin_matching(text, unvoc, err)
-- make it possible to call this function from a template
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, unvoc, err = f(text.args[1]), f(text.args[2]), f(text.args[3])
end
text = canonicalize_latin(text)
-- convert double consonant to consonant + shadda
text = rsub(text, "(.)%1", "%1\217\145")
unvoc = canonicalize_unvoc(unvoc)
local ar = {} -- exploded Arabic characters
local la = {} -- exploded Latin characters
local res = {} -- result Arabic characters
for cp in gcodepoint(unvoc) do
table.insert(ar, u(cp))
end
for cp in gcodepoint(text) do
table.insert(la, u(cp))
end
local aind = 1 -- index of next Arabic character
local alen = #ar
local lind = 1 -- index of next Latin character
local llen = #la
-- attempt to match the current Arabic character against the current
-- Latin character(s). If no match, return false; else, increment the
-- Arabic and Latin pointers over the matched characters, add the Arabic
-- character to the result characters and return true.
function match()
local ac = ar[aind]
local bow = aind == 1 or ar[aind - 1] == " "
local eow = aind == alen or ar[aind + 1] == " "
local matches =
bow and tt_to_arabic_matching_bow[ac] or
eow and tt_to_arabic_matching_eow[ac] or
tt_to_arabic_matching[ac]
if matches == nil then
if true then
error("Encountered non-Arabic (?) character " .. ac ..
" at index " .. aind)
else
matches = {ac}
end
end
if type(matches) == "string" then
matches = {matches}
end
for _, m in ipairs(matches) do
local l = lind
local matched = true
for cp in gcodepoint(m) do
if la[l] == u(cp) then
l = l + 1
else
matched = false
break
end
end
if matched then
lind = l
aind = aind + 1
table.insert(res, ac)
return true
end
end
return false
end
function cant_match()
if aind < alen then
error("Unable to match Arabic character " .. ar[aind] ..
" at index " .. aind)
else
error("Unable to match trailing Latin character " .. la[lind] ..
" at index " .. lind)
end
end
-- Here we go through the unvocalized Arabic letter for letter, matching
-- up the consonants we encounter with the corresponding Latin consonants
-- using the table in tt_to_arabic_matching and copying the Arabic
-- consonants into a destination array. When we don't match, we check for
-- allowed unmatching Latin characters in tt_to_arabic_unmatching, which
-- handles short vowels and shadda. If this doesn't match either, and we
-- have left-over Arabic or Latin characters, we reject the whole match,
-- either returning false or signaling an error.
while aind <= alen or lind <= llen do
local matched = false
if aind <= alen and match() then
matched = true
else
local unmatched = tt_to_arabic_unmatching[la[lind]]
if unmatched then
table.insert(res, unmatched)
lind = lind + 1
matched = true
end
end
if not matched then
if err then
cant_match()
else
return false
end
end
end
local arabic = table.concat(res, "")
arabic = canonicalize_arabic(arabic)
return arabic
end
--------- Transliterate directly, without unvocalized Arabic to guide ---------
--------- (NEEDS WORK) ---------
local tt_to_arabic_direct = {
-- consonants
["b"]="ب", ["t"]="ت", ["ṯ"]="ث", ["θ"]="ث", -- ["th"]="ث",
["j"]="ج",
["ḥ"]="ح", ["ħ"]="ح", ["ḵ"]="خ", ["x"]="خ", -- ["kh"]="خ",
["d"]="د", ["ḏ"]="ذ", ["ð"]="ذ", ["đ"]="ذ", -- ["dh"]="ذ",
["r"]="ر", ["z"]="ز", ["s"]="س", ["š"]="ش", -- ["sh"]="ش",
["ṣ"]="ص", ["ḍ"]="ض", ["ṭ"]="ط", ["ẓ"]="ظ",
["ʿ"]="ع", ["ʕ"]="ع",
["`"]="ع",
["3"]="ع",
["ḡ"]="غ", ["ġ"]="غ", ["ğ"]="غ", -- ["gh"]="غ",
["f"]="ف", ["q"]="ق", ["k"]="ك", ["l"]="ل", ["m"]="م", ["n"]="ن",
["h"]="ه",
-- ["a"]="ة", ["ah"]="ة"
-- tāʾ marbūṭa (special) - always after a fátḥa (a), silent at the end of
-- an utterance, "t" in ʾiḍāfa or with pronounced tanwīn
-- \216\169 = tāʾ marbūṭa = ة
-- control characters
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
["p"]="پ", ["č"]="چ", ["v"]="ڤ", ["g"]="گ",
-- semivowels or long vowels, alif, hamza, special letters
["ā"] = "\217\142ا", -- ʾalif = \216\167
-- ["aa"]="\217\142ا", ["a:"]="\217\142ا"
-- hamzated letters
["ʾ"]="ء",
["’"]="ء",
["'"]="ء",
["w"]="و",
["y"]="ي",
["ū"]="\217\143و", -- ["uu"]="\217\143و", ["u:"]="\217\143و"
["ī"]="\217\144ي", -- ["ii"]="\217\144ي", ["i:"]="\217\144ي"
-- ["ā"]="ى", -- ʾalif maqṣūra = \217\137
-- ["an"] = "\217\139" = fatḥatan
-- ["un"] = "\217\140" = ḍammatan
-- ["in"] = "\217\141" = kasratan
["a"] = "\217\142", -- fatḥa
["u"] = "\217\143", -- ḍamma
["i"] = "\217\144", -- kasra
-- \217\145 = šadda - doubled consonant
-- ["\217\146"]="", --sukūn - no vowel
-- ligatures
-- ["لا"]="lā",
-- ["الله"]="llāh",
-- taṭwīl
-- numerals
["1"]="١", ["2"]="٢",-- ["3"]="٣",
["4"]="٤", ["5"]="٥",
["6"]="٦", ["7"]="٧", ["8"]="٨", ["9"]="٩", ["0"]="٠",
-- punctuation (leave on separate lines)
["?"]="؟", -- question mark
[","]="،", -- comma
[";"]="؛" -- semicolon
}
-- Transliterate any words or phrases from Latin into Arabic script.
-- POS, if not nil, is e.g. "noun" or "verb", controlling how to handle
-- final -a.
--
-- FIXME: NEEDS WORK. Works but ignores POS. Doesn't yet generate the correct
-- seat for hamza (need to reuse code in Module:ar-verb to do this). Always
-- transliterates final -a as fatḥa, never as tāʾ marbūṭa (should make use of
-- POS for this). Doesn't (and can't) know about cases where sh, th, etc.
-- stand for single letters rather than combinations.
function export.tr_latin_direct(text, pos)
-- make it possible to call this function from a template
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, pos = f(text.args[1]), f(text.args[2])
end
text = canonicalize_latin(text)
text = rsub(text, "ah$", "\217\142ة")
text = rsub(text, "āh$", "\217\142اة")
text = rsub(text, ".", tt_to_arabic_direct)
-- convert double consonant to consonant + shadda
text = rsub(text, "([" .. lconsonants .. "])%1", "%1\217\145")
text = canonicalize_arabic(text)
return text
end
return export