From 5433e2f89f2a8ba7bad8df77e9c8e0fdb48ab41d Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Thu, 4 Mar 2010 21:57:25 +0100 Subject: Can we call this ready? --- .../third/transliterator/transliterator.tex | 171 ++++++++++++++++----- 1 file changed, 134 insertions(+), 37 deletions(-) (limited to 'doc/context') diff --git a/doc/context/third/transliterator/transliterator.tex b/doc/context/third/transliterator/transliterator.tex index 1a04a5f..10e9ad1 100644 --- a/doc/context/third/transliterator/transliterator.tex +++ b/doc/context/third/transliterator/transliterator.tex @@ -6,8 +6,6 @@ \setupindenting[yes,next,medium] -\setuppagenumbering[location=left] - \setuphead[chapter][style={\rm\bf\tfb},before={\blank[big,force]},after={\blank[2*big,force]}] \setuphead[section][style={\rm\bf\tfa},before={\blank[big]},after={\blank[small]}] \setuphead[subsection][style={\rm\bf},before={\blank[medium]},after={\blank[small]}] @@ -23,8 +21,9 @@ \definefontsynonym [SerifItalic] [name:Bukyvede-Italic] [features=default] \stoptypescript \usetypescript [bukyvede] -\definetypeface [hlaholice] [rm] [serif] [bukyvede] [default] [encoding=ec] -\definetypeface [cyrilice] [rm] [serif] [bukyvede] [default] [encoding=ec] +\definetypeface [hlaholice] [rm] [serif] [bukyvede] [default] [encoding=ec] +\definetypeface [cyrilice] [rm] [serif] [bukyvede] [default] [encoding=ec] +\definetypeface [lmstd] [rm] [serif] [latin-modern] [default] [encoding=texnansi] \usetypefile[cmu] \usetypescript[computer-modern-unicode] @@ -175,6 +174,8 @@ width=.47\textwidth,% ]{% \setupbodyfont[#4]% + %\setuptolerance[verytolerant, stretch] + \setuptolerance[verytolerant] \unskip\language[#2]#6\par }% \framed[% @@ -189,17 +190,49 @@ } } +\defineframedtext[CenteredText][width=fit,frame=off,align=middle] + \usemodule[int-load] \loadsetups[t-transliterator.xml] -\setupwhitespace[big] +\setupwhitespace[medium] \language[en] + \starttext + +\setuppagenumbering[state=stop] + +\blank[3cm,force] + + +\placefigure[force][none]{none}{ +\startCenteredText + +{\setupbodyfont[lmstd,19pt] +{\em The} +\blank [2*big] +{\tfc\sc Transliterator} +\blank [2*big] +{\em for \CONTEXT} +\blank [5*big] +{\tfc\sc Manual} +} +\stopCenteredText +} +\page + +\setuppagenumbering[% + location=middle, + state=start, + style=\tfc +] + +\setuppagenumber[number=1] \completecontent \chapter{Usage and Functionality} \section{Overview} -Basically the Transliterator provides two commands: \type{\setupTranslit} +The Transliterator provides two commands: \type{\setupTranslit} preferably goes into the preamble and allows for global configuration. The Transliterator is invoked locally by \type{\transliterate} which does the actual transliteration of text passages. @@ -235,8 +268,8 @@ By the time of this writing this can be one of the following set: \bTR\bTC \type{ru_transcript_de} \eTC\bTC German transcription for Russian \eTC\eTR \bTR\bTC \type{ru_transcript_en} \eTC\bTC English transcription for Russian \eTC\eTR \bTR\bTC \type{iso9_ocs} \eTC\bTC == \type{all} plus non-ISO additions for Old (Church) Slavonic \eTC\eTR - \bTR\bTC \type{ocs} \eTC\bTC so-called “scientific” transliteration for Old (Church) Slavonic\eTC\eTR - \bTR\bTC \type{ocs_gla} \eTC\bTC so-called “scientific” transliteration for Old (Church) Slavonic / Glagolitic alphabet\eTC\eTR + \bTR\bTC \type{ocs} \eTC\bTC “scientific” transliteration for Old (Church) Slavonic\eTC\eTR + \bTR\bTC \type{ocs_gla} \eTC\bTC “scientific” transliteration for Old (Church) Slavonic / Glagolitic alphabet\eTC\eTR \bTR\bTC \type{ru_cz} \eTC\bTC Czech transcription for Russian\eTC\eTR \bTR\bTC \type{ocs_cz} \eTC\bTC Czech transcription for Old (Church) Slavonic\eTC\eTR \bTR\bTC \type{gr} \eTC\bTC transliteration for Greek \eTC\eTR @@ -249,7 +282,7 @@ By the time of this writing this can be one of the following set: {\em Nota bene}: The description at this point only serves as a placeholder as the -transliteration modes are discussed in detail in later in this document. +transliteration modes are discussed in detail later in this document. Through the \type{hyphenate} argument it is possible to adjust the language that is used for hyphenation. @@ -258,14 +291,15 @@ part of the document be processed according to dutch rules, leaving the overall \type{\language[#1]} configuration unchanged for the rest of the content. The actual transliteration is done using the macro -\type{\transliterate[#1]{#2}}. +\type{\transliterate[#1]} \type{{#2}}. The second argument takes the raw string in the original language that we want to process, while the first, optional argument accepts local adjustments for \type{mode} and \type{hyphenate}. Thus, we would typeset one of Epicuros' sayings like this: {\setuptolerance[verytolerant] \starttyping -\transliterate[mode=gr]{κακὸν ἀνάγκη, ἀλλ' οὐδεμία ἀνάγκη ζῆν μετὰ ἀνάγκης} +\transliterate[mode=gr]{κακὸν ἀνάγκη, ἀλλ' οὐδεμία ἀνάγκη ζῆν + μετὰ ἀνάγκης} \stoptyping which yields \quotation{\transliterate[mode=gr]{κακὸν ἀνάγκη, ἀλλ' οὐδεμία ἀνάγκη ζῆν μετὰ ἀνάγκης}} in the pdf output. @@ -279,6 +313,7 @@ which yields \quotation{\transliterate[mode=gr]{κακὸν ἀνάγκη, ἀλ \blank[medium] {\sc Graham Chapman} } +\blank[2*big] \noindentation At the first glance, {\em transliteration} -- the accurate representation of letters from one alphabet in another -- seems obsolete after the advent of Unicode @@ -289,7 +324,7 @@ e.~g. some scholarly habits might prescribe it in the main text with citations i footnotes left in the original alphabet; or transliteration might alleviate comparison within one language that happens to be written in different scripts; finally, including text in a foreign script might be impossible if there is no -appropriate that fits the main text. +appropriate font which fits the main text. However, it is still most convenient for the writer to keep the untransliterated original in the document source as this allows for reusing it in another context where different transliterations rules might apply. @@ -299,7 +334,7 @@ source and a transliteration only in the final document. Another way of handling foreign languages is {\em transcription}. It is aims at producing some representation that does not rely on symbolisms alien to the language and thus to be at least \quotation{pronouncable} -without further knowledge. +without further know\-ledge. As transcription methods are language specific and highly idiosyncratic they complicate the restoration of the original phrase and information may be lost. The Transliterator provides means of transcription as well but in most cases @@ -316,13 +351,13 @@ its original form which was impossible with previous versions of ISO~9 because they contained several exceptions depending on the original language. Although fifteen years old it has not yet made its way into scholarly publications at large so it might not immediately look familiar. -The diacritics are not identical to the so-called \quotation{scientific} -transliteration used in Slavic studies but as long as your journal does not +The diacritics are not identical to the \quotation{scientific} +transliteration used in Slavic studies but as long as your editor does not enforce its traditional method you should always prefer ISO~9 (\type{[mode=ru]}, \type{[mode=ru_old]}, \type{[mode=all]}). But ISO~9, too, has its shortcomings. -It has no definitions for historical form of the cyrillic script like +It has no definitions for historical forms of the cyrillic script like pre-XVIII-century Russian and Old (Church) Slavonic while those are covered by the scholarly transliterations. To amend the situation the Transliterator provides an extension to ISO~9 for @@ -348,9 +383,9 @@ end \stopluacode \ taken from the scientific transliteration (\type{[mode=iso9_ocs]}). If you prefer more coherency you might want to use pure \quotation{scientific} -tranliteration (\type{[mode=ocs]}). +transliteration (\type{[mode=ocs]}). This method is complemented by \type{[mode=ocs_gla]}, the only option the -Transliterator offers for the Glagolitic alphabet; they can be use consistently +Transliterator offers for the Glagolitic alphabet; they can be used consistently along each other as they were taken from the same book.\footnote{\cite[authoryear][aks] p.~77 \cite[url][aks].} @@ -372,10 +407,74 @@ hyphenation.\footnote{% or locally because the default hyphenation is {\em not} the same as your documents'. } -However, as their is no hyphenation pattern I know of that closely resembles the +However, as there is no hyphenation pattern I know of that closely resembles the transliteration of Greek you might have to resort to putting \type{\discretionary} hyphens when line breaking does not satisfy. +To conclude this, let me have a word on the way the Transliterator works. +Basically, it is a bunch of dictionaries containing substitution rules for +elements that may occur in the text. +These elements may be single characters or strings of more than one character. +As there is no simple way to impose order onto those dictionaries the rules for +one transliteration method are, if needed, distributed over more than one table +which will be applied successively to ensure that multi-character rules +are processed first. + +\setupfloats[spacebefore=small,spaceafter=small] +\placetable[left][none]{Processing speed according to GNU time(1) and the \CONTEXT\ stats.}{ + \setupTABLE[c][each] [frame=off] + \setupTABLE[r][first] [style=bold,topframe=on,bottomframe=on] + \setupTABLE[r][each] [frame=off,topframe=off,bottomframe=off] + \setupTABLE[r][last] [frame=off,topframe=off,bottomframe=on] + \setupTABLE[c][each] [align=middle] + \setupTABLE[c][first] [align=left] + \setupTABLE[c][2] [alignmentcharacter={.},aligncharacter=yes,align=middle] + \setupTABLE[c][3] [alignmentcharacter={.},aligncharacter=yes,align=middle] + \bTABLE[split=no,stretch=yes] + \bTABLEhead + \bTR + \bTH mode \eTH\bTH gnu \eTH\bTH \CONTEXT \eTH + + \eTR + \eTABLEhead + \bTABLEbody + \bTR + \bTC \eTC\bTC 8.89 \eTC\bTC 8.75 \eTC + \eTR\bTR + \bTC \type{ru} \eTC\bTC 9.66 \eTC\bTC 9.45 \eTC + \eTR\bTR + \bTC \type{all} \eTC\bTC 12.18 \eTC\bTC 12.04\eTC + \eTR\bTR + \bTC \type{ru_cz} \eTC\bTC 10.17 \eTC\bTC 10.02 \eTC + \eTR\bTR + \bTC \type{ru_transcript_en} \eTC\bTC 11.24 \eTC\bTC 11.11 \eTC + \eTR\bTR + \bTC \type{ru_transcript_de} \eTC\bTC 56.58 \eTC\bTC 56.42 \eTC + \eTR + \eTABLEbody + \eTABLE +} +\setuptolerance[tolerant] +The transliteration itself is, admittedly, extremely inefficient as it uses +global substitution iteratively on the whole string for every rule in the +dictionary. +(Maybe this could be replaced by a faster implementation using look ahead that +goes through the string only once, but for now it'll stay as it is until I find +time to care for speed.) +In ordinary use when transliterating single words or short phrases only the +Transliterator should have little impact on document processing time at large, +with the exception of the German transcription mode, perhaps. +For sake of completeness, here are some numbers: +Transliterating (and typesetting in MKIV) \transliterate{Александр Пушкин}'s verse novel +\transliterate{Евгений Онегин}, a corpus of about 27000 words, took only +9.7~seconds in \type{[mode=ru]}, compared to 8.9~seconds without +transliteration.\footnote{% + On an IBM T43: \tt 2.6.32-ARCH \#1 SMP PREEMPT Tue Feb 9 14:46:08 UTC 2010 i686 + Intel(R) Pentium(R) M processor 1.60GHz GenuineIntel GNU/Linux. +} + + + \chapter[ex]{Examples} \section{Cyrillic scripts} @@ -386,7 +485,7 @@ or contain ISO~9 as a subset (\type{iso9_ocs}).\footnote{% excuse the inadequate hyphenation in these cases.% } -\trlex{ru}{ru}{cz}{computer-modern-unicode}{% +\trlex{ru}{ru}{cs}{computer-modern-unicode}{% Transliteration rules for the contemporary russian alphabet.% }{% В~ворота гостиницы губернского города NN въехала довольно красивая рессорная @@ -398,7 +497,7 @@ or contain ISO~9 as a subset (\type{iso9_ocs}).\footnote{% слишком молод. } -\trlex{ru_old}{ru}{cz}{computer-modern-unicode}{% +\trlex{ru_old}{ru}{cs}{computer-modern-unicode}{% With aditional characters for pre-1981 Russian orthography (100~per cent ISO~9).% }{% А~сведется віра, убьютъ сотцкого в~селѣ, ино тебѣ взяти полтіна, а~не @@ -414,29 +513,29 @@ or contain ISO~9 as a subset (\type{iso9_ocs}).\footnote{% с~волостмі. } -\trlex{all}{uk}{cz}{computer-modern-unicode}{% +\trlex{all}{ru}{cs}{computer-modern-unicode}{% The complete cyrillic mapping from ISO~9; transliterating Belarusian.% }{% - Беларуская мова, мова беларусаў, уваход\-зіць у~сям’ю індаеўрапейскіх моў, яе + Беларуская мова, мова беларусаў, уваходзіць у~сям’ю індаеўрапейскіх моў, яе славянскай групы і~ўсходнеславянскіх моваў падгрупы, на якой размаўляюць у~Беларусі і~па ўсім свеце, галоўным чынам у~Расіі, Украіне, Польшчы. - Б.~м. пад\-зяляе шмат граматычных і~лексічных уласцівасцяў з~іншымі + Б.~м. падзяляе шмат граматычных і~лексічных уласцівасцяў з~іншымі ўсходнеславянскімі мовамі (гл. таксама: Іншыя назвы беларускай мовы і~Узаемныя ўплывы усходнеславянскіх моваў). } -\trlex{all}{uk}{cz}{computer-modern-unicode}{% +\trlex{all}{uk}{cs}{computer-modern-unicode}{% The complete cyrillic mapping from ISO~9; transliterating Ukrainian.% }{% Украї́нська мова (застарілі назви -- руська мова, проста мова […]) -- - слов'янсь\-ка мова, державна в~Україні та одна з~трьох «офіційних мов на рівних - засадах» у~неви\-знаній Придністровській Молдавсь\-кій Республіці. - За різними оцінками загалом у~світі українською мовою гово\-рить від 41~млн. - до 45~млн. осіб, вона входить до третього десятка найпоши\-ре\-ні\-ших мов + слов'янська мова, державна в~Україні та одна з~трьох «офіційних мов на рівних + засадах» у~не\-ви\-зна\-ній Придністровській Молдавській Республіці. + За різними оцінками загалом у~світі українською мовою говорить від 41~млн. + до 45~млн. осіб, вона входить до третього десятка найпоширеніших мов світу. } -\trlex{all}{ru}{cz}{computer-modern-unicode}{% +\trlex{all}{ru}{cs}{computer-modern-unicode}{% The complete cyrillic mapping from ISO~9; transliterating Serbian.% }{% Српски језик је један од словенских језика из породице индоевропских језика. @@ -447,7 +546,7 @@ or contain ISO~9 as a subset (\type{iso9_ocs}).\footnote{% Срби, међу осталима и~у~Хрватској. } -\trlex{iso9_ocs}{ru}{cz}{cyrilice}{% +\trlex{iso9_ocs}{ru}{cs}{cyrilice}{% Transliteration rules according to ISO~9 with additions for Old (Church) Slavonic.% }{% @@ -507,12 +606,10 @@ different scripts; they are not, however, as easily reversible as ISO~9. At the moment there are tables for old school transcription into three languages: English (via \type{ru_transcript_en}), German (\type{ru_transcript_de}) and Czech (\type{ocs_cz}). -Only one of them (Czech) is recommendable as the others are to a~large extent -irreversible and lack efficiency; -at least the German one is almost unreadable if used with +At least the German one is almost unreadable if used with strings longer than two words. -As we have the bijective ISO~9 mapping at hand there should be reason at all to -use any of them unless when threatened by ignorants. +As we have the bijective ISO~9 mapping at hand there should be no reason at all to +use any of them unless when physically threatened by barbarians. \trlex{ru_transcript_en}{ru}{en}{computer-modern-unicode}{% English transcription for contemporary Russian.% @@ -573,7 +670,7 @@ use any of them unless when threatened by ignorants. The Transliterator offers two modes for handling Greek: \type{gr} and \type{gr_en}. They differ only on one aspect. -\type{gr} basically transliterates the canonical Greek alphabet as well as the +\type{gr} transliterates the canonical Greek alphabet as well as the special glyphs Digamma, Quoppa and Sampi. \type{gr_n} behaves exactly the same way except that nasalization is observed such that \type{γ+[γ|κ]} yields \type{n+[g|k]}. -- cgit v1.2.3