summaryrefslogtreecommitdiff
path: root/tex/context/base/l-unicode.lua
blob: d0c05bb8629ea8cc6598f7e9dbd831847e395271 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
-- filename : l-unicode.lua
-- comment  : split off from luat-inp
-- author   : Hans Hagen, PRAGMA-ADE, Hasselt NL
-- copyright: PRAGMA ADE / ConTeXt Development Team
-- license  : see context related readme files

if not versions then versions = { } end versions['l-unicode'] = 1.001
if not unicode  then unicode  = { } end

if not garbagecollector then
    garbagecollector = {
        push = function() collectgarbage("stop")    end,
        pop  = function() collectgarbage("restart") end,
    }
end

-- 0  EF BB BF      UTF-8
-- 1  FF FE         UTF-16-little-endian
-- 2  FE FF         UTF-16-big-endian
-- 3  FF FE 00 00   UTF-32-little-endian
-- 4  00 00 FE FF   UTF-32-big-endian

unicode.utfname = {
    [0] = 'utf-8',
    [1] = 'utf-16-le',
    [2] = 'utf-16-be',
    [3] = 'utf-32-le',
    [4] = 'utf-32-be'
}

function unicode.utftype(f) -- \000 fails !
    local str = f:read(4)
    if not str then
        f:seek('set')
        return 0
    elseif str:find("^%z%z\254\255") then
        return 4
    elseif str:find("^\255\254%z%z") then
        return 3
    elseif str:find("^\254\255") then
        f:seek('set',2)
        return 2
    elseif str:find("^\255\254") then
        f:seek('set',2)
        return 1
    elseif str:find("^\239\187\191") then
        f:seek('set',3)
        return 0
    else
        f:seek('set')
        return 0
    end
end

function unicode.utf16_to_utf8(str, endian)
    garbagecollector.push()
    local result = { }
    local tc, uc = table.concat, unicode.utf8.char
    local tmp, n, m, p = { }, 0, 0, 0
    -- lf | cr | crlf / (cr:13, lf:10)
    local function doit()
        if n == 10 then
            if p ~= 13 then
                result[#result+1] = tc(tmp,"")
                tmp = { }
                p = 0
            end
        elseif n == 13 then
            result[#result+1] = tc(tmp,"")
            tmp = { }
            p = n
        else
            tmp[#tmp+1] = uc(n)
            p = 0
        end
    end
    for l,r in str:bytepairs() do
        if endian then
            n = l*256 + r
        else
            n = r*256 + l
        end
        if m > 0 then
            n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
            m = 0
            doit()
        elseif n >= 0xD800 and n <= 0xDBFF then
            m = n
        else
            doit()
        end
    end
    if #tmp > 0 then
        result[#result+1] = tc(tmp,"")
    end
    garbagecollector.pop()
    return result
end

function unicode.utf32_to_utf8(str, endian)
    garbagecollector.push()
    local result = { }
    local tc, uc = table.concat, unicode.utf8.char
    local tmp, n, m, p = { }, 0, -1, 0
    -- lf | cr | crlf / (cr:13, lf:10)
    local function doit()
        if n == 10 then
            if p ~= 13 then
                result[#result+1] = tc(tmp,"")
                tmp = { }
                p = 0
            end
        elseif n == 13 then
            result[#result+1] = tc(tmp,"")
            tmp = { }
            p = n
        else
            tmp[#tmp+1] = uc(n)
            p = 0
        end
    end
    for a,b in str:bytepairs() do
        if a and b then
            if m < 0 then
                if endian then
                    m = a*256*256*256 + b*256*256
                else
                    m = b*256 + a
                end
            else
                if endian then
                    n = m + a*256 + b
                else
                    n = m + b*256*256*256 + a*256*256
                end
                m = -1
                doit()
            end
        else
            break
        end
    end
    if #tmp > 0 then
        result[#result+1] = tc(tmp,"")
    end
    garbagecollector.pop()
    return result
end