This source file includes following definitions.
- _zip_guess_encoding
- _zip_unicode_to_utf8_len
- _zip_unicode_to_utf8
- _zip_cp437_to_utf8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 #include "zipint.h"
36
37 #include <stdlib.h>
38
39
40 static const zip_uint16_t _cp437_to_unicode[256] = {
41
42 0x2007, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
43 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
44
45
46 0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
47 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
48
49
50 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
51 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
52
53
54 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
55 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
56
57
58 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
59 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
60
61
62 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
63 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
64
65
66 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
67 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
68
69
70 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
71 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
72
73
74 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
75 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
76
77
78 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
79 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
80
81
82 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
83 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
84
85
86 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
87 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
88
89
90 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
91 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
92
93
94 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
95 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
96
97
98 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
99 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
100
101
102 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
103 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
104 };
105
106 #define UTF_8_LEN_2_MASK 0xe0
107 #define UTF_8_LEN_2_MATCH 0xc0
108 #define UTF_8_LEN_3_MASK 0xf0
109 #define UTF_8_LEN_3_MATCH 0xe0
110 #define UTF_8_LEN_4_MASK 0xf8
111 #define UTF_8_LEN_4_MATCH 0xf0
112 #define UTF_8_CONTINUE_MASK 0xc0
113 #define UTF_8_CONTINUE_MATCH 0x80
114
115
116 zip_encoding_type_t
117 _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
118 {
119 zip_encoding_type_t enc;
120 const zip_uint8_t *name;
121 zip_uint32_t i, j, ulen;
122
123 if (str == NULL)
124 return ZIP_ENCODING_ASCII;
125
126 name = str->raw;
127
128 if (str->encoding != ZIP_ENCODING_UNKNOWN)
129 enc = str->encoding;
130 else {
131 enc = ZIP_ENCODING_ASCII;
132 for (i=0; i<str->length; i++) {
133 if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
134 continue;
135
136 enc = ZIP_ENCODING_UTF8_GUESSED;
137 if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
138 ulen = 1;
139 else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
140 ulen = 2;
141 else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
142 ulen = 3;
143 else {
144 enc = ZIP_ENCODING_CP437;
145 break;
146 }
147
148 if (i + ulen >= str->length) {
149 enc = ZIP_ENCODING_CP437;
150 break;
151 }
152
153 for (j=1; j<=ulen; j++) {
154 if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
155 enc = ZIP_ENCODING_CP437;
156 goto done;
157 }
158 }
159 i += ulen;
160 }
161 }
162
163 done:
164 str->encoding = enc;
165
166 if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
167 if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
168 str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
169
170 if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
171 return ZIP_ENCODING_ERROR;
172 }
173
174 return enc;
175 }
176
177
178 static zip_uint32_t
179 _zip_unicode_to_utf8_len(zip_uint32_t codepoint)
180 {
181 if (codepoint < 0x0080)
182 return 1;
183 if (codepoint < 0x0800)
184 return 2;
185 if (codepoint < 0x10000)
186 return 3;
187 return 4;
188 }
189
190
191 static zip_uint32_t
192 _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
193 {
194 if (codepoint < 0x0080) {
195 buf[0] = codepoint & 0xff;
196 return 1;
197 }
198 if (codepoint < 0x0800) {
199 buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
200 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
201 return 2;
202 }
203 if (codepoint < 0x10000) {
204 buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
205 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
206 buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
207 return 3;
208 }
209 buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
210 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
211 buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
212 buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
213 return 4;
214 }
215
216
217 zip_uint8_t *
218 _zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len,
219 zip_uint32_t *utf8_lenp, zip_error_t *error)
220 {
221 zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
222 zip_uint8_t *utf8buf;
223 zip_uint32_t buflen, i, offset;
224
225 if (len == 0) {
226 if (utf8_lenp)
227 *utf8_lenp = 0;
228 return NULL;
229 }
230
231 buflen = 1;
232 for (i=0; i<len; i++)
233 buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
234
235 if ((utf8buf=(zip_uint8_t*)malloc(buflen)) == NULL) {
236 zip_error_set(error, ZIP_ER_MEMORY, 0);
237 return NULL;
238 }
239
240 offset = 0;
241 for (i=0; i<len; i++)
242 offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]],
243 utf8buf+offset);
244
245 utf8buf[buflen-1] = 0;
246 if (utf8_lenp)
247 *utf8_lenp = buflen-1;
248 return utf8buf;
249 }