root/ext/zip/lib/zip_utf-8.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. _zip_guess_encoding
  2. _zip_unicode_to_utf8_len
  3. _zip_unicode_to_utf8
  4. _zip_cp437_to_utf8

   1 /*
   2   zip_utf-8.c -- UTF-8 support functions for libzip
   3   Copyright (C) 2011-2014 Dieter Baron and Thomas Klausner
   4 
   5   This file is part of libzip, a library to manipulate ZIP archives.
   6   The authors can be contacted at <libzip@nih.at>
   7 
   8   Redistribution and use in source and binary forms, with or without
   9   modification, are permitted provided that the following conditions
  10   are met:
  11   1. Redistributions of source code must retain the above copyright
  12      notice, this list of conditions and the following disclaimer.
  13   2. Redistributions in binary form must reproduce the above copyright
  14      notice, this list of conditions and the following disclaimer in
  15      the documentation and/or other materials provided with the
  16      distribution.
  17   3. The names of the authors may not be used to endorse or promote
  18      products derived from this software without specific prior
  19      written permission.
  20  
  21   THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
  22   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  23   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
  25   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  27   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  29   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  30   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
  31   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32 */
  33 
  34 
  35 #include "zipint.h"
  36 
  37 #include <stdlib.h>
  38 
  39 
  40 static const zip_uint16_t _cp437_to_unicode[256] = {
  41     /* 0x00 - 0x0F */
  42     0x2007, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
  43     0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
  44 
  45     /* 0x10 - 0x1F */
  46     0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
  47     0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
  48 
  49     /* 0x20 - 0x2F */
  50     0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
  51     0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
  52 
  53     /* 0x30 - 0x3F */
  54     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
  55     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
  56 
  57     /* 0x40 - 0x4F */
  58     0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
  59     0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
  60 
  61     /* 0x50 - 0x5F */
  62     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
  63     0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
  64 
  65     /* 0x60 - 0x6F */
  66     0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
  67     0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
  68 
  69     /* 0x70 - 0x7F */
  70     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
  71     0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
  72 
  73     /* 0x80 - 0x8F */
  74     0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
  75     0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
  76 
  77     /* 0x90 - 0x9F */
  78     0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
  79     0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
  80 
  81     /* 0xA0 - 0xAF */
  82     0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
  83     0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
  84 
  85     /* 0xB0 - 0xBF */
  86     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
  87     0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
  88 
  89     /* 0xC0 - 0xCF */
  90     0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
  91     0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
  92 
  93     /* 0xD0 - 0xDF */
  94     0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
  95     0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
  96 
  97     /* 0xE0 - 0xEF */
  98     0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
  99     0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
 100 
 101     /* 0xF0 - 0xFF */
 102     0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
 103     0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
 104 };
 105 
 106 #define UTF_8_LEN_2_MASK     0xe0
 107 #define UTF_8_LEN_2_MATCH    0xc0
 108 #define UTF_8_LEN_3_MASK     0xf0
 109 #define UTF_8_LEN_3_MATCH    0xe0
 110 #define UTF_8_LEN_4_MASK     0xf8
 111 #define UTF_8_LEN_4_MATCH    0xf0
 112 #define UTF_8_CONTINUE_MASK  0xc0
 113 #define UTF_8_CONTINUE_MATCH 0x80
 114 
 115 
 116 zip_encoding_type_t
 117 _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
 118 {
 119     zip_encoding_type_t enc;
 120     const zip_uint8_t *name;
 121     zip_uint32_t i, j, ulen;
 122 
 123     if (str == NULL)
 124         return ZIP_ENCODING_ASCII;
 125 
 126     name = str->raw;
 127 
 128     if (str->encoding != ZIP_ENCODING_UNKNOWN)
 129         enc = str->encoding;
 130     else {
 131         enc = ZIP_ENCODING_ASCII;
 132         for (i=0; i<str->length; i++) {
 133             if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
 134                 continue;
 135 
 136             enc = ZIP_ENCODING_UTF8_GUESSED;
 137             if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
 138                 ulen = 1;
 139             else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
 140                 ulen = 2;
 141             else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
 142                 ulen = 3;
 143             else {
 144                 enc = ZIP_ENCODING_CP437;
 145                 break;
 146             }
 147 
 148             if (i + ulen >= str->length) {
 149                 enc = ZIP_ENCODING_CP437;
 150                 break;
 151             }
 152 
 153             for (j=1; j<=ulen; j++) {
 154                 if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
 155                     enc = ZIP_ENCODING_CP437;
 156                     goto done;
 157                 }
 158             }
 159             i += ulen;
 160         }
 161     }
 162 
 163 done:
 164     str->encoding = enc;
 165 
 166     if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
 167         if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
 168             str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
 169 
 170         if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
 171             return ZIP_ENCODING_ERROR;
 172     }
 173     
 174     return enc;
 175 }
 176 
 177 
 178 static zip_uint32_t
 179 _zip_unicode_to_utf8_len(zip_uint32_t codepoint)
 180 {
 181     if (codepoint < 0x0080)
 182         return 1;
 183     if (codepoint < 0x0800)
 184         return 2;
 185     if (codepoint < 0x10000)
 186         return 3;
 187     return 4;
 188 }
 189 
 190 
 191 static zip_uint32_t
 192 _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
 193 {
 194     if (codepoint < 0x0080) {
 195         buf[0] = codepoint & 0xff;
 196         return 1;
 197     }
 198     if (codepoint < 0x0800) {
 199         buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
 200         buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
 201         return 2;
 202     }
 203     if (codepoint < 0x10000) {
 204         buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
 205         buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
 206         buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
 207         return 3;
 208     }
 209     buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
 210     buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
 211     buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
 212     buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
 213     return 4;
 214 }
 215 
 216 
 217 zip_uint8_t *
 218 _zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len,
 219                    zip_uint32_t *utf8_lenp, zip_error_t *error)
 220 {
 221     zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
 222     zip_uint8_t *utf8buf;
 223     zip_uint32_t buflen, i, offset;
 224 
 225     if (len == 0) {
 226         if (utf8_lenp)
 227             *utf8_lenp = 0;
 228         return NULL;
 229     }
 230 
 231     buflen = 1;
 232     for (i=0; i<len; i++)
 233         buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
 234 
 235     if ((utf8buf=(zip_uint8_t*)malloc(buflen)) == NULL) {
 236         zip_error_set(error, ZIP_ER_MEMORY, 0);
 237         return NULL;
 238     }
 239 
 240     offset = 0;
 241     for (i=0; i<len; i++)
 242         offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]],
 243                                        utf8buf+offset);
 244 
 245     utf8buf[buflen-1] = 0;
 246     if (utf8_lenp)
 247         *utf8_lenp = buflen-1;
 248     return utf8buf;
 249 }

/* [<][>][^][v][top][bottom][index][help] */