linux/fs/udf/unicode.c
<<
>>
Prefs
   1/*
   2 * unicode.c
   3 *
   4 * PURPOSE
   5 *      Routines for converting between UTF-8 and OSTA Compressed Unicode.
   6 *      Also handles filename mangling
   7 *
   8 * DESCRIPTION
   9 *      OSTA Compressed Unicode is explained in the OSTA UDF specification.
  10 *              http://www.osta.org/
  11 *      UTF-8 is explained in the IETF RFC XXXX.
  12 *              ftp://ftp.internic.net/rfc/rfcxxxx.txt
  13 *
  14 * COPYRIGHT
  15 *      This file is distributed under the terms of the GNU General Public
  16 *      License (GPL). Copies of the GPL can be obtained from:
  17 *              ftp://prep.ai.mit.edu/pub/gnu/GPL
  18 *      Each contributing author retains all rights to their own work.
  19 */
  20
  21#include "udfdecl.h"
  22
  23#include <linux/kernel.h>
  24#include <linux/string.h>       /* for memset */
  25#include <linux/nls.h>
  26#include <linux/crc-itu-t.h>
  27#include <linux/slab.h>
  28
  29#include "udf_sb.h"
  30
  31static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
  32                                  int);
  33
  34static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
  35{
  36        if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
  37                return 0;
  38
  39        memset(dest, 0, sizeof(struct ustr));
  40        memcpy(dest->u_name, src, strlen);
  41        dest->u_cmpID = 0x08;
  42        dest->u_len = strlen;
  43
  44        return strlen;
  45}
  46
  47/*
  48 * udf_build_ustr
  49 */
  50int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
  51{
  52        int usesize;
  53
  54        if (!dest || !ptr || !size)
  55                return -1;
  56        BUG_ON(size < 2);
  57
  58        usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
  59        usesize = min(usesize, size - 2);
  60        dest->u_cmpID = ptr[0];
  61        dest->u_len = usesize;
  62        memcpy(dest->u_name, ptr + 1, usesize);
  63        memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
  64
  65        return 0;
  66}
  67
  68/*
  69 * udf_build_ustr_exact
  70 */
  71static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
  72{
  73        if ((!dest) || (!ptr) || (!exactsize))
  74                return -1;
  75
  76        memset(dest, 0, sizeof(struct ustr));
  77        dest->u_cmpID = ptr[0];
  78        dest->u_len = exactsize - 1;
  79        memcpy(dest->u_name, ptr + 1, exactsize - 1);
  80
  81        return 0;
  82}
  83
  84/*
  85 * udf_ocu_to_utf8
  86 *
  87 * PURPOSE
  88 *      Convert OSTA Compressed Unicode to the UTF-8 equivalent.
  89 *
  90 * PRE-CONDITIONS
  91 *      utf                     Pointer to UTF-8 output buffer.
  92 *      ocu                     Pointer to OSTA Compressed Unicode input buffer
  93 *                              of size UDF_NAME_LEN bytes.
  94 *                              both of type "struct ustr *"
  95 *
  96 * POST-CONDITIONS
  97 *      <return>                Zero on success.
  98 *
  99 * HISTORY
 100 *      November 12, 1997 - Andrew E. Mileski
 101 *      Written, tested, and released.
 102 */
 103int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
 104{
 105        const uint8_t *ocu;
 106        uint8_t cmp_id, ocu_len;
 107        int i;
 108
 109        ocu_len = ocu_i->u_len;
 110        if (ocu_len == 0) {
 111                memset(utf_o, 0, sizeof(struct ustr));
 112                return 0;
 113        }
 114
 115        cmp_id = ocu_i->u_cmpID;
 116        if (cmp_id != 8 && cmp_id != 16) {
 117                memset(utf_o, 0, sizeof(struct ustr));
 118                pr_err("unknown compression code (%d) stri=%s\n",
 119                       cmp_id, ocu_i->u_name);
 120                return 0;
 121        }
 122
 123        ocu = ocu_i->u_name;
 124        utf_o->u_len = 0;
 125        for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
 126
 127                /* Expand OSTA compressed Unicode to Unicode */
 128                uint32_t c = ocu[i++];
 129                if (cmp_id == 16)
 130                        c = (c << 8) | ocu[i++];
 131
 132                /* Compress Unicode to UTF-8 */
 133                if (c < 0x80U)
 134                        utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
 135                else if (c < 0x800U) {
 136                        utf_o->u_name[utf_o->u_len++] =
 137                                                (uint8_t)(0xc0 | (c >> 6));
 138                        utf_o->u_name[utf_o->u_len++] =
 139                                                (uint8_t)(0x80 | (c & 0x3f));
 140                } else {
 141                        utf_o->u_name[utf_o->u_len++] =
 142                                                (uint8_t)(0xe0 | (c >> 12));
 143                        utf_o->u_name[utf_o->u_len++] =
 144                                                (uint8_t)(0x80 |
 145                                                          ((c >> 6) & 0x3f));
 146                        utf_o->u_name[utf_o->u_len++] =
 147                                                (uint8_t)(0x80 | (c & 0x3f));
 148                }
 149        }
 150        utf_o->u_cmpID = 8;
 151
 152        return utf_o->u_len;
 153}
 154
 155/*
 156 *
 157 * udf_utf8_to_ocu
 158 *
 159 * PURPOSE
 160 *      Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 161 *
 162 * DESCRIPTION
 163 *      This routine is only called by udf_lookup().
 164 *
 165 * PRE-CONDITIONS
 166 *      ocu                     Pointer to OSTA Compressed Unicode output
 167 *                              buffer of size UDF_NAME_LEN bytes.
 168 *      utf                     Pointer to UTF-8 input buffer.
 169 *      utf_len                 Length of UTF-8 input buffer in bytes.
 170 *
 171 * POST-CONDITIONS
 172 *      <return>                Zero on success.
 173 *
 174 * HISTORY
 175 *      November 12, 1997 - Andrew E. Mileski
 176 *      Written, tested, and released.
 177 */
 178static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
 179{
 180        unsigned c, i, max_val, utf_char;
 181        int utf_cnt, u_len;
 182
 183        memset(ocu, 0, sizeof(dstring) * length);
 184        ocu[0] = 8;
 185        max_val = 0xffU;
 186
 187try_again:
 188        u_len = 0U;
 189        utf_char = 0U;
 190        utf_cnt = 0U;
 191        for (i = 0U; i < utf->u_len; i++) {
 192                c = (uint8_t)utf->u_name[i];
 193
 194                /* Complete a multi-byte UTF-8 character */
 195                if (utf_cnt) {
 196                        utf_char = (utf_char << 6) | (c & 0x3fU);
 197                        if (--utf_cnt)
 198                                continue;
 199                } else {
 200                        /* Check for a multi-byte UTF-8 character */
 201                        if (c & 0x80U) {
 202                                /* Start a multi-byte UTF-8 character */
 203                                if ((c & 0xe0U) == 0xc0U) {
 204                                        utf_char = c & 0x1fU;
 205                                        utf_cnt = 1;
 206                                } else if ((c & 0xf0U) == 0xe0U) {
 207                                        utf_char = c & 0x0fU;
 208                                        utf_cnt = 2;
 209                                } else if ((c & 0xf8U) == 0xf0U) {
 210                                        utf_char = c & 0x07U;
 211                                        utf_cnt = 3;
 212                                } else if ((c & 0xfcU) == 0xf8U) {
 213                                        utf_char = c & 0x03U;
 214                                        utf_cnt = 4;
 215                                } else if ((c & 0xfeU) == 0xfcU) {
 216                                        utf_char = c & 0x01U;
 217                                        utf_cnt = 5;
 218                                } else {
 219                                        goto error_out;
 220                                }
 221                                continue;
 222                        } else {
 223                                /* Single byte UTF-8 character (most common) */
 224                                utf_char = c;
 225                        }
 226                }
 227
 228                /* Choose no compression if necessary */
 229                if (utf_char > max_val) {
 230                        if (max_val == 0xffU) {
 231                                max_val = 0xffffU;
 232                                ocu[0] = (uint8_t)0x10U;
 233                                goto try_again;
 234                        }
 235                        goto error_out;
 236                }
 237
 238                if (max_val == 0xffffU)
 239                        ocu[++u_len] = (uint8_t)(utf_char >> 8);
 240                ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
 241        }
 242
 243        if (utf_cnt) {
 244error_out:
 245                ocu[++u_len] = '?';
 246                printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
 247        }
 248
 249        ocu[length - 1] = (uint8_t)u_len + 1;
 250
 251        return u_len + 1;
 252}
 253
 254static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 255                        const struct ustr *ocu_i)
 256{
 257        const uint8_t *ocu;
 258        uint8_t cmp_id, ocu_len;
 259        int i, len;
 260
 261
 262        ocu_len = ocu_i->u_len;
 263        if (ocu_len == 0) {
 264                memset(utf_o, 0, sizeof(struct ustr));
 265                return 0;
 266        }
 267
 268        cmp_id = ocu_i->u_cmpID;
 269        if (cmp_id != 8 && cmp_id != 16) {
 270                memset(utf_o, 0, sizeof(struct ustr));
 271                pr_err("unknown compression code (%d) stri=%s\n",
 272                       cmp_id, ocu_i->u_name);
 273                return 0;
 274        }
 275
 276        ocu = ocu_i->u_name;
 277        utf_o->u_len = 0;
 278        for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
 279                /* Expand OSTA compressed Unicode to Unicode */
 280                uint32_t c = ocu[i++];
 281                if (cmp_id == 16)
 282                        c = (c << 8) | ocu[i++];
 283
 284                len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
 285                                    UDF_NAME_LEN - utf_o->u_len);
 286                /* Valid character? */
 287                if (len >= 0)
 288                        utf_o->u_len += len;
 289                else
 290                        utf_o->u_name[utf_o->u_len++] = '?';
 291        }
 292        utf_o->u_cmpID = 8;
 293
 294        return utf_o->u_len;
 295}
 296
 297static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
 298                        int length)
 299{
 300        int len;
 301        unsigned i, max_val;
 302        uint16_t uni_char;
 303        int u_len;
 304
 305        memset(ocu, 0, sizeof(dstring) * length);
 306        ocu[0] = 8;
 307        max_val = 0xffU;
 308
 309try_again:
 310        u_len = 0U;
 311        for (i = 0U; i < uni->u_len; i++) {
 312                len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
 313                if (!len)
 314                        continue;
 315                /* Invalid character, deal with it */
 316                if (len < 0) {
 317                        len = 1;
 318                        uni_char = '?';
 319                }
 320
 321                if (uni_char > max_val) {
 322                        max_val = 0xffffU;
 323                        ocu[0] = (uint8_t)0x10U;
 324                        goto try_again;
 325                }
 326
 327                if (max_val == 0xffffU)
 328                        ocu[++u_len] = (uint8_t)(uni_char >> 8);
 329                ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
 330                i += len - 1;
 331        }
 332
 333        ocu[length - 1] = (uint8_t)u_len + 1;
 334        return u_len + 1;
 335}
 336
 337int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
 338                     uint8_t *dname, int dlen)
 339{
 340        struct ustr *filename, *unifilename;
 341        int len = 0;
 342
 343        filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
 344        if (!filename)
 345                return 0;
 346
 347        unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
 348        if (!unifilename)
 349                goto out1;
 350
 351        if (udf_build_ustr_exact(unifilename, sname, slen))
 352                goto out2;
 353
 354        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
 355                if (!udf_CS0toUTF8(filename, unifilename)) {
 356                        udf_debug("Failed in udf_get_filename: sname = %s\n",
 357                                  sname);
 358                        goto out2;
 359                }
 360        } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
 361                if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
 362                                  unifilename)) {
 363                        udf_debug("Failed in udf_get_filename: sname = %s\n",
 364                                  sname);
 365                        goto out2;
 366                }
 367        } else
 368                goto out2;
 369
 370        len = udf_translate_to_linux(dname, dlen,
 371                                     filename->u_name, filename->u_len,
 372                                     unifilename->u_name, unifilename->u_len);
 373out2:
 374        kfree(unifilename);
 375out1:
 376        kfree(filename);
 377        return len;
 378}
 379
 380int udf_put_filename(struct super_block *sb, const uint8_t *sname,
 381                     uint8_t *dname, int flen)
 382{
 383        struct ustr unifilename;
 384        int namelen;
 385
 386        if (!udf_char_to_ustr(&unifilename, sname, flen))
 387                return 0;
 388
 389        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
 390                namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
 391                if (!namelen)
 392                        return 0;
 393        } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
 394                namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
 395                                        &unifilename, UDF_NAME_LEN);
 396                if (!namelen)
 397                        return 0;
 398        } else
 399                return 0;
 400
 401        return namelen;
 402}
 403
 404#define ILLEGAL_CHAR_MARK       '_'
 405#define EXT_MARK                '.'
 406#define CRC_MARK                '#'
 407#define EXT_SIZE                5
 408/* Number of chars we need to store generated CRC to make filename unique */
 409#define CRC_LEN                 5
 410
 411static int udf_translate_to_linux(uint8_t *newName, int newLen,
 412                                  uint8_t *udfName, int udfLen,
 413                                  uint8_t *fidName, int fidNameLen)
 414{
 415        int index, newIndex = 0, needsCRC = 0;
 416        int extIndex = 0, newExtIndex = 0, hasExt = 0;
 417        unsigned short valueCRC;
 418        uint8_t curr;
 419
 420        if (udfName[0] == '.' &&
 421            (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
 422                needsCRC = 1;
 423                newIndex = udfLen;
 424                memcpy(newName, udfName, udfLen);
 425        } else {
 426                for (index = 0; index < udfLen; index++) {
 427                        curr = udfName[index];
 428                        if (curr == '/' || curr == 0) {
 429                                needsCRC = 1;
 430                                curr = ILLEGAL_CHAR_MARK;
 431                                while (index + 1 < udfLen &&
 432                                                (udfName[index + 1] == '/' ||
 433                                                 udfName[index + 1] == 0))
 434                                        index++;
 435                        }
 436                        if (curr == EXT_MARK &&
 437                                        (udfLen - index - 1) <= EXT_SIZE) {
 438                                if (udfLen == index + 1)
 439                                        hasExt = 0;
 440                                else {
 441                                        hasExt = 1;
 442                                        extIndex = index;
 443                                        newExtIndex = newIndex;
 444                                }
 445                        }
 446                        if (newIndex < newLen)
 447                                newName[newIndex++] = curr;
 448                        else
 449                                needsCRC = 1;
 450                }
 451        }
 452        if (needsCRC) {
 453                uint8_t ext[EXT_SIZE];
 454                int localExtIndex = 0;
 455
 456                if (hasExt) {
 457                        int maxFilenameLen;
 458                        for (index = 0;
 459                             index < EXT_SIZE && extIndex + index + 1 < udfLen;
 460                             index++) {
 461                                curr = udfName[extIndex + index + 1];
 462
 463                                if (curr == '/' || curr == 0) {
 464                                        needsCRC = 1;
 465                                        curr = ILLEGAL_CHAR_MARK;
 466                                        while (extIndex + index + 2 < udfLen &&
 467                                              (index + 1 < EXT_SIZE &&
 468                                                (udfName[extIndex + index + 2] == '/' ||
 469                                                 udfName[extIndex + index + 2] == 0)))
 470                                                index++;
 471                                }
 472                                ext[localExtIndex++] = curr;
 473                        }
 474                        maxFilenameLen = newLen - CRC_LEN - localExtIndex;
 475                        if (newIndex > maxFilenameLen)
 476                                newIndex = maxFilenameLen;
 477                        else
 478                                newIndex = newExtIndex;
 479                } else if (newIndex > newLen - CRC_LEN)
 480                        newIndex = newLen - CRC_LEN;
 481                newName[newIndex++] = CRC_MARK;
 482                valueCRC = crc_itu_t(0, fidName, fidNameLen);
 483                newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
 484                newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
 485                newName[newIndex++] = hex_asc_upper_hi(valueCRC);
 486                newName[newIndex++] = hex_asc_upper_lo(valueCRC);
 487
 488                if (hasExt) {
 489                        newName[newIndex++] = EXT_MARK;
 490                        for (index = 0; index < localExtIndex; index++)
 491                                newName[newIndex++] = ext[index];
 492                }
 493        }
 494
 495        return newIndex;
 496}
 497