linux/fs/cifs/cifs_unicode.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *   fs/cifs/cifs_unicode.c
   4 *
   5 *   Copyright (c) International Business Machines  Corp., 2000,2009
   6 *   Modified by Steve French (sfrench@us.ibm.com)
   7 */
   8#include <linux/fs.h>
   9#include <linux/slab.h>
  10#include "cifs_fs_sb.h"
  11#include "cifs_unicode.h"
  12#include "cifs_uniupr.h"
  13#include "cifspdu.h"
  14#include "cifsglob.h"
  15#include "cifs_debug.h"
  16
  17int cifs_remap(struct cifs_sb_info *cifs_sb)
  18{
  19        int map_type;
  20
  21        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
  22                map_type = SFM_MAP_UNI_RSVD;
  23        else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
  24                map_type = SFU_MAP_UNI_RSVD;
  25        else
  26                map_type = NO_MAP_UNI_RSVD;
  27
  28        return map_type;
  29}
  30
  31/* Convert character using the SFU - "Services for Unix" remapping range */
  32static bool
  33convert_sfu_char(const __u16 src_char, char *target)
  34{
  35        /*
  36         * BB: Cannot handle remapping UNI_SLASH until all the calls to
  37         *     build_path_from_dentry are modified, as they use slash as
  38         *     separator.
  39         */
  40        switch (src_char) {
  41        case UNI_COLON:
  42                *target = ':';
  43                break;
  44        case UNI_ASTERISK:
  45                *target = '*';
  46                break;
  47        case UNI_QUESTION:
  48                *target = '?';
  49                break;
  50        case UNI_PIPE:
  51                *target = '|';
  52                break;
  53        case UNI_GRTRTHAN:
  54                *target = '>';
  55                break;
  56        case UNI_LESSTHAN:
  57                *target = '<';
  58                break;
  59        default:
  60                return false;
  61        }
  62        return true;
  63}
  64
  65/* Convert character using the SFM - "Services for Mac" remapping range */
  66static bool
  67convert_sfm_char(const __u16 src_char, char *target)
  68{
  69        if (src_char >= 0xF001 && src_char <= 0xF01F) {
  70                *target = src_char - 0xF000;
  71                return true;
  72        }
  73        switch (src_char) {
  74        case SFM_COLON:
  75                *target = ':';
  76                break;
  77        case SFM_DOUBLEQUOTE:
  78                *target = '"';
  79                break;
  80        case SFM_ASTERISK:
  81                *target = '*';
  82                break;
  83        case SFM_QUESTION:
  84                *target = '?';
  85                break;
  86        case SFM_PIPE:
  87                *target = '|';
  88                break;
  89        case SFM_GRTRTHAN:
  90                *target = '>';
  91                break;
  92        case SFM_LESSTHAN:
  93                *target = '<';
  94                break;
  95        case SFM_SPACE:
  96                *target = ' ';
  97                break;
  98        case SFM_PERIOD:
  99                *target = '.';
 100                break;
 101        default:
 102                return false;
 103        }
 104        return true;
 105}
 106
 107
 108/*
 109 * cifs_mapchar - convert a host-endian char to proper char in codepage
 110 * @target - where converted character should be copied
 111 * @src_char - 2 byte host-endian source character
 112 * @cp - codepage to which character should be converted
 113 * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2?
 114 *
 115 * This function handles the conversion of a single character. It is the
 116 * responsibility of the caller to ensure that the target buffer is large
 117 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
 118 */
 119static int
 120cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
 121             int maptype)
 122{
 123        int len = 1;
 124        __u16 src_char;
 125
 126        src_char = *from;
 127
 128        if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
 129                return len;
 130        else if ((maptype == SFU_MAP_UNI_RSVD) &&
 131                  convert_sfu_char(src_char, target))
 132                return len;
 133
 134        /* if character not one of seven in special remap set */
 135        len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
 136        if (len <= 0)
 137                goto surrogate_pair;
 138
 139        return len;
 140
 141surrogate_pair:
 142        /* convert SURROGATE_PAIR and IVS */
 143        if (strcmp(cp->charset, "utf8"))
 144                goto unknown;
 145        len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
 146        if (len <= 0)
 147                goto unknown;
 148        return len;
 149
 150unknown:
 151        *target = '?';
 152        len = 1;
 153        return len;
 154}
 155
 156/*
 157 * cifs_from_utf16 - convert utf16le string to local charset
 158 * @to - destination buffer
 159 * @from - source buffer
 160 * @tolen - destination buffer size (in bytes)
 161 * @fromlen - source buffer size (in bytes)
 162 * @codepage - codepage to which characters should be converted
 163 * @mapchar - should characters be remapped according to the mapchars option?
 164 *
 165 * Convert a little-endian utf16le string (as sent by the server) to a string
 166 * in the provided codepage. The tolen and fromlen parameters are to ensure
 167 * that the code doesn't walk off of the end of the buffer (which is always
 168 * a danger if the alignment of the source buffer is off). The destination
 169 * string is always properly null terminated and fits in the destination
 170 * buffer. Returns the length of the destination string in bytes (including
 171 * null terminator).
 172 *
 173 * Note that some windows versions actually send multiword UTF-16 characters
 174 * instead of straight UTF16-2. The linux nls routines however aren't able to
 175 * deal with those characters properly. In the event that we get some of
 176 * those characters, they won't be translated properly.
 177 */
 178int
 179cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 180                const struct nls_table *codepage, int map_type)
 181{
 182        int i, charlen, safelen;
 183        int outlen = 0;
 184        int nullsize = nls_nullsize(codepage);
 185        int fromwords = fromlen / 2;
 186        char tmp[NLS_MAX_CHARSET_SIZE];
 187        __u16 ftmp[3];          /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
 188
 189        /*
 190         * because the chars can be of varying widths, we need to take care
 191         * not to overflow the destination buffer when we get close to the
 192         * end of it. Until we get to this offset, we don't need to check
 193         * for overflow however.
 194         */
 195        safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
 196
 197        for (i = 0; i < fromwords; i++) {
 198                ftmp[0] = get_unaligned_le16(&from[i]);
 199                if (ftmp[0] == 0)
 200                        break;
 201                if (i + 1 < fromwords)
 202                        ftmp[1] = get_unaligned_le16(&from[i + 1]);
 203                else
 204                        ftmp[1] = 0;
 205                if (i + 2 < fromwords)
 206                        ftmp[2] = get_unaligned_le16(&from[i + 2]);
 207                else
 208                        ftmp[2] = 0;
 209
 210                /*
 211                 * check to see if converting this character might make the
 212                 * conversion bleed into the null terminator
 213                 */
 214                if (outlen >= safelen) {
 215                        charlen = cifs_mapchar(tmp, ftmp, codepage, map_type);
 216                        if ((outlen + charlen) > (tolen - nullsize))
 217                                break;
 218                }
 219
 220                /* put converted char into 'to' buffer */
 221                charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
 222                outlen += charlen;
 223
 224                /* charlen (=bytes of UTF-8 for 1 character)
 225                 * 4bytes UTF-8(surrogate pair) is charlen=4
 226                 *   (4bytes UTF-16 code)
 227                 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
 228                 *   (2 UTF-8 pairs divided to 2 UTF-16 pairs) */
 229                if (charlen == 4)
 230                        i++;
 231                else if (charlen >= 5)
 232                        /* 5-6bytes UTF-8 */
 233                        i += 2;
 234        }
 235
 236        /* properly null-terminate string */
 237        for (i = 0; i < nullsize; i++)
 238                to[outlen++] = 0;
 239
 240        return outlen;
 241}
 242
 243/*
 244 * NAME:        cifs_strtoUTF16()
 245 *
 246 * FUNCTION:    Convert character string to unicode string
 247 *
 248 */
 249int
 250cifs_strtoUTF16(__le16 *to, const char *from, int len,
 251              const struct nls_table *codepage)
 252{
 253        int charlen;
 254        int i;
 255        wchar_t wchar_to; /* needed to quiet sparse */
 256
 257        /* special case for utf8 to handle no plane0 chars */
 258        if (!strcmp(codepage->charset, "utf8")) {
 259                /*
 260                 * convert utf8 -> utf16, we assume we have enough space
 261                 * as caller should have assumed conversion does not overflow
 262                 * in destination len is length in wchar_t units (16bits)
 263                 */
 264                i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
 265                                       (wchar_t *) to, len);
 266
 267                /* if success terminate and exit */
 268                if (i >= 0)
 269                        goto success;
 270                /*
 271                 * if fails fall back to UCS encoding as this
 272                 * function should not return negative values
 273                 * currently can fail only if source contains
 274                 * invalid encoded characters
 275                 */
 276        }
 277
 278        for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
 279                charlen = codepage->char2uni(from, len, &wchar_to);
 280                if (charlen < 1) {
 281                        cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
 282                                 *from, charlen);
 283                        /* A question mark */
 284                        wchar_to = 0x003f;
 285                        charlen = 1;
 286                }
 287                put_unaligned_le16(wchar_to, &to[i]);
 288        }
 289
 290success:
 291        put_unaligned_le16(0, &to[i]);
 292        return i;
 293}
 294
 295/*
 296 * cifs_utf16_bytes - how long will a string be after conversion?
 297 * @utf16 - pointer to input string
 298 * @maxbytes - don't go past this many bytes of input string
 299 * @codepage - destination codepage
 300 *
 301 * Walk a utf16le string and return the number of bytes that the string will
 302 * be after being converted to the given charset, not including any null
 303 * termination required. Don't walk past maxbytes in the source buffer.
 304 */
 305int
 306cifs_utf16_bytes(const __le16 *from, int maxbytes,
 307                const struct nls_table *codepage)
 308{
 309        int i;
 310        int charlen, outlen = 0;
 311        int maxwords = maxbytes / 2;
 312        char tmp[NLS_MAX_CHARSET_SIZE];
 313        __u16 ftmp[3];
 314
 315        for (i = 0; i < maxwords; i++) {
 316                ftmp[0] = get_unaligned_le16(&from[i]);
 317                if (ftmp[0] == 0)
 318                        break;
 319                if (i + 1 < maxwords)
 320                        ftmp[1] = get_unaligned_le16(&from[i + 1]);
 321                else
 322                        ftmp[1] = 0;
 323                if (i + 2 < maxwords)
 324                        ftmp[2] = get_unaligned_le16(&from[i + 2]);
 325                else
 326                        ftmp[2] = 0;
 327
 328                charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD);
 329                outlen += charlen;
 330        }
 331
 332        return outlen;
 333}
 334
 335/*
 336 * cifs_strndup_from_utf16 - copy a string from wire format to the local
 337 * codepage
 338 * @src - source string
 339 * @maxlen - don't walk past this many bytes in the source string
 340 * @is_unicode - is this a unicode string?
 341 * @codepage - destination codepage
 342 *
 343 * Take a string given by the server, convert it to the local codepage and
 344 * put it in a new buffer. Returns a pointer to the new string or NULL on
 345 * error.
 346 */
 347char *
 348cifs_strndup_from_utf16(const char *src, const int maxlen,
 349                        const bool is_unicode, const struct nls_table *codepage)
 350{
 351        int len;
 352        char *dst;
 353
 354        if (is_unicode) {
 355                len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
 356                len += nls_nullsize(codepage);
 357                dst = kmalloc(len, GFP_KERNEL);
 358                if (!dst)
 359                        return NULL;
 360                cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
 361                               NO_MAP_UNI_RSVD);
 362        } else {
 363                len = strnlen(src, maxlen);
 364                len++;
 365                dst = kmalloc(len, GFP_KERNEL);
 366                if (!dst)
 367                        return NULL;
 368                strlcpy(dst, src, len);
 369        }
 370
 371        return dst;
 372}
 373
 374static __le16 convert_to_sfu_char(char src_char)
 375{
 376        __le16 dest_char;
 377
 378        switch (src_char) {
 379        case ':':
 380                dest_char = cpu_to_le16(UNI_COLON);
 381                break;
 382        case '*':
 383                dest_char = cpu_to_le16(UNI_ASTERISK);
 384                break;
 385        case '?':
 386                dest_char = cpu_to_le16(UNI_QUESTION);
 387                break;
 388        case '<':
 389                dest_char = cpu_to_le16(UNI_LESSTHAN);
 390                break;
 391        case '>':
 392                dest_char = cpu_to_le16(UNI_GRTRTHAN);
 393                break;
 394        case '|':
 395                dest_char = cpu_to_le16(UNI_PIPE);
 396                break;
 397        default:
 398                dest_char = 0;
 399        }
 400
 401        return dest_char;
 402}
 403
 404static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
 405{
 406        __le16 dest_char;
 407
 408        if (src_char >= 0x01 && src_char <= 0x1F) {
 409                dest_char = cpu_to_le16(src_char + 0xF000);
 410                return dest_char;
 411        }
 412        switch (src_char) {
 413        case ':':
 414                dest_char = cpu_to_le16(SFM_COLON);
 415                break;
 416        case '"':
 417                dest_char = cpu_to_le16(SFM_DOUBLEQUOTE);
 418                break;
 419        case '*':
 420                dest_char = cpu_to_le16(SFM_ASTERISK);
 421                break;
 422        case '?':
 423                dest_char = cpu_to_le16(SFM_QUESTION);
 424                break;
 425        case '<':
 426                dest_char = cpu_to_le16(SFM_LESSTHAN);
 427                break;
 428        case '>':
 429                dest_char = cpu_to_le16(SFM_GRTRTHAN);
 430                break;
 431        case '|':
 432                dest_char = cpu_to_le16(SFM_PIPE);
 433                break;
 434        case '.':
 435                if (end_of_string)
 436                        dest_char = cpu_to_le16(SFM_PERIOD);
 437                else
 438                        dest_char = 0;
 439                break;
 440        case ' ':
 441                if (end_of_string)
 442                        dest_char = cpu_to_le16(SFM_SPACE);
 443                else
 444                        dest_char = 0;
 445                break;
 446        default:
 447                dest_char = 0;
 448        }
 449
 450        return dest_char;
 451}
 452
 453/*
 454 * Convert 16 bit Unicode pathname to wire format from string in current code
 455 * page. Conversion may involve remapping up the six characters that are
 456 * only legal in POSIX-like OS (if they are present in the string). Path
 457 * names are little endian 16 bit Unicode on the wire
 458 */
 459int
 460cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
 461                 const struct nls_table *cp, int map_chars)
 462{
 463        int i, charlen;
 464        int j = 0;
 465        char src_char;
 466        __le16 dst_char;
 467        wchar_t tmp;
 468        wchar_t *wchar_to;      /* UTF-16 */
 469        int ret;
 470        unicode_t u;
 471
 472        if (map_chars == NO_MAP_UNI_RSVD)
 473                return cifs_strtoUTF16(target, source, PATH_MAX, cp);
 474
 475        wchar_to = kzalloc(6, GFP_KERNEL);
 476
 477        for (i = 0; i < srclen; j++) {
 478                src_char = source[i];
 479                charlen = 1;
 480
 481                /* check if end of string */
 482                if (src_char == 0)
 483                        goto ctoUTF16_out;
 484
 485                /* see if we must remap this char */
 486                if (map_chars == SFU_MAP_UNI_RSVD)
 487                        dst_char = convert_to_sfu_char(src_char);
 488                else if (map_chars == SFM_MAP_UNI_RSVD) {
 489                        bool end_of_string;
 490
 491                        /**
 492                         * Remap spaces and periods found at the end of every
 493                         * component of the path. The special cases of '.' and
 494                         * '..' do not need to be dealt with explicitly because
 495                         * they are addressed in namei.c:link_path_walk().
 496                         **/
 497                        if ((i == srclen - 1) || (source[i+1] == '\\'))
 498                                end_of_string = true;
 499                        else
 500                                end_of_string = false;
 501
 502                        dst_char = convert_to_sfm_char(src_char, end_of_string);
 503                } else
 504                        dst_char = 0;
 505                /*
 506                 * FIXME: We can not handle remapping backslash (UNI_SLASH)
 507                 * until all the calls to build_path_from_dentry are modified,
 508                 * as they use backslash as separator.
 509                 */
 510                if (dst_char == 0) {
 511                        charlen = cp->char2uni(source + i, srclen - i, &tmp);
 512                        dst_char = cpu_to_le16(tmp);
 513
 514                        /*
 515                         * if no match, use question mark, which at least in
 516                         * some cases serves as wild card
 517                         */
 518                        if (charlen > 0)
 519                                goto ctoUTF16;
 520
 521                        /* convert SURROGATE_PAIR */
 522                        if (strcmp(cp->charset, "utf8") || !wchar_to)
 523                                goto unknown;
 524                        if (*(source + i) & 0x80) {
 525                                charlen = utf8_to_utf32(source + i, 6, &u);
 526                                if (charlen < 0)
 527                                        goto unknown;
 528                        } else
 529                                goto unknown;
 530                        ret  = utf8s_to_utf16s(source + i, charlen,
 531                                               UTF16_LITTLE_ENDIAN,
 532                                               wchar_to, 6);
 533                        if (ret < 0)
 534                                goto unknown;
 535
 536                        i += charlen;
 537                        dst_char = cpu_to_le16(*wchar_to);
 538                        if (charlen <= 3)
 539                                /* 1-3bytes UTF-8 to 2bytes UTF-16 */
 540                                put_unaligned(dst_char, &target[j]);
 541                        else if (charlen == 4) {
 542                                /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
 543                                 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
 544                                 *   (charlen=3+4 or 4+4) */
 545                                put_unaligned(dst_char, &target[j]);
 546                                dst_char = cpu_to_le16(*(wchar_to + 1));
 547                                j++;
 548                                put_unaligned(dst_char, &target[j]);
 549                        } else if (charlen >= 5) {
 550                                /* 5-6bytes UTF-8 to 6bytes UTF-16 */
 551                                put_unaligned(dst_char, &target[j]);
 552                                dst_char = cpu_to_le16(*(wchar_to + 1));
 553                                j++;
 554                                put_unaligned(dst_char, &target[j]);
 555                                dst_char = cpu_to_le16(*(wchar_to + 2));
 556                                j++;
 557                                put_unaligned(dst_char, &target[j]);
 558                        }
 559                        continue;
 560
 561unknown:
 562                        dst_char = cpu_to_le16(0x003f);
 563                        charlen = 1;
 564                }
 565
 566ctoUTF16:
 567                /*
 568                 * character may take more than one byte in the source string,
 569                 * but will take exactly two bytes in the target string
 570                 */
 571                i += charlen;
 572                put_unaligned(dst_char, &target[j]);
 573        }
 574
 575ctoUTF16_out:
 576        put_unaligned(0, &target[j]); /* Null terminate target unicode string */
 577        kfree(wchar_to);
 578        return j;
 579}
 580
 581/*
 582 * cifs_local_to_utf16_bytes - how long will a string be after conversion?
 583 * @from - pointer to input string
 584 * @maxbytes - don't go past this many bytes of input string
 585 * @codepage - source codepage
 586 *
 587 * Walk a string and return the number of bytes that the string will
 588 * be after being converted to the given charset, not including any null
 589 * termination required. Don't walk past maxbytes in the source buffer.
 590 */
 591
 592static int
 593cifs_local_to_utf16_bytes(const char *from, int len,
 594                          const struct nls_table *codepage)
 595{
 596        int charlen;
 597        int i;
 598        wchar_t wchar_to;
 599
 600        for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
 601                charlen = codepage->char2uni(from, len, &wchar_to);
 602                /* Failed conversion defaults to a question mark */
 603                if (charlen < 1)
 604                        charlen = 1;
 605        }
 606        return 2 * i; /* UTF16 characters are two bytes */
 607}
 608
 609/*
 610 * cifs_strndup_to_utf16 - copy a string to wire format from the local codepage
 611 * @src - source string
 612 * @maxlen - don't walk past this many bytes in the source string
 613 * @utf16_len - the length of the allocated string in bytes (including null)
 614 * @cp - source codepage
 615 * @remap - map special chars
 616 *
 617 * Take a string convert it from the local codepage to UTF16 and
 618 * put it in a new buffer. Returns a pointer to the new string or NULL on
 619 * error.
 620 */
 621__le16 *
 622cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
 623                      const struct nls_table *cp, int remap)
 624{
 625        int len;
 626        __le16 *dst;
 627
 628        len = cifs_local_to_utf16_bytes(src, maxlen, cp);
 629        len += 2; /* NULL */
 630        dst = kmalloc(len, GFP_KERNEL);
 631        if (!dst) {
 632                *utf16_len = 0;
 633                return NULL;
 634        }
 635        cifsConvertToUTF16(dst, src, strlen(src), cp, remap);
 636        *utf16_len = len;
 637        return dst;
 638}
 639