utf8lib.c

   1 #include "quakedef.h"
   2 #include "utf8lib.h"
   3
   4 /*
   5 ================================================================================
   6 Initialization of UTF-8 support and new cvars.
   7 ================================================================================
   8 */
   9 // for compatibility this defaults to 0
  10 cvar_t    utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
  11
  12 void   u8_Init(void)
  13 {
  14         Cvar_RegisterVariable(&utf8_enable);
  15 }
  16
  17 /*
  18 ================================================================================
  19 UTF-8 encoding and decoding functions follow.
  20 ================================================================================
  21 */
  22
  23 /** Analyze the next character and return various information if requested.
  24  * @param _s      An utf-8 string.
  25  * @param _start  Filled with the start byte-offset of the next valid character
  26  * @param _len    Fileed with the length of the next valid character
  27  * @param _ch     Filled with the unicode value of the next character
  28  * @return        Whether or not another valid character is in the string
  29  */
  30 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch)
  31 {
  32         const unsigned char *s = (const unsigned char*)_s;
  33         unsigned char bt, bc;
  34         size_t i;
  35         size_t bits, j;
  36         Uchar ch;
  37
  38         i = 0;
  39 findchar:
  40
  41         // <0xC2 is always an overlong encoding, they're invalid, thus skipped
  42         while (s[i] && s[i] >= 0x80 && s[i] <= 0xC2) {
  43                 //fprintf(stderr, "skipping\n");
  44                 ++i;
  45         }
  46         //fprintf(stderr, "checking\n");
  47
  48         // If we hit the end, well, we're out and invalid
  49         if (!s[i])
  50                 return false;
  51         //fprintf(stderr, "checking ascii\n");
  52
  53         // ascii characters
  54         if (s[i] < 0x80)
  55         {
  56                 if (_start) *_start = i;
  57                 if (_len) *_len = 1;
  58                 if (_ch) *_ch = (Uchar)s[i];
  59                 //fprintf(stderr, "valid ascii\n");
  60                 return true;
  61         }
  62         //fprintf(stderr, "checking length\n");
  63
  64         // Figure out the next char's length
  65         bc = s[i];
  66         bits = 1;
  67         // count the 1 bits, they're the # of bytes
  68         for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
  69         if (!bt)
  70         {
  71                 //fprintf(stderr, "superlong\n");
  72                 ++i;
  73                 goto findchar;
  74         }
  75         // turn bt into a mask and give ch a starting value
  76         --bt;
  77         ch = (s[i] & bt);
  78         // check the byte sequence for invalid bytes
  79         for (j = 1; j < bits; ++j)
  80         {
  81                 // valid bit value: 10xx xxxx
  82                 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
  83                 if ( (s[i+j] & 0xC0) != 0x80 )
  84                 {
  85                         //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
  86                         // this byte sequence is invalid, skip it
  87                         i += j;
  88                         // find a character after it
  89                         goto findchar;
  90                 }
  91                 // at the same time, decode the character
  92                 ch = (ch << 6) | (s[i+j] & 0x3F);
  93         }
  94
  95         // Now check the decoded byte for an overlong encoding
  96         if ( (bits >= 2 && ch < 0x80) ||
  97              (bits >= 3 && ch < 0x800) ||
  98              (bits >= 4 && ch < 0x10000) ||
  99              ch >= 0x10FFFF // RFC 3629
 100                 )
 101         {
 102                 i += bits;
 103                 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
 104                 goto findchar;
 105         }
 106
 107         if (_start)
 108                 *_start = i;
 109         if (_len)
 110                 *_len = bits;
 111         if (_ch)
 112                 *_ch = ch;
 113         //fprintf(stderr, "valid utf8\n");
 114         return true;
 115 }
 116
 117 /** Get the number of characters in an UTF-8 string.
 118  * @param _s    An utf-8 encoded null-terminated string.
 119  * @return      The number of unicode characters in the string.
 120  */
 121 size_t u8_strlen(const char *_s)
 122 {
 123         size_t st, ln;
 124         size_t len = 0;
 125         const unsigned char *s = (const unsigned char*)_s;
 126
 127         if (!utf8_enable.integer)
 128                 return strlen(_s);
 129
 130         while (*s)
 131         {
 132                 // ascii char, skip u8_analyze
 133                 if (*s < 0x80)
 134                 {
 135                         ++len;
 136                         ++s;
 137                         continue;
 138                 }
 139
 140                 // invalid, skip u8_analyze
 141                 if (*s <= 0xC2)
 142                 {
 143                         ++s;
 144                         continue;
 145                 }
 146
 147                 if (!u8_analyze((const char*)s, &st, &ln, NULL))
 148                         break;
 149                 // valid character, skip after it
 150                 s += st + ln;
 151                 ++len;
 152         }
 153         return len;
 154 }
 155
 156 /** Get the number of characters in a part of an UTF-8 string.
 157  * @param _s    An utf-8 encoded null-terminated string.
 158  * @param n     The maximum number of bytes.
 159  * @return      The number of unicode characters in the string.
 160  */
 161 size_t u8_strnlen(const char *_s, size_t n)
 162 {
 163         size_t st, ln;
 164         size_t len = 0;
 165         const unsigned char *s = (const unsigned char*)_s;
 166
 167         if (!utf8_enable.integer)
 168         {
 169                 len = strlen(_s);
 170                 return (len < n) ? len : n;
 171         }
 172
 173         while (*s && n)
 174         {
 175                 // ascii char, skip u8_analyze
 176                 if (*s < 0x80)
 177                 {
 178                         ++len;
 179                         ++s;
 180                         --n;
 181                         continue;
 182                 }
 183
 184                 // invalid, skip u8_analyze
 185                 if (*s <= 0xC2)
 186                 {
 187                         ++s;
 188                         --n;
 189                         continue;
 190                 }
 191
 192                 if (!u8_analyze((const char*)s, &st, &ln, NULL))
 193                         break;
 194                 // valid character, see if it's still inside the range specified by n:
 195                 if (n < st + ln)
 196                         return len;
 197                 ++len;
 198                 n -= st + ln;
 199                 s += st + ln;
 200         }
 201         return len;
 202 }
 203
 204 /** Get the number of bytes used in a string to represent an amount of characters.
 205  * @param _s    An utf-8 encoded null-terminated string.
 206  * @param n     The number of characters we want to know the byte-size for.
 207  * @return      The number of bytes used to represent n characters.
 208  */
 209 size_t u8_bytelen(const char *_s, size_t n)
 210 {
 211         size_t st, ln;
 212         size_t len = 0;
 213         const unsigned char *s = (const unsigned char*)_s;
 214
 215         if (!utf8_enable.integer)
 216                 return n;
 217
 218         while (*s && n)
 219         {
 220                 // ascii char, skip u8_analyze
 221                 if (*s < 0x80)
 222                 {
 223                         ++len;
 224                         ++s;
 225                         --n;
 226                         continue;
 227                 }
 228
 229                 // invalid, skip u8_analyze
 230                 if (*s <= 0xC2)
 231                 {
 232                         ++s;
 233                         ++len;
 234                         continue;
 235                 }
 236
 237                 if (!u8_analyze((const char*)s, &st, &ln, NULL))
 238                         break;
 239                 --n;
 240                 s += st + ln;
 241                 len += st + ln;
 242         }
 243         return len;
 244 }
 245
 246 /** Get the byte-index for a character-index.
 247  * @param _s      An utf-8 encoded string.
 248  * @param i       The character-index for which you want the byte offset.
 249  * @param len     If not null, character's length will be stored in there.
 250  * @return        The byte-index at which the character begins, or -1 if the string is too short.
 251  */
 252 int u8_byteofs(const char *_s, size_t i, size_t *len)
 253 {
 254         size_t st, ln;
 255         size_t ofs = 0;
 256         const unsigned char *s = (const unsigned char*)_s;
 257
 258         if (!utf8_enable.integer)
 259         {
 260                 if (len) *len = 1;
 261                 return i;
 262         }
 263
 264         st = ln = 0;
 265         do
 266         {
 267                 ofs += ln;
 268                 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL))
 269                         return -1;
 270                 ofs += st;
 271         } while(i-- > 0);
 272         if (len)
 273                 *len = ln;
 274         return ofs;
 275 }
 276
 277 /** Get the char-index for a byte-index.
 278  * @param _s      An utf-8 encoded string.
 279  * @param i       The byte offset for which you want the character index.
 280  * @param len     If not null, the offset within the character is stored here.
 281  * @return        The character-index, or -1 if the string is too short.
 282  */
 283 int u8_charidx(const char *_s, size_t i, size_t *len)
 284 {
 285         size_t st, ln;
 286         size_t ofs = 0;
 287         size_t pofs = 0;
 288         int idx = 0;
 289         const unsigned char *s = (const unsigned char*)_s;
 290
 291         if (!utf8_enable.integer)
 292         {
 293                 if (len) *len = 0;
 294                 return i;
 295         }
 296
 297         while (ofs < i && s[ofs])
 298         {
 299                 // ascii character, skip u8_analyze
 300                 if (s[ofs] < 0x80)
 301                 {
 302                         pofs = ofs;
 303                         ++idx;
 304                         ++ofs;
 305                         continue;
 306                 }
 307
 308                 // invalid, skip u8_analyze
 309                 if (s[ofs] <= 0xC2)
 310                 {
 311                         ++ofs;
 312                         continue;
 313                 }
 314
 315                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
 316                         return -1;
 317                 // see if next char is after the bytemark
 318                 if (ofs + st > i)
 319                 {
 320                         if (len)
 321                                 *len = i - pofs;
 322                         return idx;
 323                 }
 324                 ++idx;
 325                 pofs = ofs + st;
 326                 ofs += st + ln;
 327                 // see if bytemark is within the char
 328                 if (ofs > i)
 329                 {
 330                         if (len)
 331                                 *len = i - pofs;
 332                         return idx;
 333                 }
 334         }
 335         if (len) *len = 0;
 336         return idx;
 337 }
 338
 339 /** Get the byte offset of the previous byte.
 340  * The result equals:
 341  * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
 342  * @param _s      An utf-8 encoded string.
 343  * @param i       The current byte offset.
 344  * @return        The byte offset of the previous character
 345  */
 346 size_t u8_prevbyte(const char *_s, size_t i)
 347 {
 348         size_t st, ln;
 349         const unsigned char *s = (const unsigned char*)_s;
 350         size_t lastofs = 0;
 351         size_t ofs = 0;
 352
 353         if (!utf8_enable.integer)
 354         {
 355                 if (i > 0)
 356                         return i-1;
 357                 return 0;
 358         }
 359
 360         while (ofs < i && s[ofs])
 361         {
 362                 // ascii character, skip u8_analyze
 363                 if (s[ofs] < 0x80)
 364                 {
 365                         lastofs = ofs++;
 366                         continue;
 367                 }
 368
 369                 // invalid, skip u8_analyze
 370                 if (s[ofs] <= 0xC2)
 371                 {
 372                         ++ofs;
 373                         continue;
 374                 }
 375
 376                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
 377                         return lastofs;
 378                 if (ofs + st > i)
 379                         return lastofs;
 380                 if (ofs + st + ln >= i)
 381                         return ofs + st;
 382
 383                 lastofs = ofs;
 384                 ofs += st + ln;
 385         }
 386         return lastofs;
 387 }
 388
 389 static int char_usefont[256] = {
 390         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
 391         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
 392         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // shift+digit line
 393         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // digits
 394         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
 395         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
 396         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
 397         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
 398         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
 399         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // faces
 400         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 401         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 402         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 403         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 404         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 405         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 406 };
 407
 408
 409 /** Fetch a character from an utf-8 encoded string.
 410  * @param _s      The start of an utf-8 encoded multi-byte character.
 411  * @param _end    Will point to after the first multi-byte character.
 412  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 413  */
 414 Uchar u8_getchar(const char *_s, const char **_end)
 415 {
 416         size_t st, ln;
 417         Uchar ch;
 418
 419         if (!utf8_enable.integer)
 420         {
 421                 if (_end)
 422                         *_end = _s + 1;
 423                 /* Careful: if we disable utf8 but not freetype, we wish to see freetype chars
 424                  * for normal letters. So use E000+x for special chars, but leave the freetype stuff for the
 425                  * rest:
 426                  */
 427                 if (!char_usefont[(unsigned int)*(const unsigned char*)_s])
 428                         return 0xE000 + (Uchar)*(const unsigned char*)_s;
 429                 return (Uchar)*(const unsigned char*)_s;
 430         }
 431
 432         if (!u8_analyze(_s, &st, &ln, &ch))
 433                 return 0;
 434         if (_end)
 435                 *_end = _s + st + ln;
 436         return ch;
 437 }
 438
 439 /** Encode a wide-character into utf-8.
 440  * @param w        The wide character to encode.
 441  * @param to       The target buffer the utf-8 encoded string is stored to.
 442  * @param maxlen   The maximum number of bytes that fit into the target buffer.
 443  * @return         Number of bytes written to the buffer not including the terminating null.
 444  *                 Less or equal to 0 if the buffer is too small.
 445  */
 446 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 447 {
 448         if (maxlen < 1)
 449                 return -2;
 450
 451         if (!w)
 452                 return -5;
 453
 454         if (w >= 0xE000 && !utf8_enable.integer)
 455                 w -= 0xE000;
 456
 457         if (w < 0x80 || !utf8_enable.integer)
 458         {
 459                 to[0] = (char)w;
 460                 if (maxlen < 2)
 461                         return -1;
 462                 to[1] = 0;
 463                 return 1;
 464         }
 465         // for a little speedup
 466         if (w < 0x800)
 467         {
 468                 if (maxlen < 3)
 469                 {
 470                         to[0] = 0;
 471                         return -1;
 472                 }
 473                 to[2] = 0;
 474                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 475                 to[0] = 0xC0 | w;
 476                 return 2;
 477         }
 478         if (w < 0x10000)
 479         {
 480                 if (maxlen < 4)
 481                 {
 482                         to[0] = 0;
 483                         return -1;
 484                 }
 485                 to[3] = 0;
 486                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 487                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 488                 to[0] = 0xE0 | w;
 489                 return 3;
 490         }
 491
 492         // RFC 3629
 493         if (w <= 0x10FFFF)
 494         {
 495                 if (maxlen < 5)
 496                 {
 497                         to[0] = 0;
 498                         return -1;
 499                 }
 500                 to[4] = 0;
 501                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 502                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 503                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 504                 to[0] = 0xE0 | w;
 505                 return 4;
 506         }
 507         return -1;
 508 }
 509
 510 /** uses u8_fromchar on a static buffer
 511  * @param ch        The unicode character to convert to encode
 512  * @param l         The number of bytes without the terminating null.
 513  * @return          A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
 514  */
 515 char *u8_encodech(Uchar ch, size_t *l)
 516 {
 517         static char buf[16];
 518         size_t len;
 519         len = u8_fromchar(ch, buf, sizeof(buf));
 520         if (len > 0)
 521         {
 522                 if (l) *l = len;
 523                 return buf;
 524         }
 525         return NULL;
 526 }
 527
 528 /** Convert a utf-8 multibyte string to a wide character string.
 529  * @param wcs       The target wide-character buffer.
 530  * @param mb        The utf-8 encoded multibyte string to convert.
 531  * @param maxlen    The maximum number of wide-characters that fit into the target buffer.
 532  * @return          The number of characters written to the target buffer.
 533  */
 534 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
 535 {
 536         size_t i;
 537         Uchar ch;
 538         if (maxlen < 1)
 539                 return 0;
 540         for (i = 0; *mb && i < maxlen-1; ++i)
 541         {
 542                 ch = u8_getchar(mb, &mb);
 543                 if (!ch)
 544                         break;
 545                 wcs[i] = ch;
 546         }
 547         wcs[i] = 0;
 548         return i;
 549 }
 550
 551 /** Convert a wide-character string to a utf-8 multibyte string.
 552  * @param mb      The target buffer the utf-8 string is written to.
 553  * @param wcs     The wide-character string to convert.
 554  * @param maxlen  The number bytes that fit into the multibyte target buffer.
 555  * @return        The number of bytes written, not including the terminating \0
 556  */
 557 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
 558 {
 559         size_t i;
 560         const char *start = mb;
 561         if (maxlen < 2)
 562                 return 0;
 563         for (i = 0; wcs[i] && i < maxlen-1; ++i)
 564         {
 565                 int len;
 566                 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
 567                         return (mb - start);
 568                 mb += len;
 569         }
 570         *mb = 0;
 571         return (mb - start);
 572 }
 573
 574 /*
 575 ============
 576 UTF-8 aware COM_StringLengthNoColors
 577
 578 calculates the visible width of a color coded string.
 579
 580 *valid is filled with TRUE if the string is a valid colored string (that is, if
 581 it does not end with an unfinished color code). If it gets filled with FALSE, a
 582 fix would be adding a STRING_COLOR_TAG at the end of the string.
 583
 584 valid can be set to NULL if the caller doesn't care.
 585
 586 For size_s, specify the maximum number of characters from s to use, or 0 to use
 587 all characters until the zero terminator.
 588 ============
 589 */
 590 size_t
 591 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
 592 size_t
 593 u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid)
 594 {
 595         const char *end;
 596         size_t len = 0;
 597
 598         if (!utf8_enable.integer)
 599                 return COM_StringLengthNoColors(s, size_s, valid);
 600
 601         end = size_s ? (s + size_s) : NULL;
 602
 603         for(;;)
 604         {
 605                 switch((s == end) ? 0 : *s)
 606                 {
 607                         case 0:
 608                                 if(valid)
 609                                         *valid = TRUE;
 610                                 return len;
 611                         case STRING_COLOR_TAG:
 612                                 ++s;
 613                                 switch((s == end) ? 0 : *s)
 614                                 {
 615                                         case STRING_COLOR_RGB_TAG_CHAR:
 616                                                 if (s+1 != end && isxdigit(s[1]) &&
 617                                                         s+2 != end && isxdigit(s[2]) &&
 618                                                         s+3 != end && isxdigit(s[3]) )
 619                                                 {
 620                                                         s+=3;
 621                                                         break;
 622                                                 }
 623                                                 ++len; // STRING_COLOR_TAG
 624                                                 ++len; // STRING_COLOR_RGB_TAG_CHAR
 625                                                 break;
 626                                         case 0: // ends with unfinished color code!
 627                                                 ++len;
 628                                                 if(valid)
 629                                                         *valid = FALSE;
 630                                                 return len;
 631                                         case STRING_COLOR_TAG: // escaped ^
 632                                                 ++len;
 633                                                 break;
 634                                         case '0': case '1': case '2': case '3': case '4':
 635                                         case '5': case '6': case '7': case '8': case '9': // color code
 636                                                 break;
 637                                         default: // not a color code
 638                                                 ++len; // STRING_COLOR_TAG
 639                                                 ++len; // the character
 640                                                 break;
 641                                 }
 642                                 break;
 643                         default:
 644                                 ++len;
 645                                 break;
 646                 }
 647
 648                 // start of a wide character
 649                 if (*s & 0xC0)
 650                 {
 651                         for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
 652                         continue;
 653                 }
 654                 // part of a wide character, we ignore that one
 655                 if (*s <= 0xBF)
 656                         --len;
 657                 ++s;
 658         }
 659         // never get here
 660 }