utf8lib.c

   1 #include "quakedef.h"
   2 #include "utf8lib.h"
   3
   4 /*
   5 ================================================================================
   6 Initialization of UTF-8 support and new cvars.
   7 ================================================================================
   8 */
   9 // TODO: should this default to 1? To enforce compatibilty
  10 //       TODO: If changed to 1, add 'utf8_disabled 0' to defaultNexuiz.cfg
  11 cvar_t    utf8_disabled = {CVAR_SAVE, "utf8_disabled", "1", "Disable UTF-8 support. For compatibility, this is disabled by default in most games."};
  12
  13 void   u8_Init(void)
  14 {
  15         Cvar_RegisterVariable(&utf8_disabled);
  16 }
  17
  18 /*
  19 ================================================================================
  20 UTF-8 encoding and decoding functions follow.
  21 ================================================================================
  22 */
  23
  24 /** Analyze the next character and return various information if requested.
  25  * @param _s      An utf-8 string.
  26  * @param _start  Filled with the start byte-offset of the next valid character
  27  * @param _len    Fileed with the length of the next valid character
  28  * @param _ch     Filled with the unicode value of the next character
  29  * @return        Whether or not another valid character is in the string
  30  */
  31 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch)
  32 {
  33         const unsigned char *s = (const unsigned char*)_s;
  34         unsigned char bt, bc;
  35         size_t i;
  36         size_t bits, j;
  37         Uchar ch;
  38
  39         i = 0;
  40 findchar:
  41
  42         // <0xC2 is always an overlong encoding, they're invalid, thus skipped
  43         while (s[i] && s[i] >= 0x80 && s[i] <= 0xC2) {
  44                 //fprintf(stderr, "skipping\n");
  45                 ++i;
  46         }
  47         //fprintf(stderr, "checking\n");
  48
  49         // If we hit the end, well, we're out and invalid
  50         if (!s[i])
  51                 return false;
  52         //fprintf(stderr, "checking ascii\n");
  53
  54         // ascii characters
  55         if (s[i] < 0x80)
  56         {
  57                 if (_start) *_start = i;
  58                 if (_len) *_len = 1;
  59                 if (_ch) *_ch = (Uchar)s[i];
  60                 //fprintf(stderr, "valid ascii\n");
  61                 return true;
  62         }
  63         //fprintf(stderr, "checking length\n");
  64
  65         // Figure out the next char's length
  66         bc = s[i];
  67         bits = 1;
  68         // count the 1 bits, they're the # of bytes
  69         for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
  70         if (!bt)
  71         {
  72                 //fprintf(stderr, "superlong\n");
  73                 ++i;
  74                 goto findchar;
  75         }
  76         // turn bt into a mask and give ch a starting value
  77         --bt;
  78         ch = (s[i] & bt);
  79         // check the byte sequence for invalid bytes
  80         for (j = 1; j < bits; ++j)
  81         {
  82                 // valid bit value: 10xx xxxx
  83                 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
  84                 if ( (s[i+j] & 0xC0) != 0x80 )
  85                 {
  86                         //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
  87                         // this byte sequence is invalid, skip it
  88                         i += j;
  89                         // find a character after it
  90                         goto findchar;
  91                 }
  92                 // at the same time, decode the character
  93                 ch = (ch << 6) | (s[i+j] & 0x3F);
  94         }
  95
  96         // Now check the decoded byte for an overlong encoding
  97         if ( (bits >= 2 && ch < 0x80) ||
  98              (bits >= 3 && ch < 0x800) ||
  99              (bits >= 4 && ch < 0x10000) ||
 100              ch >= 0x10FFFF // RFC 3629
 101                 )
 102         {
 103                 i += bits;
 104                 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
 105                 goto findchar;
 106         }
 107
 108         if (_start)
 109                 *_start = i;
 110         if (_len)
 111                 *_len = bits;
 112         if (_ch)
 113                 *_ch = ch;
 114         //fprintf(stderr, "valid utf8\n");
 115         return true;
 116 }
 117
 118 /** Get the number of characters in an UTF-8 string.
 119  * @param _s    An utf-8 encoded null-terminated string.
 120  * @return      The number of unicode characters in the string.
 121  */
 122 size_t u8_strlen(const char *_s)
 123 {
 124         size_t st, ln;
 125         size_t len = 0;
 126         const unsigned char *s = (const unsigned char*)_s;
 127
 128         if (utf8_disabled.integer)
 129                 return strlen(_s);
 130
 131         while (*s)
 132         {
 133                 // ascii char, skip u8_analyze
 134                 if (*s < 0x80)
 135                 {
 136                         ++len;
 137                         ++s;
 138                         continue;
 139                 }
 140
 141                 // invalid, skip u8_analyze
 142                 if (*s <= 0xC2)
 143                 {
 144                         ++s;
 145                         continue;
 146                 }
 147
 148                 if (!u8_analyze((const char*)s, &st, &ln, NULL))
 149                         break;
 150                 // valid character, skip after it
 151                 s += st + ln;
 152                 ++len;
 153         }
 154         return len;
 155 }
 156
 157 /** Get the number of characters in a part of an UTF-8 string.
 158  * @param _s    An utf-8 encoded null-terminated string.
 159  * @param n     The maximum number of bytes.
 160  * @return      The number of unicode characters in the string.
 161  */
 162 size_t u8_strnlen(const char *_s, size_t n)
 163 {
 164         size_t st, ln;
 165         size_t len = 0;
 166         const unsigned char *s = (const unsigned char*)_s;
 167
 168         if (utf8_disabled.integer)
 169         {
 170                 len = strlen(_s);
 171                 return (len < n) ? len : n;
 172         }
 173
 174         while (*s && n)
 175         {
 176                 // ascii char, skip u8_analyze
 177                 if (*s < 0x80)
 178                 {
 179                         ++len;
 180                         ++s;
 181                         --n;
 182                         continue;
 183                 }
 184
 185                 // invalid, skip u8_analyze
 186                 if (*s <= 0xC2)
 187                 {
 188                         ++s;
 189                         --n;
 190                         continue;
 191                 }
 192
 193                 if (!u8_analyze((const char*)s, &st, &ln, NULL))
 194                         break;
 195                 // valid character, see if it's still inside the range specified by n:
 196                 if (n < st + ln)
 197                         return len;
 198                 ++len;
 199                 n -= st + ln;
 200                 s += st + ln;
 201         }
 202         return len;
 203 }
 204
 205 /** Get the number of bytes used in a string to represent an amount of characters.
 206  * @param _s    An utf-8 encoded null-terminated string.
 207  * @param n     The number of characters we want to know the byte-size for.
 208  * @return      The number of bytes used to represent n characters.
 209  */
 210 size_t u8_bytelen(const char *_s, size_t n)
 211 {
 212         size_t st, ln;
 213         size_t len = 0;
 214         const unsigned char *s = (const unsigned char*)_s;
 215
 216         if (utf8_disabled.integer)
 217                 return n;
 218
 219         while (*s && n)
 220         {
 221                 // ascii char, skip u8_analyze
 222                 if (*s < 0x80)
 223                 {
 224                         ++len;
 225                         ++s;
 226                         --n;
 227                         continue;
 228                 }
 229
 230                 // invalid, skip u8_analyze
 231                 if (*s <= 0xC2)
 232                 {
 233                         ++s;
 234                         ++len;
 235                         continue;
 236                 }
 237
 238                 if (!u8_analyze((const char*)s, &st, &ln, NULL))
 239                         break;
 240                 --n;
 241                 s += st + ln;
 242                 len += st + ln;
 243         }
 244         return len;
 245 }
 246
 247 /** Get the byte-index for a character-index.
 248  * @param _s      An utf-8 encoded string.
 249  * @param i       The character-index for which you want the byte offset.
 250  * @param len     If not null, character's length will be stored in there.
 251  * @return        The byte-index at which the character begins, or -1 if the string is too short.
 252  */
 253 int u8_byteofs(const char *_s, size_t i, size_t *len)
 254 {
 255         size_t st, ln;
 256         size_t ofs = 0;
 257         const unsigned char *s = (const unsigned char*)_s;
 258
 259         if (utf8_disabled.integer)
 260         {
 261                 if (len) *len = 1;
 262                 return i;
 263         }
 264
 265         st = ln = 0;
 266         do
 267         {
 268                 ofs += ln;
 269                 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL))
 270                         return -1;
 271                 ofs += st;
 272         } while(i-- > 0);
 273         if (len)
 274                 *len = ln;
 275         return ofs;
 276 }
 277
 278 /** Get the char-index for a byte-index.
 279  * @param _s      An utf-8 encoded string.
 280  * @param i       The byte offset for which you want the character index.
 281  * @param len     If not null, the offset within the character is stored here.
 282  * @return        The character-index, or -1 if the string is too short.
 283  */
 284 int u8_charidx(const char *_s, size_t i, size_t *len)
 285 {
 286         size_t st, ln;
 287         size_t ofs = 0;
 288         size_t pofs = 0;
 289         int idx = 0;
 290         const unsigned char *s = (const unsigned char*)_s;
 291
 292         if (utf8_disabled.integer)
 293         {
 294                 if (len) *len = 0;
 295                 return i;
 296         }
 297
 298         while (ofs < i && s[ofs])
 299         {
 300                 // ascii character, skip u8_analyze
 301                 if (s[ofs] < 0x80)
 302                 {
 303                         pofs = ofs;
 304                         ++idx;
 305                         ++ofs;
 306                         continue;
 307                 }
 308
 309                 // invalid, skip u8_analyze
 310                 if (s[ofs] <= 0xC2)
 311                 {
 312                         ++ofs;
 313                         continue;
 314                 }
 315
 316                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
 317                         return -1;
 318                 // see if next char is after the bytemark
 319                 if (ofs + st > i)
 320                 {
 321                         if (len)
 322                                 *len = i - pofs;
 323                         return idx;
 324                 }
 325                 ++idx;
 326                 pofs = ofs + st;
 327                 ofs += st + ln;
 328                 // see if bytemark is within the char
 329                 if (ofs > i)
 330                 {
 331                         if (len)
 332                                 *len = i - pofs;
 333                         return idx;
 334                 }
 335         }
 336         if (len) *len = 0;
 337         return idx;
 338 }
 339
 340 /** Get the byte offset of the previous byte.
 341  * The result equals:
 342  * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
 343  * @param _s      An utf-8 encoded string.
 344  * @param i       The current byte offset.
 345  * @return        The byte offset of the previous character
 346  */
 347 size_t u8_prevbyte(const char *_s, size_t i)
 348 {
 349         size_t st, ln;
 350         const unsigned char *s = (const unsigned char*)_s;
 351         size_t lastofs = 0;
 352         size_t ofs = 0;
 353
 354         if (utf8_disabled.integer)
 355         {
 356                 if (i > 0)
 357                         return i-1;
 358                 return 0;
 359         }
 360
 361         while (ofs < i && s[ofs])
 362         {
 363                 // ascii character, skip u8_analyze
 364                 if (s[ofs] < 0x80)
 365                 {
 366                         lastofs = ofs++;
 367                         continue;
 368                 }
 369
 370                 // invalid, skip u8_analyze
 371                 if (s[ofs] <= 0xC2)
 372                 {
 373                         ++ofs;
 374                         continue;
 375                 }
 376
 377                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
 378                         return lastofs;
 379                 if (ofs + st > i)
 380                         return lastofs;
 381                 if (ofs + st + ln >= i)
 382                         return ofs + st;
 383
 384                 lastofs = ofs;
 385                 ofs += st + ln;
 386         }
 387         return lastofs;
 388 }
 389
 390 /** Fetch a character from an utf-8 encoded string.
 391  * @param _s      The start of an utf-8 encoded multi-byte character.
 392  * @param _end    Will point to after the first multi-byte character.
 393  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 394  */
 395 Uchar u8_getchar(const char *_s, const char **_end)
 396 {
 397         size_t st, ln;
 398         Uchar ch;
 399
 400         if (utf8_disabled.integer)
 401         {
 402                 if (_end)
 403                         *_end = _s + 1;
 404                 //return 0xE000 + (Uchar)*(const unsigned char*)_s;
 405                 return (Uchar)*(const unsigned char*)_s;
 406         }
 407
 408         if (!u8_analyze(_s, &st, &ln, &ch))
 409                 return 0;
 410         if (_end)
 411                 *_end = _s + st + ln;
 412         return ch;
 413 }
 414
 415 /** Encode a wide-character into utf-8.
 416  * @param w        The wide character to encode.
 417  * @param to       The target buffer the utf-8 encoded string is stored to.
 418  * @param maxlen   The maximum number of bytes that fit into the target buffer.
 419  * @return         Number of bytes written to the buffer not including the terminating null.
 420  *                 Less or equal to 0 if the buffer is too small.
 421  */
 422 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 423 {
 424         if (maxlen < 1)
 425                 return -2;
 426
 427         if (!w)
 428                 return -5;
 429
 430         if (w >= 0xE000 && utf8_disabled.integer)
 431                 w -= 0xE000;
 432
 433         if (w < 0x80 || utf8_disabled.integer)
 434         {
 435                 to[0] = (char)w;
 436                 if (maxlen < 2)
 437                         return -1;
 438                 to[1] = 0;
 439                 return 1;
 440         }
 441         // for a little speedup
 442         if (w < 0x800)
 443         {
 444                 if (maxlen < 3)
 445                 {
 446                         to[0] = 0;
 447                         return -1;
 448                 }
 449                 to[2] = 0;
 450                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 451                 to[0] = 0xC0 | w;
 452                 return 2;
 453         }
 454         if (w < 0x10000)
 455         {
 456                 if (maxlen < 4)
 457                 {
 458                         to[0] = 0;
 459                         return -1;
 460                 }
 461                 to[3] = 0;
 462                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 463                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 464                 to[0] = 0xE0 | w;
 465                 return 3;
 466         }
 467
 468         // RFC 3629
 469         if (w <= 0x10FFFF)
 470         {
 471                 if (maxlen < 5)
 472                 {
 473                         to[0] = 0;
 474                         return -1;
 475                 }
 476                 to[4] = 0;
 477                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 478                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 479                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 480                 to[0] = 0xE0 | w;
 481                 return 4;
 482         }
 483         return -1;
 484 }
 485
 486 /** uses u8_fromchar on a static buffer
 487  * @param ch        The unicode character to convert to encode
 488  * @param l         The number of bytes without the terminating null.
 489  * @return          A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
 490  */
 491 char *u8_encodech(Uchar ch, size_t *l)
 492 {
 493         static char buf[16];
 494         size_t len;
 495         len = u8_fromchar(ch, buf, sizeof(buf));
 496         if (len > 0)
 497         {
 498                 if (l) *l = len;
 499                 return buf;
 500         }
 501         return NULL;
 502 }
 503
 504 /** Convert a utf-8 multibyte string to a wide character string.
 505  * @param wcs       The target wide-character buffer.
 506  * @param mb        The utf-8 encoded multibyte string to convert.
 507  * @param maxlen    The maximum number of wide-characters that fit into the target buffer.
 508  * @return          The number of characters written to the target buffer.
 509  */
 510 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
 511 {
 512         size_t i;
 513         Uchar ch;
 514         if (maxlen < 1)
 515                 return 0;
 516         for (i = 0; *mb && i < maxlen-1; ++i)
 517         {
 518                 ch = u8_getchar(mb, &mb);
 519                 if (!ch)
 520                         break;
 521                 wcs[i] = ch;
 522         }
 523         wcs[i] = 0;
 524         return i;
 525 }
 526
 527 /** Convert a wide-character string to a utf-8 multibyte string.
 528  * @param mb      The target buffer the utf-8 string is written to.
 529  * @param wcs     The wide-character string to convert.
 530  * @param maxlen  The number bytes that fit into the multibyte target buffer.
 531  * @return        The number of bytes written, not including the terminating \0
 532  */
 533 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
 534 {
 535         size_t i;
 536         const char *start = mb;
 537         if (maxlen < 2)
 538                 return 0;
 539         for (i = 0; wcs[i] && i < maxlen-1; ++i)
 540         {
 541                 int len;
 542                 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
 543                         return (mb - start);
 544                 mb += len;
 545         }
 546         *mb = 0;
 547         return (mb - start);
 548 }
 549
 550 /*
 551 ============
 552 UTF-8 aware COM_StringLengthNoColors
 553
 554 calculates the visible width of a color coded string.
 555
 556 *valid is filled with TRUE if the string is a valid colored string (that is, if
 557 it does not end with an unfinished color code). If it gets filled with FALSE, a
 558 fix would be adding a STRING_COLOR_TAG at the end of the string.
 559
 560 valid can be set to NULL if the caller doesn't care.
 561
 562 For size_s, specify the maximum number of characters from s to use, or 0 to use
 563 all characters until the zero terminator.
 564 ============
 565 */
 566 size_t
 567 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
 568 size_t
 569 u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid)
 570 {
 571         const char *end;
 572         size_t len = 0;
 573
 574         if (utf8_disabled.integer)
 575                 return COM_StringLengthNoColors(s, size_s, valid);
 576
 577         end = size_s ? (s + size_s) : NULL;
 578
 579         for(;;)
 580         {
 581                 switch((s == end) ? 0 : *s)
 582                 {
 583                         case 0:
 584                                 if(valid)
 585                                         *valid = TRUE;
 586                                 return len;
 587                         case STRING_COLOR_TAG:
 588                                 ++s;
 589                                 switch((s == end) ? 0 : *s)
 590                                 {
 591                                         case STRING_COLOR_RGB_TAG_CHAR:
 592                                                 if (s+1 != end && isxdigit(s[1]) &&
 593                                                         s+2 != end && isxdigit(s[2]) &&
 594                                                         s+3 != end && isxdigit(s[3]) )
 595                                                 {
 596                                                         s+=3;
 597                                                         break;
 598                                                 }
 599                                                 ++len; // STRING_COLOR_TAG
 600                                                 ++len; // STRING_COLOR_RGB_TAG_CHAR
 601                                                 break;
 602                                         case 0: // ends with unfinished color code!
 603                                                 ++len;
 604                                                 if(valid)
 605                                                         *valid = FALSE;
 606                                                 return len;
 607                                         case STRING_COLOR_TAG: // escaped ^
 608                                                 ++len;
 609                                                 break;
 610                                         case '0': case '1': case '2': case '3': case '4':
 611                                         case '5': case '6': case '7': case '8': case '9': // color code
 612                                                 break;
 613                                         default: // not a color code
 614                                                 ++len; // STRING_COLOR_TAG
 615                                                 ++len; // the character
 616                                                 break;
 617                                 }
 618                                 break;
 619                         default:
 620                                 ++len;
 621                                 break;
 622                 }
 623
 624                 // start of a wide character
 625                 if (*s & 0xC0)
 626                 {
 627                         for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
 628                         continue;
 629                 }
 630                 // part of a wide character, we ignore that one
 631                 if (*s <= 0xBF)
 632                         --len;
 633                 ++s;
 634         }
 635         // never get here
 636 }