utf8lib.c

   1 #include "quakedef.h"
   2 #include "utf8lib.h"
   3
   4 /*
   5 ================================================================================
   6 Initialization of UTF-8 support and new cvars.
   7 ================================================================================
   8 */
   9 // TODO: should this default to 1? To enforce compatibilty
  10 //       TODO: If changed to 1, add 'utf8_disabled 0' to defaultNexuiz.cfg
  11 cvar_t    utf8_disabled = {CVAR_SAVE, "utf8_disabled", "0", "Disable UTF-8 support."};
  12
  13 void   u8_Init(void)
  14 {
  15         Cvar_RegisterVariable(&utf8_disabled);
  16 }
  17
  18 /*
  19 ================================================================================
  20 UTF-8 encoding and decoding functions follow.
  21 ================================================================================
  22 */
  23
  24 /** Validate that this strings starts with a valid utf8 character.
  25  * @param _s    An utf-8 encoded null-terminated string.
  26  * @return
  27  */
  28 static inline qboolean u8_validate(const char *_s)
  29 {
  30         const unsigned char *s = (const unsigned char*)_s;
  31         if (*s < 0x80) // ascii
  32                 return true;
  33         if (*s < 0xC0) // in-between
  34                 return false;
  35         if (*s < 0xC2) // overlong encoding, not allowed
  36                 return false;
  37         if (*s < 0xF5) // valid start of a sequence
  38                 return true;
  39         // anything else is restricted since RFC 3629, November 2003
  40         return false;
  41 }
  42
  43 /** Get the number of characters in an UTF-8 string.
  44  * @param _s    An utf-8 encoded null-terminated string.
  45  * @return      The number of unicode characters in the string.
  46  */
  47 size_t u8_strlen(const char *_s)
  48 {
  49         size_t len = 0;
  50         const unsigned char *s = (const unsigned char*)_s;
  51
  52         if (utf8_disabled.integer)
  53                 return strlen(_s);
  54
  55         while (*s)
  56         {
  57                 // ascii char
  58                 if (*s < 0x80)
  59                 {
  60                         ++len;
  61                         ++s;
  62                         continue;
  63                 }
  64
  65                 // part of a wide character, we ignore that one
  66                 if (*s < 0xC0) // 10111111
  67                 {
  68                         ++s;
  69                         continue;
  70                 }
  71
  72                 // start of a wide character
  73                 if (u8_validate((const char*)s))
  74                         ++len;
  75                 for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
  76         }
  77         return len;
  78 }
  79
  80 /** Get the number of bytes used in a string to represent an amount of characters.
  81  * @param _s    An utf-8 encoded null-terminated string.
  82  * @param n     The number of characters we want to know the byte-size for.
  83  * @return      The number of bytes used to represent n characters.
  84  */
  85 size_t u8_bytelen(const char *_s, size_t n)
  86 {
  87         size_t len = 0;
  88         const unsigned char *s = (const unsigned char*)_s;
  89
  90         if (utf8_disabled.integer)
  91                 return n;
  92
  93         while (*s && n)
  94         {
  95                 // ascii char
  96                 if (*s < 0x80)
  97                 {
  98                         ++len;
  99                         ++s;
 100                         --n;
 101                         continue;
 102                 }
 103
 104                 // part of a wide character, this time we cannot ignore it
 105                 if (*s < 0xC0) // 10111111
 106                 {
 107                         ++s;
 108                         ++len;
 109                         continue;
 110                 }
 111
 112                 // start of a wide character
 113                 for (++len, ++s; *s >= 0x80 && *s < 0xC0; ++s, ++len);
 114                 --n;
 115         }
 116         return len;
 117 }
 118
 119 /** Get the byte-index for a character-index.
 120  * @param _s      An utf-8 encoded string.
 121  * @param i       The character-index for which you want the byte offset.
 122  * @param len     If not null, character's length will be stored in there.
 123  * @return        The byte-index at which the character begins, or -1 if the string is too short.
 124  */
 125 int u8_byteofs(const char *_s, size_t i, size_t *len)
 126 {
 127         size_t ofs = 0;
 128         const unsigned char *s = (const unsigned char*)_s;
 129
 130         if (utf8_disabled.integer)
 131         {
 132                 if (len) *len = 0;
 133                 return i;
 134         }
 135
 136         while (i > 0 && s[ofs])
 137         {
 138                 // ascii character
 139                 if (s[ofs] < 0x80)
 140                 {
 141                         ++ofs;
 142                         --i;
 143                         continue;
 144                 }
 145
 146                 // part of a wide character, we ignore that one
 147                 if (s[ofs] < 0xC0)
 148                 {
 149                         ++ofs;
 150                         continue;
 151                 }
 152
 153                 // start of a wide character
 154                 if (u8_validate((const char*)s))
 155                         --i;
 156                 for (++ofs; s[ofs] >= 0x80 && s[ofs] <= 0xC0; ++ofs);
 157         }
 158         if (!s[ofs])
 159                 return -1;
 160         if (len) {
 161                 if (s[ofs] < 0x80)
 162                         *len = 1;
 163                 else if (s[ofs] & 0xC0)
 164                 {
 165                         size_t i;
 166                         for (i = 1; s[ofs+i] >= 0x80 && s[ofs+i] <= 0xC0; ++i);
 167                         *len = i;
 168                 }
 169                 else if (s[ofs] < 0xC0)
 170                         *len = 0;
 171         }
 172         return ofs;
 173 }
 174
 175 /** Get the char-index for a byte-index.
 176  * @param _s      An utf-8 encoded string.
 177  * @param i       The byte offset for which you want the character index.
 178  * @param len     If not null, the offset within the character is stored here.
 179  * @return        The character-index, or -1 if the string is too short.
 180  */
 181 int u8_charidx(const char *_s, size_t i, size_t *len)
 182 {
 183         size_t ofs = 0;
 184         int idx = 0;
 185         size_t start;
 186         const unsigned char *s = (const unsigned char*)_s;
 187
 188         if (utf8_disabled.integer)
 189         {
 190                 if (len) *len = 0;
 191                 return i;
 192         }
 193
 194         while (ofs < i && s[ofs])
 195         {
 196                 // ascii character
 197                 if (s[ofs] < 0x80)
 198                 {
 199                         ++idx;
 200                         ++ofs;
 201                         continue;
 202                 }
 203
 204                 // part of a wide character, weignore that one
 205                 if (s[ofs] < 0xC0)
 206                 {
 207                         ++ofs;
 208                         continue;
 209                 }
 210
 211                 // start of a wide character
 212                 start = ofs;
 213                 if (!u8_validate((const char*)s+ofs))
 214                 {
 215                         // invalid byte
 216                         ++ofs;
 217                         continue;
 218                 }
 219                 for (++ofs; s[ofs] >= 0x80 && s[ofs] < 0xC0 && ofs < i; ++ofs);
 220                 if (s[ofs] >= 0x80 && s[ofs] < 0xC0)
 221                 {
 222                         // it ends within this character
 223                         if (len)
 224                                 *len = ofs - start;
 225                         return idx;
 226                 }
 227                 ++idx;
 228                 continue;
 229         }
 230         if (len) *len = 0;
 231         return idx;
 232 }
 233
 234 /** Get the byte offset of the previous byte.
 235  * The result equals:
 236  * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
 237  * @param _s      An utf-8 encoded string.
 238  * @param i       The current byte offset.
 239  * @return        The byte offset of the previous character
 240  */
 241 size_t u8_prevbyte(const char *_s, size_t i)
 242 {
 243         const unsigned char *s = (const unsigned char*)_s;
 244         size_t lastofs = 0;
 245         size_t ofs = 0;
 246
 247         if (utf8_disabled.integer)
 248         {
 249                 if (i > 0)
 250                         return i-1;
 251                 return 0;
 252         }
 253
 254         while (ofs < i && s[ofs])
 255         {
 256                 // ascii character
 257                 if (s[ofs] < 0x80)
 258                 {
 259                         lastofs = ofs++;
 260                         continue;
 261                 }
 262
 263                 // part of a wide character, we ignore that one
 264                 if (s[ofs] < 0xC0)
 265                 {
 266                         ++ofs;
 267                         continue;
 268                 }
 269
 270                 // start of a wide character
 271                 if (!u8_validate((const char*)s+ofs))
 272                 {
 273                         // invalid byte
 274                         ++ofs;
 275                         continue;
 276                 }
 277                 lastofs = ofs;
 278                 for (++ofs; s[ofs] >= 0x80 && s[ofs] < 0xC0 && ofs < i; ++ofs);
 279                 if (s[ofs] >= 0x80 && s[ofs] < 0xC0)
 280                 {
 281                         // it ends within this character
 282                         return lastofs;
 283                 }
 284         }
 285         return lastofs;
 286 }
 287
 288 /** Fetch a character from an utf-8 encoded string.
 289  * @param _s      The start of an utf-8 encoded multi-byte character.
 290  * @param _end    Will point to after the first multi-byte character.
 291  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 292  */
 293 Uchar u8_getchar(const char *_s, const char **_end)
 294 {
 295         const unsigned char *s = (const unsigned char*)_s;
 296         Uchar u;
 297         unsigned char mask;
 298         unsigned char v;
 299
 300         if (utf8_disabled.integer)
 301         {
 302                 if (_end)
 303                         *_end = _s + 1;
 304                 return 0xE000 + (Uchar)*s;
 305         }
 306
 307         if (*s < 0x80)
 308         {
 309                 if (_end)
 310                         *_end = _s + 1;
 311                 return (Uchar)*s;
 312         }
 313
 314         if (*s < 0xC0)
 315         {
 316                 // starting within a wide character - skip it and retrieve the one after it
 317                 for (++s; *s >= 0x80 && *s < 0xC0; ++s);
 318                 // or we could return '?' here?
 319         }
 320
 321         while (!u8_validate((const char*)s))
 322         {
 323                 // skip invalid characters
 324                 for (++s; *s >= 0x80 && *s < 0xC0; ++s);
 325                 if (!*s)
 326                         return 0;
 327         }
 328         // for a little speedup:
 329         if ( (*s & 0xE0) == 0xC0 )
 330         {
 331                 // 2-byte character
 332                 u = ( (s[0] & 0x1F) << 6 ) | (s[1] & 0x3F);
 333                 if (_end)
 334                         *_end = _s + 2;
 335                 return u;
 336         }
 337         if ( (*s & 0xF0) == 0xE0 )
 338         {
 339                 // 3-byte character
 340                 u = ( (s[0] & 0x0F) << 12 ) | ( (s[1] & 0x3F) << 6 ) | (s[2] & 0x3F);
 341                 if (_end)
 342                         *_end = _s + 3;
 343                 return u;
 344         }
 345
 346         u = 0;
 347         mask = 0x7F;
 348         v = *s & mask;
 349         for (mask >>= 1; v > (*s & mask); mask >>= 1)
 350                 v = (*s & mask);
 351         u = (Uchar)(*s & mask);
 352         for (++s; *s >= 0x80 && *s < 0xC0; ++s)
 353                 u = (u << 6) | (*s & 0x3F);
 354
 355         if (_end)
 356                 *_end = (const char*)s;
 357
 358         return u;
 359 }
 360
 361 /** Encode a wide-character into utf-8.
 362  * @param w        The wide character to encode.
 363  * @param to       The target buffer the utf-8 encoded string is stored to.
 364  * @param maxlen   The maximum number of bytes that fit into the target buffer.
 365  * @return         Number of bytes written to the buffer not including the terminating null.
 366  *                 Less or equal to 0 if the buffer is too small.
 367  */
 368 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 369 {
 370         size_t i, j;
 371         char bt;
 372         char tmp[16];
 373
 374         if (maxlen < 1)
 375                 return -2;
 376
 377         if (!w)
 378                 return -5;
 379
 380         if (w >= 0xE000 && utf8_disabled.integer)
 381                 w -= 0xE000;
 382
 383         if (w < 0x80 || utf8_disabled.integer)
 384         {
 385                 to[0] = (char)w;
 386                 if (maxlen < 2)
 387                         return -1;
 388                 to[1] = 0;
 389                 return 1;
 390         }
 391         // for a little speedup
 392         if (w < 0x800)
 393         {
 394                 if (maxlen < 3)
 395                 {
 396                         to[0] = 0;
 397                         return -1;
 398                 }
 399                 to[2] = 0;
 400                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 401                 to[0] = 0xC0 | w;
 402                 return 2;
 403         }
 404         if (w < 0x10000)
 405         {
 406                 if (maxlen < 4)
 407                 {
 408                         to[0] = 0;
 409                         return -1;
 410                 }
 411                 to[3] = 0;
 412                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 413                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 414                 to[0] = 0xE0 | w;
 415                 return 3;
 416         }
 417
 418         // "more general" version:
 419
 420         // check how much space we need and store data into a
 421         // temp buffer - this is faster than recalculating again
 422         i = 0;
 423         bt = 0;
 424         while (w)
 425         {
 426                 tmp[i++] = 0x80 | (w & 0x3F);
 427                 bt = (bt >> 1) | 0x80;
 428                 w >>= 6;
 429                 // see if we still fit into the target buffer
 430                 if (i+1 >= maxlen) // +1 for the \0
 431                         return -i;
 432
 433                 // there are no characters which take up that much space yet
 434                 // and there won't be for the next many many years, still... let's be safe
 435                 if (i >= sizeof(tmp))
 436                         return -1;
 437         }
 438         tmp[i-1] |= bt;
 439         for (j = 0; j < i; ++j)
 440         {
 441                 to[i-j-1] = tmp[j];
 442         }
 443
 444         to[i] = 0;
 445         return i;
 446 }
 447
 448 /** uses u8_fromchar on a static buffer
 449  * @param ch        The unicode character to convert to encode
 450  * @param l         The number of bytes without the terminating null.
 451  * @return          A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
 452  */
 453 char *u8_encodech(Uchar ch, size_t *l)
 454 {
 455         static char buf[16];
 456         size_t len;
 457         len = u8_fromchar(ch, buf, sizeof(buf));
 458         if (len > 0)
 459         {
 460                 if (l) *l = len;
 461                 return buf;
 462         }
 463         return NULL;
 464 }
 465
 466 /** Convert a utf-8 multibyte string to a wide character string.
 467  * @param wcs       The target wide-character buffer.
 468  * @param mb        The utf-8 encoded multibyte string to convert.
 469  * @param maxlen    The maximum number of wide-characters that fit into the target buffer.
 470  * @return          The number of characters written to the target buffer.
 471  */
 472 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
 473 {
 474         size_t i;
 475         Uchar ch;
 476         if (maxlen < 1)
 477                 return 0;
 478         for (i = 0; *mb && i < maxlen-1; ++i)
 479         {
 480                 ch = u8_getchar(mb, &mb);
 481                 if (!ch)
 482                         break;
 483                 wcs[i] = ch;
 484         }
 485         wcs[i] = 0;
 486         return i;
 487 }
 488
 489 /** Convert a wide-character string to a utf-8 multibyte string.
 490  * @param mb      The target buffer the utf-8 string is written to.
 491  * @param wcs     The wide-character string to convert.
 492  * @param maxlen  The number bytes that fit into the multibyte target buffer.
 493  * @return        The number of bytes written, not including the terminating \0
 494  */
 495 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
 496 {
 497         size_t i;
 498         const char *start = mb;
 499         if (maxlen < 2)
 500                 return 0;
 501         for (i = 0; wcs[i] && i < maxlen-1; ++i)
 502         {
 503                 int len;
 504                 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
 505                         return (mb - start);
 506                 mb += len;
 507         }
 508         *mb = 0;
 509         return (mb - start);
 510 }
 511
 512 /*
 513 ============
 514 UTF-8 aware COM_StringLengthNoColors
 515
 516 calculates the visible width of a color coded string.
 517
 518 *valid is filled with TRUE if the string is a valid colored string (that is, if
 519 it does not end with an unfinished color code). If it gets filled with FALSE, a
 520 fix would be adding a STRING_COLOR_TAG at the end of the string.
 521
 522 valid can be set to NULL if the caller doesn't care.
 523
 524 For size_s, specify the maximum number of characters from s to use, or 0 to use
 525 all characters until the zero terminator.
 526 ============
 527 */
 528 size_t
 529 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
 530 size_t
 531 u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid)
 532 {
 533         const char *end;
 534         size_t len = 0;
 535
 536         if (utf8_disabled.integer)
 537                 return COM_StringLengthNoColors(s, size_s, valid);
 538
 539         end = size_s ? (s + size_s) : NULL;
 540
 541         for(;;)
 542         {
 543                 switch((s == end) ? 0 : *s)
 544                 {
 545                         case 0:
 546                                 if(valid)
 547                                         *valid = TRUE;
 548                                 return len;
 549                         case STRING_COLOR_TAG:
 550                                 ++s;
 551                                 switch((s == end) ? 0 : *s)
 552                                 {
 553                                         case STRING_COLOR_RGB_TAG_CHAR:
 554                                                 if (s+1 != end && isxdigit(s[1]) &&
 555                                                         s+2 != end && isxdigit(s[2]) &&
 556                                                         s+3 != end && isxdigit(s[3]) )
 557                                                 {
 558                                                         s+=3;
 559                                                         break;
 560                                                 }
 561                                                 ++len; // STRING_COLOR_TAG
 562                                                 ++len; // STRING_COLOR_RGB_TAG_CHAR
 563                                                 break;
 564                                         case 0: // ends with unfinished color code!
 565                                                 ++len;
 566                                                 if(valid)
 567                                                         *valid = FALSE;
 568                                                 return len;
 569                                         case STRING_COLOR_TAG: // escaped ^
 570                                                 ++len;
 571                                                 break;
 572                                         case '0': case '1': case '2': case '3': case '4':
 573                                         case '5': case '6': case '7': case '8': case '9': // color code
 574                                                 break;
 575                                         default: // not a color code
 576                                                 ++len; // STRING_COLOR_TAG
 577                                                 ++len; // the character
 578                                                 break;
 579                                 }
 580                                 break;
 581                         default:
 582                                 ++len;
 583                                 break;
 584                 }
 585
 586                 // start of a wide character
 587                 if (*s & 0xC0)
 588                 {
 589                         for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
 590                         continue;
 591                 }
 592                 // part of a wide character, we ignore that one
 593                 if (*s <= 0xBF)
 594                         --len;
 595                 ++s;
 596         }
 597         // never get here
 598 }