utf8lib.c

   1 #include "quakedef.h"
   2 #include "utf8lib.h"
   3
   4 /*
   5 ================================================================================
   6 Initialization of UTF-8 support and new cvars.
   7 ================================================================================
   8 */
   9 // for compatibility this defaults to 0
  10 cvar_t    utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
  11
  12 void   u8_Init(void)
  13 {
  14         Cvar_RegisterVariable(&utf8_enable);
  15 }
  16
  17 /*
  18 ================================================================================
  19 UTF-8 encoding and decoding functions follow.
  20 ================================================================================
  21 */
  22
  23 unsigned char utf8_lengths[256] = { // 0 = invalid
  24         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // ascii characters
  25         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  26         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  27         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  28         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  29         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  30         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  31         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  32         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0xBF are within multibyte sequences
  33         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // they could be interpreted as 2-byte starts but
  34         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // the codepoint would be < 127
  35         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  36         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 and C1 would also result in overlong encodings
  37         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  38         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  39         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  40         // with F5 the codepoint is above 0x10FFFF,
  41         // F8-FB would start 5-byte sequences
  42         // FC-FD would start 6-byte sequences
  43         // ...
  44 };
  45 Uchar utf8_range[5] = {
  46         1,       // invalid - let's not allow the creation of 0-bytes :P
  47         1,       // ascii minimum
  48         0x80,    // 2-byte minimum
  49         0x800,   // 3-byte minimum
  50         0x10000, // 4-byte minimum
  51 };
  52
  53 /** Analyze the next character and return various information if requested.
  54  * @param _s      An utf-8 string.
  55  * @param _start  Filled with the start byte-offset of the next valid character
  56  * @param _len    Fileed with the length of the next valid character
  57  * @param _ch     Filled with the unicode value of the next character
  58  * @param _maxlen Maximum number of bytes to read from _s
  59  * @return        Whether or not another valid character is in the string
  60  */
  61 #define U8_ANALYZE_INFINITY 7
  62 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
  63 {
  64         const unsigned char *s = (const unsigned char*)_s;
  65         size_t i, j;
  66         size_t bits = 0;
  67         Uchar ch;
  68
  69         i = 0;
  70 findchar:
  71         while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0)
  72                 ++i;
  73
  74         if (i >= _maxlen || !s[i]) {
  75                 if (_start) *_start = i;
  76                 if (_len) *_len = 0;
  77                 return false;
  78         }
  79
  80         if (bits == 1) { // ascii
  81                 if (_start) *_start = i;
  82                 if (_len) *_len = 1;
  83                 if (_ch) *_ch = (Uchar)s[i];
  84                 return true;
  85         }
  86
  87         ch = (s[i] & (0xFF >> bits));
  88         for (j = 1; j < bits; ++j)
  89         {
  90                 if ( (s[i+j] & 0xC0) != 0x80 )
  91                 {
  92                         i += j;
  93                         goto findchar;
  94                 }
  95                 ch = (ch << 6) | (s[i+j] & 0x3F);
  96         }
  97         if (ch < utf8_range[bits] || ch >= 0x10FFFF)
  98         {
  99                 i += bits;
 100                 goto findchar;
 101         }
 102 #if 0
 103         // <0xC2 is always an overlong encoding, they're invalid, thus skipped
 104         while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] < 0xC2) {
 105                 //fprintf(stderr, "skipping\n");
 106                 ++i;
 107         }
 108
 109         // If we hit the end, well, we're out and invalid
 110         if(i >= _maxlen || !s[i]) {
 111                 if (_start) *_start = i;
 112                 if (_len) *_len = 0;
 113                 return false;
 114         }
 115
 116         // I'll leave that in - if you remove it, also change the part below
 117         // to support 1-byte chars correctly
 118         if (s[i] < 0x80)
 119         {
 120                 if (_start) *_start = i;
 121                 if (_len) *_len = 1;
 122                 if (_ch) *_ch = (Uchar)s[i];
 123                 //fprintf(stderr, "valid ascii\n");
 124                 return true;
 125         }
 126
 127         // Figure out the next char's length
 128         bc = s[i];
 129         bits = 1;
 130         // count the 1 bits, they're the # of bytes
 131         for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
 132         if (!bt)
 133         {
 134                 //fprintf(stderr, "superlong\n");
 135                 ++i;
 136                 goto findchar;
 137         }
 138         if(i + bits > _maxlen) {
 139                 /*
 140                 if (_start) *_start = i;
 141                 if (_len) *_len = 0;
 142                 return false;
 143                 */
 144                 ++i;
 145                 goto findchar;
 146         }
 147         // turn bt into a mask and give ch a starting value
 148         --bt;
 149         ch = (s[i] & bt);
 150         // check the byte sequence for invalid bytes
 151         for (j = 1; j < bits; ++j)
 152         {
 153                 // valid bit value: 10xx xxxx
 154                 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
 155                 if ( (s[i+j] & 0xC0) != 0x80 )
 156                 {
 157                         //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
 158                         // this byte sequence is invalid, skip it
 159                         i += j;
 160                         // find a character after it
 161                         goto findchar;
 162                 }
 163                 // at the same time, decode the character
 164                 ch = (ch << 6) | (s[i+j] & 0x3F);
 165         }
 166
 167         // Now check the decoded byte for an overlong encoding
 168         if ( (bits >= 2 && ch < 0x80) ||
 169              (bits >= 3 && ch < 0x800) ||
 170              (bits >= 4 && ch < 0x10000) ||
 171              ch >= 0x10FFFF // RFC 3629
 172                 )
 173         {
 174                 i += bits;
 175                 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
 176                 goto findchar;
 177         }
 178 #endif
 179
 180         if (_start)
 181                 *_start = i;
 182         if (_len)
 183                 *_len = bits;
 184         if (_ch)
 185                 *_ch = ch;
 186         //fprintf(stderr, "valid utf8\n");
 187         return true;
 188 }
 189
 190 /** Get the number of characters in an UTF-8 string.
 191  * @param _s    An utf-8 encoded null-terminated string.
 192  * @return      The number of unicode characters in the string.
 193  */
 194 size_t u8_strlen(const char *_s)
 195 {
 196         size_t st, ln;
 197         size_t len = 0;
 198         const unsigned char *s = (const unsigned char*)_s;
 199
 200         if (!utf8_enable.integer)
 201                 return strlen(_s);
 202
 203         while (*s)
 204         {
 205                 // ascii char, skip u8_analyze
 206                 if (*s < 0x80)
 207                 {
 208                         ++len;
 209                         ++s;
 210                         continue;
 211                 }
 212
 213                 // invalid, skip u8_analyze
 214                 if (*s < 0xC2)
 215                 {
 216                         ++s;
 217                         continue;
 218                 }
 219
 220                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 221                         break;
 222                 // valid character, skip after it
 223                 s += st + ln;
 224                 ++len;
 225         }
 226         return len;
 227 }
 228
 229 /** Get the number of characters in a part of an UTF-8 string.
 230  * @param _s    An utf-8 encoded null-terminated string.
 231  * @param n     The maximum number of bytes.
 232  * @return      The number of unicode characters in the string.
 233  */
 234 size_t u8_strnlen(const char *_s, size_t n)
 235 {
 236         size_t st, ln;
 237         size_t len = 0;
 238         const unsigned char *s = (const unsigned char*)_s;
 239
 240         if (!utf8_enable.integer)
 241         {
 242                 len = strlen(_s);
 243                 return (len < n) ? len : n;
 244         }
 245
 246         while (*s && n)
 247         {
 248                 // ascii char, skip u8_analyze
 249                 if (*s < 0x80)
 250                 {
 251                         ++len;
 252                         ++s;
 253                         --n;
 254                         continue;
 255                 }
 256
 257                 // invalid, skip u8_analyze
 258                 if (*s < 0xC2)
 259                 {
 260                         ++s;
 261                         --n;
 262                         continue;
 263                 }
 264
 265                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 266                         break;
 267                 // valid character, see if it's still inside the range specified by n:
 268                 if (n < st + ln)
 269                         return len;
 270                 ++len;
 271                 n -= st + ln;
 272                 s += st + ln;
 273         }
 274         return len;
 275 }
 276
 277 /** Get the number of bytes used in a string to represent an amount of characters.
 278  * @param _s    An utf-8 encoded null-terminated string.
 279  * @param n     The number of characters we want to know the byte-size for.
 280  * @return      The number of bytes used to represent n characters.
 281  */
 282 size_t u8_bytelen(const char *_s, size_t n)
 283 {
 284         size_t st, ln;
 285         size_t len = 0;
 286         const unsigned char *s = (const unsigned char*)_s;
 287
 288         if (!utf8_enable.integer) {
 289                 len = strlen(_s);
 290                 return (len < n) ? len : n;
 291         }
 292
 293         while (*s && n)
 294         {
 295                 // ascii char, skip u8_analyze
 296                 if (*s < 0x80)
 297                 {
 298                         ++len;
 299                         ++s;
 300                         --n;
 301                         continue;
 302                 }
 303
 304                 // invalid, skip u8_analyze
 305                 if (*s < 0xC2)
 306                 {
 307                         ++s;
 308                         ++len;
 309                         continue;
 310                 }
 311
 312                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 313                         break;
 314                 --n;
 315                 s += st + ln;
 316                 len += st + ln;
 317         }
 318         return len;
 319 }
 320
 321 /** Get the byte-index for a character-index.
 322  * @param _s      An utf-8 encoded string.
 323  * @param i       The character-index for which you want the byte offset.
 324  * @param len     If not null, character's length will be stored in there.
 325  * @return        The byte-index at which the character begins, or -1 if the string is too short.
 326  */
 327 int u8_byteofs(const char *_s, size_t i, size_t *len)
 328 {
 329         size_t st, ln;
 330         size_t ofs = 0;
 331         const unsigned char *s = (const unsigned char*)_s;
 332
 333         if (!utf8_enable.integer)
 334         {
 335                 if (strlen(_s) < i)
 336                 {
 337                         if (len) *len = 0;
 338                         return -1;
 339                 }
 340
 341                 if (len) *len = 1;
 342                 return i;
 343         }
 344
 345         st = ln = 0;
 346         do
 347         {
 348                 ofs += ln;
 349                 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 350                         return -1;
 351                 ofs += st;
 352         } while(i-- > 0);
 353         if (len)
 354                 *len = ln;
 355         return ofs;
 356 }
 357
 358 /** Get the char-index for a byte-index.
 359  * @param _s      An utf-8 encoded string.
 360  * @param i       The byte offset for which you want the character index.
 361  * @param len     If not null, the offset within the character is stored here.
 362  * @return        The character-index, or -1 if the string is too short.
 363  */
 364 int u8_charidx(const char *_s, size_t i, size_t *len)
 365 {
 366         size_t st, ln;
 367         size_t ofs = 0;
 368         size_t pofs = 0;
 369         int idx = 0;
 370         const unsigned char *s = (const unsigned char*)_s;
 371
 372         if (!utf8_enable.integer)
 373         {
 374                 if (len) *len = 0;
 375                 return i;
 376         }
 377
 378         while (ofs < i && s[ofs])
 379         {
 380                 // ascii character, skip u8_analyze
 381                 if (s[ofs] < 0x80)
 382                 {
 383                         pofs = ofs;
 384                         ++idx;
 385                         ++ofs;
 386                         continue;
 387                 }
 388
 389                 // invalid, skip u8_analyze
 390                 if (s[ofs] < 0xC2)
 391                 {
 392                         ++ofs;
 393                         continue;
 394                 }
 395
 396                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 397                         return -1;
 398                 // see if next char is after the bytemark
 399                 if (ofs + st > i)
 400                 {
 401                         if (len)
 402                                 *len = i - pofs;
 403                         return idx;
 404                 }
 405                 ++idx;
 406                 pofs = ofs + st;
 407                 ofs += st + ln;
 408                 // see if bytemark is within the char
 409                 if (ofs > i)
 410                 {
 411                         if (len)
 412                                 *len = i - pofs;
 413                         return idx;
 414                 }
 415         }
 416         if (len) *len = 0;
 417         return idx;
 418 }
 419
 420 /** Get the byte offset of the previous byte.
 421  * The result equals:
 422  * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
 423  * @param _s      An utf-8 encoded string.
 424  * @param i       The current byte offset.
 425  * @return        The byte offset of the previous character
 426  */
 427 size_t u8_prevbyte(const char *_s, size_t i)
 428 {
 429         size_t st, ln;
 430         const unsigned char *s = (const unsigned char*)_s;
 431         size_t lastofs = 0;
 432         size_t ofs = 0;
 433
 434         if (!utf8_enable.integer)
 435         {
 436                 if (i > 0)
 437                         return i-1;
 438                 return 0;
 439         }
 440
 441         while (ofs < i && s[ofs])
 442         {
 443                 // ascii character, skip u8_analyze
 444                 if (s[ofs] < 0x80)
 445                 {
 446                         lastofs = ofs++;
 447                         continue;
 448                 }
 449
 450                 // invalid, skip u8_analyze
 451                 if (s[ofs] < 0xC2)
 452                 {
 453                         ++ofs;
 454                         continue;
 455                 }
 456
 457                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 458                         return lastofs;
 459                 if (ofs + st > i)
 460                         return lastofs;
 461                 if (ofs + st + ln >= i)
 462                         return ofs + st;
 463
 464                 lastofs = ofs;
 465                 ofs += st + ln;
 466         }
 467         return lastofs;
 468 }
 469
 470 Uchar u8_quake2utf8map[256] = {
 471         0xE000, 0xE001, 0xE002, 0xE003, 0xE004, 0xE005, 0xE006, 0xE007, 0xE008, 0xE009, 0xE00A, 0xE00B, 0xE00C, 0xE00D, 0xE00E, 0xE00F, // specials
 472         0xE010, 0xE011, 0xE012, 0xE013, 0xE014, 0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C, 0xE01D, 0xE01E, 0xE01F, // specials
 473         0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // shift+digit line
 474         0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // digits
 475         0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // caps
 476         0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // caps
 477         0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // small
 478         0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // small
 479         0xE080, 0xE081, 0xE082, 0xE083, 0xE084, 0xE085, 0xE086, 0xE087, 0xE088, 0xE089, 0xE08A, 0xE08B, 0xE08C, 0xE08D, 0xE08E, 0xE08F, // specials
 480         0xE090, 0xE091, 0xE092, 0xE093, 0xE094, 0xE095, 0xE096, 0xE097, 0xE098, 0xE099, 0xE09A, 0xE09B, 0xE09C, 0xE09D, 0xE09E, 0xE09F, // faces
 481         0xE0A0, 0xE0A1, 0xE0A2, 0xE0A3, 0xE0A4, 0xE0A5, 0xE0A6, 0xE0A7, 0xE0A8, 0xE0A9, 0xE0AA, 0xE0AB, 0xE0AC, 0xE0AD, 0xE0AE, 0xE0AF,
 482         0xE0B0, 0xE0B1, 0xE0B2, 0xE0B3, 0xE0B4, 0xE0B5, 0xE0B6, 0xE0B7, 0xE0B8, 0xE0B9, 0xE0BA, 0xE0BB, 0xE0BC, 0xE0BD, 0xE0BE, 0xE0BF,
 483         0xE0C0, 0xE0C1, 0xE0C2, 0xE0C3, 0xE0C4, 0xE0C5, 0xE0C6, 0xE0C7, 0xE0C8, 0xE0C9, 0xE0CA, 0xE0CB, 0xE0CC, 0xE0CD, 0xE0CE, 0xE0CF,
 484         0xE0D0, 0xE0D1, 0xE0D2, 0xE0D3, 0xE0D4, 0xE0D5, 0xE0D6, 0xE0D7, 0xE0D8, 0xE0D9, 0xE0DA, 0xE0DB, 0xE0DC, 0xE0DD, 0xE0DE, 0xE0DF,
 485         0xE0E0, 0xE0E1, 0xE0E2, 0xE0E3, 0xE0E4, 0xE0E5, 0xE0E6, 0xE0E7, 0xE0E8, 0xE0E9, 0xE0EA, 0xE0EB, 0xE0EC, 0xE0ED, 0xE0EE, 0xE0EF,
 486         0xE0F0, 0xE0F1, 0xE0F2, 0xE0F3, 0xE0F4, 0xE0F5, 0xE0F6, 0xE0F7, 0xE0F8, 0xE0F9, 0xE0FA, 0xE0FB, 0xE0FC, 0xE0FD, 0xE0FE, 0xE0FF,
 487 };
 488
 489 /** Fetch a character from an utf-8 encoded string.
 490  * @param _s      The start of an utf-8 encoded multi-byte character.
 491  * @param _end    Will point to after the first multi-byte character.
 492  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 493  */
 494 Uchar u8_getchar_utf8_enabled(const char *_s, const char **_end)
 495 {
 496         size_t st, ln;
 497         Uchar ch;
 498
 499         if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY))
 500                 ch = 0;
 501         if (_end)
 502                 *_end = _s + st + ln;
 503         return ch;
 504 }
 505
 506 /** Fetch a character from an utf-8 encoded string.
 507  * @param _s      The start of an utf-8 encoded multi-byte character.
 508  * @param _end    Will point to after the first multi-byte character.
 509  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 510  */
 511 Uchar u8_getnchar_utf8_enabled(const char *_s, const char **_end, size_t _maxlen)
 512 {
 513         size_t st, ln;
 514         Uchar ch;
 515
 516         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 517                 ch = 0;
 518         if (_end)
 519                 *_end = _s + st + ln;
 520         return ch;
 521 }
 522
 523 /** Encode a wide-character into utf-8.
 524  * @param w        The wide character to encode.
 525  * @param to       The target buffer the utf-8 encoded string is stored to.
 526  * @param maxlen   The maximum number of bytes that fit into the target buffer.
 527  * @return         Number of bytes written to the buffer not including the terminating null.
 528  *                 Less or equal to 0 if the buffer is too small.
 529  */
 530 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 531 {
 532         if (maxlen < 1)
 533                 return 0;
 534
 535         if (!w)
 536                 return 0;
 537
 538         if (w >= 0xE000 && !utf8_enable.integer)
 539                 w -= 0xE000;
 540
 541         if (w < 0x80 || !utf8_enable.integer)
 542         {
 543                 to[0] = (char)w;
 544                 if (maxlen < 2)
 545                         return -1;
 546                 to[1] = 0;
 547                 return 1;
 548         }
 549         // for a little speedup
 550         if (w < 0x800)
 551         {
 552                 if (maxlen < 3)
 553                 {
 554                         to[0] = 0;
 555                         return -1;
 556                 }
 557                 to[2] = 0;
 558                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 559                 to[0] = 0xC0 | w;
 560                 return 2;
 561         }
 562         if (w < 0x10000)
 563         {
 564                 if (maxlen < 4)
 565                 {
 566                         to[0] = 0;
 567                         return -1;
 568                 }
 569                 to[3] = 0;
 570                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 571                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 572                 to[0] = 0xE0 | w;
 573                 return 3;
 574         }
 575
 576         // RFC 3629
 577         if (w <= 0x10FFFF)
 578         {
 579                 if (maxlen < 5)
 580                 {
 581                         to[0] = 0;
 582                         return -1;
 583                 }
 584                 to[4] = 0;
 585                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 586                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 587                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 588                 to[0] = 0xF0 | w;
 589                 return 4;
 590         }
 591         return 0;
 592 }
 593
 594 /** uses u8_fromchar on a static buffer
 595  * @param ch        The unicode character to convert to encode
 596  * @param l         The number of bytes without the terminating null.
 597  * @return          A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
 598  */
 599 char *u8_encodech(Uchar ch, size_t *l)
 600 {
 601         static char buf[16];
 602         size_t len;
 603         len = u8_fromchar(ch, buf, sizeof(buf));
 604         if (len > 0)
 605         {
 606                 if (l) *l = len;
 607                 return buf;
 608         }
 609         return NULL;
 610 }
 611
 612 /** Convert a utf-8 multibyte string to a wide character string.
 613  * @param wcs       The target wide-character buffer.
 614  * @param mb        The utf-8 encoded multibyte string to convert.
 615  * @param maxlen    The maximum number of wide-characters that fit into the target buffer.
 616  * @return          The number of characters written to the target buffer.
 617  */
 618 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
 619 {
 620         size_t i;
 621         Uchar ch;
 622         if (maxlen < 1)
 623                 return 0;
 624         for (i = 0; *mb && i < maxlen-1; ++i)
 625         {
 626                 ch = u8_getchar(mb, &mb);
 627                 if (!ch)
 628                         break;
 629                 wcs[i] = ch;
 630         }
 631         wcs[i] = 0;
 632         return i;
 633 }
 634
 635 /** Convert a wide-character string to a utf-8 multibyte string.
 636  * @param mb      The target buffer the utf-8 string is written to.
 637  * @param wcs     The wide-character string to convert.
 638  * @param maxlen  The number bytes that fit into the multibyte target buffer.
 639  * @return        The number of bytes written, not including the terminating \0
 640  */
 641 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
 642 {
 643         size_t i;
 644         const char *start = mb;
 645         if (maxlen < 2)
 646                 return 0;
 647         for (i = 0; wcs[i] && i < maxlen-1; ++i)
 648         {
 649                 /*
 650                 int len;
 651                 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
 652                         return (mb - start);
 653                 mb += len;
 654                 */
 655                 mb += u8_fromchar(wcs[i], mb, maxlen - i);
 656         }
 657         *mb = 0;
 658         return (mb - start);
 659 }
 660
 661 /*
 662 ============
 663 UTF-8 aware COM_StringLengthNoColors
 664
 665 calculates the visible width of a color coded string.
 666
 667 *valid is filled with TRUE if the string is a valid colored string (that is, if
 668 it does not end with an unfinished color code). If it gets filled with FALSE, a
 669 fix would be adding a STRING_COLOR_TAG at the end of the string.
 670
 671 valid can be set to NULL if the caller doesn't care.
 672
 673 For size_s, specify the maximum number of characters from s to use, or 0 to use
 674 all characters until the zero terminator.
 675 ============
 676 */
 677 size_t
 678 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
 679 size_t
 680 u8_COM_StringLengthNoColors(const char *_s, size_t size_s, qboolean *valid)
 681 {
 682         const unsigned char *s = (const unsigned char*)_s;
 683         const unsigned char *end;
 684         size_t len = 0;
 685         size_t st, ln;
 686
 687         if (!utf8_enable.integer)
 688                 return COM_StringLengthNoColors(_s, size_s, valid);
 689
 690         end = size_s ? (s + size_s) : NULL;
 691
 692         for(;;)
 693         {
 694                 switch((s == end) ? 0 : *s)
 695                 {
 696                         case 0:
 697                                 if(valid)
 698                                         *valid = TRUE;
 699                                 return len;
 700                         case STRING_COLOR_TAG:
 701                                 ++s;
 702                                 switch((s == end) ? 0 : *s)
 703                                 {
 704                                         case STRING_COLOR_RGB_TAG_CHAR:
 705                                                 if (s+1 != end && isxdigit(s[1]) &&
 706                                                         s+2 != end && isxdigit(s[2]) &&
 707                                                         s+3 != end && isxdigit(s[3]) )
 708                                                 {
 709                                                         s+=3;
 710                                                         break;
 711                                                 }
 712                                                 ++len; // STRING_COLOR_TAG
 713                                                 ++len; // STRING_COLOR_RGB_TAG_CHAR
 714                                                 break;
 715                                         case 0: // ends with unfinished color code!
 716                                                 ++len;
 717                                                 if(valid)
 718                                                         *valid = FALSE;
 719                                                 return len;
 720                                         case STRING_COLOR_TAG: // escaped ^
 721                                                 ++len;
 722                                                 break;
 723                                         case '0': case '1': case '2': case '3': case '4':
 724                                         case '5': case '6': case '7': case '8': case '9': // color code
 725                                                 break;
 726                                         default: // not a color code
 727                                                 ++len; // STRING_COLOR_TAG
 728                                                 ++len; // the character
 729                                                 break;
 730                                 }
 731                                 continue;
 732                         default:
 733                                 break;
 734                 }
 735
 736                 // ascii char, skip u8_analyze
 737                 if (*s < 0x80)
 738                 {
 739                         ++len;
 740                         ++s;
 741                         continue;
 742                 }
 743
 744                 // invalid, skip u8_analyze
 745                 if (*s < 0xC2)
 746                 {
 747                         ++s;
 748                         continue;
 749                 }
 750
 751                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 752                 {
 753                         // we CAN end up here, if an invalid char is between this one and the end of the string
 754                         if(valid)
 755                                 *valid = TRUE;
 756                         return len;
 757                 }
 758
 759                 if(end && s + st + ln > end)
 760                 {
 761                         // string length exceeded by new character
 762                         if(valid)
 763                                 *valid = TRUE;
 764                         return len;
 765                 }
 766
 767                 // valid character, skip after it
 768                 s += st + ln;
 769                 ++len;
 770         }
 771         // never get here
 772 }
 773
 774 /** Pads a utf-8 string
 775  * @param out     The target buffer the utf-8 string is written to.
 776  * @param outsize The size of the target buffer, including the final NUL
 777  * @param in      The input utf-8 buffer
 778  * @param leftalign Left align the output string (by default right alignment is done)
 779  * @param minwidth The minimum output width
 780  * @param maxwidth The maximum output width
 781  * @return        The number of bytes written, not including the terminating \0
 782  */
 783 size_t u8_strpad(char *out, size_t outsize, const char *in, qboolean leftalign, size_t minwidth, size_t maxwidth)
 784 {
 785         if(!utf8_enable.integer)
 786         {
 787                 return dpsnprintf(out, outsize, "%*.*s", leftalign ? -(int) minwidth : (int) minwidth, (int) maxwidth, in);
 788         }
 789         else
 790         {
 791                 size_t l = u8_bytelen(in, maxwidth);
 792                 size_t actual_width = u8_strnlen(in, l);
 793                 int pad = (actual_width >= minwidth) ? 0 : (minwidth - actual_width);
 794                 int prec = l;
 795                 int lpad = leftalign ? 0 : pad;
 796                 int rpad = leftalign ? pad : 0;
 797                 return dpsnprintf(out, outsize, "%*s%.*s%*s", lpad, "", prec, in, rpad, "");
 798         }
 799 }