5 ================================================================================
6 Initialization of UTF-8 support and new cvars.
7 ================================================================================
9 // TODO: should this default to 1? To enforce compatibilty
10 // TODO: If changed to 1, add 'utf8_disabled 0' to defaultNexuiz.cfg
11 cvar_t utf8_disabled = {CVAR_SAVE, "utf8_disabled", "1", "Disable UTF-8 support. For compatibility, this is disabled by default in most games."};
15 Cvar_RegisterVariable(&utf8_disabled);
19 ================================================================================
20 UTF-8 encoding and decoding functions follow.
21 ================================================================================
24 /** Analyze the next character and return various information if requested.
25 * @param _s An utf-8 string.
26 * @param _start Filled with the start byte-offset of the next valid character
27 * @param _len Fileed with the length of the next valid character
28 * @param _ch Filled with the unicode value of the next character
29 * @return Whether or not another valid character is in the string
31 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch)
33 const unsigned char *s = (const unsigned char*)_s;
42 // <0xC2 is always an overlong encoding, they're invalid, thus skipped
43 while (s[i] && s[i] >= 0x80 && s[i] <= 0xC2) {
44 //fprintf(stderr, "skipping\n");
47 //fprintf(stderr, "checking\n");
49 // If we hit the end, well, we're out and invalid
52 //fprintf(stderr, "checking ascii\n");
57 if (_start) *_start = i;
59 if (_ch) *_ch = (Uchar)s[i];
60 //fprintf(stderr, "valid ascii\n");
63 //fprintf(stderr, "checking length\n");
65 // Figure out the next char's length
68 // count the 1 bits, they're the # of bytes
69 for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
72 //fprintf(stderr, "superlong\n");
76 // turn bt into a mask and give ch a starting value
79 // check the byte sequence for invalid bytes
80 for (j = 1; j < bits; ++j)
82 // valid bit value: 10xx xxxx
83 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
84 if ( (s[i+j] & 0xC0) != 0x80 )
86 //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
87 // this byte sequence is invalid, skip it
89 // find a character after it
92 // at the same time, decode the character
93 ch = (ch << 6) | (s[i+j] & 0x3F);
96 // Now check the decoded byte for an overlong encoding
97 if ( (bits >= 2 && ch < 0x80) ||
98 (bits >= 3 && ch < 0x800) ||
99 (bits >= 4 && ch < 0x10000) ||
100 ch >= 0x10FFFF // RFC 3629
104 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
114 //fprintf(stderr, "valid utf8\n");
118 /** Get the number of characters in an UTF-8 string.
119 * @param _s An utf-8 encoded null-terminated string.
120 * @return The number of unicode characters in the string.
122 size_t u8_strlen(const char *_s)
126 const unsigned char *s = (const unsigned char*)_s;
128 if (utf8_disabled.integer)
133 // ascii char, skip u8_analyze
141 // invalid, skip u8_analyze
148 if (!u8_analyze((const char*)s, &st, &ln, NULL))
150 // valid character, skip after it
157 /** Get the number of characters in a part of an UTF-8 string.
158 * @param _s An utf-8 encoded null-terminated string.
159 * @param n The maximum number of bytes.
160 * @return The number of unicode characters in the string.
162 size_t u8_strnlen(const char *_s, size_t n)
166 const unsigned char *s = (const unsigned char*)_s;
168 if (utf8_disabled.integer)
171 return (len < n) ? len : n;
176 // ascii char, skip u8_analyze
185 // invalid, skip u8_analyze
193 if (!u8_analyze((const char*)s, &st, &ln, NULL))
195 // valid character, see if it's still inside the range specified by n:
205 /** Get the number of bytes used in a string to represent an amount of characters.
206 * @param _s An utf-8 encoded null-terminated string.
207 * @param n The number of characters we want to know the byte-size for.
208 * @return The number of bytes used to represent n characters.
210 size_t u8_bytelen(const char *_s, size_t n)
214 const unsigned char *s = (const unsigned char*)_s;
216 if (utf8_disabled.integer)
221 // ascii char, skip u8_analyze
230 // invalid, skip u8_analyze
238 if (!u8_analyze((const char*)s, &st, &ln, NULL))
247 /** Get the byte-index for a character-index.
248 * @param _s An utf-8 encoded string.
249 * @param i The character-index for which you want the byte offset.
250 * @param len If not null, character's length will be stored in there.
251 * @return The byte-index at which the character begins, or -1 if the string is too short.
253 int u8_byteofs(const char *_s, size_t i, size_t *len)
257 const unsigned char *s = (const unsigned char*)_s;
259 if (utf8_disabled.integer)
269 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL))
278 /** Get the char-index for a byte-index.
279 * @param _s An utf-8 encoded string.
280 * @param i The byte offset for which you want the character index.
281 * @param len If not null, the offset within the character is stored here.
282 * @return The character-index, or -1 if the string is too short.
284 int u8_charidx(const char *_s, size_t i, size_t *len)
290 const unsigned char *s = (const unsigned char*)_s;
292 if (utf8_disabled.integer)
298 while (ofs < i && s[ofs])
300 // ascii character, skip u8_analyze
309 // invalid, skip u8_analyze
316 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
318 // see if next char is after the bytemark
328 // see if bytemark is within the char
340 /** Get the byte offset of the previous byte.
342 * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
343 * @param _s An utf-8 encoded string.
344 * @param i The current byte offset.
345 * @return The byte offset of the previous character
347 size_t u8_prevbyte(const char *_s, size_t i)
350 const unsigned char *s = (const unsigned char*)_s;
354 if (utf8_disabled.integer)
361 while (ofs < i && s[ofs])
363 // ascii character, skip u8_analyze
370 // invalid, skip u8_analyze
377 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
381 if (ofs + st + ln >= i)
390 /** Fetch a character from an utf-8 encoded string.
391 * @param _s The start of an utf-8 encoded multi-byte character.
392 * @param _end Will point to after the first multi-byte character.
393 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
395 Uchar u8_getchar(const char *_s, const char **_end)
400 if (utf8_disabled.integer)
404 //return 0xE000 + (Uchar)*(const unsigned char*)_s;
405 return (Uchar)*(const unsigned char*)_s;
408 if (!u8_analyze(_s, &st, &ln, &ch))
411 *_end = _s + st + ln;
415 /** Encode a wide-character into utf-8.
416 * @param w The wide character to encode.
417 * @param to The target buffer the utf-8 encoded string is stored to.
418 * @param maxlen The maximum number of bytes that fit into the target buffer.
419 * @return Number of bytes written to the buffer not including the terminating null.
420 * Less or equal to 0 if the buffer is too small.
422 int u8_fromchar(Uchar w, char *to, size_t maxlen)
430 if (w >= 0xE000 && utf8_disabled.integer)
433 if (w < 0x80 || utf8_disabled.integer)
441 // for a little speedup
450 to[1] = 0x80 | (w & 0x3F); w >>= 6;
462 to[2] = 0x80 | (w & 0x3F); w >>= 6;
463 to[1] = 0x80 | (w & 0x3F); w >>= 6;
477 to[3] = 0x80 | (w & 0x3F); w >>= 6;
478 to[2] = 0x80 | (w & 0x3F); w >>= 6;
479 to[1] = 0x80 | (w & 0x3F); w >>= 6;
486 /** uses u8_fromchar on a static buffer
487 * @param ch The unicode character to convert to encode
488 * @param l The number of bytes without the terminating null.
489 * @return A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
491 char *u8_encodech(Uchar ch, size_t *l)
495 len = u8_fromchar(ch, buf, sizeof(buf));
504 /** Convert a utf-8 multibyte string to a wide character string.
505 * @param wcs The target wide-character buffer.
506 * @param mb The utf-8 encoded multibyte string to convert.
507 * @param maxlen The maximum number of wide-characters that fit into the target buffer.
508 * @return The number of characters written to the target buffer.
510 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
516 for (i = 0; *mb && i < maxlen-1; ++i)
518 ch = u8_getchar(mb, &mb);
527 /** Convert a wide-character string to a utf-8 multibyte string.
528 * @param mb The target buffer the utf-8 string is written to.
529 * @param wcs The wide-character string to convert.
530 * @param maxlen The number bytes that fit into the multibyte target buffer.
531 * @return The number of bytes written, not including the terminating \0
533 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
536 const char *start = mb;
539 for (i = 0; wcs[i] && i < maxlen-1; ++i)
542 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
552 UTF-8 aware COM_StringLengthNoColors
554 calculates the visible width of a color coded string.
556 *valid is filled with TRUE if the string is a valid colored string (that is, if
557 it does not end with an unfinished color code). If it gets filled with FALSE, a
558 fix would be adding a STRING_COLOR_TAG at the end of the string.
560 valid can be set to NULL if the caller doesn't care.
562 For size_s, specify the maximum number of characters from s to use, or 0 to use
563 all characters until the zero terminator.
567 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
569 u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid)
574 if (utf8_disabled.integer)
575 return COM_StringLengthNoColors(s, size_s, valid);
577 end = size_s ? (s + size_s) : NULL;
581 switch((s == end) ? 0 : *s)
587 case STRING_COLOR_TAG:
589 switch((s == end) ? 0 : *s)
591 case STRING_COLOR_RGB_TAG_CHAR:
592 if (s+1 != end && isxdigit(s[1]) &&
593 s+2 != end && isxdigit(s[2]) &&
594 s+3 != end && isxdigit(s[3]) )
599 ++len; // STRING_COLOR_TAG
600 ++len; // STRING_COLOR_RGB_TAG_CHAR
602 case 0: // ends with unfinished color code!
607 case STRING_COLOR_TAG: // escaped ^
610 case '0': case '1': case '2': case '3': case '4':
611 case '5': case '6': case '7': case '8': case '9': // color code
613 default: // not a color code
614 ++len; // STRING_COLOR_TAG
615 ++len; // the character
624 // start of a wide character
627 for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
630 // part of a wide character, we ignore that one