5 ================================================================================
6 Initialization of UTF-8 support and new cvars.
7 ================================================================================
9 // TODO: should this default to 1? To enforce compatibilty
10 // TODO: If changed to 1, add 'utf8_disabled 0' to defaultNexuiz.cfg
11 cvar_t utf8_disabled = {CVAR_SAVE, "utf8_disabled", "0", "Disable UTF-8 support."};
15 Cvar_RegisterVariable(&utf8_disabled);
19 ================================================================================
20 UTF-8 encoding and decoding functions follow.
21 ================================================================================
24 /** Validate that this strings starts with a valid utf8 character.
25 * @param _s An utf-8 encoded null-terminated string.
28 static inline qboolean u8_validate(const char *_s)
30 const unsigned char *s = (const unsigned char*)_s;
31 if (*s < 0x80) // ascii
33 if (*s < 0xC0) // in-between
35 if (*s < 0xC2) // overlong encoding, not allowed
37 if (*s < 0xF5) // valid start of a sequence
39 // anything else is restricted since RFC 3629, November 2003
43 /** Get the number of characters in an UTF-8 string.
44 * @param _s An utf-8 encoded null-terminated string.
45 * @return The number of unicode characters in the string.
47 size_t u8_strlen(const char *_s)
50 const unsigned char *s = (const unsigned char*)_s;
52 if (utf8_disabled.integer)
65 // part of a wide character, we ignore that one
66 if (*s < 0xC0) // 10111111
72 // start of a wide character
73 if (u8_validate((const char*)s))
75 for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
80 /** Get the number of bytes used in a string to represent an amount of characters.
81 * @param _s An utf-8 encoded null-terminated string.
82 * @param n The number of characters we want to know the byte-size for.
83 * @return The number of bytes used to represent n characters.
85 size_t u8_bytelen(const char *_s, size_t n)
88 const unsigned char *s = (const unsigned char*)_s;
90 if (utf8_disabled.integer)
104 // part of a wide character, this time we cannot ignore it
105 if (*s < 0xC0) // 10111111
112 // start of a wide character
113 for (++len, ++s; *s >= 0x80 && *s < 0xC0; ++s, ++len);
119 /** Get the byte-index for a character-index.
120 * @param _s An utf-8 encoded string.
121 * @param i The character-index for which you want the byte offset.
122 * @param len If not null, character's length will be stored in there.
123 * @return The byte-index at which the character begins, or -1 if the string is too short.
125 int u8_byteofs(const char *_s, size_t i, size_t *len)
128 const unsigned char *s = (const unsigned char*)_s;
130 if (utf8_disabled.integer)
136 while (i > 0 && s[ofs])
146 // part of a wide character, we ignore that one
153 // start of a wide character
154 if (u8_validate((const char*)s))
156 for (++ofs; s[ofs] >= 0x80 && s[ofs] <= 0xC0; ++ofs);
163 else if (s[ofs] & 0xC0)
166 for (i = 1; s[ofs+i] >= 0x80 && s[ofs+i] <= 0xC0; ++i);
169 else if (s[ofs] < 0xC0)
175 /** Get the char-index for a byte-index.
176 * @param _s An utf-8 encoded string.
177 * @param i The byte offset for which you want the character index.
178 * @param len If not null, the offset within the character is stored here.
179 * @return The character-index, or -1 if the string is too short.
181 int u8_charidx(const char *_s, size_t i, size_t *len)
186 const unsigned char *s = (const unsigned char*)_s;
188 if (utf8_disabled.integer)
194 while (ofs < i && s[ofs])
204 // part of a wide character, weignore that one
211 // start of a wide character
213 if (!u8_validate((const char*)s+ofs))
219 for (++ofs; s[ofs] >= 0x80 && s[ofs] < 0xC0 && ofs < i; ++ofs);
220 if (s[ofs] >= 0x80 && s[ofs] < 0xC0)
222 // it ends within this character
234 /** Get the byte offset of the previous byte.
236 * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
237 * @param _s An utf-8 encoded string.
238 * @param i The current byte offset.
239 * @return The byte offset of the previous character
241 size_t u8_prevbyte(const char *_s, size_t i)
243 const unsigned char *s = (const unsigned char*)_s;
247 if (utf8_disabled.integer)
254 while (ofs < i && s[ofs])
263 // part of a wide character, we ignore that one
270 // start of a wide character
271 if (!u8_validate((const char*)s+ofs))
278 for (++ofs; s[ofs] >= 0x80 && s[ofs] < 0xC0 && ofs < i; ++ofs);
279 if (s[ofs] >= 0x80 && s[ofs] < 0xC0)
281 // it ends within this character
288 /** Fetch a character from an utf-8 encoded string.
289 * @param _s The start of an utf-8 encoded multi-byte character.
290 * @param _end Will point to after the first multi-byte character.
291 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
293 Uchar u8_getchar(const char *_s, const char **_end)
295 const unsigned char *s = (const unsigned char*)_s;
300 if (utf8_disabled.integer)
304 return 0xE000 + (Uchar)*s;
316 // starting within a wide character - skip it and retrieve the one after it
317 for (++s; *s >= 0x80 && *s < 0xC0; ++s);
318 // or we could return '?' here?
321 while (!u8_validate((const char*)s))
323 // skip invalid characters
324 for (++s; *s >= 0x80 && *s < 0xC0; ++s);
328 // for a little speedup:
329 if ( (*s & 0xE0) == 0xC0 )
332 u = ( (s[0] & 0x1F) << 6 ) | (s[1] & 0x3F);
337 if ( (*s & 0xF0) == 0xE0 )
340 u = ( (s[0] & 0x0F) << 12 ) | ( (s[1] & 0x3F) << 6 ) | (s[2] & 0x3F);
349 for (mask >>= 1; v > (*s & mask); mask >>= 1)
351 u = (Uchar)(*s & mask);
352 for (++s; *s >= 0x80 && *s < 0xC0; ++s)
353 u = (u << 6) | (*s & 0x3F);
356 *_end = (const char*)s;
361 /** Encode a wide-character into utf-8.
362 * @param w The wide character to encode.
363 * @param to The target buffer the utf-8 encoded string is stored to.
364 * @param maxlen The maximum number of bytes that fit into the target buffer.
365 * @return Number of bytes written to the buffer not including the terminating null.
366 * Less or equal to 0 if the buffer is too small.
368 int u8_fromchar(Uchar w, char *to, size_t maxlen)
380 if (w >= 0xE000 && utf8_disabled.integer)
383 if (w < 0x80 || utf8_disabled.integer)
391 // for a little speedup
400 to[1] = 0x80 | (w & 0x3F); w >>= 6;
412 to[2] = 0x80 | (w & 0x3F); w >>= 6;
413 to[1] = 0x80 | (w & 0x3F); w >>= 6;
418 // "more general" version:
420 // check how much space we need and store data into a
421 // temp buffer - this is faster than recalculating again
426 tmp[i++] = 0x80 | (w & 0x3F);
427 bt = (bt >> 1) | 0x80;
429 // see if we still fit into the target buffer
430 if (i+1 >= maxlen) // +1 for the \0
433 // there are no characters which take up that much space yet
434 // and there won't be for the next many many years, still... let's be safe
435 if (i >= sizeof(tmp))
439 for (j = 0; j < i; ++j)
448 /** uses u8_fromchar on a static buffer
449 * @param ch The unicode character to convert to encode
450 * @param l The number of bytes without the terminating null.
451 * @return A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
453 char *u8_encodech(Uchar ch, size_t *l)
457 len = u8_fromchar(ch, buf, sizeof(buf));
466 /** Convert a utf-8 multibyte string to a wide character string.
467 * @param wcs The target wide-character buffer.
468 * @param mb The utf-8 encoded multibyte string to convert.
469 * @param maxlen The maximum number of wide-characters that fit into the target buffer.
470 * @return The number of characters written to the target buffer.
472 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
478 for (i = 0; *mb && i < maxlen-1; ++i)
480 ch = u8_getchar(mb, &mb);
489 /** Convert a wide-character string to a utf-8 multibyte string.
490 * @param mb The target buffer the utf-8 string is written to.
491 * @param wcs The wide-character string to convert.
492 * @param maxlen The number bytes that fit into the multibyte target buffer.
493 * @return The number of bytes written, not including the terminating \0
495 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
498 const char *start = mb;
501 for (i = 0; wcs[i] && i < maxlen-1; ++i)
504 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
514 UTF-8 aware COM_StringLengthNoColors
516 calculates the visible width of a color coded string.
518 *valid is filled with TRUE if the string is a valid colored string (that is, if
519 it does not end with an unfinished color code). If it gets filled with FALSE, a
520 fix would be adding a STRING_COLOR_TAG at the end of the string.
522 valid can be set to NULL if the caller doesn't care.
524 For size_s, specify the maximum number of characters from s to use, or 0 to use
525 all characters until the zero terminator.
529 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
531 u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid)
536 if (utf8_disabled.integer)
537 return COM_StringLengthNoColors(s, size_s, valid);
539 end = size_s ? (s + size_s) : NULL;
543 switch((s == end) ? 0 : *s)
549 case STRING_COLOR_TAG:
551 switch((s == end) ? 0 : *s)
553 case STRING_COLOR_RGB_TAG_CHAR:
554 if (s+1 != end && isxdigit(s[1]) &&
555 s+2 != end && isxdigit(s[2]) &&
556 s+3 != end && isxdigit(s[3]) )
561 ++len; // STRING_COLOR_TAG
562 ++len; // STRING_COLOR_RGB_TAG_CHAR
564 case 0: // ends with unfinished color code!
569 case STRING_COLOR_TAG: // escaped ^
572 case '0': case '1': case '2': case '3': case '4':
573 case '5': case '6': case '7': case '8': case '9': // color code
575 default: // not a color code
576 ++len; // STRING_COLOR_TAG
577 ++len; // the character
586 // start of a wide character
589 for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
592 // part of a wide character, we ignore that one