pluginxUTF8.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. /*
  2. * This file uses some implementations of gutf8.c in glib.
  3. *
  4. * gutf8.c - Operations on UTF-8 strings.
  5. *
  6. * Copyright (C) 1999 Tom Tromey
  7. * Copyright (C) 2000 Red Hat, Inc.
  8. *
  9. * This library is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2 of the License, or (at your option) any later version.
  13. *
  14. * This library is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with this library; if not, write to the
  21. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  22. * Boston, MA 02111-1307, USA.
  23. */
  24. #include "pluginxUTF8.h"
  25. namespace pluginx {
  26. #define CCLOGERROR(...) do {} while(0)
  27. #ifndef NULL
  28. #define NULL 0
  29. #endif
  30. int cc_wcslen(const unsigned short* str)
  31. {
  32. int i=0;
  33. while(*str++) i++;
  34. return i;
  35. }
  36. /* Code from GLIB gutf8.c starts here. */
  37. #define UTF8_COMPUTE(Char, Mask, Len) \
  38. if (Char < 128) \
  39. { \
  40. Len = 1; \
  41. Mask = 0x7f; \
  42. } \
  43. else if ((Char & 0xe0) == 0xc0) \
  44. { \
  45. Len = 2; \
  46. Mask = 0x1f; \
  47. } \
  48. else if ((Char & 0xf0) == 0xe0) \
  49. { \
  50. Len = 3; \
  51. Mask = 0x0f; \
  52. } \
  53. else if ((Char & 0xf8) == 0xf0) \
  54. { \
  55. Len = 4; \
  56. Mask = 0x07; \
  57. } \
  58. else if ((Char & 0xfc) == 0xf8) \
  59. { \
  60. Len = 5; \
  61. Mask = 0x03; \
  62. } \
  63. else if ((Char & 0xfe) == 0xfc) \
  64. { \
  65. Len = 6; \
  66. Mask = 0x01; \
  67. } \
  68. else \
  69. Len = -1;
  70. #define UTF8_LENGTH(Char) \
  71. ((Char) < 0x80 ? 1 : \
  72. ((Char) < 0x800 ? 2 : \
  73. ((Char) < 0x10000 ? 3 : \
  74. ((Char) < 0x200000 ? 4 : \
  75. ((Char) < 0x4000000 ? 5 : 6)))))
  76. #define UTF8_GET(Result, Chars, Count, Mask, Len) \
  77. (Result) = (Chars)[0] & (Mask); \
  78. for ((Count) = 1; (Count) < (Len); ++(Count)) \
  79. { \
  80. if (((Chars)[(Count)] & 0xc0) != 0x80) \
  81. { \
  82. (Result) = -1; \
  83. break; \
  84. } \
  85. (Result) <<= 6; \
  86. (Result) |= ((Chars)[(Count)] & 0x3f); \
  87. }
  88. #define UNICODE_VALID(Char) \
  89. ((Char) < 0x110000 && \
  90. (((Char) & 0xFFFFF800) != 0xD800) && \
  91. ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
  92. ((Char) & 0xFFFE) != 0xFFFE)
  93. static const char utf8_skip_data[256] = {
  94. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  95. 1, 1, 1, 1, 1, 1, 1,
  96. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  97. 1, 1, 1, 1, 1, 1, 1,
  98. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  99. 1, 1, 1, 1, 1, 1, 1,
  100. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  101. 1, 1, 1, 1, 1, 1, 1,
  102. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  103. 1, 1, 1, 1, 1, 1, 1,
  104. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  105. 1, 1, 1, 1, 1, 1, 1,
  106. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  107. 2, 2, 2, 2, 2, 2, 2,
  108. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
  109. 5, 5, 5, 6, 6, 1, 1
  110. };
  111. static const char *const g_utf8_skip = utf8_skip_data;
  112. #define cc_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
  113. /*
  114. * @str: the string to search through.
  115. * @c: the character to find.
  116. *
  117. * Returns the index of the first occurrence of the character, if found. Otherwise -1 is returned.
  118. *
  119. * Return value: the index of the first occurrence of the character if found or -1 otherwise.
  120. * */
  121. static unsigned int cc_utf8_find_char(std::vector<unsigned short> str, unsigned short c)
  122. {
  123. unsigned int len = str.size();
  124. for (unsigned int i = 0; i < len; ++i)
  125. if (str[i] == c) return i;
  126. return -1;
  127. }
  128. /*
  129. * @str: the string to search through.
  130. * @c: the character to not look for.
  131. *
  132. * Return value: the index of the last character that is not c.
  133. * */
  134. unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c)
  135. {
  136. int len = str.size();
  137. int i = len - 1;
  138. for (; i >= 0; --i)
  139. if (str[i] != c) return i;
  140. return i;
  141. }
  142. /*
  143. * @str: the string to trim
  144. * @index: the index to start trimming from.
  145. *
  146. * Trims str st str=[0, index) after the operation.
  147. *
  148. * Return value: the trimmed string.
  149. * */
  150. static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index)
  151. {
  152. int size = str->size();
  153. if (index >= size || index < 0)
  154. return;
  155. str->erase(str->begin() + index, str->begin() + size);
  156. }
  157. /*
  158. * @ch is the unicode character whitespace?
  159. *
  160. * Reference: http://en.wikipedia.org/wiki/Whitespace_character#Unicode
  161. *
  162. * Return value: weather the character is a whitespace character.
  163. * */
  164. bool isspace_unicode(unsigned short ch)
  165. {
  166. return (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 || ch == 0x1680
  167. || (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 || ch == 0x202F
  168. || ch == 0x205F || ch == 0x3000;
  169. }
  170. void cc_utf8_trim_ws(std::vector<unsigned short>* str)
  171. {
  172. int len = str->size();
  173. if ( len <= 0 )
  174. return;
  175. int last_index = len - 1;
  176. // Only start trimming if the last character is whitespace..
  177. if (isspace_unicode((*str)[last_index]))
  178. {
  179. for (int i = last_index - 1; i >= 0; --i)
  180. {
  181. if (isspace_unicode((*str)[i]))
  182. last_index = i;
  183. else
  184. break;
  185. }
  186. cc_utf8_trim_from(str, last_index);
  187. }
  188. }
  189. /*
  190. * cc_utf8_strlen:
  191. * @p: pointer to the start of a UTF-8 encoded string.
  192. * @max: the maximum number of bytes to examine. If @max
  193. * is less than 0, then the string is assumed to be
  194. * null-terminated. If @max is 0, @p will not be examined and
  195. * may be %NULL.
  196. *
  197. * Returns the length of the string in characters.
  198. *
  199. * Return value: the length of the string in characters
  200. **/
  201. long
  202. cc_utf8_strlen (const char * p, int max)
  203. {
  204. long len = 0;
  205. const char *start = p;
  206. if (!(p != NULL || max == 0))
  207. {
  208. return 0;
  209. }
  210. if (max < 0)
  211. {
  212. while (*p)
  213. {
  214. p = cc_utf8_next_char (p);
  215. ++len;
  216. }
  217. }
  218. else
  219. {
  220. if (max == 0 || !*p)
  221. return 0;
  222. p = cc_utf8_next_char (p);
  223. while (p - start < max && *p)
  224. {
  225. ++len;
  226. p = cc_utf8_next_char (p);
  227. }
  228. /* only do the last len increment if we got a complete
  229. * char (don't count partial chars)
  230. */
  231. if (p - start == max)
  232. ++len;
  233. }
  234. return len;
  235. }
  236. /*
  237. * g_utf8_get_char:
  238. * @p: a pointer to Unicode character encoded as UTF-8
  239. *
  240. * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
  241. * If @p does not point to a valid UTF-8 encoded character, results are
  242. * undefined. If you are not sure that the bytes are complete
  243. * valid Unicode characters, you should use g_utf8_get_char_validated()
  244. * instead.
  245. *
  246. * Return value: the resulting character
  247. **/
  248. static unsigned int
  249. cc_utf8_get_char (const char * p)
  250. {
  251. int i, mask = 0, len;
  252. unsigned int result;
  253. unsigned char c = (unsigned char) *p;
  254. UTF8_COMPUTE (c, mask, len);
  255. if (len == -1)
  256. return (unsigned int) - 1;
  257. UTF8_GET (result, p, i, mask, len);
  258. return result;
  259. }
  260. unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1 */, int* rUtf16Size/* = NULL */)
  261. {
  262. int len = cc_utf8_strlen(str_old, length);
  263. if (rUtf16Size != NULL) {
  264. *rUtf16Size = len;
  265. }
  266. unsigned short* str_new = new unsigned short[len + 1];
  267. str_new[len] = 0;
  268. for (int i = 0; i < len; ++i)
  269. {
  270. str_new[i] = cc_utf8_get_char(str_old);
  271. str_old = cc_utf8_next_char(str_old);
  272. }
  273. return str_new;
  274. }
  275. std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str)
  276. {
  277. int len = cc_wcslen(str);
  278. std::vector<unsigned short> str_new;
  279. for (int i = 0; i < len; ++i)
  280. {
  281. str_new.push_back(str[i]);
  282. }
  283. return str_new;
  284. }
  285. /**
  286. * cc_unichar_to_utf8:
  287. * @c: a ISO10646 character code
  288. * @outbuf: output buffer, must have at least 6 bytes of space.
  289. * If %NULL, the length will be computed and returned
  290. * and nothing will be written to @outbuf.
  291. *
  292. * Converts a single character to UTF-8.
  293. *
  294. * Return value: number of bytes written
  295. **/
  296. int
  297. cc_unichar_to_utf8 (unsigned short c,
  298. char *outbuf)
  299. {
  300. unsigned int len = 0;
  301. int first;
  302. int i;
  303. if (c < 0x80)
  304. {
  305. first = 0;
  306. len = 1;
  307. }
  308. else if (c < 0x800)
  309. {
  310. first = 0xc0;
  311. len = 2;
  312. }
  313. else if (c < 0x10000)
  314. {
  315. first = 0xe0;
  316. len = 3;
  317. }
  318. else if (c < 0x200000)
  319. {
  320. first = 0xf0;
  321. len = 4;
  322. }
  323. else if (c < 0x4000000)
  324. {
  325. first = 0xf8;
  326. len = 5;
  327. }
  328. else
  329. {
  330. first = 0xfc;
  331. len = 6;
  332. }
  333. if (outbuf)
  334. {
  335. for (i = len - 1; i > 0; --i)
  336. {
  337. outbuf[i] = (c & 0x3f) | 0x80;
  338. c >>= 6;
  339. }
  340. outbuf[0] = c | first;
  341. }
  342. return len;
  343. }
  344. #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
  345. /**
  346. * cc_utf16_to_utf8:
  347. * @str: a UTF-16 encoded string
  348. * @len: the maximum length of @str to use. If @len < 0, then
  349. * the string is terminated with a 0 character.
  350. * @items_read: location to store number of words read, or %NULL.
  351. * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
  352. * returned in case @str contains a trailing partial
  353. * character. If an error occurs then the index of the
  354. * invalid input is stored here.
  355. * @items_written: location to store number of bytes written, or %NULL.
  356. * The value stored here does not include the trailing
  357. * 0 byte.
  358. * @error: location to store the error occuring, or %NULL to ignore
  359. * errors. Any of the errors in #GConvertError other than
  360. * %G_CONVERT_ERROR_NO_CONVERSION may occur.
  361. *
  362. * Convert a string from UTF-16 to UTF-8. The result will be
  363. * terminated with a 0 byte.
  364. *
  365. * Return value: a pointer to a newly allocated UTF-8 string.
  366. * This value must be freed with free(). If an
  367. * error occurs, %NULL will be returned and
  368. * @error set.
  369. **/
  370. char *
  371. cc_utf16_to_utf8 (const unsigned short *str,
  372. long len,
  373. long *items_read,
  374. long *items_written)
  375. {
  376. /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
  377. * are marked.
  378. */
  379. const unsigned short *in;
  380. char *out;
  381. char *result = NULL;
  382. int n_bytes;
  383. unsigned short high_surrogate;
  384. if (str == 0) return NULL;
  385. n_bytes = 0;
  386. in = str;
  387. high_surrogate = 0;
  388. while ((len < 0 || in - str < len) && *in)
  389. {
  390. unsigned short c = *in;
  391. unsigned short wc;
  392. if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
  393. {
  394. if (high_surrogate)
  395. {
  396. wc = SURROGATE_VALUE (high_surrogate, c);
  397. high_surrogate = 0;
  398. }
  399. else
  400. {
  401. CCLOGERROR("Invalid sequence in conversion input");
  402. goto err_out;
  403. }
  404. }
  405. else
  406. {
  407. if (high_surrogate)
  408. {
  409. CCLOGERROR("Invalid sequence in conversion input");
  410. goto err_out;
  411. }
  412. if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
  413. {
  414. high_surrogate = c;
  415. goto next1;
  416. }
  417. else
  418. wc = c;
  419. }
  420. /********** DIFFERENT for UTF8/UCS4 **********/
  421. n_bytes += UTF8_LENGTH (wc);
  422. next1:
  423. in++;
  424. }
  425. if (high_surrogate && !items_read)
  426. {
  427. CCLOGERROR("Partial character sequence at end of input");
  428. goto err_out;
  429. }
  430. /* At this point, everything is valid, and we just need to convert
  431. */
  432. /********** DIFFERENT for UTF8/UCS4 **********/
  433. result = new char[n_bytes + 1];
  434. high_surrogate = 0;
  435. out = result;
  436. in = str;
  437. while (out < result + n_bytes)
  438. {
  439. unsigned short c = *in;
  440. unsigned short wc;
  441. if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
  442. {
  443. wc = SURROGATE_VALUE (high_surrogate, c);
  444. high_surrogate = 0;
  445. }
  446. else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
  447. {
  448. high_surrogate = c;
  449. goto next2;
  450. }
  451. else
  452. wc = c;
  453. /********** DIFFERENT for UTF8/UCS4 **********/
  454. out += cc_unichar_to_utf8 (wc, out);
  455. next2:
  456. in++;
  457. }
  458. /********** DIFFERENT for UTF8/UCS4 **********/
  459. *out = '\0';
  460. if (items_written)
  461. /********** DIFFERENT for UTF8/UCS4 **********/
  462. *items_written = out - result;
  463. err_out:
  464. if (items_read)
  465. *items_read = in - str;
  466. return result;
  467. }
  468. }// namespace pluginx {