ConvertUTF.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
  2. *
  3. * The LLVM Compiler Infrastructure
  4. *
  5. * This file is distributed under the University of Illinois Open Source
  6. * License. See LICENSE.TXT for details.
  7. *
  8. *===------------------------------------------------------------------------=*/
  9. /*
  10. * Copyright 2001-2004 Unicode, Inc.
  11. *
  12. * Disclaimer
  13. *
  14. * This source code is provided as is by Unicode, Inc. No claims are
  15. * made as to fitness for any particular purpose. No warranties of any
  16. * kind are expressed or implied. The recipient agrees to determine
  17. * applicability of information provided. If this file has been
  18. * purchased on magnetic or optical media from Unicode, Inc., the
  19. * sole remedy for any claim will be exchange of defective media
  20. * within 90 days of receipt.
  21. *
  22. * Limitations on Rights to Redistribute This Code
  23. *
  24. * Unicode, Inc. hereby grants the right to freely use the information
  25. * supplied in this file in the creation of products supporting the
  26. * Unicode Standard, and to make copies of this file in any form
  27. * for internal or external distribution as long as this notice
  28. * remains attached.
  29. */
  30. /* ---------------------------------------------------------------------
  31. Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  32. Author: Mark E. Davis, 1994.
  33. Rev History: Rick McGowan, fixes & updates May 2001.
  34. Sept 2001: fixed const & error conditions per
  35. mods suggested by S. Parent & A. Lillich.
  36. June 2002: Tim Dodd added detection and handling of incomplete
  37. source sequences, enhanced error detection, added casts
  38. to eliminate compiler warnings.
  39. July 2003: slight mods to back out aggressive FFFE detection.
  40. Jan 2004: updated switches in from-UTF8 conversions.
  41. Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  42. See the header file "ConvertUTF.h" for complete documentation.
  43. ------------------------------------------------------------------------ */
  44. #include "ConvertUTF.h"
  45. #ifdef CVTUTF_DEBUG
  46. #include <stdio.h>
  47. #endif
  48. #include <string.h>
  49. static const int halfShift = 10; /* used for shifting by 10 bits */
  50. static const UTF32 halfBase = 0x0010000UL;
  51. static const UTF32 halfMask = 0x3FFUL;
  52. #define UNI_SUR_HIGH_START (UTF32)0xD800
  53. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  54. #define UNI_SUR_LOW_START (UTF32)0xDC00
  55. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  56. #define false 0
  57. #define true 1
  58. /* --------------------------------------------------------------------- */
  59. /*
  60. * Index into the table below with the first byte of a UTF-8 sequence to
  61. * get the number of trailing bytes that are supposed to follow it.
  62. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  63. * left as-is for anyone who may want to do such conversion, which was
  64. * allowed in earlier algorithms.
  65. */
  66. static const char trailingBytesForUTF8[256] = {
  67. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  68. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  69. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  70. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  71. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  72. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  73. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  74. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  75. };
  76. /*
  77. * Magic values subtracted from a buffer value during UTF8 conversion.
  78. * This table contains as many values as there might be trailing bytes
  79. * in a UTF-8 sequence.
  80. */
  81. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  82. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  83. /*
  84. * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  85. * into the first byte, depending on how many bytes follow. There are
  86. * as many entries in this table as there are UTF-8 sequence types.
  87. * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
  88. * for *legal* UTF-8 will be 4 or fewer bytes total.
  89. */
  90. static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  91. /* --------------------------------------------------------------------- */
  92. /* The interface converts a whole buffer to avoid function-call overhead.
  93. * Constants have been gathered. Loops & conditionals have been removed as
  94. * much as possible for efficiency, in favor of drop-through switches.
  95. * (See "Note A" at the bottom of the file for equivalent code.)
  96. * If your compiler supports it, the "isLegalUTF8" call can be turned
  97. * into an inline function.
  98. */
  99. /* --------------------------------------------------------------------- */
  100. ConversionResult ConvertUTF32toUTF16 (
  101. const UTF32** sourceStart, const UTF32* sourceEnd,
  102. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  103. ConversionResult result = conversionOK;
  104. const UTF32* source = *sourceStart;
  105. UTF16* target = *targetStart;
  106. while (source < sourceEnd) {
  107. UTF32 ch;
  108. if (target >= targetEnd) {
  109. result = targetExhausted; break;
  110. }
  111. ch = *source++;
  112. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  113. /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  114. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  115. if (flags == strictConversion) {
  116. --source; /* return to the illegal value itself */
  117. result = sourceIllegal;
  118. break;
  119. } else {
  120. *target++ = UNI_REPLACEMENT_CHAR;
  121. }
  122. } else {
  123. *target++ = (UTF16)ch; /* normal case */
  124. }
  125. } else if (ch > UNI_MAX_LEGAL_UTF32) {
  126. if (flags == strictConversion) {
  127. result = sourceIllegal;
  128. } else {
  129. *target++ = UNI_REPLACEMENT_CHAR;
  130. }
  131. } else {
  132. /* target is a character in range 0xFFFF - 0x10FFFF. */
  133. if (target + 1 >= targetEnd) {
  134. --source; /* Back up source pointer! */
  135. result = targetExhausted; break;
  136. }
  137. ch -= halfBase;
  138. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  139. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  140. }
  141. }
  142. *sourceStart = source;
  143. *targetStart = target;
  144. return result;
  145. }
  146. /* --------------------------------------------------------------------- */
  147. ConversionResult ConvertUTF16toUTF32 (
  148. const UTF16** sourceStart, const UTF16* sourceEnd,
  149. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  150. ConversionResult result = conversionOK;
  151. const UTF16* source = *sourceStart;
  152. UTF32* target = *targetStart;
  153. UTF32 ch, ch2;
  154. while (source < sourceEnd) {
  155. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  156. ch = *source++;
  157. /* If we have a surrogate pair, convert to UTF32 first. */
  158. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  159. /* If the 16 bits following the high surrogate are in the source buffer... */
  160. if (source < sourceEnd) {
  161. ch2 = *source;
  162. /* If it's a low surrogate, convert to UTF32. */
  163. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  164. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  165. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  166. ++source;
  167. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  168. --source; /* return to the illegal value itself */
  169. result = sourceIllegal;
  170. break;
  171. }
  172. } else { /* We don't have the 16 bits following the high surrogate. */
  173. --source; /* return to the high surrogate */
  174. result = sourceExhausted;
  175. break;
  176. }
  177. } else if (flags == strictConversion) {
  178. /* UTF-16 surrogate values are illegal in UTF-32 */
  179. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  180. --source; /* return to the illegal value itself */
  181. result = sourceIllegal;
  182. break;
  183. }
  184. }
  185. if (target >= targetEnd) {
  186. source = oldSource; /* Back up source pointer! */
  187. result = targetExhausted; break;
  188. }
  189. *target++ = ch;
  190. }
  191. *sourceStart = source;
  192. *targetStart = target;
  193. #ifdef CVTUTF_DEBUG
  194. if (result == sourceIllegal) {
  195. fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
  196. fflush(stderr);
  197. }
  198. #endif
  199. return result;
  200. }
  201. ConversionResult ConvertUTF16toUTF8 (
  202. const UTF16** sourceStart, const UTF16* sourceEnd,
  203. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  204. ConversionResult result = conversionOK;
  205. const UTF16* source = *sourceStart;
  206. UTF8* target = *targetStart;
  207. while (source < sourceEnd) {
  208. UTF32 ch;
  209. unsigned short bytesToWrite = 0;
  210. const UTF32 byteMask = 0xBF;
  211. const UTF32 byteMark = 0x80;
  212. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  213. ch = *source++;
  214. /* If we have a surrogate pair, convert to UTF32 first. */
  215. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  216. /* If the 16 bits following the high surrogate are in the source buffer... */
  217. if (source < sourceEnd) {
  218. UTF32 ch2 = *source;
  219. /* If it's a low surrogate, convert to UTF32. */
  220. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  221. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  222. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  223. ++source;
  224. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  225. --source; /* return to the illegal value itself */
  226. result = sourceIllegal;
  227. break;
  228. }
  229. } else { /* We don't have the 16 bits following the high surrogate. */
  230. --source; /* return to the high surrogate */
  231. result = sourceExhausted;
  232. break;
  233. }
  234. } else if (flags == strictConversion) {
  235. /* UTF-16 surrogate values are illegal in UTF-32 */
  236. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  237. --source; /* return to the illegal value itself */
  238. result = sourceIllegal;
  239. break;
  240. }
  241. }
  242. /* Figure out how many bytes the result will require */
  243. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  244. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  245. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  246. } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
  247. } else { bytesToWrite = 3;
  248. ch = UNI_REPLACEMENT_CHAR;
  249. }
  250. target += bytesToWrite;
  251. if (target > targetEnd) {
  252. source = oldSource; /* Back up source pointer! */
  253. target -= bytesToWrite; result = targetExhausted; break;
  254. }
  255. switch (bytesToWrite) { /* note: everything falls through. */
  256. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  257. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  258. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  259. case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
  260. }
  261. target += bytesToWrite;
  262. }
  263. *sourceStart = source;
  264. *targetStart = target;
  265. return result;
  266. }
  267. /* --------------------------------------------------------------------- */
  268. ConversionResult ConvertUTF32toUTF8 (
  269. const UTF32** sourceStart, const UTF32* sourceEnd,
  270. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  271. ConversionResult result = conversionOK;
  272. const UTF32* source = *sourceStart;
  273. UTF8* target = *targetStart;
  274. while (source < sourceEnd) {
  275. UTF32 ch;
  276. unsigned short bytesToWrite = 0;
  277. const UTF32 byteMask = 0xBF;
  278. const UTF32 byteMark = 0x80;
  279. ch = *source++;
  280. if (flags == strictConversion ) {
  281. /* UTF-16 surrogate values are illegal in UTF-32 */
  282. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  283. --source; /* return to the illegal value itself */
  284. result = sourceIllegal;
  285. break;
  286. }
  287. }
  288. /*
  289. * Figure out how many bytes the result will require. Turn any
  290. * illegally large UTF32 things (> Plane 17) into replacement chars.
  291. */
  292. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  293. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  294. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  295. } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
  296. } else { bytesToWrite = 3;
  297. ch = UNI_REPLACEMENT_CHAR;
  298. result = sourceIllegal;
  299. }
  300. target += bytesToWrite;
  301. if (target > targetEnd) {
  302. --source; /* Back up source pointer! */
  303. target -= bytesToWrite; result = targetExhausted; break;
  304. }
  305. switch (bytesToWrite) { /* note: everything falls through. */
  306. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  307. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  308. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  309. case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
  310. }
  311. target += bytesToWrite;
  312. }
  313. *sourceStart = source;
  314. *targetStart = target;
  315. return result;
  316. }
  317. /* --------------------------------------------------------------------- */
  318. /*
  319. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  320. * This must be called with the length pre-determined by the first byte.
  321. * If not calling this from ConvertUTF8to*, then the length can be set by:
  322. * length = trailingBytesForUTF8[*source]+1;
  323. * and the sequence is illegal right away if there aren't that many bytes
  324. * available.
  325. * If presented with a length > 4, this returns false. The Unicode
  326. * definition of UTF-8 goes up to 4-byte sequences.
  327. */
  328. static Boolean isLegalUTF8(const UTF8 *source, int length) {
  329. UTF8 a;
  330. const UTF8 *srcptr = source+length;
  331. switch (length) {
  332. default: return false;
  333. /* Everything else falls through when "true"... */
  334. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  335. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  336. case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  337. switch (*source) {
  338. /* no fall-through in this inner switch */
  339. case 0xE0: if (a < 0xA0) return false; break;
  340. case 0xED: if (a > 0x9F) return false; break;
  341. case 0xF0: if (a < 0x90) return false; break;
  342. case 0xF4: if (a > 0x8F) return false; break;
  343. default: if (a < 0x80) return false;
  344. }
  345. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  346. }
  347. if (*source > 0xF4) return false;
  348. return true;
  349. }
  350. /* --------------------------------------------------------------------- */
  351. /*
  352. * Exported function to return whether a UTF-8 sequence is legal or not.
  353. * This is not used here; it's just exported.
  354. */
  355. Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
  356. int length = trailingBytesForUTF8[*source]+1;
  357. if (length > sourceEnd - source) {
  358. return false;
  359. }
  360. return isLegalUTF8(source, length);
  361. }
  362. /* --------------------------------------------------------------------- */
  363. /*
  364. * Exported function to return the total number of bytes in a codepoint
  365. * represented in UTF-8, given the value of the first byte.
  366. */
  367. unsigned getNumBytesForUTF8(UTF8 first) {
  368. return trailingBytesForUTF8[first] + 1;
  369. }
  370. int getUTF8StringLength(const UTF8* utf8)
  371. {
  372. const UTF8** source = &utf8;
  373. const UTF8* sourceEnd = utf8 + strlen((const char*)utf8);
  374. int ret = 0;
  375. while (*source != sourceEnd) {
  376. int length = trailingBytesForUTF8[**source] + 1;
  377. if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
  378. return 0;
  379. *source += length;
  380. ++ret;
  381. }
  382. return ret;
  383. }
  384. /* --------------------------------------------------------------------- */
  385. /*
  386. * Exported function to return whether a UTF-8 string is legal or not.
  387. * This is not used here; it's just exported.
  388. */
  389. Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
  390. while (*source != sourceEnd) {
  391. int length = trailingBytesForUTF8[**source] + 1;
  392. if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
  393. return false;
  394. *source += length;
  395. }
  396. return true;
  397. }
  398. /* --------------------------------------------------------------------- */
  399. ConversionResult ConvertUTF8toUTF16 (
  400. const UTF8** sourceStart, const UTF8* sourceEnd,
  401. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  402. ConversionResult result = conversionOK;
  403. const UTF8* source = *sourceStart;
  404. UTF16* target = *targetStart;
  405. while (source < sourceEnd) {
  406. UTF32 ch = 0;
  407. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  408. if (extraBytesToRead >= sourceEnd - source) {
  409. result = sourceExhausted; break;
  410. }
  411. /* Do this check whether lenient or strict */
  412. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  413. result = sourceIllegal;
  414. break;
  415. }
  416. /*
  417. * The cases all fall through. See "Note A" below.
  418. */
  419. switch (extraBytesToRead) {
  420. case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  421. case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  422. case 3: ch += *source++; ch <<= 6;
  423. case 2: ch += *source++; ch <<= 6;
  424. case 1: ch += *source++; ch <<= 6;
  425. case 0: ch += *source++;
  426. }
  427. ch -= offsetsFromUTF8[extraBytesToRead];
  428. if (target >= targetEnd) {
  429. source -= (extraBytesToRead+1); /* Back up source pointer! */
  430. result = targetExhausted; break;
  431. }
  432. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  433. /* UTF-16 surrogate values are illegal in UTF-32 */
  434. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  435. if (flags == strictConversion) {
  436. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  437. result = sourceIllegal;
  438. break;
  439. } else {
  440. *target++ = UNI_REPLACEMENT_CHAR;
  441. }
  442. } else {
  443. *target++ = (UTF16)ch; /* normal case */
  444. }
  445. } else if (ch > UNI_MAX_UTF16) {
  446. if (flags == strictConversion) {
  447. result = sourceIllegal;
  448. source -= (extraBytesToRead+1); /* return to the start */
  449. break; /* Bail out; shouldn't continue */
  450. } else {
  451. *target++ = UNI_REPLACEMENT_CHAR;
  452. }
  453. } else {
  454. /* target is a character in range 0xFFFF - 0x10FFFF. */
  455. if (target + 1 >= targetEnd) {
  456. source -= (extraBytesToRead+1); /* Back up source pointer! */
  457. result = targetExhausted; break;
  458. }
  459. ch -= halfBase;
  460. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  461. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  462. }
  463. }
  464. *sourceStart = source;
  465. *targetStart = target;
  466. return result;
  467. }
  468. /* --------------------------------------------------------------------- */
  469. ConversionResult ConvertUTF8toUTF32 (
  470. const UTF8** sourceStart, const UTF8* sourceEnd,
  471. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  472. ConversionResult result = conversionOK;
  473. const UTF8* source = *sourceStart;
  474. UTF32* target = *targetStart;
  475. while (source < sourceEnd) {
  476. UTF32 ch = 0;
  477. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  478. if (extraBytesToRead >= sourceEnd - source) {
  479. result = sourceExhausted; break;
  480. }
  481. /* Do this check whether lenient or strict */
  482. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  483. result = sourceIllegal;
  484. break;
  485. }
  486. /*
  487. * The cases all fall through. See "Note A" below.
  488. */
  489. switch (extraBytesToRead) {
  490. case 5: ch += *source++; ch <<= 6;
  491. case 4: ch += *source++; ch <<= 6;
  492. case 3: ch += *source++; ch <<= 6;
  493. case 2: ch += *source++; ch <<= 6;
  494. case 1: ch += *source++; ch <<= 6;
  495. case 0: ch += *source++;
  496. }
  497. ch -= offsetsFromUTF8[extraBytesToRead];
  498. if (target >= targetEnd) {
  499. source -= (extraBytesToRead+1); /* Back up the source pointer! */
  500. result = targetExhausted; break;
  501. }
  502. if (ch <= UNI_MAX_LEGAL_UTF32) {
  503. /*
  504. * UTF-16 surrogate values are illegal in UTF-32, and anything
  505. * over Plane 17 (> 0x10FFFF) is illegal.
  506. */
  507. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  508. if (flags == strictConversion) {
  509. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  510. result = sourceIllegal;
  511. break;
  512. } else {
  513. *target++ = UNI_REPLACEMENT_CHAR;
  514. }
  515. } else {
  516. *target++ = ch;
  517. }
  518. } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  519. result = sourceIllegal;
  520. *target++ = UNI_REPLACEMENT_CHAR;
  521. }
  522. }
  523. *sourceStart = source;
  524. *targetStart = target;
  525. return result;
  526. }
  527. /* ---------------------------------------------------------------------
  528. Note A.
  529. The fall-through switches in UTF-8 reading code save a
  530. temp variable, some decrements & conditionals. The switches
  531. are equivalent to the following loop:
  532. {
  533. int tmpBytesToRead = extraBytesToRead+1;
  534. do {
  535. ch += *source++;
  536. --tmpBytesToRead;
  537. if (tmpBytesToRead) ch <<= 6;
  538. } while (tmpBytesToRead > 0);
  539. }
  540. In UTF-8 writing code, the switches on "bytesToWrite" are
  541. similarly unrolled loops.
  542. --------------------------------------------------------------------- */