rapidxml_sax3.hpp 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121
  1. #ifndef RAPIDXML_SAX3_HPP_INCLUDED
  2. #define RAPIDXML_SAX3_HPP_INCLUDED
  3. // Copyright (C) 2006, 2009 Marcin Kalicinski
  4. // Version 1.13
  5. // Revision $DateTime: 2009/05/13 01:46:17 $
  6. //! \file rapidxml_sax3.hpp This file contains rapidxml SAX parser implementation
  7. #include <vector>
  8. #include <utility>
  9. #include "rapidxml.hpp"
  10. // On MSVC, disable "conditional expression is constant" warning (level 4).
  11. // This warning is almost impossible to avoid with certain types of templated code
  12. #ifdef _MSC_VER
  13. #pragma warning(push)
  14. #pragma warning(disable:4127) // Conditional expression is constant
  15. #endif
  16. #if !defined(RAPIDXML_PARSE_ERROR)
  17. #define RAPIDXML_PARSE_ERROR(what, where) throw parse_error(what, where)
  18. #endif
  19. namespace rapidxml
  20. {
  21. const int parse_normal = parse_no_data_nodes;
  22. typedef std::pair<char*, size_t> tok_string;
  23. typedef std::pair<const char*, size_t> const_tok_string;
  24. class xml_sax3_handler
  25. {
  26. public:
  27. virtual ~xml_sax3_handler() {}
  28. virtual void xmlSAX3StartElement(char *name, size_t) = 0;
  29. virtual void xmlSAX3Attr(const char* name, size_t,
  30. const char* value, size_t) = 0;
  31. virtual void xmlSAX3EndAttr() = 0;
  32. virtual void xmlSAX3EndElement(const char *name, size_t) = 0;
  33. virtual void xmlSAX3Text(const char *text, size_t len) = 0;
  34. };
  35. ///////////////////////////////////////////////////////////////////////////
  36. // XML sax parser
  37. class xml_sax2_handler : public xml_sax3_handler
  38. {
  39. public:
  40. xml_sax2_handler() { elementAttrs.reserve(64); }
  41. /**
  42. * @remark: The parameter 'name' without null terminator charactor
  43. */
  44. virtual void xmlSAX2StartElement(const char *name, size_t, const char **atts, size_t) = 0;
  45. /**
  46. * @remark: The parameter 'name' has null terminator charactor
  47. */
  48. virtual void xmlSAX2EndElement(const char *name, size_t) = 0;
  49. /**
  50. * @remark: The parameter 's' has null terminator charactor
  51. */
  52. virtual void xmlSAX2Text(const char *s, size_t) = 0;
  53. /// Implement SAX3 interfaces:
  54. virtual void xmlSAX3StartElement(char * name, size_t size) final
  55. {
  56. elementName.first = name;
  57. elementName.second = size;
  58. }
  59. virtual void xmlSAX3Attr(
  60. const char* name, size_t,
  61. const char* value, size_t) final
  62. {
  63. elementAttrs.push_back(name);
  64. elementAttrs.push_back(value);
  65. }
  66. void xmlSAX3EndAttr() final
  67. {
  68. auto chTemp = elementName.first[elementName.second];
  69. elementName.first[elementName.second] = '\0';
  70. if (!elementAttrs.empty()) {
  71. elementAttrs.push_back(nullptr);
  72. xmlSAX2StartElement(elementName.first, elementName.second, &elementAttrs[0], elementAttrs.size() - 1);
  73. elementAttrs.clear();
  74. }
  75. else {
  76. const char* attr = nullptr;
  77. const char** attrs = &attr;
  78. xmlSAX2StartElement(elementName.first, elementName.second, attrs, 0);
  79. }
  80. elementName.first[elementName.second] = chTemp;
  81. }
  82. virtual void xmlSAX3EndElement(const char *name, size_t len) final
  83. {
  84. xmlSAX2EndElement(name, len);
  85. }
  86. virtual void xmlSAX3Text(const char *s, size_t len) final
  87. {
  88. xmlSAX2Text(s, len);
  89. }
  90. private:
  91. tok_string elementName;
  92. std::vector<const char*> elementAttrs;
  93. };
  94. //! This class represents root of the DOM hierarchy.
  95. //! It is also an xml_node and a memory_pool through public inheritance.
  96. //! Use parse() function to build a DOM tree from a zero-terminated XML text string.
  97. //! parse() function allocates memory for nodes and attributes by using functions of xml_document,
  98. //! which are inherited from memory_pool.
  99. //! To access root node of the document, use the document itself, as if it was an xml_node.
  100. //! \param Ch Character type to use.
  101. template<class Ch = char>
  102. class xml_sax3_parser
  103. {
  104. xml_sax3_handler* handler_;
  105. public:
  106. //! Constructs empty XML document
  107. xml_sax3_parser(xml_sax3_handler* handler)
  108. {
  109. handler_ = handler;
  110. endptr_ = nullptr;
  111. }
  112. Ch *endptr_;
  113. //! Parses zero-terminated XML string according to given flags.
  114. //! Passed string will be modified by the parser, unless rapidxml::parse_non_destructive flag is used.
  115. //! The string must persist for the lifetime of the document.
  116. //! In case of error, rapidxml::parse_error exception will be thrown.
  117. //! <br><br>
  118. //! If you want to parse contents of a file, you must first load the file into the memory, and pass pointer to its beginning.
  119. //! Make sure that data is zero-terminated.
  120. //! <br><br>
  121. //! Document can be parsed into multiple times.
  122. //! Each new call to parse removes previous nodes and attributes (if any), but does not clear memory pool.
  123. //! \param text XML data to parse; pointer is non-const to denote fact that this data may be modified by the parser.
  124. template<int Flags = parse_normal>
  125. void parse(Ch *text, int nLen)
  126. {
  127. assert(text);
  128. // Remove current contents
  129. //this->remove_all_nodes();
  130. //this->remove_all_attributes();
  131. endptr_ = nullptr;
  132. if (nLen > 0)
  133. {
  134. endptr_ = text + nLen;
  135. }
  136. // Parse BOM, if any
  137. parse_bom<Flags>(text);
  138. // Parse children
  139. while (1)
  140. {
  141. // Skip whitespace before node
  142. skip<whitespace_pred, Flags>(text, endptr_);
  143. if (*text == 0 || text >= endptr_)
  144. break;
  145. // Parse and append new child
  146. if (*text == Ch('<'))
  147. {
  148. ++text; // Skip '<'
  149. parse_node<Flags>(text);
  150. }
  151. else
  152. RAPIDXML_PARSE_ERROR("expected <", text);
  153. }
  154. }
  155. //! Clears the document by deleting all nodes and clearing the memory pool.
  156. //! All nodes owned by document pool are destroyed.
  157. void clear()
  158. {
  159. //this->remove_all_nodes();
  160. //this->remove_all_attributes();
  161. //memory_pool<Ch>::clear();
  162. }
  163. private:
  164. ///////////////////////////////////////////////////////////////////////
  165. // Internal character utility functions
  166. // Detect whitespace character
  167. struct whitespace_pred
  168. {
  169. static unsigned char test(Ch ch)
  170. {
  171. return internal::lookup_tables<0>::lookup_whitespace[static_cast<unsigned char>(ch)];
  172. }
  173. };
  174. // Detect node name character
  175. struct node_name_pred
  176. {
  177. static unsigned char test(Ch ch)
  178. {
  179. return internal::lookup_tables<0>::lookup_node_name[static_cast<unsigned char>(ch)];
  180. }
  181. };
  182. // Detect attribute name character
  183. struct attribute_name_pred
  184. {
  185. static unsigned char test(Ch ch)
  186. {
  187. return internal::lookup_tables<0>::lookup_attribute_name[static_cast<unsigned char>(ch)];
  188. }
  189. };
  190. // Detect text character (PCDATA)
  191. struct text_pred
  192. {
  193. static unsigned char test(Ch ch)
  194. {
  195. return internal::lookup_tables<0>::lookup_text[static_cast<unsigned char>(ch)];
  196. }
  197. };
  198. // Detect text character (PCDATA) that does not require processing
  199. struct text_pure_no_ws_pred
  200. {
  201. static unsigned char test(Ch ch)
  202. {
  203. return internal::lookup_tables<0>::lookup_text_pure_no_ws[static_cast<unsigned char>(ch)];
  204. }
  205. };
  206. // Detect text character (PCDATA) that does not require processing
  207. struct text_pure_with_ws_pred
  208. {
  209. static unsigned char test(Ch ch)
  210. {
  211. return internal::lookup_tables<0>::lookup_text_pure_with_ws[static_cast<unsigned char>(ch)];
  212. }
  213. };
  214. // Detect attribute value character
  215. template<Ch Quote>
  216. struct attribute_value_pred
  217. {
  218. static unsigned char test(Ch ch)
  219. {
  220. if (Quote == Ch('\''))
  221. return internal::lookup_tables<0>::lookup_attribute_data_1[static_cast<unsigned char>(ch)];
  222. if (Quote == Ch('\"'))
  223. return internal::lookup_tables<0>::lookup_attribute_data_2[static_cast<unsigned char>(ch)];
  224. return 0; // Should never be executed, to avoid warnings on Comeau
  225. }
  226. };
  227. // Detect attribute value character
  228. template<Ch Quote>
  229. struct attribute_value_pure_pred
  230. {
  231. static unsigned char test(Ch ch)
  232. {
  233. if (Quote == Ch('\''))
  234. return internal::lookup_tables<0>::lookup_attribute_data_1_pure[static_cast<unsigned char>(ch)];
  235. if (Quote == Ch('\"'))
  236. return internal::lookup_tables<0>::lookup_attribute_data_2_pure[static_cast<unsigned char>(ch)];
  237. return 0; // Should never be executed, to avoid warnings on Comeau
  238. }
  239. };
  240. // Insert coded character, using UTF8 or 8-bit ASCII
  241. template<int Flags>
  242. static void insert_coded_character(Ch *&text, unsigned long code)
  243. {
  244. if (Flags & parse_no_utf8)
  245. {
  246. // Insert 8-bit ASCII character
  247. // Todo: possibly verify that code is less than 256 and use replacement char otherwise?
  248. text[0] = static_cast<unsigned char>(code);
  249. text += 1;
  250. }
  251. else
  252. {
  253. // Insert UTF8 sequence
  254. if (code < 0x80) // 1 byte sequence
  255. {
  256. text[0] = static_cast<unsigned char>(code);
  257. text += 1;
  258. }
  259. else if (code < 0x800) // 2 byte sequence
  260. {
  261. text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  262. text[0] = static_cast<unsigned char>(code | 0xC0);
  263. text += 2;
  264. }
  265. else if (code < 0x10000) // 3 byte sequence
  266. {
  267. text[2] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  268. text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  269. text[0] = static_cast<unsigned char>(code | 0xE0);
  270. text += 3;
  271. }
  272. else if (code < 0x110000) // 4 byte sequence
  273. {
  274. text[3] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  275. text[2] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  276. text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
  277. text[0] = static_cast<unsigned char>(code | 0xF0);
  278. text += 4;
  279. }
  280. else // Invalid, only codes up to 0x10FFFF are allowed in Unicode
  281. {
  282. RAPIDXML_PARSE_ERROR("invalid numeric character entity", text);
  283. }
  284. }
  285. }
  286. // Skip characters until predicate evaluates to true
  287. template<class StopPred, int Flags>
  288. static void skip(Ch *&text, Ch *textEnd = NULL)
  289. {
  290. Ch *tmp = text;
  291. while ((textEnd == NULL || tmp < textEnd) && StopPred::test(*tmp))
  292. ++tmp;
  293. text = tmp;
  294. }
  295. // Skip characters until predicate evaluates to true while doing the following:
  296. // - replacing XML character entity references with proper characters (&apos; &amp; &quot; &lt; &gt; &#...;)
  297. // - condensing whitespace sequences to single space character
  298. template<class StopPred, class StopPredPure, int Flags>
  299. static Ch *skip_and_expand_character_refs(Ch *&text)
  300. {
  301. // If entity translation, whitespace condense and whitespace trimming is disabled, use plain skip
  302. if (Flags & parse_no_entity_translation &&
  303. !(Flags & parse_normalize_whitespace) &&
  304. !(Flags & parse_trim_whitespace))
  305. {
  306. skip<StopPred, Flags>(text);
  307. return text;
  308. }
  309. // Use simple skip until first modification is detected
  310. skip<StopPredPure, Flags>(text);
  311. // Use translation skip
  312. Ch *src = text;
  313. Ch *dest = src;
  314. while (StopPred::test(*src))
  315. {
  316. // If entity translation is enabled
  317. if (!(Flags & parse_no_entity_translation))
  318. {
  319. // Test if replacement is needed
  320. if (src[0] == Ch('&'))
  321. {
  322. switch (src[1])
  323. {
  324. // &amp; &apos;
  325. case Ch('a'):
  326. if (src[2] == Ch('m') && src[3] == Ch('p') && src[4] == Ch(';'))
  327. {
  328. *dest = Ch('&');
  329. ++dest;
  330. src += 5;
  331. continue;
  332. }
  333. if (src[2] == Ch('p') && src[3] == Ch('o') && src[4] == Ch('s') && src[5] == Ch(';'))
  334. {
  335. *dest = Ch('\'');
  336. ++dest;
  337. src += 6;
  338. continue;
  339. }
  340. break;
  341. // &quot;
  342. case Ch('q'):
  343. if (src[2] == Ch('u') && src[3] == Ch('o') && src[4] == Ch('t') && src[5] == Ch(';'))
  344. {
  345. *dest = Ch('"');
  346. ++dest;
  347. src += 6;
  348. continue;
  349. }
  350. break;
  351. // &gt;
  352. case Ch('g'):
  353. if (src[2] == Ch('t') && src[3] == Ch(';'))
  354. {
  355. *dest = Ch('>');
  356. ++dest;
  357. src += 4;
  358. continue;
  359. }
  360. break;
  361. // &lt;
  362. case Ch('l'):
  363. if (src[2] == Ch('t') && src[3] == Ch(';'))
  364. {
  365. *dest = Ch('<');
  366. ++dest;
  367. src += 4;
  368. continue;
  369. }
  370. break;
  371. // &#...; - assumes ASCII
  372. case Ch('#'):
  373. if (src[2] == Ch('x'))
  374. {
  375. unsigned long code = 0;
  376. src += 3; // Skip &#x
  377. while (1)
  378. {
  379. unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast<unsigned char>(*src)];
  380. if (digit == 0xFF)
  381. break;
  382. code = code * 16 + digit;
  383. ++src;
  384. }
  385. insert_coded_character<Flags>(dest, code); // Put character in output
  386. }
  387. else
  388. {
  389. unsigned long code = 0;
  390. src += 2; // Skip &#
  391. while (1)
  392. {
  393. unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast<unsigned char>(*src)];
  394. if (digit == 0xFF)
  395. break;
  396. code = code * 10 + digit;
  397. ++src;
  398. }
  399. insert_coded_character<Flags>(dest, code); // Put character in output
  400. }
  401. if (*src == Ch(';'))
  402. ++src;
  403. else
  404. RAPIDXML_PARSE_ERROR("expected ;", src);
  405. continue;
  406. // Something else
  407. default:
  408. // Ignore, just copy '&' verbatim
  409. break;
  410. }
  411. }
  412. }
  413. // If whitespace condensing is enabled
  414. if (Flags & parse_normalize_whitespace)
  415. {
  416. // Test if condensing is needed
  417. if (whitespace_pred::test(*src))
  418. {
  419. *dest = Ch(' '); ++dest; // Put single space in dest
  420. ++src; // Skip first whitespace char
  421. // Skip remaining whitespace chars
  422. while (whitespace_pred::test(*src))
  423. ++src;
  424. continue;
  425. }
  426. }
  427. // No replacement, only copy character
  428. *dest++ = *src++;
  429. }
  430. // Return new end
  431. text = src;
  432. return dest;
  433. }
  434. ///////////////////////////////////////////////////////////////////////
  435. // Internal parsing functions
  436. // Parse UTF-8 BOM, if any
  437. template<int Flags>
  438. void parse_bom(char *&text)
  439. {
  440. if (static_cast<unsigned char>(text[0]) == 0xEF &&
  441. static_cast<unsigned char>(text[1]) == 0xBB &&
  442. static_cast<unsigned char>(text[2]) == 0xBF)
  443. {
  444. text += 3;
  445. }
  446. }
  447. // Parse UTF-16/32 BOM, if any
  448. template<int Flags>
  449. void parse_bom(wchar_t *&text)
  450. {
  451. const wchar_t bom = 0xFEFF;
  452. if (text[0] == bom)
  453. {
  454. ++text;
  455. }
  456. }
  457. // Parse XML declaration (<?xml...)
  458. template<int Flags>
  459. void parse_xml_declaration(Ch *&text)
  460. {
  461. // If parsing of declaration is disabled
  462. if (!(Flags & parse_declaration_node))
  463. {
  464. // Skip until end of declaration
  465. while (text[0] != Ch('?') || text[1] != Ch('>'))
  466. {
  467. if (!text[0])
  468. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  469. ++text;
  470. }
  471. text += 2; // Skip '?>'
  472. return; // return 0;
  473. }
  474. // Create declaration
  475. // xml_node<Ch> *declaration = this->allocate_node(node_declaration);
  476. // Skip whitespace before attributes or ?>
  477. skip<whitespace_pred, Flags>(text, endptr_);
  478. // Parse declaration attributes
  479. parse_node_attributes<Flags>(text/*, declaration*/);
  480. // Skip ?>
  481. if (text[0] != Ch('?') || text[1] != Ch('>'))
  482. RAPIDXML_PARSE_ERROR("expected ?>", text);
  483. text += 2;
  484. // return declaration;
  485. }
  486. // Parse XML comment (<!--...)
  487. template<int Flags>
  488. void parse_comment(Ch *&text)
  489. {
  490. // If parsing of comments is disabled
  491. if (!(Flags & parse_comment_nodes))
  492. {
  493. // Skip until end of comment
  494. while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
  495. {
  496. if (!text[0])
  497. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  498. ++text;
  499. }
  500. text += 3; // Skip '-->'
  501. return;// return 0; // Do not produce comment node
  502. }
  503. // Remember value start
  504. Ch *value = text;
  505. // Skip until end of comment
  506. while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
  507. {
  508. if (!text[0])
  509. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  510. ++text;
  511. }
  512. // Create comment node
  513. // xml_node<Ch> *comment = this->allocate_node(node_comment);
  514. // comment->value(value, text - value); // TODO: DNT implement comment
  515. // Place zero terminator after comment value
  516. if (!(Flags & parse_no_string_terminators))
  517. *text = Ch('\0');
  518. text += 3; // Skip '-->'
  519. return;
  520. }
  521. // Parse DOCTYPE
  522. template<int Flags>
  523. void parse_doctype(Ch *&text)
  524. {
  525. // Remember value start
  526. Ch *value = text;
  527. // Skip to >
  528. while (*text != Ch('>'))
  529. {
  530. // Determine character type
  531. switch (*text)
  532. {
  533. // If '[' encountered, scan for matching ending ']' using naive algorithm with depth
  534. // This works for all W3C test files except for 2 most wicked
  535. case Ch('['):
  536. {
  537. ++text; // Skip '['
  538. int depth = 1;
  539. while (depth > 0)
  540. {
  541. switch (*text)
  542. {
  543. case Ch('['): ++depth; break;
  544. case Ch(']'): --depth; break;
  545. case 0: RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  546. default: break;
  547. }
  548. ++text;
  549. }
  550. break;
  551. }
  552. // Error on end of text
  553. case Ch('\0'):
  554. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  555. // Other character, skip it
  556. default:
  557. ++text;
  558. }
  559. }
  560. // If DOCTYPE nodes enabled
  561. if (Flags & parse_doctype_node)
  562. {
  563. #if 0
  564. // Create a new doctype node
  565. xml_node<Ch> *doctype = this->allocate_node(node_doctype);
  566. doctype->value(value, text - value);
  567. #endif
  568. // Place zero terminator after value
  569. if (!(Flags & parse_no_string_terminators))
  570. *text = Ch('\0');
  571. text += 1; // skip '>'
  572. return;// return doctype;
  573. }
  574. else
  575. {
  576. text += 1; // skip '>'
  577. return;// return 0;
  578. }
  579. }
  580. // Parse PI
  581. template<int Flags>
  582. void parse_pi(Ch *&text)
  583. {
  584. // If creation of PI nodes is enabled
  585. if (Flags & parse_pi_nodes)
  586. {
  587. // Create pi node
  588. // xml_node<Ch> *pi = this->allocate_node(node_pi);
  589. // Extract PI target name
  590. Ch *name = text;
  591. skip<node_name_pred, Flags>(text, endptr_);
  592. if (text == name)
  593. RAPIDXML_PARSE_ERROR("expected PI target", text);
  594. // pi->name(name, text - name); // TODO: DNT notify for pi
  595. // Skip whitespace between pi target and pi
  596. skip<whitespace_pred, Flags>(text, endptr_);
  597. // Remember start of pi
  598. Ch *value = text;
  599. // Skip to '?>'
  600. while (text[0] != Ch('?') || text[1] != Ch('>'))
  601. {
  602. if (*text == Ch('\0'))
  603. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  604. ++text;
  605. }
  606. #if 0 // TODO: DNT notify for pi
  607. // Set pi value (verbatim, no entity expansion or whitespace normalization)
  608. pi->value(value, text - value);
  609. // Place zero terminator after name and value
  610. if (!(Flags & parse_no_string_terminators))
  611. {
  612. pi->name()[pi->name_size()] = Ch('\0');
  613. pi->value()[pi->value_size()] = Ch('\0');
  614. }
  615. #endif
  616. text += 2; // Skip '?>'
  617. return; // return pi;
  618. }
  619. else
  620. {
  621. // Skip to '?>'
  622. while (text[0] != Ch('?') || text[1] != Ch('>'))
  623. {
  624. if (*text == Ch('\0'))
  625. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  626. ++text;
  627. }
  628. text += 2; // Skip '?>'
  629. return; // return 0;
  630. }
  631. }
  632. // Parse and append data
  633. // Return character that ends data.
  634. // This is necessary because this character might have been overwritten by a terminating 0
  635. template<int Flags>
  636. Ch parse_and_append_data(/*const tok_string& elementName unused for SAX,*/ Ch *&text, Ch *contents_start)
  637. {
  638. // Backup to contents start if whitespace trimming is disabled
  639. if (!(Flags & parse_trim_whitespace))
  640. text = contents_start;
  641. // Skip until end of data
  642. Ch *value = text, *end;
  643. if (Flags & parse_normalize_whitespace)
  644. end = skip_and_expand_character_refs<text_pred, text_pure_with_ws_pred, Flags>(text);
  645. else
  646. end = skip_and_expand_character_refs<text_pred, text_pure_no_ws_pred, Flags>(text);
  647. // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after >
  648. if (Flags & parse_trim_whitespace)
  649. {
  650. if (Flags & parse_normalize_whitespace)
  651. {
  652. // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end
  653. if (*(end - 1) == Ch(' '))
  654. --end;
  655. }
  656. else
  657. {
  658. // Backup until non-whitespace character is found
  659. while (whitespace_pred::test(*(end - 1)))
  660. --end;
  661. }
  662. }
  663. #if 0 // disable data node
  664. // If characters are still left between end and value (this test is only necessary if normalization is enabled)
  665. // Create new data node
  666. if (!(Flags & parse_no_data_nodes))
  667. {
  668. xml_node<Ch> *data = this->allocate_node(node_data);
  669. data->value(value, end - value);
  670. node->append_node(data);
  671. }
  672. #endif
  673. // Add data to parent node if no data exists yet
  674. #if 0
  675. if (!(Flags & parse_no_element_values))
  676. if (*node->value() == Ch('\0'))
  677. ;// node->value(value, end - value);
  678. #endif
  679. Ch ch = *text;
  680. // Place zero terminator after value
  681. if (!(Flags & parse_no_string_terminators))
  682. {
  683. //Ch ch = *text;
  684. *end = Ch('\0');
  685. //return ch; // Return character that ends data; this is required because zero terminator overwritten it
  686. }
  687. handler_->xmlSAX3Text(value, end - value);
  688. // Return character that ends data
  689. return ch;
  690. }
  691. // Parse CDATA
  692. template<int Flags>
  693. void parse_cdata(Ch *&text)
  694. {
  695. // If CDATA is disabled
  696. if (Flags & parse_no_data_nodes)
  697. {
  698. // Skip until end of cdata
  699. while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
  700. {
  701. if (!text[0])
  702. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  703. ++text;
  704. }
  705. text += 3; // Skip ]]>
  706. return; // return 0; // Do not produce CDATA node
  707. }
  708. // Skip until end of cdata
  709. Ch *value = text;
  710. while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
  711. {
  712. if (!text[0])
  713. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  714. ++text;
  715. }
  716. #if 0 // TODO: disable CDATA
  717. // Create new cdata node
  718. xml_node<Ch> *cdata = this->allocate_node(node_cdata);
  719. cdata->value(value, text - value);
  720. #endif
  721. // Place zero terminator after value
  722. if (!(Flags & parse_no_string_terminators))
  723. *text = Ch('\0');
  724. text += 3; // Skip ]]>
  725. return;// return cdata;
  726. }
  727. // Parse element node
  728. template<int Flags>
  729. void parse_element(Ch *&text)
  730. {
  731. // Create element node
  732. // xml_node<Ch> *element = this->allocate_node(node_element);
  733. // Extract element name
  734. tok_string elementName(text, 0);
  735. skip<node_name_pred, Flags>(text, endptr_);
  736. elementName.second = text - elementName.first;
  737. if (0 == elementName.second)
  738. RAPIDXML_PARSE_ERROR("expected element name", text);
  739. handler_->xmlSAX3StartElement(elementName.first, elementName.second);
  740. // Skip whitespace between element name and attributes or >
  741. skip<whitespace_pred, Flags>(text, endptr_);
  742. // Parse attributes, if any
  743. parse_node_attributes<Flags>(text);
  744. handler_->xmlSAX3EndAttr();
  745. // Determine ending type
  746. if (*text == Ch('>'))
  747. {
  748. ++text;
  749. parse_node_contents<Flags>(text, elementName);
  750. }
  751. else if (*text == Ch('/'))
  752. {
  753. ++text;
  754. if (*text != Ch('>'))
  755. RAPIDXML_PARSE_ERROR("expected >", text);
  756. ++text;
  757. }
  758. else
  759. RAPIDXML_PARSE_ERROR("expected >", text);
  760. // Place zero terminator after name
  761. if (!(Flags & parse_no_string_terminators)) {
  762. elementName.first[elementName.second] = (Ch)'\0';
  763. }
  764. // Return parsed element
  765. handler_->xmlSAX3EndElement(elementName.first, elementName.second);
  766. // return element;
  767. }
  768. // Determine node type, and parse it
  769. template<int Flags>
  770. void parse_node(Ch *&text)
  771. {
  772. // Parse proper node type
  773. switch (text[0])
  774. {
  775. // <...
  776. default:
  777. // Parse and append element node
  778. return parse_element<Flags>(text);
  779. // <?...
  780. case Ch('?'):
  781. ++text; // Skip ?
  782. if ((text[0] == Ch('x') || text[0] == Ch('X')) &&
  783. (text[1] == Ch('m') || text[1] == Ch('M')) &&
  784. (text[2] == Ch('l') || text[2] == Ch('L')) &&
  785. whitespace_pred::test(text[3]))
  786. {
  787. // '<?xml ' - xml declaration
  788. text += 4; // Skip 'xml '
  789. return parse_xml_declaration<Flags>(text);
  790. }
  791. else
  792. {
  793. // Parse PI
  794. return parse_pi<Flags>(text);
  795. }
  796. // <!...
  797. case Ch('!'):
  798. // Parse proper subset of <! node
  799. switch (text[1])
  800. {
  801. // <!-
  802. case Ch('-'):
  803. if (text[2] == Ch('-'))
  804. {
  805. // '<!--' - xml comment
  806. text += 3; // Skip '!--'
  807. return parse_comment<Flags>(text);
  808. }
  809. break;
  810. // <![
  811. case Ch('['):
  812. if (text[2] == Ch('C') && text[3] == Ch('D') && text[4] == Ch('A') &&
  813. text[5] == Ch('T') && text[6] == Ch('A') && text[7] == Ch('['))
  814. {
  815. // '<![CDATA[' - cdata
  816. text += 8; // Skip '![CDATA['
  817. return parse_cdata<Flags>(text);
  818. }
  819. break;
  820. // <!D
  821. case Ch('D'):
  822. if (text[2] == Ch('O') && text[3] == Ch('C') && text[4] == Ch('T') &&
  823. text[5] == Ch('Y') && text[6] == Ch('P') && text[7] == Ch('E') &&
  824. whitespace_pred::test(text[8]))
  825. {
  826. // '<!DOCTYPE ' - doctype
  827. text += 9; // skip '!DOCTYPE '
  828. return parse_doctype<Flags>(text);
  829. }
  830. break;
  831. default: break;
  832. } // switch
  833. // Attempt to skip other, unrecognized node types starting with <!
  834. ++text; // Skip !
  835. while (*text != Ch('>'))
  836. {
  837. if (*text == 0)
  838. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  839. ++text;
  840. }
  841. ++text; // Skip '>'
  842. return; // return 0; // No node recognized
  843. }
  844. }
  845. // Parse contents of the node - children, data etc.
  846. template<int Flags>
  847. void parse_node_contents(Ch *&text, const tok_string& elementName/*element name*/)
  848. {
  849. // For all children and text
  850. while (1)
  851. {
  852. // Skip whitespace between > and node contents
  853. Ch *contents_start = text; // Store start of node contents before whitespace is skipped
  854. skip<whitespace_pred, Flags>(text, endptr_);
  855. Ch next_char = *text;
  856. // After data nodes, instead of continuing the loop, control jumps here.
  857. // This is because zero termination inside parse_and_append_data() function
  858. // would wreak havoc with the above code.
  859. // Also, skipping whitespace after data nodes is unnecessary.
  860. after_data_node:
  861. // Determine what comes next: node closing, child node, data node, or 0?
  862. switch (next_char)
  863. {
  864. // Node closing or child node
  865. case Ch('<'):
  866. if (text[1] == Ch('/'))
  867. {
  868. // Node closing
  869. text += 2; // Skip '</'
  870. if (Flags & parse_validate_closing_tags)
  871. {
  872. // Skip and validate closing tag name
  873. Ch *closing_name = text;
  874. skip<node_name_pred, Flags>(text, endptr_);
  875. if (!internal::compare(elementName.first, elementName.second, closing_name, text - closing_name, true))
  876. RAPIDXML_PARSE_ERROR("invalid closing tag name", text);
  877. }
  878. else
  879. {
  880. // No validation, just skip name
  881. skip<node_name_pred, Flags>(text, endptr_);
  882. }
  883. // Skip remaining whitespace after node name
  884. skip<whitespace_pred, Flags>(text, endptr_);
  885. if (*text != Ch('>'))
  886. RAPIDXML_PARSE_ERROR("expected >", text);
  887. ++text; // Skip '>'
  888. return; // Node closed, finished parsing contents
  889. }
  890. else
  891. {
  892. // Child node
  893. ++text; // Skip '<'
  894. parse_node<Flags>(text);
  895. /*if (xml_node<Ch> *child = parse_node<Flags>(text))
  896. node->append_node(child);*/
  897. }
  898. break;
  899. // End of data - error
  900. case Ch('\0'):
  901. RAPIDXML_PARSE_ERROR("unexpected end of data", text);
  902. // Data node
  903. default:
  904. next_char = parse_and_append_data<Flags>(/*elementName, */text, contents_start);
  905. goto after_data_node; // Bypass regular processing after data nodes
  906. }
  907. }
  908. }
  909. // Parse XML attributes of the node
  910. template<int Flags>
  911. void parse_node_attributes(Ch *&text)
  912. {
  913. // For all attributes
  914. while (attribute_name_pred::test(*text))
  915. {
  916. // Extract attribute name
  917. Ch *name = text;
  918. ++text; // Skip first character of attribute name
  919. skip<attribute_name_pred, Flags>(text, endptr_);
  920. if (text == name)
  921. RAPIDXML_PARSE_ERROR("expected attribute name", name);
  922. // Create new attribute
  923. // xml_attribute<Ch> *attribute = this->allocate_attribute();
  924. // attribute->name(name, text - name);
  925. auto namesize = text - name;
  926. // node->append_attribute(attribute);
  927. // Skip whitespace after attribute name
  928. skip<whitespace_pred, Flags>(text, endptr_);
  929. // Skip =
  930. if (*text != Ch('='))
  931. RAPIDXML_PARSE_ERROR("expected =", text);
  932. ++text;
  933. // Add terminating zero after name
  934. if (!(Flags & parse_no_string_terminators))
  935. name[namesize] = 0;
  936. // Skip whitespace after =
  937. skip<whitespace_pred, Flags>(text, endptr_);
  938. // Skip quote and remember if it was ' or "
  939. Ch quote = *text;
  940. if (quote != Ch('\'') && quote != Ch('"'))
  941. RAPIDXML_PARSE_ERROR("expected ' or \"", text);
  942. ++text;
  943. // Extract attribute value and expand char refs in it
  944. Ch *value = text, *end;
  945. const int AttFlags = Flags & ~parse_normalize_whitespace; // No whitespace normalization in attributes
  946. if (quote == Ch('\''))
  947. end = skip_and_expand_character_refs<attribute_value_pred<Ch('\'')>, attribute_value_pure_pred<Ch('\'')>, AttFlags>(text);
  948. else
  949. end = skip_and_expand_character_refs<attribute_value_pred<Ch('"')>, attribute_value_pure_pred<Ch('"')>, AttFlags>(text);
  950. // Set attribute value
  951. // attribute->value(value, end - value);
  952. auto valuesize = end - value;
  953. // Make sure that end quote is present
  954. if (*text != quote)
  955. RAPIDXML_PARSE_ERROR("expected ' or \"", text);
  956. ++text; // Skip quote
  957. // Add terminating zero after value
  958. if (!(Flags & parse_no_string_terminators))
  959. value[valuesize] = 0;
  960. handler_->xmlSAX3Attr(name, namesize, value, valuesize);
  961. // Skip whitespace after attribute value
  962. skip<whitespace_pred, Flags>(text, endptr_);
  963. }
  964. }
  965. };
  966. }
  967. // Undefine internal macros
  968. #undef RAPIDXML_PARSE_ERROR
  969. // On MSVC, restore warnings state
  970. #ifdef _MSC_VER
  971. #pragma warning(pop)
  972. #endif
  973. #endif