Orcus
sax_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_SAX_PARSER_HPP
9 #define ORCUS_SAX_PARSER_HPP
10 
11 #include "sax_parser_base.hpp"
12 
13 namespace orcus {
14 
16 {
22  static const uint8_t baseline_version = 10;
23 };
24 
29 template<typename _Handler, typename _Config = sax_parser_default_config>
31 {
32 public:
33  typedef _Handler handler_type;
34  typedef _Config config_type;
35 
36  sax_parser(const char* content, const size_t size, handler_type& handler);
37  ~sax_parser();
38 
39  void parse();
40 
41 private:
42 
47  void header();
48  void body();
49  void element();
50  void element_open(std::ptrdiff_t begin_pos);
51  void element_close(std::ptrdiff_t begin_pos);
52  void special_tag();
53  void declaration(const char* name_check);
54  void cdata();
55  void doctype();
56  void characters();
57  void attribute();
58 
59 private:
60  handler_type& m_handler;
61 };
62 
63 template<typename _Handler, typename _Config>
65  const char* content, const size_t size, handler_type& handler) :
66  sax::parser_base(content, size),
67  m_handler(handler)
68 {
69 }
70 
71 template<typename _Handler, typename _Config>
72 sax_parser<_Handler,_Config>::~sax_parser()
73 {
74 }
75 
76 template<typename _Handler, typename _Config>
77 void sax_parser<_Handler,_Config>::parse()
78 {
79  m_nest_level = 0;
80  mp_char = mp_begin;
81  header();
82  blank();
83  body();
84 
85  assert(m_buffer_pos == 0);
86 }
87 
88 template<typename _Handler, typename _Config>
89 void sax_parser<_Handler,_Config>::header()
90 {
91  // we don't handle multi byte encodings so we can just skip bom entry if exists.
92  skip_bom();
93  blank();
94  if (!has_char() || cur_char() != '<')
95  throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
96 
97  if (config_type::baseline_version >= 11)
98  {
99  // XML version 1.1 requires a header declaration whereas in 1.0 it's
100  // optional.
101  if (next_char_checked() != '?')
102  throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
103 
104  declaration("xml");
105  }
106 }
107 
108 template<typename _Handler, typename _Config>
109 void sax_parser<_Handler,_Config>::body()
110 {
111  while (has_char())
112  {
113  if (cur_char() == '<')
114  {
115  element();
116  if (!m_root_elem_open)
117  // Root element closed. Stop parsing.
118  return;
119  }
120  else if (m_nest_level)
121  // Call characters only when in xml hierarchy.
122  characters();
123  else
124  next();
125  }
126 }
127 
128 template<typename _Handler, typename _Config>
129 void sax_parser<_Handler,_Config>::element()
130 {
131  assert(cur_char() == '<');
132  std::ptrdiff_t pos = offset();
133  char c = next_char_checked();
134  switch (c)
135  {
136  case '/':
137  element_close(pos);
138  break;
139  case '!':
140  special_tag();
141  break;
142  case '?':
143  declaration(nullptr);
144  break;
145  default:
146  if (!is_alpha(c) && c != '_')
147  throw sax::malformed_xml_error("expected an alphabet.", offset());
148  element_open(pos);
149  }
150 }
151 
152 template<typename _Handler, typename _Config>
153 void sax_parser<_Handler,_Config>::element_open(std::ptrdiff_t begin_pos)
154 {
155  assert(is_alpha(cur_char()) || cur_char() == '_');
156 
157  sax::parser_element elem;
158  element_name(elem, begin_pos);
159 
160  while (true)
161  {
162  blank();
163  char c = cur_char();
164  if (c == '/')
165  {
166  // Self-closing element: <element/>
167  if (next_and_char() != '>')
168  throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
169  next();
170  elem.end_pos = offset();
171  m_handler.start_element(elem);
172  reset_buffer_pos();
173  m_handler.end_element(elem);
174  if (!m_nest_level)
175  m_root_elem_open = false;
176 #if ORCUS_DEBUG_SAX_PARSER
177  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
178 #endif
179  return;
180  }
181  else if (c == '>')
182  {
183  // End of opening element: <element>
184  next();
185  elem.end_pos = offset();
186  nest_up();
187  m_handler.start_element(elem);
188  reset_buffer_pos();
189 #if ORCUS_DEBUG_SAX_PARSER
190  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
191 #endif
192  return;
193  }
194  else
195  attribute();
196  }
197 }
198 
199 template<typename _Handler, typename _Config>
200 void sax_parser<_Handler,_Config>::element_close(std::ptrdiff_t begin_pos)
201 {
202  assert(cur_char() == '/');
203  nest_down();
204  next_check();
205  sax::parser_element elem;
206  element_name(elem, begin_pos);
207 
208  if (cur_char() != '>')
209  throw sax::malformed_xml_error("expected '>' to close the element.", offset());
210  next();
211  elem.end_pos = offset();
212 
213  m_handler.end_element(elem);
214 #if ORCUS_DEBUG_SAX_PARSER
215  cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
216 #endif
217  if (!m_nest_level)
218  m_root_elem_open = false;
219 }
220 
221 template<typename _Handler, typename _Config>
222 void sax_parser<_Handler,_Config>::special_tag()
223 {
224  assert(cur_char() == '!');
225  // This can be either <![CDATA, <!--, or <!DOCTYPE.
226  size_t len = remains();
227  if (len < 2)
228  throw sax::malformed_xml_error("special tag too short.", offset());
229 
230  switch (next_and_char())
231  {
232  case '-':
233  {
234  // Possibly comment.
235  if (next_and_char() != '-')
236  throw sax::malformed_xml_error("comment expected.", offset());
237 
238  len -= 2;
239  if (len < 3)
240  throw sax::malformed_xml_error("malformed comment.", offset());
241 
242  next();
243  comment();
244  }
245  break;
246  case '[':
247  {
248  // Possibly a CDATA.
249  expects_next("CDATA[", 6);
250  if (has_char())
251  cdata();
252  }
253  break;
254  case 'D':
255  {
256  // check if this is a DOCTYPE.
257  expects_next("OCTYPE", 6);
258  blank();
259  if (has_char())
260  doctype();
261  }
262  break;
263  default:
264  throw sax::malformed_xml_error("failed to parse special tag.", offset());
265  }
266 }
267 
268 template<typename _Handler, typename _Config>
269 void sax_parser<_Handler,_Config>::declaration(const char* name_check)
270 {
271  assert(cur_char() == '?');
272  next_check();
273 
274  // Get the declaration name first.
275  pstring decl_name;
276  name(decl_name);
277 #if ORCUS_DEBUG_SAX_PARSER
278  cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
279 #endif
280 
281  if (name_check && decl_name != name_check)
282  {
283  std::ostringstream os;
284  os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
285  throw sax::malformed_xml_error(os.str(), offset());
286  }
287 
288  m_handler.start_declaration(decl_name);
289  blank();
290 
291  // Parse the attributes.
292  while (cur_char_checked() != '?')
293  {
294  attribute();
295  blank();
296  }
297  if (next_char_checked() != '>')
298  throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
299 
300  m_handler.end_declaration(decl_name);
301  reset_buffer_pos();
302  next();
303 #if ORCUS_DEBUG_SAX_PARSER
304  cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
305 #endif
306 }
307 
308 template<typename _Handler, typename _Config>
309 void sax_parser<_Handler,_Config>::cdata()
310 {
311  size_t len = remains();
312  assert(len > 3);
313 
314  // Parse until we reach ']]>'.
315  const char* p0 = mp_char;
316  size_t i = 0, match = 0;
317  for (char c = cur_char(); i < len; ++i, c = next_and_char())
318  {
319  if (c == ']')
320  {
321  // Be aware that we may encounter a series of more than two ']'
322  // characters, in which case we'll only count the last two.
323 
324  if (match == 0)
325  // First ']'
326  ++match;
327  else if (match == 1)
328  // Second ']'
329  ++match;
330  }
331  else if (c == '>' && match == 2)
332  {
333  // Found ']]>'.
334  size_t cdata_len = i - 2;
335  m_handler.characters(pstring(p0, cdata_len), false);
336  next();
337  return;
338  }
339  else
340  match = 0;
341  }
342  throw sax::malformed_xml_error("malformed CDATA section.", offset());
343 }
344 
345 template<typename _Handler, typename _Config>
346 void sax_parser<_Handler,_Config>::doctype()
347 {
348  // Parse the root element first.
349  sax::doctype_declaration param;
350  name(param.root_element);
351  blank();
352 
353  // Either PUBLIC or SYSTEM.
354  size_t len = remains();
355  if (len < 6)
356  throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
357 
358  param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
359  char c = cur_char();
360  if (c == 'P')
361  {
362  if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
363  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
364 
365  param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
366  }
367  else if (c == 'S')
368  {
369  if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
370  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
371  }
372 
373  next_check();
374  blank();
375  has_char_throw("DOCTYPE section too short.");
376 
377  // Parse FPI.
378  value(param.fpi, false);
379 
380  has_char_throw("DOCTYPE section too short.");
381  blank();
382  has_char_throw("DOCTYPE section too short.");
383 
384  if (cur_char() == '>')
385  {
386  // Optional URI not given. Exit.
387 #if ORCUS_DEBUG_SAX_PARSER
388  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
389 #endif
390  m_handler.doctype(param);
391  next();
392  return;
393  }
394 
395  // Parse optional URI.
396  value(param.uri, false);
397 
398  has_char_throw("DOCTYPE section too short.");
399  blank();
400  has_char_throw("DOCTYPE section too short.");
401 
402  if (cur_char() != '>')
403  throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
404 
405 #if ORCUS_DEBUG_SAX_PARSER
406  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
407 #endif
408  m_handler.doctype(param);
409  next();
410 }
411 
412 template<typename _Handler, typename _Config>
413 void sax_parser<_Handler,_Config>::characters()
414 {
415  const char* p0 = mp_char;
416  for (; has_char(); next())
417  {
418  if (cur_char() == '<')
419  break;
420 
421  if (cur_char() == '&')
422  {
423  // Text span with one or more encoded characters. Parse using cell buffer.
424  cell_buffer& buf = get_cell_buffer();
425  buf.reset();
426  buf.append(p0, mp_char-p0);
427  characters_with_encoded_char(buf);
428  if (buf.empty())
429  m_handler.characters(pstring(), false);
430  else
431  m_handler.characters(pstring(buf.get(), buf.size()), true);
432  return;
433  }
434  }
435 
436  if (mp_char > p0)
437  {
438  pstring val(p0, mp_char-p0);
439  m_handler.characters(val, false);
440  }
441 }
442 
443 template<typename _Handler, typename _Config>
444 void sax_parser<_Handler,_Config>::attribute()
445 {
446  sax::parser_attribute attr;
447  pstring attr_ns_name, attr_name, attr_value;
448  attribute_name(attr.ns, attr.name);
449 
450 #if ORCUS_DEBUG_SAX_PARSER
451  std::ostringstream os;
452  os << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'";
453 #endif
454 
455  char c = cur_char();
456  if (c != '=')
457  {
458  std::ostringstream os;
459  os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
460  throw sax::malformed_xml_error(os.str(), offset());
461  }
462 
463  next_check();
464  attr.transient = value(attr.value, true);
465  if (attr.transient)
466  // Value is stored in a temporary buffer. Push a new buffer.
467  inc_buffer_pos();
468 
469 #if ORCUS_DEBUG_SAX_PARSER
470  os << " value='" << attr.value << "'" << endl;
471  cout << os.str();
472 #endif
473 
474  m_handler.attribute(attr);
475 }
476 
477 }
478 
479 #endif
480 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: sax_parser.hpp:15
static const uint8_t baseline_version
Definition: sax_parser.hpp:22
Definition: parser_base.hpp:35
Definition: base64.hpp:15
Definition: sax_parser.hpp:30
Definition: sax_parser_base.hpp:108