6 #include <wordring/whatwg/html/parsing/atom_tbl.hpp>
7 #include <wordring/whatwg/html/parsing/input_stream.hpp>
8 #include <wordring/whatwg/html/parsing/parser_defs.hpp>
9 #include <wordring/whatwg/html/parsing/token.hpp>
11 #include <wordring/whatwg/infra/infra.hpp>
15 #include <type_traits>
28 template <
typename T,
typename NodeTraits>
37 using traits = NodeTraits;
45 using base_type::begin;
57 state_type m_return_state;
58 std::u32string m_temporary_buffer;
75 std::u32string m_last_start_tag_name;
77 char32_t m_character_reference_code;
84 , m_return_state(nullptr)
86 , m_character_reference_code(0)
97 m_return_state =
nullptr;
98 m_temporary_buffer.clear();
109 m_last_start_tag_name.clear();
111 m_character_reference_code = 0;
119 m_start_tag_token.clear();
120 return m_start_tag_token;
123 tag_token& create_end_tag_token()
126 m_end_tag_token.clear();
127 return m_start_tag_token;
130 void create_comment_token(char32_t
const* data = U
"")
132 m_comment_token.m_data = data;
135 DOCTYPE_token& create_DOCTYPE_token()
137 m_DOCTYPE_token.clear();
138 return m_DOCTYPE_token;
149 return m_start_tag_token;
152 bool is_appropriate_end_tag_token(
tag_token const& token)
156 if (m_last_start_tag_name.empty() || (m_last_start_tag_name != token.m_tag_name))
return false;
176 return m_comment_token;
179 DOCTYPE_token& current_DOCTYPE_token()
181 return m_DOCTYPE_token;
196 this_type
const* P =
static_cast<this_type const*
>(
this);
197 return traits::get_namespace_name(P->adjusted_current_node().m_it) == ns_name::HTML;
207 auto it1 = al.begin();
208 auto it2 = std::prev(al.end(), 1);
209 while (it1 != it2)
if (it1++->m_name == al.current().m_name) al.current().m_omitted =
true;
214 template <
typename Token>
215 void emit_token(Token& token)
217 this_type* P =
static_cast<this_type*
>(
this);
219 if constexpr (std::is_base_of_v<tag_token, Token>)
224 auto tag_it = tag_atom_tbl.find(t.m_tag_name);
225 t.m_tag_name_id = (tag_it == tag_atom_tbl.end()) ?
static_cast<tag_name
>(0) : tag_it->second;
229 m_last_start_tag_name = m_start_tag_token.m_tag_name;
230 P->on_emit_token(m_start_tag_token);
234 else P->on_emit_token(token);
237 if constexpr (std::is_same_v<Token, start_tag_token>)
239 if (token.m_self_closing_flag && !token.m_acknowledged_self_closing_flag)
241 report_error(error_name::non_void_html_element_start_tag_with_trailing_solidus);
246 void emit_token(char32_t cp)
248 this_type* P =
static_cast<this_type*
>(
this);
250 m_character_token.m_data = cp;
251 P->on_emit_token(m_character_token);
254 void emit_token(end_of_file_token)
256 this_type* P =
static_cast<this_type*
>(
this);
257 P->on_emit_token(m_end_of_file_token);
262 void change_state(state_type st) { m_state = st; }
264 void return_state(state_type st) { m_return_state = st; }
266 state_type return_state()
const {
return m_return_state; }
274 on_emit_code_point();
278 bool consumed_as_part_of_attribute()
280 if (m_return_state == attribute_value_double_quoted_state
281 || m_return_state == attribute_value_single_quoted_state
282 || m_return_state == attribute_value_unquoted_state)
return true;
287 void flush_code_points_consumed_as_character_reference()
289 if (consumed_as_part_of_attribute())
for (char32_t cp : m_temporary_buffer)
current_attribute().m_value.push_back(cp);
290 else for (char32_t cp : m_temporary_buffer) emit_token(cp);
295 void on_emit_code_point()
316 return_state(data_state);
317 change_state(character_reference_state);
320 change_state(tag_open_state);
346 return_state(RCDATA_state);
347 change_state(character_reference_state);
350 change_state(RCDATA_less_than_sign_state);
354 emit_token(U
'\xFFFD');
376 change_state(RAWTEXT_less_than_sign_state);
380 emit_token(U
'\xFFFD');
402 change_state(script_data_less_than_sign_state);
406 emit_token(U
'\xFFFD');
428 emit_token(U
'\xFFFD');
451 change_state(markup_declaration_open_state);
454 change_state(end_tag_open_state);
457 report_error(error_name::unexpected_question_mark_instead_of_tag_name);
458 create_comment_token();
463 if (is_ascii_alpha(cp))
465 create_start_tag_token();
470 report_error(error_name::invalid_first_character_of_tag_name);
489 if (is_ascii_alpha(cp))
491 create_end_tag_token();
499 change_state(data_state);
503 report_error(error_name::invalid_first_character_of_tag_name);
504 create_comment_token();
526 change_state(before_attribute_name_state);
529 change_state(self_closing_start_tag_state);
532 change_state(data_state);
541 if (is_ascii_upper_alpha(cp))
555 if (!
eof() && cp == U
'/')
557 m_temporary_buffer.clear();
558 change_state(RCDATA_end_tag_open_state);
571 if (!
eof() && is_ascii_alpha(cp))
573 create_end_tag_token();
596 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
597 change_state(before_attribute_name_state);
600 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
601 change_state(self_closing_start_tag_state);
604 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
605 change_state(data_state);
610 if (is_ascii_upper_alpha(cp))
613 m_temporary_buffer.push_back(cp);
617 if (is_ascii_lower_alpha(cp))
620 m_temporary_buffer.push_back(cp);
628 for (char32_t c : m_temporary_buffer) emit_token(c);
637 if (!
eof() && cp == U
'/')
639 m_temporary_buffer.clear();
640 change_state(RAWTEXT_end_tag_open_state);
653 if (!
eof() && is_ascii_alpha(cp))
655 create_end_tag_token();
678 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
679 change_state(before_attribute_name_state);
682 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
683 change_state(self_closing_start_tag_state);
686 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
687 change_state(data_state);
692 if (is_ascii_upper_alpha(cp))
695 m_temporary_buffer.push_back(cp);
699 if (is_ascii_lower_alpha(cp))
702 m_temporary_buffer.push_back(cp);
710 for (char32_t c : m_temporary_buffer) emit_token(c);
724 m_temporary_buffer.clear();
725 change_state(script_data_end_tag_open_state);
728 change_state(script_data_escape_start_state);
746 if (is_ascii_alpha(cp))
748 create_end_tag_token();
749 reconsume(script_data_end_tag_name_state);
772 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
773 change_state(before_attribute_name_state);
776 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
777 change_state(self_closing_start_tag_state);
780 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
781 change_state(data_state);
786 if (is_ascii_upper_alpha(cp))
789 m_temporary_buffer.push_back(cp);
793 if (is_ascii_lower_alpha(cp))
796 m_temporary_buffer.push_back(cp);
804 for (char32_t c : m_temporary_buffer) emit_token(c);
813 if (!
eof() && cp == U
'-')
815 change_state(script_data_escape_start_dash_state);
828 if (!
eof() && cp == U
'-')
830 change_state(script_data_escaped_dash_dash_state);
845 report_error(error_name::eof_in_script_html_comment_like_text);
853 change_state(script_data_escaped_dash_state);
857 change_state(script_data_escaped_less_than_sign_state);
861 emit_token(U
'\xFFFD');
876 report_error(error_name::eof_in_script_html_comment_like_text);
884 change_state(script_data_escaped_dash_dash_state);
888 change_state(script_data_escaped_less_than_sign_state);
892 change_state(script_data_escaped_state);
893 emit_token(U
'\xFFFD');
896 change_state(script_data_escaped_state);
909 report_error(error_name::eof_in_script_html_comment_like_text);
920 change_state(script_data_escaped_less_than_sign_state);
923 change_state(script_data_state);
928 change_state(script_data_escaped_state);
929 emit_token(U
'\xFFFD');
932 change_state(script_data_escaped_state);
947 m_temporary_buffer.clear();
948 change_state(script_data_escaped_end_tag_open_state);
952 if (is_ascii_alpha(cp))
954 m_temporary_buffer.clear();
956 reconsume(script_data_double_escape_start_state);
972 if (is_ascii_alpha(cp))
974 create_end_tag_token();
975 reconsume(script_data_escaped_end_tag_name_state);
998 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
999 change_state(before_attribute_name_state);
1002 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
1003 change_state(self_closing_start_tag_state);
1006 if (!is_appropriate_end_tag_token(m_end_tag_token))
goto AnythingElse;
1007 change_state(data_state);
1012 if (is_ascii_upper_alpha(cp))
1015 m_temporary_buffer.push_back(cp);
1019 if (is_ascii_lower_alpha(cp))
1022 m_temporary_buffer.push_back(cp);
1030 for (char32_t c : m_temporary_buffer) emit_token(c);
1049 if (m_temporary_buffer == U
"script") change_state(script_data_double_escaped_state);
1050 else change_state(before_attribute_name_state);
1055 if (is_ascii_upper_alpha(cp))
1057 m_temporary_buffer.push_back(cp + 0x20);
1062 if (is_ascii_lower_alpha(cp))
1064 m_temporary_buffer.push_back(cp);
1080 report_error(error_name::eof_in_script_html_comment_like_text);
1088 change_state(script_data_double_escaped_dash_state);
1092 change_state(script_data_double_escaped_less_than_sign_state);
1097 emit_token(U
'\xFFFD');
1111 report_error(error_name::eof_in_script_html_comment_like_text);
1119 change_state(script_data_double_escaped_dash_dash_state);
1123 change_state(script_data_double_escaped_less_than_sign_state);
1128 change_state(script_data_double_escaped_state);
1129 emit_token(U
'\xFFFD');
1133 change_state(script_data_double_escaped_state);
1144 report_error(error_name::eof_in_script_html_comment_like_text);
1155 change_state(script_data_double_escaped_less_than_sign_state);
1159 change_state(script_data_state);
1164 change_state(script_data_double_escaped_state);
1165 emit_token(U
'\xFFFD');
1169 change_state(script_data_double_escaped_state);
1178 if (!
eof() && cp == U
'/')
1180 m_temporary_buffer.clear();
1181 change_state(script_data_double_escape_end_state);
1186 reconsume(script_data_double_escaped_state);
1204 if (m_temporary_buffer == U
"script") change_state(script_data_escaped_state);
1205 else change_state(script_data_double_escaped_state);
1210 if (is_ascii_upper_alpha(cp))
1212 m_temporary_buffer.push_back(cp + 0x20);
1217 if (is_ascii_lower_alpha(cp))
1219 m_temporary_buffer.push_back(cp);
1225 reconsume(script_data_double_escaped_state);
1251 report_error(error_name::unexpected_equals_sign_before_attribute_name);
1254 change_state(attribute_name_state);
1286 change_state(before_attribute_value_state);
1295 report_error(error_name::unexpected_character_in_attribute_name);
1299 if (is_ascii_upper_alpha(cp))
1329 change_state(self_closing_start_tag_state);
1332 change_state(before_attribute_value_state);
1335 change_state(data_state);
1359 change_state(attribute_value_double_quoted_state);
1362 change_state(attribute_value_single_quoted_state);
1366 change_state(data_state);
1372 reconsume(attribute_value_unquoted_state);
1390 change_state(after_attribute_value_quoted_state);
1393 return_state(attribute_value_double_quoted_state);
1394 change_state(character_reference_state);
1420 change_state(after_attribute_value_quoted_state);
1423 return_state(attribute_value_single_quoted_state);
1424 change_state(character_reference_state);
1453 change_state(before_attribute_name_state);
1456 return_state(attribute_value_unquoted_state);
1457 change_state(character_reference_state);
1460 change_state(data_state);
1472 report_error(error_name::unexpected_character_in_unquoted_attribute_value);
1498 change_state(before_attribute_name_state);
1501 change_state(self_closing_start_tag_state);
1504 change_state(data_state);
1509 report_error(error_name::missing_whitespace_between_attributes);
1528 change_state(data_state);
1544 emit_token(current_comment_token());
1552 change_state(data_state);
1553 emit_token(current_comment_token());
1557 current_comment_token().m_data.push_back(U
'\xFFFD');
1561 current_comment_token().m_data.push_back(cp);
1567 std::size_t constexpr n = std::max({ std::size(U
"--") - 1, std::size(U
"doctype") - 1, std::size(U
"[CDATA[") - 1 });
1568 if (!
fill(n))
return;
1570 if (
match(U
"--",
false,
false))
1572 consume(std::size(U
"--") - 1);
1573 create_comment_token();
1574 change_state(comment_start_state);
1579 if (
match(U
"doctype",
false,
true))
1581 consume(std::size(U
"doctype") - 1);
1582 change_state(DOCTYPE_state);
1587 if (
match(U
"[CDATA[",
false,
false))
1589 consume(std::size(U
"[CDATA[") - 1);
1591 this_type
const* P =
static_cast<this_type const*
>(
this);
1593 if (!P->m_stack.empty())
1597 change_state(CDATA_section_state);
1604 create_comment_token(U
"[CDATA[");
1605 change_state(bogus_comment_state);
1611 create_comment_token();
1612 change_state(bogus_comment_state);
1626 change_state(comment_start_dash_state);
1629 report_error(error_name::abrupt_closing_of_empty_comment);
1630 change_state(data_state);
1631 emit_token(current_comment_token());
1647 emit_token(current_comment_token());
1655 change_state(comment_end_state);
1658 report_error(error_name::abrupt_closing_of_empty_comment);
1659 change_state(data_state);
1660 emit_token(current_comment_token());
1664 current_comment_token().m_data.push_back(U
'-');
1676 emit_token(current_comment_token());
1684 current_comment_token().m_data.push_back(cp);
1685 change_state(comment_less_than_sign_state);
1688 change_state(comment_end_dash_state);
1692 emit_token(U
'\xFFFD');
1696 current_comment_token().m_data.push_back(cp);
1709 current_comment_token().m_data.push_back(cp);
1710 change_state(comment_less_than_sign_bang_state);
1713 current_comment_token().m_data.push_back(cp);
1726 if (!
eof() && cp == U
'-')
1728 change_state(comment_less_than_sign_bang_dash_state);
1740 if (!
eof() && cp == U
'-')
1742 change_state(comment_less_than_sign_bang_dash_dash_state);
1778 emit_token(current_comment_token());
1785 change_state(comment_end_state);
1789 current_comment_token().m_data.push_back(U
'-');
1801 emit_token(current_comment_token());
1809 change_state(data_state);
1810 emit_token(current_comment_token());
1813 change_state(comment_end_bang_state);
1816 current_comment_token().m_data.push_back(U
'-');
1820 current_comment_token().m_data.push_back(U
'-');
1821 current_comment_token().m_data.push_back(U
'-');
1833 emit_token(current_comment_token());
1841 current_comment_token().m_data.push_back(U
'-');
1842 current_comment_token().m_data.push_back(U
'-');
1843 current_comment_token().m_data.push_back(U
'!');
1844 change_state(comment_end_dash_state);
1848 change_state(data_state);
1849 emit_token(current_comment_token());
1853 current_comment_token().m_data.push_back(U
'-');
1854 current_comment_token().m_data.push_back(U
'-');
1855 current_comment_token().m_data.push_back(U
'!');
1868 d.m_force_quirks_flag =
true;
1880 change_state(before_DOCTYPE_name_state);
1887 report_error(error_name::missing_whitespace_before_doctype_name);
1900 d.m_force_quirks_flag =
true;
1915 create_DOCTYPE_token();
1916 current_DOCTYPE_token().m_name = U
'\xFFFD';
1917 change_state(DOCTYPE_name_state);
1921 create_DOCTYPE_token();
1922 current_DOCTYPE_token().m_force_quirks_flag =
true;
1923 change_state(data_state);
1924 emit_token(current_DOCTYPE_token());
1928 if (is_ascii_upper_alpha(cp))
1930 create_DOCTYPE_token();
1931 current_DOCTYPE_token().m_name = cp + 0x20;
1932 change_state(DOCTYPE_name_state);
1938 change_state(DOCTYPE_name_state);
1949 current_DOCTYPE_token().m_force_quirks_flag =
true;
1950 emit_token(current_DOCTYPE_token());
1961 change_state(after_DOCTYPE_name_state);
1964 change_state(data_state);
1965 emit_token(current_DOCTYPE_token());
1969 current_DOCTYPE_token().m_name.push_back(U
'\xFFFD');
1973 if (is_ascii_upper_alpha(cp))
1975 current_DOCTYPE_token().m_name.push_back(cp + 0x20);
1979 current_DOCTYPE_token().m_name.push_back(cp);
1985 std::size_t constexpr n = std::max(std::size(U
"public") - 1, std::size(U
"system") - 1);
1986 if (!
fill(n))
return;
1993 current_DOCTYPE_token().m_force_quirks_flag =
true;
1994 emit_token(current_DOCTYPE_token());
2008 change_state(data_state);
2009 emit_token(current_DOCTYPE_token());
2014 if (
match(U
"public",
true,
true))
2016 consume(std::size(U
"public") - 2);
2017 change_state(after_DOCTYPE_public_keyword_state);
2022 if (
match(U
"system",
true,
true))
2024 consume(std::size(U
"system") - 2);
2025 change_state(after_DOCTYPE_system_keyword_state);
2030 report_error(error_name::invalid_character_sequence_after_doctype_name);
2031 current_DOCTYPE_token().m_force_quirks_flag =
true;
2044 current_DOCTYPE_token().m_force_quirks_flag =
true;
2045 emit_token(current_DOCTYPE_token());
2056 change_state(before_DOCTYPE_public_identifier_state);
2059 report_error(error_name::missing_whitespace_after_doctype_public_keyword);
2060 current_DOCTYPE_token().m_public_identifier.clear();
2061 change_state(DOCTYPE_public_identifier_double_quoted_state);
2064 report_error(error_name::missing_whitespace_after_doctype_public_keyword);
2065 current_DOCTYPE_token().m_public_identifier.clear();
2066 change_state(DOCTYPE_public_identifier_single_quoted_state);
2069 report_error(error_name::missing_doctype_public_identifier);
2070 current_DOCTYPE_token().m_force_quirks_flag =
true;
2071 change_state(data_state);
2072 emit_token(current_DOCTYPE_token());
2076 report_error(error_name::missing_quote_before_doctype_public_identifier);
2077 current_DOCTYPE_token().m_force_quirks_flag =
true;
2089 current_DOCTYPE_token().m_force_quirks_flag =
true;
2090 emit_token(current_DOCTYPE_token());
2103 current_DOCTYPE_token().m_public_identifier.clear();
2104 change_state(DOCTYPE_public_identifier_double_quoted_state);
2107 current_DOCTYPE_token().m_public_identifier.clear();
2108 change_state(DOCTYPE_public_identifier_single_quoted_state);
2111 report_error(error_name::missing_doctype_public_identifier);
2112 current_DOCTYPE_token().m_force_quirks_flag =
true;
2113 change_state(data_state);
2114 emit_token(current_DOCTYPE_token());
2118 report_error(error_name::missing_quote_before_doctype_public_identifier);
2119 current_DOCTYPE_token().m_force_quirks_flag =
true;
2131 current_DOCTYPE_token().m_force_quirks_flag =
true;
2132 emit_token(current_DOCTYPE_token());
2140 change_state(after_DOCTYPE_public_identifier_state);
2144 current_DOCTYPE_token().m_public_identifier.push_back(U
'\xFFFD');
2147 report_error(error_name::abrupt_doctype_public_identifier);
2148 current_DOCTYPE_token().m_force_quirks_flag =
true;
2149 change_state(data_state);
2150 emit_token(current_DOCTYPE_token());
2154 current_DOCTYPE_token().m_public_identifier.push_back(cp);
2165 current_DOCTYPE_token().m_force_quirks_flag =
true;
2166 emit_token(current_DOCTYPE_token());
2174 change_state(after_DOCTYPE_public_identifier_state);
2178 current_DOCTYPE_token().m_public_identifier.push_back(U
'\xFFFD');
2181 report_error(error_name::abrupt_doctype_public_identifier);
2182 current_DOCTYPE_token().m_force_quirks_flag =
true;
2183 change_state(data_state);
2184 emit_token(current_DOCTYPE_token());
2188 current_DOCTYPE_token().m_public_identifier.push_back(cp);
2199 current_DOCTYPE_token().m_force_quirks_flag =
true;
2200 emit_token(current_DOCTYPE_token());
2211 change_state(between_DOCTYPE_public_and_system_identifiers_state);
2214 change_state(data_state);
2215 emit_token(current_DOCTYPE_token());
2218 report_error(error_name::missing_whitespace_between_doctype_public_and_system_identifiers);
2219 current_DOCTYPE_token().m_system_identifier.clear();
2220 change_state(DOCTYPE_system_identifier_double_quoted_state);
2223 report_error(error_name::missing_whitespace_between_doctype_public_and_system_identifiers);
2224 current_DOCTYPE_token().m_system_identifier.clear();
2225 change_state(DOCTYPE_system_identifier_single_quoted_state);
2229 report_error(error_name::missing_quote_before_doctype_system_identifier);
2230 current_DOCTYPE_token().m_force_quirks_flag =
true;
2242 current_DOCTYPE_token().m_force_quirks_flag =
true;
2243 emit_token(current_DOCTYPE_token());
2256 change_state(data_state);
2257 emit_token(current_DOCTYPE_token());
2260 current_DOCTYPE_token().m_system_identifier.clear();
2261 change_state(DOCTYPE_system_identifier_double_quoted_state);
2264 current_DOCTYPE_token().m_system_identifier.clear();
2265 change_state(DOCTYPE_system_identifier_single_quoted_state);
2269 report_error(error_name::missing_quote_before_doctype_system_identifier);
2270 current_DOCTYPE_token().m_force_quirks_flag =
true;
2282 current_DOCTYPE_token().m_force_quirks_flag =
true;
2283 emit_token(current_DOCTYPE_token());
2294 change_state(before_DOCTYPE_system_identifier_state);
2297 report_error(error_name::missing_whitespace_after_doctype_system_keyword);
2298 current_DOCTYPE_token().m_system_identifier.clear();
2299 change_state(DOCTYPE_system_identifier_double_quoted_state);
2302 report_error(error_name::missing_whitespace_after_doctype_system_keyword);
2303 current_DOCTYPE_token().m_system_identifier.clear();
2304 change_state(DOCTYPE_system_identifier_single_quoted_state);
2307 report_error(error_name::missing_doctype_system_identifier);
2308 current_DOCTYPE_token().m_force_quirks_flag =
true;
2309 change_state(data_state);
2310 emit_token(current_DOCTYPE_token());
2314 report_error(error_name::missing_quote_before_doctype_system_identifier);
2315 current_DOCTYPE_token().m_force_quirks_flag =
true;
2327 current_DOCTYPE_token().m_force_quirks_flag =
true;
2328 emit_token(current_DOCTYPE_token());
2341 current_DOCTYPE_token().m_system_identifier.clear();
2342 change_state(DOCTYPE_system_identifier_double_quoted_state);
2345 current_DOCTYPE_token().m_system_identifier.clear();
2346 change_state(DOCTYPE_system_identifier_single_quoted_state);
2349 report_error(error_name::missing_doctype_system_identifier);
2350 current_DOCTYPE_token().m_force_quirks_flag =
true;
2351 change_state(data_state);
2352 emit_token(current_DOCTYPE_token());
2356 report_error(error_name::missing_quote_before_doctype_system_identifier);
2357 current_DOCTYPE_token().m_force_quirks_flag =
true;
2369 current_DOCTYPE_token().m_force_quirks_flag =
true;
2370 emit_token(current_DOCTYPE_token());
2378 change_state(after_DOCTYPE_system_identifier_state);
2382 current_DOCTYPE_token().m_system_identifier.push_back(U
'\xFFFD');
2385 report_error(error_name::abrupt_doctype_system_identifier);
2386 current_DOCTYPE_token().m_force_quirks_flag =
true;
2387 change_state(data_state);
2388 emit_token(current_DOCTYPE_token());
2392 current_DOCTYPE_token().m_system_identifier.push_back(cp);
2403 current_DOCTYPE_token().m_force_quirks_flag =
true;
2404 emit_token(current_DOCTYPE_token());
2412 change_state(after_DOCTYPE_system_identifier_state);
2416 current_DOCTYPE_token().m_system_identifier.push_back(U
'\xFFFD');
2419 report_error(error_name::abrupt_doctype_system_identifier);
2420 current_DOCTYPE_token().m_force_quirks_flag =
true;
2421 change_state(data_state);
2422 emit_token(current_DOCTYPE_token());
2426 current_DOCTYPE_token().m_system_identifier.push_back(cp);
2437 current_DOCTYPE_token().m_force_quirks_flag =
true;
2438 emit_token(current_DOCTYPE_token());
2451 change_state(data_state);
2452 emit_token(current_DOCTYPE_token());
2456 report_error(error_name::unexpected_character_after_doctype_system_identifier);
2467 emit_token(current_DOCTYPE_token());
2475 change_state(data_state);
2476 emit_token(current_DOCTYPE_token());
2498 change_state(CDATA_section_bracket_state);
2510 if (!
eof() && cp == U
']')
2512 change_state(CDATA_section_end_state);
2533 change_state(data_state);
2546 m_temporary_buffer.clear();
2547 m_temporary_buffer.push_back(U
'&');
2553 if (is_ascii_alphanumeric(cp))
2555 reconsume(named_character_reference_state);
2561 m_temporary_buffer.push_back(cp);
2562 change_state(numeric_character_reference_state);
2567 flush_code_points_consumed_as_character_reference();
2574 if (!
fill(named_character_reference_max_length + 1))
return;
2576 std::uint32_t len = 0;
2581 char32_t tail = *(begin() + len - 1);
2584 auto it2 = begin() + len;
2585 while (it1 != it2) m_temporary_buffer.push_back(*it1++);
2588 if (begin() != end())
2591 if (consumed_as_part_of_attribute() && tail != U
';' && (cp == U
'=' || is_ascii_alphanumeric(cp)))
2593 flush_code_points_consumed_as_character_reference();
2594 change_state(return_state());
2600 if (tail != U
';')
report_error(error_name::missing_semicolon_after_character_reference);
2602 m_temporary_buffer.clear();
2603 m_temporary_buffer.push_back(a[0]);
2604 if (a[1] != 0) m_temporary_buffer.push_back(a[1]);
2606 flush_code_points_consumed_as_character_reference();
2607 change_state(return_state());
2612 flush_code_points_consumed_as_character_reference();
2613 change_state(ambiguous_ampersand_state);
2625 if (is_ascii_alphanumeric(cp))
2628 else emit_token(cp);
2634 report_error(error_name::unknown_named_character_reference);
2646 m_character_reference_code = 0;
2656 m_temporary_buffer.push_back(cp);
2657 change_state(hexadecimal_character_reference_start_state);
2662 reconsume(decimal_character_reference_start_state);
2670 if (!
eof() && is_ascii_hex_digit(cp))
2672 reconsume(hexadecimal_character_reference_state);
2676 report_error(error_name::absence_of_digits_in_numeric_character_reference);
2677 flush_code_points_consumed_as_character_reference();
2686 if (!
eof() && is_ascii_digit(cp))
2688 reconsume(decimal_character_reference_state);
2692 report_error(error_name::absence_of_digits_in_numeric_character_reference);
2693 flush_code_points_consumed_as_character_reference();
2704 if (is_ascii_hex_digit(cp))
2708 if (is_ascii_digit(cp)) c -= 0x30;
2709 else if (is_ascii_upper_hex_digit(cp)) c -= 0x37;
2712 m_character_reference_code = (m_character_reference_code * 16) + c;
2719 change_state(numeric_character_reference_end_state);
2724 report_error(error_name::missing_semicolon_after_character_reference);
2725 reconsume(numeric_character_reference_end_state);
2735 if (is_ascii_digit(cp))
2737 m_character_reference_code = (m_character_reference_code * 10) + (cp - 0x30);
2743 change_state(numeric_character_reference_end_state);
2748 report_error(error_name::missing_semicolon_after_character_reference);
2749 reconsume(numeric_character_reference_end_state);
2755 char32_t c = m_character_reference_code;
2760 m_character_reference_code = U
'\xFFFD';
2765 report_error(error_name::character_reference_outside_unicode_range);
2766 m_character_reference_code = U
'\xFFFD';
2769 if (is_surrogate(c))
2771 report_error(error_name::surrogate_character_reference);
2772 m_character_reference_code = U
'\xFFFD';
2775 if (is_noncharacter(c))
2777 report_error(error_name::noncharacter_character_reference);
2780 if (c == 0xD || (is_control(c) && !is_ascii_white_space(c)))
2785 auto it = character_reference_code_tbl.find(c);
2786 if (it != character_reference_code_tbl.end()) m_character_reference_code = it->second;
2788 m_temporary_buffer.assign(1, m_character_reference_code);
2789 flush_code_points_consumed_as_character_reference();
2790 change_state(return_state());