@@ -33,7 +33,64 @@ const int kUtf8ReplacementChar = 0xFFFD;
33
33
// Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
34
// RFC 3629: http://tools.ietf.org/html/rfc3629
35
35
// HTML5 Unicode handling:
36
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
36
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
37
+ //
38
+ // This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
39
+ // <[email protected] >. We wrap the inner table-based decoder routine in our
40
+ // own handling for newlines, tabs, invalid continuation bytes, and other
41
+ // conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
42
+ // not handle.
43
+ // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
44
+ // the license agreement and code follows.
45
+
46
+ // Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected] >
47
+
48
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
49
+ // of this software and associated documentation files (the "Software"), to deal
50
+ // in the Software without restriction, including without limitation the rights to
51
+ // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
52
+ // of the Software, and to permit persons to whom the Software is furnished to do
53
+ // so, subject to the following conditions:
54
+
55
+ // The above copyright notice and this permission notice shall be included in
56
+ // all copies or substantial portions of the Software.
57
+
58
+ #define UTF8_ACCEPT 0
59
+ #define UTF8_REJECT 12
60
+
61
+ static const uint8_t utf8d [] = {
62
+ // The first part of the table maps bytes to character classes that
63
+ // to reduce the size of the transition table and create bitmasks.
64
+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
65
+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
66
+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
67
+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
68
+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , 9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,9 ,
69
+ 7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 , 7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,7 ,
70
+ 8 ,8 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 , 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,
71
+ 10 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,4 ,3 ,3 , 11 ,6 ,6 ,6 ,5 ,8 ,8 ,8 ,8 ,8 ,8 ,8 ,8 ,8 ,8 ,8 ,
72
+
73
+ // The second part is a transition table that maps a combination
74
+ // of a state of the automaton and a character class to a state.
75
+ 0 ,12 ,24 ,36 ,60 ,96 ,84 ,12 ,12 ,12 ,48 ,72 , 12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,
76
+ 12 , 0 ,12 ,12 ,12 ,12 ,12 , 0 ,12 , 0 ,12 ,12 , 12 ,24 ,12 ,12 ,12 ,12 ,12 ,24 ,12 ,24 ,12 ,12 ,
77
+ 12 ,12 ,12 ,12 ,12 ,12 ,12 ,24 ,12 ,12 ,12 ,12 , 12 ,24 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,24 ,12 ,12 ,
78
+ 12 ,12 ,12 ,12 ,12 ,12 ,12 ,36 ,12 ,36 ,12 ,12 , 12 ,36 ,12 ,12 ,12 ,12 ,12 ,36 ,12 ,36 ,12 ,12 ,
79
+ 12 ,36 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,12 ,
80
+ };
81
+
82
+ uint32_t static inline decode (uint32_t * state , uint32_t * codep , uint32_t byte ) {
83
+ uint32_t type = utf8d [byte ];
84
+
85
+ * codep = (* state != UTF8_ACCEPT ) ?
86
+ (byte & 0x3fu ) | (* codep << 6 ) :
87
+ (0xff >> type ) & (byte );
88
+
89
+ * state = utf8d [256 + * state + type ];
90
+ return * state ;
91
+ }
92
+
93
+ // END COPIED CODE.
37
94
38
95
// Adds a decoding error to the parser's error list, based on the current state
39
96
// of the Utf8Iterator.
@@ -63,110 +120,58 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
63
120
// When this method returns, iter->_width and iter->_current will be set
64
121
// appropriately, as well as any error flags.
65
122
static void read_char (Utf8Iterator * iter ) {
66
- unsigned char c ;
67
- unsigned char mask = '\0' ;
68
- int is_bad_char = false;
69
-
70
- c = (unsigned char ) * iter -> _start ;
71
- if (c < 0x80 ) {
72
- // Valid one-byte sequence.
73
- iter -> _width = 1 ;
74
- mask = 0xFF ;
75
- } else if (c < 0xC0 ) {
76
- // Continuation character not following a multibyte sequence.
77
- // The HTML5 spec here says to consume the byte and output a replacement
78
- // character.
79
- iter -> _width = 1 ;
80
- is_bad_char = true;
81
- } else if (c < 0xE0 ) {
82
- iter -> _width = 2 ;
83
- mask = 0x1F ; // 00011111 in binary.
84
- if (c < 0xC2 ) {
85
- // Overlong encoding; error according to UTF8/HTML5 spec.
86
- is_bad_char = true;
87
- }
88
- } else if (c < 0xF0 ) {
89
- iter -> _width = 3 ;
90
- mask = 0xF ; // 00001111 in binary.
91
- } else if (c < 0xF5 ) {
92
- iter -> _width = 4 ;
93
- mask = 0x7 ; // 00000111 in binary.
94
- } else if (c < 0xF8 ) {
95
- // The following cases are all errors, but we need to handle them separately
96
- // so that we consume the proper number of bytes from the input stream
97
- // before replacing them with the replacement char. The HTML5 spec
98
- // specifies that we should consume the shorter of the length specified by
99
- // the first bit or the run leading up to the first non-continuation
100
- // character.
101
- iter -> _width = 5 ;
102
- is_bad_char = true;
103
- } else if (c < 0xFC ) {
104
- iter -> _width = 6 ;
105
- is_bad_char = true;
106
- } else if (c < 0xFE ) {
107
- iter -> _width = 7 ;
108
- is_bad_char = true;
109
- } else {
110
- iter -> _width = 1 ;
111
- is_bad_char = true;
112
- }
113
-
114
- // Check to make sure we have enough bytes left in the iter to read all that
115
- // we want. If not, we set the iter_truncated flag, mark this as a bad
116
- // character, and adjust the current width so that it consumes the rest of the
117
- // iter.
118
- uint64_t code_point = c & mask ;
119
- if (iter -> _start + iter -> _width > iter -> _end ) {
120
- iter -> _width = iter -> _end - iter -> _start ;
121
- add_error (iter , GUMBO_ERR_UTF8_TRUNCATED );
122
- is_bad_char = true;
123
- }
124
-
125
- // Now we decode continuation bytes, shift them appropriately, and build up
126
- // the appropriate code point.
127
- assert (iter -> _width < 8 );
128
- for (int i = 1 ; i < iter -> _width ; ++ i ) {
129
- c = (unsigned char ) iter -> _start [i ];
130
- if (c < 0x80 || c > 0xBF ) {
131
- // Per HTML5 spec, we don't include the invalid continuation char in the
132
- // run that we consume here.
133
- iter -> _width = i ;
134
- is_bad_char = true;
135
- break ;
136
- }
137
- code_point = (code_point << 6 ) | (c & ~0x80 );
138
- }
139
- if (code_point > 0x10FFFF ) is_bad_char = true;
140
-
141
- // If we had a decode error, set the current code point to the replacement
142
- // character and flip the flag indicating that a decode error occurred.
143
- // Ditto if we have a code point that is explicitly on the list of characters
144
- // prohibited by the HTML5 spec, such as control characters.
145
- if (is_bad_char || utf8_is_invalid_code_point (code_point )) {
146
- add_error (iter , GUMBO_ERR_UTF8_INVALID );
147
- code_point = kUtf8ReplacementChar ;
123
+ if (iter -> _start >= iter -> _end ) {
124
+ // No input left to consume; emit an EOF and set width = 0.
125
+ iter -> _current = -1 ;
126
+ iter -> _width = 0 ;
127
+ return ;
148
128
}
149
129
150
- // This is the special handling for carriage returns that is mandated by the
151
- // HTML5 spec. Since we're looking for particular 7-bit literal characters,
152
- // we operate in terms of chars and only need a check for iter overrun,
153
- // instead of having to read in a full next code point.
154
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
155
- if (code_point == '\r' ) {
156
- const char * next = iter -> _start + iter -> _width ;
157
- if (next < iter -> _end && * next == '\n' ) {
158
- // Advance the iter, as if the carriage return didn't exist.
159
- ++ iter -> _start ;
160
- // Preserve the true offset, since other tools that look at it may be
161
- // unaware of HTML5's rules for converting \r into \n.
162
- ++ iter -> _pos .offset ;
130
+ uint32_t code_point = 0 ;
131
+ uint32_t state = UTF8_ACCEPT ;
132
+ for (const char * c = iter -> _start ; c < iter -> _end ; ++ c ) {
133
+ decode (& state , & code_point , (uint32_t ) (unsigned char ) (* c ));
134
+ if (state == UTF8_ACCEPT ) {
135
+ iter -> _width = c - iter -> _start + 1 ;
136
+ // This is the special handling for carriage returns that is mandated by the
137
+ // HTML5 spec. Since we're looking for particular 7-bit literal characters,
138
+ // we operate in terms of chars and only need a check for iter overrun,
139
+ // instead of having to read in a full next code point.
140
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
141
+ if (code_point == '\r' ) {
142
+ assert (iter -> _width == 1 );
143
+ const char * next = c + 1 ;
144
+ if (next < iter -> _end && * next == '\n' ) {
145
+ // Advance the iter, as if the carriage return didn't exist.
146
+ ++ iter -> _start ;
147
+ // Preserve the true offset, since other tools that look at it may be
148
+ // unaware of HTML5's rules for converting \r into \n.
149
+ ++ iter -> _pos .offset ;
150
+ }
151
+ code_point = '\n' ;
152
+ }
153
+ if (utf8_is_invalid_code_point (code_point )) {
154
+ add_error (iter , GUMBO_ERR_UTF8_INVALID );
155
+ code_point = kUtf8ReplacementChar ;
156
+ }
157
+ iter -> _current = code_point ;
158
+ return ;
159
+ } else if (state == UTF8_REJECT ) {
160
+ // We don't want to consume the invalid continuation byte of a multi-byte
161
+ // run, but we do want to skip past an invalid first byte.
162
+ iter -> _width = c - iter -> _start + (c == iter -> _start );
163
+ iter -> _current = kUtf8ReplacementChar ;
164
+ add_error (iter , GUMBO_ERR_UTF8_INVALID );
165
+ return ;
163
166
}
164
- code_point = '\n' ;
165
167
}
166
-
167
- // At this point, we know we have a valid character as the code point, so we
168
- // set it, and we're done.
169
- iter -> _current = code_point ;
168
+ // If we got here without exiting early, then we've reached the end of the iterator.
169
+ // Add an error for truncated input, set the width to consume the rest of the
170
+ // iterator, and emit a replacement character. The next time we enter this method,
171
+ // it will detect that there's no input to consume and
172
+ iter -> _current = kUtf8ReplacementChar ;
173
+ iter -> _width = iter -> _end - iter -> _start ;
174
+ add_error (iter , GUMBO_ERR_UTF8_TRUNCATED );
170
175
}
171
176
172
177
static void update_position (Utf8Iterator * iter ) {
@@ -177,7 +182,7 @@ static void update_position(Utf8Iterator* iter) {
177
182
} else if (iter -> _current == '\t' ) {
178
183
int tab_stop = iter -> _parser -> _options -> tab_stop ;
179
184
iter -> _pos .column = ((iter -> _pos .column / tab_stop ) + 1 ) * tab_stop ;
180
- } else {
185
+ } else if ( iter -> _current != -1 ) {
181
186
++ iter -> _pos .column ;
182
187
}
183
188
}
@@ -195,34 +200,19 @@ void utf8iterator_init(
195
200
Utf8Iterator * iter ) {
196
201
iter -> _start = source ;
197
202
iter -> _end = source + source_length ;
198
- iter -> _width = 0 ;
199
203
iter -> _pos .line = 1 ;
200
204
iter -> _pos .column = 1 ;
201
205
iter -> _pos .offset = 0 ;
202
206
iter -> _parser = parser ;
203
- if (source_length ) {
204
- read_char (iter );
205
- } else {
206
- iter -> _current = -1 ;
207
- }
207
+ read_char (iter );
208
208
}
209
209
210
210
void utf8iterator_next (Utf8Iterator * iter ) {
211
- if (iter -> _current == -1 ) {
212
- // If we're already at EOF, bail out before advancing anything to avoid
213
- // reading past the end of the buffer. It's easier to catch this case here
214
- // than litter the code with lots of individual checks for EOF.
215
- return ;
216
- }
217
- iter -> _start += iter -> _width ;
218
211
// We update positions based on the *last* character read, so that the first
219
212
// character following a newline is at column 1 in the next line.
220
213
update_position (iter );
221
- if (iter -> _start < iter -> _end ) {
222
- read_char (iter );
223
- } else { // EOF
224
- iter -> _current = -1 ;
225
- }
214
+ iter -> _start += iter -> _width ;
215
+ read_char (iter );
226
216
}
227
217
228
218
int utf8iterator_current (const Utf8Iterator * iter ) {
0 commit comments