Skip to content
This repository was archived by the owner on Feb 15, 2023. It is now read-only.

Commit 0fd07ca

Browse files
committed
Merge pull request #253 from nostrademons/utf8_decoder
Utf8 decoder performance improvements
2 parents bd15a1d + b6b3fb0 commit 0fd07ca

File tree

2 files changed

+139
-126
lines changed

2 files changed

+139
-126
lines changed

src/utf8.c

Lines changed: 110 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,64 @@ const int kUtf8ReplacementChar = 0xFFFD;
3333
// Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
3434
// RFC 3629: http://tools.ietf.org/html/rfc3629
3535
// HTML5 Unicode handling:
36-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
36+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
37+
//
38+
// This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
39+
// <[email protected]>. We wrap the inner table-based decoder routine in our
40+
// own handling for newlines, tabs, invalid continuation bytes, and other
41+
// conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
42+
// not handle.
43+
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
44+
// the license agreement and code follows.
45+
46+
// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
47+
48+
// Permission is hereby granted, free of charge, to any person obtaining a copy
49+
// of this software and associated documentation files (the "Software"), to deal
50+
// in the Software without restriction, including without limitation the rights to
51+
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
52+
// of the Software, and to permit persons to whom the Software is furnished to do
53+
// so, subject to the following conditions:
54+
55+
// The above copyright notice and this permission notice shall be included in
56+
// all copies or substantial portions of the Software.
57+
58+
#define UTF8_ACCEPT 0
59+
#define UTF8_REJECT 12
60+
61+
static const uint8_t utf8d[] = {
62+
// The first part of the table maps bytes to character classes that
63+
// to reduce the size of the transition table and create bitmasks.
64+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
65+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
66+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
69+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
70+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
71+
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
72+
73+
// The second part is a transition table that maps a combination
74+
// of a state of the automaton and a character class to a state.
75+
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
76+
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
77+
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
78+
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
79+
12,36,12,12,12,12,12,12,12,12,12,12,
80+
};
81+
82+
uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
83+
uint32_t type = utf8d[byte];
84+
85+
*codep = (*state != UTF8_ACCEPT) ?
86+
(byte & 0x3fu) | (*codep << 6) :
87+
(0xff >> type) & (byte);
88+
89+
*state = utf8d[256 + *state + type];
90+
return *state;
91+
}
92+
93+
// END COPIED CODE.
3794

3895
// Adds a decoding error to the parser's error list, based on the current state
3996
// of the Utf8Iterator.
@@ -63,110 +120,58 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
63120
// When this method returns, iter->_width and iter->_current will be set
64121
// appropriately, as well as any error flags.
65122
static void read_char(Utf8Iterator* iter) {
66-
unsigned char c;
67-
unsigned char mask = '\0';
68-
int is_bad_char = false;
69-
70-
c = (unsigned char) *iter->_start;
71-
if (c < 0x80) {
72-
// Valid one-byte sequence.
73-
iter->_width = 1;
74-
mask = 0xFF;
75-
} else if (c < 0xC0) {
76-
// Continuation character not following a multibyte sequence.
77-
// The HTML5 spec here says to consume the byte and output a replacement
78-
// character.
79-
iter->_width = 1;
80-
is_bad_char = true;
81-
} else if (c < 0xE0) {
82-
iter->_width = 2;
83-
mask = 0x1F; // 00011111 in binary.
84-
if (c < 0xC2) {
85-
// Overlong encoding; error according to UTF8/HTML5 spec.
86-
is_bad_char = true;
87-
}
88-
} else if (c < 0xF0) {
89-
iter->_width = 3;
90-
mask = 0xF; // 00001111 in binary.
91-
} else if (c < 0xF5) {
92-
iter->_width = 4;
93-
mask = 0x7; // 00000111 in binary.
94-
} else if (c < 0xF8) {
95-
// The following cases are all errors, but we need to handle them separately
96-
// so that we consume the proper number of bytes from the input stream
97-
// before replacing them with the replacement char. The HTML5 spec
98-
// specifies that we should consume the shorter of the length specified by
99-
// the first bit or the run leading up to the first non-continuation
100-
// character.
101-
iter->_width = 5;
102-
is_bad_char = true;
103-
} else if (c < 0xFC) {
104-
iter->_width = 6;
105-
is_bad_char = true;
106-
} else if (c < 0xFE) {
107-
iter->_width = 7;
108-
is_bad_char = true;
109-
} else {
110-
iter->_width = 1;
111-
is_bad_char = true;
112-
}
113-
114-
// Check to make sure we have enough bytes left in the iter to read all that
115-
// we want. If not, we set the iter_truncated flag, mark this as a bad
116-
// character, and adjust the current width so that it consumes the rest of the
117-
// iter.
118-
uint64_t code_point = c & mask;
119-
if (iter->_start + iter->_width > iter->_end) {
120-
iter->_width = iter->_end - iter->_start;
121-
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
122-
is_bad_char = true;
123-
}
124-
125-
// Now we decode continuation bytes, shift them appropriately, and build up
126-
// the appropriate code point.
127-
assert(iter->_width < 8);
128-
for (int i = 1; i < iter->_width; ++i) {
129-
c = (unsigned char) iter->_start[i];
130-
if (c < 0x80 || c > 0xBF) {
131-
// Per HTML5 spec, we don't include the invalid continuation char in the
132-
// run that we consume here.
133-
iter->_width = i;
134-
is_bad_char = true;
135-
break;
136-
}
137-
code_point = (code_point << 6) | (c & ~0x80);
138-
}
139-
if (code_point > 0x10FFFF) is_bad_char = true;
140-
141-
// If we had a decode error, set the current code point to the replacement
142-
// character and flip the flag indicating that a decode error occurred.
143-
// Ditto if we have a code point that is explicitly on the list of characters
144-
// prohibited by the HTML5 spec, such as control characters.
145-
if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
146-
add_error(iter, GUMBO_ERR_UTF8_INVALID);
147-
code_point = kUtf8ReplacementChar;
123+
if (iter->_start >= iter->_end) {
124+
// No input left to consume; emit an EOF and set width = 0.
125+
iter->_current = -1;
126+
iter->_width = 0;
127+
return;
148128
}
149129

150-
// This is the special handling for carriage returns that is mandated by the
151-
// HTML5 spec. Since we're looking for particular 7-bit literal characters,
152-
// we operate in terms of chars and only need a check for iter overrun,
153-
// instead of having to read in a full next code point.
154-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
155-
if (code_point == '\r') {
156-
const char* next = iter->_start + iter->_width;
157-
if (next < iter->_end && *next == '\n') {
158-
// Advance the iter, as if the carriage return didn't exist.
159-
++iter->_start;
160-
// Preserve the true offset, since other tools that look at it may be
161-
// unaware of HTML5's rules for converting \r into \n.
162-
++iter->_pos.offset;
130+
uint32_t code_point = 0;
131+
uint32_t state = UTF8_ACCEPT;
132+
for (const char* c = iter->_start; c < iter->_end; ++c) {
133+
decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
134+
if (state == UTF8_ACCEPT) {
135+
iter->_width = c - iter->_start + 1;
136+
// This is the special handling for carriage returns that is mandated by the
137+
// HTML5 spec. Since we're looking for particular 7-bit literal characters,
138+
// we operate in terms of chars and only need a check for iter overrun,
139+
// instead of having to read in a full next code point.
140+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
141+
if (code_point == '\r') {
142+
assert(iter->_width == 1);
143+
const char* next = c + 1;
144+
if (next < iter->_end && *next == '\n') {
145+
// Advance the iter, as if the carriage return didn't exist.
146+
++iter->_start;
147+
// Preserve the true offset, since other tools that look at it may be
148+
// unaware of HTML5's rules for converting \r into \n.
149+
++iter->_pos.offset;
150+
}
151+
code_point = '\n';
152+
}
153+
if (utf8_is_invalid_code_point(code_point)) {
154+
add_error(iter, GUMBO_ERR_UTF8_INVALID);
155+
code_point = kUtf8ReplacementChar;
156+
}
157+
iter->_current = code_point;
158+
return;
159+
} else if (state == UTF8_REJECT) {
160+
// We don't want to consume the invalid continuation byte of a multi-byte
161+
// run, but we do want to skip past an invalid first byte.
162+
iter->_width = c - iter->_start + (c == iter->_start);
163+
iter->_current = kUtf8ReplacementChar;
164+
add_error(iter, GUMBO_ERR_UTF8_INVALID);
165+
return;
163166
}
164-
code_point = '\n';
165167
}
166-
167-
// At this point, we know we have a valid character as the code point, so we
168-
// set it, and we're done.
169-
iter->_current = code_point;
168+
// If we got here without exiting early, then we've reached the end of the iterator.
169+
// Add an error for truncated input, set the width to consume the rest of the
170+
// iterator, and emit a replacement character. The next time we enter this method,
171+
// it will detect that there's no input to consume and
172+
iter->_current = kUtf8ReplacementChar;
173+
iter->_width = iter->_end - iter->_start;
174+
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
170175
}
171176

172177
static void update_position(Utf8Iterator* iter) {
@@ -177,7 +182,7 @@ static void update_position(Utf8Iterator* iter) {
177182
} else if(iter->_current == '\t') {
178183
int tab_stop = iter->_parser->_options->tab_stop;
179184
iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
180-
} else {
185+
} else if(iter->_current != -1) {
181186
++iter->_pos.column;
182187
}
183188
}
@@ -195,34 +200,19 @@ void utf8iterator_init(
195200
Utf8Iterator* iter) {
196201
iter->_start = source;
197202
iter->_end = source + source_length;
198-
iter->_width = 0;
199203
iter->_pos.line = 1;
200204
iter->_pos.column = 1;
201205
iter->_pos.offset = 0;
202206
iter->_parser = parser;
203-
if (source_length) {
204-
read_char(iter);
205-
} else {
206-
iter->_current = -1;
207-
}
207+
read_char(iter);
208208
}
209209

210210
void utf8iterator_next(Utf8Iterator* iter) {
211-
if (iter->_current == -1) {
212-
// If we're already at EOF, bail out before advancing anything to avoid
213-
// reading past the end of the buffer. It's easier to catch this case here
214-
// than litter the code with lots of individual checks for EOF.
215-
return;
216-
}
217-
iter->_start += iter->_width;
218211
// We update positions based on the *last* character read, so that the first
219212
// character following a newline is at column 1 in the next line.
220213
update_position(iter);
221-
if (iter->_start < iter->_end) {
222-
read_char(iter);
223-
} else { // EOF
224-
iter->_current = -1;
225-
}
214+
iter->_start += iter->_width;
215+
read_char(iter);
226216
}
227217

228218
int utf8iterator_current(const Utf8Iterator* iter) {

tests/utf8.cc

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -171,14 +171,17 @@ TEST_F(Utf8Test, OverlongEncodingWithContinuationByte) {
171171
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
172172
EXPECT_EQ('\xC0', *utf8iterator_get_char_pointer(&input_));
173173

174+
utf8iterator_next(&input_);
175+
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
176+
174177
errors_are_expected_ = true;
175178
GumboError* error = GetFirstError();
176179
EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
177180
EXPECT_EQ(1, error->position.line);
178181
EXPECT_EQ(1, error->position.column);
179182
EXPECT_EQ(0, error->position.offset);
180183
EXPECT_EQ('\xC0', *error->original_text);
181-
EXPECT_EQ(0xC085, error->v.codepoint);
184+
EXPECT_EQ(0xC0, error->v.codepoint);
182185

183186
utf8iterator_next(&input_);
184187
EXPECT_EQ(-1, utf8iterator_current(&input_));
@@ -284,6 +287,11 @@ TEST_F(Utf8Test, FiveByteCharIsError) {
284287
EXPECT_EQ(1, GetNumErrors());
285288
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
286289

290+
utf8iterator_next(&input_);
291+
utf8iterator_next(&input_);
292+
utf8iterator_next(&input_);
293+
utf8iterator_next(&input_);
294+
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
287295
utf8iterator_next(&input_);
288296
EXPECT_EQ('x', utf8iterator_current(&input_));
289297
}
@@ -294,6 +302,12 @@ TEST_F(Utf8Test, SixByteCharIsError) {
294302
EXPECT_EQ(1, GetNumErrors());
295303
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
296304

305+
utf8iterator_next(&input_);
306+
utf8iterator_next(&input_);
307+
utf8iterator_next(&input_);
308+
utf8iterator_next(&input_);
309+
utf8iterator_next(&input_);
310+
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
297311
utf8iterator_next(&input_);
298312
EXPECT_EQ('x', utf8iterator_current(&input_));
299313
}
@@ -304,6 +318,13 @@ TEST_F(Utf8Test, SevenByteCharIsError) {
304318
EXPECT_EQ(1, GetNumErrors());
305319
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
306320

321+
utf8iterator_next(&input_);
322+
utf8iterator_next(&input_);
323+
utf8iterator_next(&input_);
324+
utf8iterator_next(&input_);
325+
utf8iterator_next(&input_);
326+
utf8iterator_next(&input_);
327+
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
307328
utf8iterator_next(&input_);
308329
EXPECT_EQ('x', utf8iterator_current(&input_));
309330
}
@@ -329,9 +350,9 @@ TEST_F(Utf8Test, InvalidControlCharIsError) {
329350
}
330351

331352
TEST_F(Utf8Test, TruncatedInput) {
332-
ResetText("\xF8\xA7");
353+
ResetText("\xF1\xA7");
333354

334-
EXPECT_EQ(2, GetNumErrors());
355+
EXPECT_EQ(1, GetNumErrors());
335356
EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
336357

337358
errors_are_expected_ = true;
@@ -340,15 +361,17 @@ TEST_F(Utf8Test, TruncatedInput) {
340361
EXPECT_EQ(1, error->position.line);
341362
EXPECT_EQ(1, error->position.column);
342363
EXPECT_EQ(0, error->position.offset);
343-
EXPECT_EQ('\xF8', *error->original_text);
344-
EXPECT_EQ(0xF8A7, error->v.codepoint);
364+
EXPECT_EQ('\xF1', *error->original_text);
365+
EXPECT_EQ(0xF1A7, error->v.codepoint);
345366

346367
utf8iterator_next(&input_);
347368
EXPECT_EQ(-1, utf8iterator_current(&input_));
348369
}
349370

350371
TEST_F(Utf8Test, Html5SpecExample) {
351-
// http://www.whatwg.org/specs/web-apps/current-work/epub.html#utf-8
372+
// This example has since been removed from the spec, and the spec has been
373+
// changed to reference the Unicode Standard 6.2, 5.22 "Best practices for
374+
// U+FFFD substitution."
352375
ResetText("\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98");
353376

354377
EXPECT_EQ('A', utf8iterator_current(&input_));

0 commit comments

Comments
 (0)