Merge pull request #253 from nostrademons/utf8_decoder

nostrademons · nostrademons · commit 0fd07ca67348 · 2014-09-21T20:36:38.000-07:00
Utf8 decoder performance improvements
diff --git a/src/utf8.c b/src/utf8.c
@@ -33,7 +33,64 @@ const int kUtf8ReplacementChar = 0xFFFD;
 // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
 // RFC 3629: http://tools.ietf.org/html/rfc3629
 // HTML5 Unicode handling:
-// http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
+//
+// This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
+// <bjoern@hoehrmann.de>.  We wrap the inner table-based decoder routine in our
+// own handling for newlines, tabs, invalid continuation bytes, and other
+// conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
+// not handle.
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.  Full text of
+// the license agreement and code follows.
+
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+  // The first part of the table maps bytes to character classes that
+  // to reduce the size of the transition table and create bitmasks.
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+  // The second part is a transition table that maps a combination
+  // of a state of the automaton and a character class to a state.
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12, 
+};
+
+uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state + type];
+  return *state;
+}
+
+// END COPIED CODE.
 
 // Adds a decoding error to the parser's error list, based on the current state
 // of the Utf8Iterator.
@@ -63,110 +120,58 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
 // When this method returns, iter->_width and iter->_current will be set
 // appropriately, as well as any error flags.
 static void read_char(Utf8Iterator* iter) {
-  unsigned char c;
-  unsigned char mask = '\0';
-  int is_bad_char = false;
-
-  c = (unsigned char) *iter->_start;
-  if (c < 0x80) {
-    // Valid one-byte sequence.
-    iter->_width = 1;
-    mask = 0xFF;
-  } else if (c < 0xC0) {
-    // Continuation character not following a multibyte sequence.
-    // The HTML5 spec here says to consume the byte and output a replacement
-    // character.
-    iter->_width = 1;
-    is_bad_char = true;
-  } else if (c < 0xE0) {
-    iter->_width = 2;
-    mask = 0x1F;                // 00011111 in binary.
-    if (c < 0xC2) {
-      // Overlong encoding; error according to UTF8/HTML5 spec.
-      is_bad_char = true;
-    }
-  } else if (c < 0xF0) {
-    iter->_width = 3;
-    mask = 0xF;                 // 00001111 in binary.
-  } else if (c < 0xF5) {
-    iter->_width = 4;
-    mask = 0x7;                 // 00000111 in binary.
-  } else if (c < 0xF8) {
-    // The following cases are all errors, but we need to handle them separately
-    // so that we consume the proper number of bytes from the input stream
-    // before replacing them with the replacement char.  The HTML5 spec
-    // specifies that we should consume the shorter of the length specified by
-    // the first bit or the run leading up to the first non-continuation
-    // character.
-    iter->_width = 5;
-    is_bad_char = true;
-  } else if (c < 0xFC) {
-    iter->_width = 6;
-    is_bad_char = true;
-  } else if (c < 0xFE) {
-    iter->_width = 7;
-    is_bad_char = true;
-  } else {
-    iter->_width = 1;
-    is_bad_char = true;
-  }
-
-  // Check to make sure we have enough bytes left in the iter to read all that
-  // we want.  If not, we set the iter_truncated flag, mark this as a bad
-  // character, and adjust the current width so that it consumes the rest of the
-  // iter.
-  uint64_t code_point = c & mask;
-  if (iter->_start + iter->_width > iter->_end) {
-    iter->_width = iter->_end - iter->_start;
-    add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
-    is_bad_char = true;
-  }
-
-  // Now we decode continuation bytes, shift them appropriately, and build up
-  // the appropriate code point.
-  assert(iter->_width < 8);
-  for (int i = 1; i < iter->_width; ++i) {
-    c = (unsigned char) iter->_start[i];
-    if (c < 0x80 || c > 0xBF) {
-      // Per HTML5 spec, we don't include the invalid continuation char in the
-      // run that we consume here.
-      iter->_width = i;
-      is_bad_char = true;
-      break;
-    }
-    code_point = (code_point << 6) | (c & ~0x80);
-  }
-  if (code_point > 0x10FFFF) is_bad_char = true;
-
-  // If we had a decode error, set the current code point to the replacement
-  // character and flip the flag indicating that a decode error occurred.
-  // Ditto if we have a code point that is explicitly on the list of characters
-  // prohibited by the HTML5 spec, such as control characters.
-  if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
-    add_error(iter, GUMBO_ERR_UTF8_INVALID);
-    code_point = kUtf8ReplacementChar;
+  if (iter->_start >= iter->_end) {
+    // No input left to consume; emit an EOF and set width = 0.
+    iter->_current = -1;
+    iter->_width = 0;
+    return;
   }
 
-  // This is the special handling for carriage returns that is mandated by the
-  // HTML5 spec.  Since we're looking for particular 7-bit literal characters,
-  // we operate in terms of chars and only need a check for iter overrun,
-  // instead of having to read in a full next code point.
-  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
-  if (code_point == '\r') {
-    const char* next = iter->_start + iter->_width;
-    if (next < iter->_end && *next == '\n') {
-      // Advance the iter, as if the carriage return didn't exist.
-      ++iter->_start;
-      // Preserve the true offset, since other tools that look at it may be
-      // unaware of HTML5's rules for converting \r into \n.
-      ++iter->_pos.offset;
+  uint32_t code_point = 0;
+  uint32_t state = UTF8_ACCEPT;
+  for (const char* c = iter->_start; c < iter->_end; ++c) {
+    decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
+    if (state == UTF8_ACCEPT) {
+      iter->_width = c - iter->_start + 1;
+      // This is the special handling for carriage returns that is mandated by the
+      // HTML5 spec.  Since we're looking for particular 7-bit literal characters,
+      // we operate in terms of chars and only need a check for iter overrun,
+      // instead of having to read in a full next code point.
+      // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
+      if (code_point == '\r') {
+        assert(iter->_width == 1);
+        const char* next = c + 1;
+        if (next < iter->_end && *next == '\n') {
+          // Advance the iter, as if the carriage return didn't exist.
+          ++iter->_start;
+          // Preserve the true offset, since other tools that look at it may be
+          // unaware of HTML5's rules for converting \r into \n.
+          ++iter->_pos.offset;
+        }
+        code_point = '\n';
+      }
+      if (utf8_is_invalid_code_point(code_point)) {
+        add_error(iter, GUMBO_ERR_UTF8_INVALID);
+        code_point = kUtf8ReplacementChar;
+      }
+      iter->_current = code_point;
+      return;
+    } else if (state == UTF8_REJECT) {
+      // We don't want to consume the invalid continuation byte of a multi-byte
+      // run, but we do want to skip past an invalid first byte.
+      iter->_width = c - iter->_start + (c == iter->_start);
+      iter->_current = kUtf8ReplacementChar;
+      add_error(iter, GUMBO_ERR_UTF8_INVALID);
+      return;
     }
-    code_point = '\n';
   }
-
-  // At this point, we know we have a valid character as the code point, so we
-  // set it, and we're done.
-  iter->_current = code_point;
+  // If we got here without exiting early, then we've reached the end of the iterator.
+  // Add an error for truncated input, set the width to consume the rest of the
+  // iterator, and emit a replacement character.  The next time we enter this method,
+  // it will detect that there's no input to consume and 
+  iter->_current = kUtf8ReplacementChar;
+  iter->_width = iter->_end - iter->_start;
+  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
 }
 
 static void update_position(Utf8Iterator* iter) {
@@ -177,7 +182,7 @@ static void update_position(Utf8Iterator* iter) {
   } else if(iter->_current == '\t') {
     int tab_stop = iter->_parser->_options->tab_stop;
     iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
-  } else {
+  } else if(iter->_current != -1) {
     ++iter->_pos.column;
   }
 }
@@ -195,34 +200,19 @@ void utf8iterator_init(
     Utf8Iterator* iter) {
   iter->_start = source;
   iter->_end = source + source_length;
-  iter->_width = 0;
   iter->_pos.line = 1;
   iter->_pos.column = 1;
   iter->_pos.offset = 0;
   iter->_parser = parser;
-  if (source_length) {
-    read_char(iter);
-  } else {
-    iter->_current = -1;
-  }
+  read_char(iter);
 }
 
 void utf8iterator_next(Utf8Iterator* iter) {
-  if (iter->_current == -1) {
-    // If we're already at EOF, bail out before advancing anything to avoid
-    // reading past the end of the buffer.  It's easier to catch this case here
-    // than litter the code with lots of individual checks for EOF.
-    return;
-  }
-  iter->_start += iter->_width;
   // We update positions based on the *last* character read, so that the first
   // character following a newline is at column 1 in the next line.
   update_position(iter);
-  if (iter->_start < iter->_end) {
-    read_char(iter);
-  } else {  // EOF
-    iter->_current = -1;
-  }
+  iter->_start += iter->_width;
+  read_char(iter);
 }
 
 int utf8iterator_current(const Utf8Iterator* iter) {
diff --git a/tests/utf8.cc b/tests/utf8.cc
@@ -171,14 +171,17 @@ TEST_F(Utf8Test, OverlongEncodingWithContinuationByte) {
   EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
   EXPECT_EQ('\xC0', *utf8iterator_get_char_pointer(&input_));
 
+  utf8iterator_next(&input_);
+  EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
+
   errors_are_expected_ = true;
   GumboError* error = GetFirstError();
   EXPECT_EQ(GUMBO_ERR_UTF8_INVALID, error->type);
   EXPECT_EQ(1, error->position.line);
   EXPECT_EQ(1, error->position.column);
   EXPECT_EQ(0, error->position.offset);
   EXPECT_EQ('\xC0', *error->original_text);
-  EXPECT_EQ(0xC085, error->v.codepoint);
+  EXPECT_EQ(0xC0, error->v.codepoint);
 
   utf8iterator_next(&input_);
   EXPECT_EQ(-1, utf8iterator_current(&input_));
@@ -284,6 +287,11 @@ TEST_F(Utf8Test, FiveByteCharIsError) {
   EXPECT_EQ(1, GetNumErrors());
   EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
 
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
   utf8iterator_next(&input_);
   EXPECT_EQ('x', utf8iterator_current(&input_));
 }
@@ -294,6 +302,12 @@ TEST_F(Utf8Test, SixByteCharIsError) {
   EXPECT_EQ(1, GetNumErrors());
   EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
 
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
   utf8iterator_next(&input_);
   EXPECT_EQ('x', utf8iterator_current(&input_));
 }
@@ -304,6 +318,13 @@ TEST_F(Utf8Test, SevenByteCharIsError) {
   EXPECT_EQ(1, GetNumErrors());
   EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
 
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  utf8iterator_next(&input_);
+  EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
   utf8iterator_next(&input_);
   EXPECT_EQ('x', utf8iterator_current(&input_));
 }
@@ -329,9 +350,9 @@ TEST_F(Utf8Test, InvalidControlCharIsError) {
 }
 
 TEST_F(Utf8Test, TruncatedInput) {
-  ResetText("\xF8\xA7");
+  ResetText("\xF1\xA7");
 
-  EXPECT_EQ(2, GetNumErrors());
+  EXPECT_EQ(1, GetNumErrors());
   EXPECT_EQ(0xFFFD, utf8iterator_current(&input_));
 
   errors_are_expected_ = true;
@@ -340,15 +361,17 @@ TEST_F(Utf8Test, TruncatedInput) {
   EXPECT_EQ(1, error->position.line);
   EXPECT_EQ(1, error->position.column);
   EXPECT_EQ(0, error->position.offset);
-  EXPECT_EQ('\xF8', *error->original_text);
-  EXPECT_EQ(0xF8A7, error->v.codepoint);
+  EXPECT_EQ('\xF1', *error->original_text);
+  EXPECT_EQ(0xF1A7, error->v.codepoint);
 
   utf8iterator_next(&input_);
   EXPECT_EQ(-1, utf8iterator_current(&input_));
 }
 
 TEST_F(Utf8Test, Html5SpecExample) {
-  // http://www.whatwg.org/specs/web-apps/current-work/epub.html#utf-8
+  // This example has since been removed from the spec, and the spec has been
+  // changed to reference the Unicode Standard 6.2, 5.22 "Best practices for
+  // U+FFFD substitution."
   ResetText("\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98");
 
   EXPECT_EQ('A', utf8iterator_current(&input_));