From b4aee06110099f686f3d57259ed2fc1674328c5b Mon Sep 17 00:00:00 2001 From: "J. King" Date: Fri, 11 Sep 2020 12:16:31 -0400 Subject: [PATCH 01/11] Correctly trim nbsp without using mbstring --- Mf2/Parser.php | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index e8e6dda..b0db775 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -111,10 +111,9 @@ function collapseWhitespace($str) { } function unicodeTrim($str) { - // this is cheating. TODO: find a better way if this causes any problems - $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); - $str = preg_replace('/^\s+/', '', $str); - return preg_replace('/\s+$/', '', $str); + // the binary sequence C2A0 is a UTF-8 non-breaking space character + $str = preg_replace('/^(?:\s|\x{C2}\x{A0})+/', '', $str); + return preg_replace('/(?:\s|\x{C2}\x{A0})+$/', '', $str); } /** From 5c7c883d3db5d0b7a064405f20e6f9d442eb41dd Mon Sep 17 00:00:00 2001 From: "J. King" Date: Fri, 11 Sep 2020 14:56:14 -0400 Subject: [PATCH 02/11] Remove unnecessary use of mbstring The classname prefixes are always ASCII, so mb_ functions are not needed --- Mf2/Parser.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index b0db775..8d27b13 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -160,8 +160,8 @@ function nestedMfPropertyNamesFromClass($class) { foreach (explode(' ', $class) as $classname) { foreach ($prefixes as $prefix) { // Check if $classname is a valid property classname for $prefix. - if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) { - $propertyName = mb_substr($classname, mb_strlen($prefix)); + if (substr($classname, 0, strlen($prefix)) == $prefix && $classname != $prefix) { + $propertyName = substr($classname, strlen($prefix)); $propertyNames[$propertyName][] = $prefix; } } From 9c3a5a362cda5cd51baf0111d13566800ee844e5 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Fri, 11 Sep 2020 19:06:38 -0400 Subject: [PATCH 03/11] Make mbstring in unicodeToHtmlEntities optional It's unclear what problems the function was originallymeant to solve; it is equally unclear the problems still exist --- Mf2/Parser.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 8d27b13..cda6380 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -93,7 +93,12 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) { * @return string */ function unicodeToHtmlEntities($input) { - return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); + // FIXME: This function purportedly works around UTF-8 decoding problems + // in PHP's DOMDocument class circa 2012. Is this still necessary? + if (extension_loaded("mbstring")) { + return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); + } + return $input; } /** From 1965f9afac66d2e9af9e95acca60aede5048fa02 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 7 Apr 2021 12:56:02 -0400 Subject: [PATCH 04/11] Perform charset detection instead of guessing --- Mf2/Parser.php | 45 +++++++++++++++++++-------- tests/Mf2/ClassicMicroformatsTest.php | 2 +- tests/Mf2/ParseImpliedTest.php | 2 +- tests/Mf2/ParserTest.php | 6 ++-- 4 files changed, 36 insertions(+), 19 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index cda6380..a32cdbe 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -87,18 +87,38 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) { return parse($html, $url, $convertClassic); } -/** - * Unicode to HTML Entities - * @param string $input String containing characters to convert into HTML entities - * @return string - */ -function unicodeToHtmlEntities($input) { - // FIXME: This function purportedly works around UTF-8 decoding problems - // in PHP's DOMDocument class circa 2012. Is this still necessary? - if (extension_loaded("mbstring")) { - return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); +function load_dom_document($input) { + $d = new \DOMDocument(); + @$d->loadHTML($input, \LIBXML_NOWARNING); + if (substr($input, 0, 3) === "\xEF\xBB\xBF") { + // the document has a byte order mark and is unambiguously UTF-8 + return $d; + } + /* Examine each element in the document to determine whether + there is a charset attribute that signals UTF-8 encoding. + + If there is we add a UTF-8 byte order mark to the input and parse + the document a second time, yielding a correct document. Otherwise + we return the document as-is rather than jump through hoops to + correct it (if indeed it is incorrect). + + The truly correct thing to do is to use a modern HTML parser which + implements the encoding pre-scan algorithm; this should suffice when + such a modern parser is not available, however. + */ + $charset = false; + $labelPattern = '/^\s*(?:unicode-1-1-utf-8|unicode11utf8|unicode20utf8|utf-8|utf8|x-unicode20utf8)\s*$/i'; // See https://encoding.spec.whatwg.org/#names-and-labels + foreach ($d->getElementsByTagName("meta") as $e) { + if (preg_match($labelPattern, $e->getAttribute("charset"))) { + $charset = true; + break; + } + } + if ($charset) { + $d = new \DOMDocument(); + @$d->loadHTML("\xEF\xBB\xBF".$input, \LIBXML_NOWARNING); } - return $input; + return $d; } /** @@ -365,8 +385,7 @@ public function __construct($input, $url = null, $jsonMode = false) { $doc = new \Masterminds\HTML5(array('disable_html_ns' => true)); $doc = $doc->loadHTML($input); } else { - $doc = new DOMDocument(); - @$doc->loadHTML(unicodeToHtmlEntities($input), \LIBXML_NOWARNING); + $doc = load_dom_document($input); } } elseif (is_a($input, 'DOMDocument')) { $doc = clone $input; diff --git a/tests/Mf2/ClassicMicroformatsTest.php b/tests/Mf2/ClassicMicroformatsTest.php index 0150e08..514800a 100644 --- a/tests/Mf2/ClassicMicroformatsTest.php +++ b/tests/Mf2/ClassicMicroformatsTest.php @@ -911,7 +911,7 @@ public function testHEntryRelTagInContent() { * @see source: http://jg.typepad.com/ciel/2006/02/daniel_bouluds_.html */ public function testHReviewRelTag() { - $input = '
+ $input = '

Divine Brunch! diff --git a/tests/Mf2/ParseImpliedTest.php b/tests/Mf2/ParseImpliedTest.php index cea18fd..990ca0c 100644 --- a/tests/Mf2/ParseImpliedTest.php +++ b/tests/Mf2/ParseImpliedTest.php @@ -167,7 +167,7 @@ public function testParsesImpliedNameWithExplicitURL() { } public function testMultipleImpliedHCards() { - $input = 'Frances Berriman + $input = 'Frances Berriman Ben Ward diff --git a/tests/Mf2/ParserTest.php b/tests/Mf2/ParserTest.php index e0f96cc..fb588af 100644 --- a/tests/Mf2/ParserTest.php +++ b/tests/Mf2/ParserTest.php @@ -816,10 +816,8 @@ public function testNotMutatingPassedInDOM() { // Use same parsing as Parser::__construct(), twice to have a comparison object. libxml_use_internal_errors(true); - $refDoc = new \DOMDocument(); - @$refDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING); - $inputDoc = new \DOMDocument(); - @$inputDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING); + $refDoc = Mf2\load_dom_document($input); + $inputDoc = Mf2\load_dom_document($input); // For completion sake, test PHP itself. $this->assertEquals($refDoc, $inputDoc, 'PHP could not create identical DOMDocument instances.'); From 25c8ff46505b627e1b37a99230c66b50269c39c5 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 7 Apr 2021 13:03:02 -0400 Subject: [PATCH 05/11] Fix standard test harness --- tests/Mf2/MicroformatsTestSuiteTest.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/Mf2/MicroformatsTestSuiteTest.php b/tests/Mf2/MicroformatsTestSuiteTest.php index 3f0392f..745d2c6 100644 --- a/tests/Mf2/MicroformatsTestSuiteTest.php +++ b/tests/Mf2/MicroformatsTestSuiteTest.php @@ -74,6 +74,8 @@ public function testMf2FromTestSuite($input, $expectedOutput, $test) $this->markTestIncomplete('This test does not match because we implement a proposed spec: https://github.com/microformats/microformats2-parsing/issues/4#issuecomment-373457720.'); } + // Add a byte order mark to the input to ensure encoding is correctly detected + $input = "\xEF\xBB\xBF".$input; $parser = new TestSuiteParser($input, 'http://example.com/'); $this->assertEquals( $this->makeComparible(json_decode($expectedOutput, true)), From adf2eb8775fdaeb4779905dfce55ab7cac5f102d Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 6 Apr 2023 21:53:13 -0400 Subject: [PATCH 06/11] Perform encoding detection as a browser would This is the result of extension testing against PHP 5.6 through PHP 8.2 in Linux and Windows to determine how encoding detection actually works in DOMDocument --- Mf2/Parser.php | 503 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 477 insertions(+), 26 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 49ecd13..c0398c0 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -7,7 +7,6 @@ use DOMXPath; use DOMNode; use DOMNodeList; -use Exception; use SplObjectStorage; use stdClass; @@ -87,36 +86,112 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) { return parse($html, $url, $convertClassic); } -function load_dom_document($input) { - $d = new \DOMDocument(); - @$d->loadHTML($input, \LIBXML_NOWARNING); - if (substr($input, 0, 3) === "\xEF\xBB\xBF") { - // the document has a byte order mark and is unambiguously UTF-8 - return $d; - } - /* Examine each element in the document to determine whether - there is a charset attribute that signals UTF-8 encoding. - - If there is we add a UTF-8 byte order mark to the input and parse - the document a second time, yielding a correct document. Otherwise - we return the document as-is rather than jump through hoops to - correct it (if indeed it is incorrect). - - The truly correct thing to do is to use a modern HTML parser which - implements the encoding pre-scan algorithm; this should suffice when - such a modern parser is not available, however. +function load_dom_document($input, $contentType = "") { + /* + The work of this function concerns detecting encoding and making sure + DOMDocument honours the correct document encoding. There are + multiple complications to this, chiefly that DOMDocument's + priorities differ from the modern HTML standard's. HTML mandates the + following from most to least authoritative: + + 1. Byte order mark + 2. Protocol-specified encoding + 3. Meta element in document + 4. Heuristics (optional) + 5. Default encoding (usually windows-1252) + + DOMDocument, on the other hand, takes encoding from the following + sources from most to least authoritative: + + 1. Meta element in document + 2. Byte order mark + 3. Default encoding (ISO 8859-1) + + Thus this function finds the most authoritative encoding in the order + required by the specification, and then inserts a meta element where + necessary i.e. when the document doesn't already have the correct encoding + in a meta element. + + Finally, in the case of UTF-16 input the only reliable indicator is a + byte order mark, so we insert one where needed instead of a meta element. */ - $charset = false; - $labelPattern = '/^\s*(?:unicode-1-1-utf-8|unicode11utf8|unicode20utf8|utf-8|utf8|x-unicode20utf8)\s*$/i'; // See https://encoding.spec.whatwg.org/#names-and-labels + $load = function($input) { + $d = new DOMDocument(); + @$d->loadHTML($input, \LIBXML_NOWARNING); + return $d; + }; + // parse the document and find the first valid meta element with encoding information + // even if this initial parsing uses the wrong encoding, it's more reliable than trying to do this with the string directly + $d = $load($input); + $meta = ""; + $effective = ""; foreach ($d->getElementsByTagName("meta") as $e) { - if (preg_match($labelPattern, $e->getAttribute("charset"))) { - $charset = true; + $meta = Encoding::matchEncodingLabel($e->getAttribute("charset")); + if (strlen($meta)) { + $effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true); break; } + if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") { + $candidate = Encoding::parseContentType($e->getAttribute("content")); + $meta = Encoding::matchEncodingLabel($candidate['charset']); + if (strlen($meta)) { + $effective = Encoding::matchEncodingLabel($candidate['charset'], true); + break; + } + } } - if ($charset) { - $d = new \DOMDocument(); - @$d->loadHTML("\xEF\xBB\xBF".$input, \LIBXML_NOWARNING); + // now find the authoritative encoding according to the standard HTML order + $uthoritative = ""; + // start by looking for BOMs + if (substr($input, 0, 3) === "\xEF\xBB\xBF") { + $authoritative = "UTF-8"; + } elseif (substr($input, 0, 2) === "\xFE\xFF") { + $authoritative = "UTF-16BE"; + } elseif (substr($input, 0, 2) === "\xFF\xFE") { + $authoritative = "UTF-16LE"; + } else { + // now parse the HTTP Content-Type looking for an encoding + $candidate = Encoding::parseContentType($contentType); + $candidate = Encoding::matchEncodingLabel($candidate['charset']); + if ($candidate) { + $authoritative = $candidate; + } elseif ($meta) { + // if the document has a valid element, use that + $authoritative = $meta; + } elseif (preg_match(Encoding::UTF8_PATTERN, $input)) { + // if the string is valid UTF-8, we can use this + $authoritative = "UTF-8"; + } else { + // otherwise use the default encoding + $authoritative = Encoding::DEFAULT_ENCODING; + } + } + // if the authoritative encoding does not match the encoding currently in + // the document, add an appropriate meta tag or (for UTF-16) a BOM + if ($authoritative !== $effective) { + // some canonical names are not understood by some environments, so we use aliases where needed + if (array_key_exists($authoritative, Encoding::ENCODING_ALIAS_MAP)) { + $authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative]; + } + if ($authoritative === "UTF-16BE") { + if (!substr($input, 0, 2) === "\xFE\xFF") { + // add a BOM and reparse + $d = $load("\xFE\xFF".$input); + } + } elseif ($authoritative === "UTF-16BLE") { + if (!substr($input, 0, 2) === "\xFF\xFE") { + // add a BOM and reparse + $d = $load("\xFF\xFE".$input); + } + } else { + $offset = 0; + // look for a DOCTYPE, possibly preceded by a UTF-8 BOM and/or whitespace; we'll insert our meta tag after this to avoid any quirks mode the parser may have + if (preg_match('/^\x{EF}\x{BB}\x{BF}\s*/is', $input, $match)) { + $offset = strlen($match[0]); + } + $tag = ''; + $d = $load(substr($input, 0, $offset) . $tag . substr($input, $offset)); + } } return $d; } @@ -2452,3 +2527,379 @@ function removeDotSegments($path) { return $output; } + +/** Tools for determining the encoding of documents */ +class Encoding { + /** @var string The default encoding to use when none is detected in the document */ + const DEFAULT_ENCODING = "windows-1252"; + /** @var array The list of known encodings; see https://encoding.spec.whatwg.org/#names-and-labels */ + const ENCODINGS = [ + 'unicode-1-1-utf-8' => "UTF-8", + 'unicode11utf8' => "UTF-8", + 'unicode20utf8' => "UTF-8", + 'utf-8' => "UTF-8", + 'utf8' => "UTF-8", + 'x-unicode20utf8' => "UTF-8", + '866' => "IBM866", + 'cp866' => "IBM866", + 'csibm866' => "IBM866", + 'ibm866' => "IBM866", + 'csisolatin2' => "ISO-8859-2", + 'iso-8859-2' => "ISO-8859-2", + 'iso-ir-101' => "ISO-8859-2", + 'iso8859-2' => "ISO-8859-2", + 'iso88592' => "ISO-8859-2", + 'iso_8859-2' => "ISO-8859-2", + 'iso_8859-2:1987' => "ISO-8859-2", + 'l2' => "ISO-8859-2", + 'latin2' => "ISO-8859-2", + 'csisolatin3' => "ISO-8859-3", + 'iso-8859-3' => "ISO-8859-3", + 'iso-ir-109' => "ISO-8859-3", + 'iso8859-3' => "ISO-8859-3", + 'iso88593' => "ISO-8859-3", + 'iso_8859-3' => "ISO-8859-3", + 'iso_8859-3:1988' => "ISO-8859-3", + 'l3' => "ISO-8859-3", + 'latin3' => "ISO-8859-3", + 'csisolatin4' => "ISO-8859-4", + 'iso-8859-4' => "ISO-8859-4", + 'iso-ir-110' => "ISO-8859-4", + 'iso8859-4' => "ISO-8859-4", + 'iso88594' => "ISO-8859-4", + 'iso_8859-4' => "ISO-8859-4", + 'iso_8859-4:1988' => "ISO-8859-4", + 'l4' => "ISO-8859-4", + 'latin4' => "ISO-8859-4", + 'csisolatincyrillic' => "ISO-8859-5", + 'cyrillic' => "ISO-8859-5", + 'iso-8859-5' => "ISO-8859-5", + 'iso-ir-144' => "ISO-8859-5", + 'iso8859-5' => "ISO-8859-5", + 'iso88595' => "ISO-8859-5", + 'iso_8859-5' => "ISO-8859-5", + 'iso_8859-5:1988' => "ISO-8859-5", + 'arabic' => "ISO-8859-6", + 'asmo-708' => "ISO-8859-6", + 'csiso88596e' => "ISO-8859-6", + 'csiso88596i' => "ISO-8859-6", + 'csisolatinarabic' => "ISO-8859-6", + 'ecma-114' => "ISO-8859-6", + 'iso-8859-6' => "ISO-8859-6", + 'iso-8859-6-e' => "ISO-8859-6", + 'iso-8859-6-i' => "ISO-8859-6", + 'iso-ir-127' => "ISO-8859-6", + 'iso8859-6' => "ISO-8859-6", + 'iso88596' => "ISO-8859-6", + 'iso_8859-6' => "ISO-8859-6", + 'iso_8859-6:1987' => "ISO-8859-6", + 'csisolatingreek' => "ISO-8859-7", + 'ecma-118' => "ISO-8859-7", + 'elot_928' => "ISO-8859-7", + 'greek' => "ISO-8859-7", + 'greek8' => "ISO-8859-7", + 'iso-8859-7' => "ISO-8859-7", + 'iso-ir-126' => "ISO-8859-7", + 'iso8859-7' => "ISO-8859-7", + 'iso88597' => "ISO-8859-7", + 'iso_8859-7' => "ISO-8859-7", + 'iso_8859-7:1987' => "ISO-8859-7", + 'sun_eu_greek' => "ISO-8859-7", + 'csiso88598e' => "ISO-8859-8", + 'csisolatinhebrew' => "ISO-8859-8", + 'hebrew' => "ISO-8859-8", + 'iso-8859-8' => "ISO-8859-8", + 'iso-8859-8-e' => "ISO-8859-8", + 'iso-ir-138' => "ISO-8859-8", + 'iso8859-8' => "ISO-8859-8", + 'iso88598' => "ISO-8859-8", + 'iso_8859-8' => "ISO-8859-8", + 'iso_8859-8:1988' => "ISO-8859-8", + 'visual' => "ISO-8859-8", + 'csiso88598i' => "ISO-8859-8-I", + 'iso-8859-8-i' => "ISO-8859-8-I", + 'logical' => "ISO-8859-8-I", + 'csisolatin6' => "ISO-8859-10", + 'iso-8859-10' => "ISO-8859-10", + 'iso-ir-157' => "ISO-8859-10", + 'iso8859-10' => "ISO-8859-10", + 'iso885910' => "ISO-8859-10", + 'l6' => "ISO-8859-10", + 'latin6' => "ISO-8859-10", + 'iso-8859-13' => "ISO-8859-13", + 'iso8859-13' => "ISO-8859-13", + 'iso885913' => "ISO-8859-13", + 'iso-8859-14' => "ISO-8859-14", + 'iso8859-14' => "ISO-8859-14", + 'iso885914' => "ISO-8859-14", + 'csisolatin9' => "ISO-8859-15", + 'iso-8859-15' => "ISO-8859-15", + 'iso8859-15' => "ISO-8859-15", + 'iso885915' => "ISO-8859-15", + 'iso_8859-15' => "ISO-8859-15", + 'l9' => "ISO-8859-15", + 'iso-8859-16' => "ISO-8859-16", + 'cskoi8r' => "KOI8-R", + 'koi' => "KOI8-R", + 'koi8' => "KOI8-R", + 'koi8-r' => "KOI8-R", + 'koi8_r' => "KOI8-R", + 'koi8-ru' => "KOI8-U", + 'koi8-u' => "KOI8-U", + 'csmacintosh' => "macintosh", + 'mac' => "macintosh", + 'macintosh' => "macintosh", + 'x-mac-roman' => "macintosh", + 'dos-874' => "windows-874", + 'iso-8859-11' => "windows-874", + 'iso8859-11' => "windows-874", + 'iso885911' => "windows-874", + 'tis-620' => "windows-874", + 'windows-874' => "windows-874", + 'cp1250' => "windows-1250", + 'windows-1250' => "windows-1250", + 'x-cp1250' => "windows-1250", + 'cp1251' => "windows-1251", + 'windows-1251' => "windows-1251", + 'x-cp1251' => "windows-1251", + 'ansi_x3.4-1968' => "windows-1252", + 'ascii' => "windows-1252", + 'cp1252' => "windows-1252", + 'cp819' => "windows-1252", + 'csisolatin1' => "windows-1252", + 'ibm819' => "windows-1252", + 'iso-8859-1' => "windows-1252", + 'iso-ir-100' => "windows-1252", + 'iso8859-1' => "windows-1252", + 'iso88591' => "windows-1252", + 'iso_8859-1' => "windows-1252", + 'iso_8859-1:1987' => "windows-1252", + 'l1' => "windows-1252", + 'latin1' => "windows-1252", + 'us-ascii' => "windows-1252", + 'windows-1252' => "windows-1252", + 'x-cp1252' => "windows-1252", + 'cp1253' => "windows-1253", + 'windows-1253' => "windows-1253", + 'x-cp1253' => "windows-1253", + 'cp1254' => "windows-1254", + 'csisolatin5' => "windows-1254", + 'iso-8859-9' => "windows-1254", + 'iso-ir-148' => "windows-1254", + 'iso8859-9' => "windows-1254", + 'iso88599' => "windows-1254", + 'iso_8859-9' => "windows-1254", + 'iso_8859-9:1989' => "windows-1254", + 'l5' => "windows-1254", + 'latin5' => "windows-1254", + 'windows-1254' => "windows-1254", + 'x-cp1254' => "windows-1254", + 'cp1255' => "windows-1255", + 'windows-1255' => "windows-1255", + 'x-cp1255' => "windows-1255", + 'cp1256' => "windows-1256", + 'windows-1256' => "windows-1256", + 'x-cp1256' => "windows-1256", + 'cp1257' => "windows-1257", + 'windows-1257' => "windows-1257", + 'x-cp1257' => "windows-1257", + 'cp1258' => "windows-1258", + 'windows-1258' => "windows-1258", + 'x-cp1258' => "windows-1258", + 'x-mac-cyrillic' => "x-mac-cyrillic", + 'x-mac-ukrainian' => "x-mac-cyrillic", + 'chinese' => "GBK", + 'csgb2312' => "GBK", + 'csiso58gb231280' => "GBK", + 'gb2312' => "GBK", + 'gb_2312' => "GBK", + 'gb_2312-80' => "GBK", + 'gbk' => "GBK", + 'iso-ir-58' => "GBK", + 'x-gbk' => "GBK", + 'gb18030' => "gb18030", + 'big5' => "Big5", + 'big5-hkscs' => "Big5", + 'cn-big5' => "Big5", + 'csbig5' => "Big5", + 'x-x-big5' => "Big5", + 'cseucpkdfmtjapanese' => "EUC-JP", + 'euc-jp' => "EUC-JP", + 'x-euc-jp' => "EUC-JP", + 'csiso2022jp' => "ISO-2022-JP", + 'iso-2022-jp' => "ISO-2022-JP", + 'csshiftjis' => "Shift_JIS", + 'ms932' => "Shift_JIS", + 'ms_kanji' => "Shift_JIS", + 'shift-jis' => "Shift_JIS", + 'shift_jis' => "Shift_JIS", + 'sjis' => "Shift_JIS", + 'windows-31j' => "Shift_JIS", + 'x-sjis' => "Shift_JIS", + 'cseuckr' => "EUC-KR", + 'csksc56011987' => "EUC-KR", + 'euc-kr' => "EUC-KR", + 'iso-ir-149' => "EUC-KR", + 'korean' => "EUC-KR", + 'ks_c_5601-1987' => "EUC-KR", + 'ks_c_5601-1989' => "EUC-KR", + 'ksc5601' => "EUC-KR", + 'ksc_5601' => "EUC-KR", + 'windows-949' => "EUC-KR", + 'csiso2022kr' => "replacement", + 'hz-gb-2312' => "replacement", + 'iso-2022-cn' => "replacement", + 'iso-2022-cn-ext' => "replacement", + 'iso-2022-kr' => "replacement", + 'replacement' => "replacement", + 'unicodefffe' => "UTF-16BE", + 'utf-16be' => "UTF-16BE", + 'csunicode' => "UTF-16LE", + 'iso-10646-ucs-2' => "UTF-16LE", + 'ucs-2' => "UTF-16LE", + 'unicode' => "UTF-16LE", + 'unicodefeff' => "UTF-16LE", + 'utf-16' => "UTF-16LE", + 'utf-16le' => "UTF-16LE", + 'x-user-defined' => "x-user-defined", + ]; + /** @var string The PCRE pattern for a Content-Type HTTP header-field, chopping it up into three main sections */ + const TYPE_PATTERN = <<<'PATTERN' + /^ + [\t\r\n ]* # optional leading whitespace + ([^\/]+) # type + \/ # type-subtype delimiter + ([^;]+) # subtype (possibly with trailing whitespace) + (;.*)? # optional parameters, to be parsed separately + [\t\r\n ]* # optional trailing whitespace + $/sx +PATTERN; + /** @var string The PCRE pattern for an HTTP Content-Type header-field's parameters, splitting them into a series of names and values*/ + const PARAM_PATTERN = <<<'PATTERN' + / + [;\t\r\n ]* # parameter delimiter and leading whitespace, all optional + ([^=;]*) # parameter name; may be empty + (?:= # parameter name-value delimiter + ( + "(?:\\"|[^"])*(?:"|$)[^;]* # quoted parameter value and optional garbage + |[^;]* # unquoted parameter value (possibly with trailing whitespace) + ) + )? + ;? # optional trailing parameter delimiter + [\t\r\n ]* # optional trailing whitespace + /sx +PATTERN; + /** @var string The PCRE pattern for an HTTP token, used to validate the nameof a parameter */ + const TOKEN_PATTERN = '/^[A-Za-z0-9!#$%&\'*+\-\.\^_`|~]+$/s'; + /** @var string The PCRE pattern for a "bare" (unquoted) parameter value, used for validation */ + const BARE_VALUE_PATTERN = '/^[\t\x{20}-\x{7E}\x{80}-\x{FF}]+$/s'; + /** @var string The PCRE pattern for a quoted parameter value, used for validation */ + const QUOTED_VALUE_PATTERN = '/^"((?:\\\"|[\t !\x{23}-\x{7E}\x{80}-\x{FF}])*)(?:"|$)/s'; + /** @var string The PCRE pattern for an escaped character in a quoted parameter value, used to unescape via replacement */ + const ESCAPE_PATTERN = '/\\\(.)/s'; + /** @var string The PCRE pattern for all valid UTF-8 characters */ + const UTF8_PATTERN = <<<'PATTERN' + /^(?: + # Single-byte + [\x{00}-\x{7F}]+ + # Two-byte + |[\x{C2}-\x{DF}][\x{80}-\x{BF}] + # Three-byte excluding surrogates + |\x{E0}[\x{A0}-\x{BF}][\x{80}-\x{BF}] + |\x{ED}[\x{80}-\x{9F}][\x{80}-\x{BF}] + |[\x{E1}-\x{EC}\x{EE}\x{EF}][\x{80}-\x{BF}]{2} + # Four-byte + |\x{F0}[\x{90}-\x{BF}][\x{80}-\x{BF}]{2} + |\x{F4}[\x{80}-\x{8F}][\x{80}-\x{BF}]{2} + |[\x{F1}-\x{F3}][\x{80}-\x{BF}]{3} + )*$/sx +PATTERN; + /** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */ + const ENCODING_NAUGHTY_LIST = [ + "unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8", + "iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e", + "csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597", + "sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual", + "csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913", + "iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r", + "x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911", + "tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819", + "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", + "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", + "us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254", + "x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258", + "x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312", + "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5", + "csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis", + "cseuckr", "euc-kr" + ]; + /** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */ + const ENCODING_ALIAS_MAP = [ + 'windows-1258' => "x-cp1258", + 'GBK' => "x-gbk", + 'Big5' => "big5-hkscs", + 'EUC-KR' => "korean", + ]; + + /** Matches an encoding label to a known encoding name in the WHATWG encoding list + * + * If the label does not match any known encoding, and empty string is returned. + * + * @param string $label The label to match e.g. "utf8", " FOO_bar " + * @param bool $excludeNaughty Whether or not to exclude labels DOMDocument does not understand + * @return string The canonical name of the encoding if matched (e.g. "UTF-8"), or an empty string in case of failure + */ + public static function matchEncodingLabel($label, $excludeNaughty = false) { + $label = strtolower(trim($label)); + if ($excludeNaughty && in_array($label, self::ENCODING_NAUGHTY_LIST)) { + return ""; + } + if (array_key_exists($label, self::ENCODINGS)) { + return self::ENCODINGS[$label]; + } + return ""; + } + + /** Parses a Content-Type header-field to extract the type and charset, if any + * + * @param string $type + * @return array An array containing keys for "type" and "charset" + */ + public static function parseContentType($type) { + $out = [ + 'type' => "", + 'charset' => "", + ]; + if (preg_match(self::TYPE_PATTERN, $type, $match)) { + list($mimeType, $type, $subtype, $params) = array_pad($match, 4, ""); + $type = preg_match(self::TOKEN_PATTERN, $type) ? $type : ""; + $subtype = rtrim($subtype, "\t\r\n "); + $subtype = preg_match(self::TOKEN_PATTERN, $subtype) ? $subtype : ""; + if (strlen($type) && strlen($subtype)) { + $out['type'] = strtolower($type) . "/" . strtolower($subtype); + // parse parameters looking for a charset one + if (preg_match_all(self::PARAM_PATTERN, $params, $matches, \PREG_SET_ORDER)) { + foreach ($matches as $match) { + list($param, $name, $value) = array_pad($match, 3, ""); + $name = strtolower(preg_match(self::TOKEN_PATTERN, $name) ? $name : ""); + if ($name === "charset" && strlen($value)) { + if ($value[0] === '"') { + if (preg_match(self::QUOTED_VALUE_PATTERN, $value, $match)) { + $out['charset'] = preg_replace(self::ESCAPE_PATTERN, '$1', $match[1]); + return $out; + } + } else { + $value = rtrim($value, "\t\r\n "); + if (preg_match(self::BARE_VALUE_PATTERN, $value)) { + $out['charset'] = $value; + return $out; + } + } + } + } + } + } + } + return $out; + } +} From 04b947f5d2248117e7092ac419065ebc448efb20 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 6 Apr 2023 21:54:32 -0400 Subject: [PATCH 07/11] Revert changes to tests which are not needed As UTF-8 is now heuristically detected, no explicit encoding declaration is required --- tests/Mf2/ClassicMicroformatsTest.php | 2 +- tests/Mf2/MicroformatsTestSuiteTest.php | 2 -- tests/Mf2/ParseImpliedTest.php | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/Mf2/ClassicMicroformatsTest.php b/tests/Mf2/ClassicMicroformatsTest.php index 89579e7..5b192c3 100644 --- a/tests/Mf2/ClassicMicroformatsTest.php +++ b/tests/Mf2/ClassicMicroformatsTest.php @@ -911,7 +911,7 @@ public function testHEntryRelTagInContent() { * @see source: http://jg.typepad.com/ciel/2006/02/daniel_bouluds_.html */ public function testHReviewRelTag() { - $input = '
+ $input = '

Divine Brunch! diff --git a/tests/Mf2/MicroformatsTestSuiteTest.php b/tests/Mf2/MicroformatsTestSuiteTest.php index 6126f66..855c1fd 100644 --- a/tests/Mf2/MicroformatsTestSuiteTest.php +++ b/tests/Mf2/MicroformatsTestSuiteTest.php @@ -76,8 +76,6 @@ public function testMf2FromTestSuite($input, $expectedOutput, $test) $this->markTestIncomplete('This test does not match because we implement a proposed spec: https://github.com/microformats/microformats2-parsing/issues/4#issuecomment-373457720.'); } - // Add a byte order mark to the input to ensure encoding is correctly detected - $input = "\xEF\xBB\xBF".$input; $parser = new TestSuiteParser($input, 'http://example.com/'); $this->assertEquals( $this->makeComparible(json_decode($expectedOutput, true)), diff --git a/tests/Mf2/ParseImpliedTest.php b/tests/Mf2/ParseImpliedTest.php index 7195581..d806c1a 100644 --- a/tests/Mf2/ParseImpliedTest.php +++ b/tests/Mf2/ParseImpliedTest.php @@ -166,7 +166,7 @@ public function testParsesImpliedNameWithExplicitURL() { } public function testMultipleImpliedHCards() { - $input = 'Frances Berriman + $input = 'Frances Berriman Ben Ward From 69825b6e5e6f2fb0b43bb2e03aef2f6a9e91ab8b Mon Sep 17 00:00:00 2001 From: "J. King" Date: Fri, 7 Apr 2023 16:52:39 -0400 Subject: [PATCH 08/11] Consider BOMs correctly --- Mf2/Parser.php | 71 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index c0398c0..9322c82 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -103,9 +103,10 @@ function load_dom_document($input, $contentType = "") { DOMDocument, on the other hand, takes encoding from the following sources from most to least authoritative: - 1. Meta element in document - 2. Byte order mark - 3. Default encoding (ISO 8859-1) + 1. UTF-16 byte order mark + 2. Meta element in document + 3. UTF-8 byte order mark + 4. Default encoding (ISO 8859-1) Thus this function finds the most authoritative encoding in the order required by the specification, and then inserts a meta element where @@ -120,34 +121,50 @@ function load_dom_document($input, $contentType = "") { @$d->loadHTML($input, \LIBXML_NOWARNING); return $d; }; - // parse the document and find the first valid meta element with encoding information - // even if this initial parsing uses the wrong encoding, it's more reliable than trying to do this with the string directly - $d = $load($input); $meta = ""; $effective = ""; - foreach ($d->getElementsByTagName("meta") as $e) { - $meta = Encoding::matchEncodingLabel($e->getAttribute("charset")); - if (strlen($meta)) { - $effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true); - break; - } - if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") { - $candidate = Encoding::parseContentType($e->getAttribute("content")); - $meta = Encoding::matchEncodingLabel($candidate['charset']); + // determine the encoding which DOMDocument has detected from the document, starting by looking for a UTF-16 BOM + if (substr($input, 0, 2) === Encoding::BOM_UTF16BE) { + $effective = "UTF-16BE"; + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) { + $effective = "UTF-16LE"; + } else { + // parse the document and find the first valid meta element with + // encoding information; even if this initial parsing uses the + // wrong encoding, it's more reliable than trying to do this + // with the string directly + // we keep note of both what DOMDocument understands (the $effective + // encoding), as well as what the standard says is valid (the $meta + // encoding) as the two can differ + $d = $load($input); + foreach ($d->getElementsByTagName("meta") as $e) { + $meta = Encoding::matchEncodingLabel($e->getAttribute("charset")); if (strlen($meta)) { - $effective = Encoding::matchEncodingLabel($candidate['charset'], true); + $effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true); break; } + if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") { + $candidate = Encoding::parseContentType($e->getAttribute("content")); + $meta = Encoding::matchEncodingLabel($candidate['charset']); + if (strlen($meta)) { + $effective = Encoding::matchEncodingLabel($candidate['charset'], true); + break; + } + } + } + // if no effective encoding was found, check for a UTF-8 BOM + if (!$effective && substr($input, 0, 3) === Encoding::BOM_UTF8) { + $effective = "UTF-8"; } } // now find the authoritative encoding according to the standard HTML order $uthoritative = ""; // start by looking for BOMs - if (substr($input, 0, 3) === "\xEF\xBB\xBF") { + if (substr($input, 0, 3) === Encoding::BOM_UTF8) { $authoritative = "UTF-8"; - } elseif (substr($input, 0, 2) === "\xFE\xFF") { + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16BE) { $authoritative = "UTF-16BE"; - } elseif (substr($input, 0, 2) === "\xFF\xFE") { + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) { $authoritative = "UTF-16LE"; } else { // now parse the HTTP Content-Type looking for an encoding @@ -174,14 +191,14 @@ function load_dom_document($input, $contentType = "") { $authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative]; } if ($authoritative === "UTF-16BE") { - if (!substr($input, 0, 2) === "\xFE\xFF") { + if (!substr($input, 0, 2) === Encoding::BOM_UTF16BE) { // add a BOM and reparse - $d = $load("\xFE\xFF".$input); + $d = $load(Encoding::BOM_UTF16BE.$input); } - } elseif ($authoritative === "UTF-16BLE") { - if (!substr($input, 0, 2) === "\xFF\xFE") { + } elseif ($authoritative === "UTF-16LE") { + if (!substr($input, 0, 2) === Encoding::BOM_UTF16LE) { // add a BOM and reparse - $d = $load("\xFF\xFE".$input); + $d = $load(Encoding::BOM_UTF16LE.$input); } } else { $offset = 0; @@ -2840,6 +2857,12 @@ class Encoding { 'Big5' => "big5-hkscs", 'EUC-KR' => "korean", ]; + /** @var string A UTF-8 byte order mark */ + const BOM_UTF8 = "\xEF\xBB\xBF"; + /** @var string A UTF-16 (big-endian) byte order mark */ + const BOM_UTF16BE = "\xFE\xFF"; + /** @var string A UTF-16 (little-endian) byte order mark */ + const BOM_UTF16LE = "\xFF\xFE"; /** Matches an encoding label to a known encoding name in the WHATWG encoding list * From 9e93f7f940c5ba43e451069a2452136812190242 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 8 Apr 2023 11:16:14 -0400 Subject: [PATCH 09/11] Add after rather than after DOCTYPE --- Mf2/Parser.php | 52 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 9322c82..92209bc 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -202,8 +202,11 @@ function load_dom_document($input, $contentType = "") { } } else { $offset = 0; - // look for a DOCTYPE, possibly preceded by a UTF-8 BOM and/or whitespace; we'll insert our meta tag after this to avoid any quirks mode the parser may have - if (preg_match('/^\x{EF}\x{BB}\x{BF}\s*/is', $input, $match)) { + // look for an HTML element and insert the meta tag within + // it, or at the nearest appropriate point; if the tag is + // inserted too soon, attributes on or (which + // may hold mf data) can be stripped + if (preg_match(Encoding::HTML_HEADER_PATTERN, $input, $match)) { $offset = strlen($match[0]); } $tag = ''; @@ -2863,6 +2866,51 @@ class Encoding { const BOM_UTF16BE = "\xFE\xFF"; /** @var string A UTF-16 (little-endian) byte order mark */ const BOM_UTF16LE = "\xFF\xFE"; + /** @var string A PCRE pattern which finds the insertion point inside the HTML element of a document; the pattern takes some shortcuts, but nothing which is likely to affect documents which actually have microformat metadata */ + const HTML_HEADER_PATTERN = <<]*> # DOCTYPE or + |) # Comment or + |<\?[^>]*> # Processing instruction or XML declaration + )* # ... zero or more times + (?: start tag + (?:\s+ # ... with possible attributes + (?:\s*[^\s=>]* + (?:= + (?: + "[^"]*" + |'[^']*' + |[^ >]* + )? + )? + )* + )? + \/?\s*> + )? + (?: # Followed by... + \s+ # Whitespace or + |]*> # DOCTYPE or + |) # Comment or + |<\?[^>]*> # Processing instruction or XML declaration + )* # ... zero or more times + (?: start tag + (?:\s+ # ... with possible attributes + (?:\s*[^\s=>]* + (?:= + (?: + "[^"]*" + |'[^']*' + |[^ >]* + )? + )? + )* + )? + \/?\s*> + )? + /six +PATTERN; /** Matches an encoding label to a known encoding name in the WHATWG encoding list * From f3cbf76944dd6ea48f95650c9853b4063710bb01 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 8 Apr 2023 13:32:42 -0400 Subject: [PATCH 10/11] Handle x-user-defined and replacement Neither is likely to appear in a real HTML document; x-user-defined is not actually supported properly, but it's unlikely to be used for HTML --- Mf2/Parser.php | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 92209bc..a6ad4fa 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -191,15 +191,18 @@ function load_dom_document($input, $contentType = "") { $authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative]; } if ($authoritative === "UTF-16BE") { - if (!substr($input, 0, 2) === Encoding::BOM_UTF16BE) { + if (substr($input, 0, 2) !== Encoding::BOM_UTF16BE) { // add a BOM and reparse $d = $load(Encoding::BOM_UTF16BE.$input); } } elseif ($authoritative === "UTF-16LE") { - if (!substr($input, 0, 2) === Encoding::BOM_UTF16LE) { + if (substr($input, 0, 2) !== Encoding::BOM_UTF16LE) { // add a BOM and reparse $d = $load(Encoding::BOM_UTF16LE.$input); } + } elseif ($authoritative === "replacement") { + // the replacement encoding is actually a denylist of problematic encodings; the document is reduced to a single replacement character + $d = $load(Encoding::BOM_UTF8."\xEF\xBF\xBD"); } else { $offset = 0; // look for an HTML element and insert the meta tag within @@ -2851,14 +2854,15 @@ class Encoding { "x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5", "csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis", - "cseuckr", "euc-kr" + "cseuckr", "euc-kr", "x-user-defined", "replacement", ]; /** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */ const ENCODING_ALIAS_MAP = [ 'windows-1258' => "x-cp1258", - 'GBK' => "x-gbk", - 'Big5' => "big5-hkscs", - 'EUC-KR' => "korean", + 'GBK' => "x-gbk", + 'Big5' => "big5-hkscs", + 'EUC-KR' => "korean", + 'x-user-defined' => "windows-1256", // this is technically not correct, but x-user-defined is not likely to be used in HTML documents; it is used to represent binary data in JavaScript; windows-1256 is used as a substitute as every byte is assigned to a character, so text can be converted back into bytes with no loss ]; /** @var string A UTF-8 byte order mark */ const BOM_UTF8 = "\xEF\xBB\xBF"; From 10dda4659e2c873a046161e604393ae2d37917a7 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sun, 16 Apr 2023 19:13:51 -0400 Subject: [PATCH 11/11] Add tests and correct two bugs All new lines are covered, and an effort was made to cover as many permutations of PCRE matches as practical --- Mf2/Parser.php | 8 ++- tests/Mf2/EncodingTest.php | 123 +++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 tests/Mf2/EncodingTest.php diff --git a/Mf2/Parser.php b/Mf2/Parser.php index a6ad4fa..b3ba637 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -121,6 +121,9 @@ function load_dom_document($input, $contentType = "") { @$d->loadHTML($input, \LIBXML_NOWARNING); return $d; }; + // perform an initial parsing of the document. We may parse it again + // if the encoding is wrong, or use the result if everything is correct + $d = $load($input); $meta = ""; $effective = ""; // determine the encoding which DOMDocument has detected from the document, starting by looking for a UTF-16 BOM @@ -129,14 +132,13 @@ function load_dom_document($input, $contentType = "") { } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) { $effective = "UTF-16LE"; } else { - // parse the document and find the first valid meta element with + // find the first valid meta element in the document with // encoding information; even if this initial parsing uses the // wrong encoding, it's more reliable than trying to do this // with the string directly // we keep note of both what DOMDocument understands (the $effective // encoding), as well as what the standard says is valid (the $meta // encoding) as the two can differ - $d = $load($input); foreach ($d->getElementsByTagName("meta") as $e) { $meta = Encoding::matchEncodingLabel($e->getAttribute("charset")); if (strlen($meta)) { @@ -2925,7 +2927,7 @@ class Encoding { * @return string The canonical name of the encoding if matched (e.g. "UTF-8"), or an empty string in case of failure */ public static function matchEncodingLabel($label, $excludeNaughty = false) { - $label = strtolower(trim($label)); + $label = strtolower(trim($label, " \t\r\n\x0C")); // see https://infra.spec.whatwg.org/#ascii-whitespace if ($excludeNaughty && in_array($label, self::ENCODING_NAUGHTY_LIST)) { return ""; } diff --git a/tests/Mf2/EncodingTest.php b/tests/Mf2/EncodingTest.php new file mode 100644 index 0000000..5e13ddf --- /dev/null +++ b/tests/Mf2/EncodingTest.php @@ -0,0 +1,123 @@ +assertIsArray($act); + $this->assertArrayHasKey("type", $act); + $this->assertArrayHasKey("charset", $act); + $this->assertSame($type, $act['type'], "Content-Type did not match"); + $this->assertSame($charset, $act['charset'], "Charset did not match"); + } + + public function provideContentTypes() { + return [ + ["a/b", "a/b", ""], + ["A/B", "a/b", ""], + [" \t\r\na/b \t\r\n", "a/b", ""], + ["a/b ; \t\r\n;", "a/b", ""], + ["a/b;a=b ; \t\r\n;", "a/b", ""], + ["a/b;a=b;charset=foo" , "a/b", "foo"], + ["a/b;a=b;charset=foo;charset=bar", "a/b", "foo"], + ["a/ b;charset=foo", "", ""], + ["a/b;charset=FOO", "a/b", "FOO"], + ['a/b;charset="foo"', "a/b", "foo"], + ['a/b;charset="foo bar"', "a/b", "foo bar"], + ['a/b;charset="foo\\"bar"', "a/b", 'foo"bar'], + ['a/b;a="foo"bar;charset=baz', "a/b", 'baz'], + ["text/html;charset=UTF-8", "text/html", "UTF-8"], + ["", "", ""], + ]; + } + + /** + * @dataProvider provideEncodingLabels + * @covers Mf2\Encoding::matchEncodingLabel + */ + public function testMatchEncodingLabels($in, $excludeNaugty, $exp) { + $this->assertSame($exp, Encoding::matchEncodingLabel($in, $excludeNaugty)); + } + + public function provideEncodingLabels() { + return [ + ["", false, ""], + ["foo", false, ""], + ["utf8", false, "UTF-8"], + ["UTF8", false, "UTF-8"], + ["Unicode", false, "UTF-16LE"], + [" \t\r\n\x0Cbig5 \t\r\n\x0C", false, "Big5"], + ["\v\x00big5\v\x00", false, ""], + ["iso88591", false, "windows-1252"], + ["Windows-1252", false, "windows-1252"], + ["", true, ""], + ["foo", true, ""], + ["utf8", true, "UTF-8"], + ["UTF8", true, "UTF-8"], + ["Unicode", true, "UTF-16LE"], + [" \t\r\n\x0Cbig5 \t\r\n\x0C", true, ""], + ["\v\x00big5\v\x00", true, ""], + ["iso88591", true, ""], + ["Windows-1252", true, "windows-1252"], + ]; + } + + /** + * @dataProvider provideDocuments + * @covers Mf2\load_dom_document + */ + public function testDetectEncoding($doc, $type, $body, $htmlClass, $headClass) { + $d = load_dom_document($doc, $type); + $this->assertInstanceOf("DOMDocument", $d); + $this->assertNotNull($d->documentElement); + $this->assertSame($body, $d->documentElement->textContent); + $this->assertSame($htmlClass, $d->documentElement->getAttribute("class")); + if ($head = $d->getElementsByTagName("head")->item(0)) { + $this->assertSame($headClass, $head->getAttribute("class")); + } + } + + public function provideDocuments() { + return [ + ["\xEF\xBB\xBF\xC3\xA9", "", "\xC3\xA9", "", ""], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "", ""], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xFE\xFF\xD8\x34\xDD\x1E", "", "\xF0\x9D\x84\x9E", "", ""], + ["\xFF\xFE\x34\xD8\x1E\xDD", "", "\xF0\x9D\x84\x9E", "", ""], + ["\xD8\x34\xDD\x1E", "text/html;charset=utf-16be", "\xF0\x9D\x84\x9E", "", ""], + ["\x34\xD8\x1E\xDD", "text/html;charset=utf-16le", "\xF0\x9D\x84\x9E", "", ""], + ["\x9F", "", "\xC5\xB8", "a", "b"], + ["\xC3\xA9", "text/html;charset=iso-2022-cn", "\xEF\xBF\xBD", "", ""], + ["\x9F", "", "\xC5\xB8", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + [" \n\t\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xDD", "text/html;charset=windows-1258", "\xC6\xAF", "", ""], + ]; + } +} \ No newline at end of file