diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 8b6485b..b3ba637 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -7,7 +7,6 @@ use DOMXPath; use DOMNode; use DOMNodeList; -use Exception; use SplObjectStorage; use stdClass; @@ -87,13 +86,139 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) { return parse($html, $url, $convertClassic); } -/** - * Unicode to HTML Entities - * @param string $input String containing characters to convert into HTML entities - * @return string - */ -function unicodeToHtmlEntities($input) { - return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); +function load_dom_document($input, $contentType = "") { + /* + The work of this function concerns detecting encoding and making sure + DOMDocument honours the correct document encoding. There are + multiple complications to this, chiefly that DOMDocument's + priorities differ from the modern HTML standard's. HTML mandates the + following from most to least authoritative: + + 1. Byte order mark + 2. Protocol-specified encoding + 3. Meta element in document + 4. Heuristics (optional) + 5. Default encoding (usually windows-1252) + + DOMDocument, on the other hand, takes encoding from the following + sources from most to least authoritative: + + 1. UTF-16 byte order mark + 2. Meta element in document + 3. UTF-8 byte order mark + 4. Default encoding (ISO 8859-1) + + Thus this function finds the most authoritative encoding in the order + required by the specification, and then inserts a meta element where + necessary i.e. when the document doesn't already have the correct encoding + in a meta element. + + Finally, in the case of UTF-16 input the only reliable indicator is a + byte order mark, so we insert one where needed instead of a meta element. + */ + $load = function($input) { + $d = new DOMDocument(); + @$d->loadHTML($input, \LIBXML_NOWARNING); + return $d; + }; + // perform an initial parsing of the document. We may parse it again + // if the encoding is wrong, or use the result if everything is correct + $d = $load($input); + $meta = ""; + $effective = ""; + // determine the encoding which DOMDocument has detected from the document, starting by looking for a UTF-16 BOM + if (substr($input, 0, 2) === Encoding::BOM_UTF16BE) { + $effective = "UTF-16BE"; + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) { + $effective = "UTF-16LE"; + } else { + // find the first valid meta element in the document with + // encoding information; even if this initial parsing uses the + // wrong encoding, it's more reliable than trying to do this + // with the string directly + // we keep note of both what DOMDocument understands (the $effective + // encoding), as well as what the standard says is valid (the $meta + // encoding) as the two can differ + foreach ($d->getElementsByTagName("meta") as $e) { + $meta = Encoding::matchEncodingLabel($e->getAttribute("charset")); + if (strlen($meta)) { + $effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true); + break; + } + if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") { + $candidate = Encoding::parseContentType($e->getAttribute("content")); + $meta = Encoding::matchEncodingLabel($candidate['charset']); + if (strlen($meta)) { + $effective = Encoding::matchEncodingLabel($candidate['charset'], true); + break; + } + } + } + // if no effective encoding was found, check for a UTF-8 BOM + if (!$effective && substr($input, 0, 3) === Encoding::BOM_UTF8) { + $effective = "UTF-8"; + } + } + // now find the authoritative encoding according to the standard HTML order + $uthoritative = ""; + // start by looking for BOMs + if (substr($input, 0, 3) === Encoding::BOM_UTF8) { + $authoritative = "UTF-8"; + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16BE) { + $authoritative = "UTF-16BE"; + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) { + $authoritative = "UTF-16LE"; + } else { + // now parse the HTTP Content-Type looking for an encoding + $candidate = Encoding::parseContentType($contentType); + $candidate = Encoding::matchEncodingLabel($candidate['charset']); + if ($candidate) { + $authoritative = $candidate; + } elseif ($meta) { + // if the document has a valid element, use that + $authoritative = $meta; + } elseif (preg_match(Encoding::UTF8_PATTERN, $input)) { + // if the string is valid UTF-8, we can use this + $authoritative = "UTF-8"; + } else { + // otherwise use the default encoding + $authoritative = Encoding::DEFAULT_ENCODING; + } + } + // if the authoritative encoding does not match the encoding currently in + // the document, add an appropriate meta tag or (for UTF-16) a BOM + if ($authoritative !== $effective) { + // some canonical names are not understood by some environments, so we use aliases where needed + if (array_key_exists($authoritative, Encoding::ENCODING_ALIAS_MAP)) { + $authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative]; + } + if ($authoritative === "UTF-16BE") { + if (substr($input, 0, 2) !== Encoding::BOM_UTF16BE) { + // add a BOM and reparse + $d = $load(Encoding::BOM_UTF16BE.$input); + } + } elseif ($authoritative === "UTF-16LE") { + if (substr($input, 0, 2) !== Encoding::BOM_UTF16LE) { + // add a BOM and reparse + $d = $load(Encoding::BOM_UTF16LE.$input); + } + } elseif ($authoritative === "replacement") { + // the replacement encoding is actually a denylist of problematic encodings; the document is reduced to a single replacement character + $d = $load(Encoding::BOM_UTF8."\xEF\xBF\xBD"); + } else { + $offset = 0; + // look for an HTML element and insert the meta tag within + // it, or at the nearest appropriate point; if the tag is + // inserted too soon, attributes on or (which + // may hold mf data) can be stripped + if (preg_match(Encoding::HTML_HEADER_PATTERN, $input, $match)) { + $offset = strlen($match[0]); + } + $tag = ''; + $d = $load(substr($input, 0, $offset) . $tag . substr($input, $offset)); + } + } + return $d; } /** @@ -111,10 +236,9 @@ function collapseWhitespace($str) { } function unicodeTrim($str) { - // this is cheating. TODO: find a better way if this causes any problems - $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); - $str = preg_replace('/^\s+/', '', $str); - return preg_replace('/\s+$/', '', $str); + // the binary sequence C2A0 is a UTF-8 non-breaking space character + $str = preg_replace('/^(?:\s|\x{C2}\x{A0})+/', '', $str); + return preg_replace('/(?:\s|\x{C2}\x{A0})+$/', '', $str); } /** @@ -170,8 +294,8 @@ function nestedMfPropertyNamesFromClass($class) { foreach (explode(' ', $class) as $classname) { foreach ($prefixes as $prefix) { // Check if $classname is a valid property classname for $prefix. - if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) { - $propertyName = mb_substr($classname, mb_strlen($prefix)); + if (substr($classname, 0, strlen($prefix)) == $prefix && $classname != $prefix) { + $propertyName = substr($classname, strlen($prefix)); $propertyNames[$propertyName][] = $prefix; } } @@ -375,8 +499,7 @@ public function __construct($input, $url = null, $jsonMode = false) { $doc = new \Masterminds\HTML5(array('disable_html_ns' => true)); $doc = $doc->loadHTML($input); } else { - $doc = new DOMDocument(); - @$doc->loadHTML(unicodeToHtmlEntities($input), \LIBXML_NOWARNING); + $doc = load_dom_document($input); } } elseif (is_a($input, 'DOMDocument')) { $doc = clone $input; @@ -2429,3 +2552,431 @@ function removeDotSegments($path) { return $output; } + +/** Tools for determining the encoding of documents */ +class Encoding { + /** @var string The default encoding to use when none is detected in the document */ + const DEFAULT_ENCODING = "windows-1252"; + /** @var array The list of known encodings; see https://encoding.spec.whatwg.org/#names-and-labels */ + const ENCODINGS = [ + 'unicode-1-1-utf-8' => "UTF-8", + 'unicode11utf8' => "UTF-8", + 'unicode20utf8' => "UTF-8", + 'utf-8' => "UTF-8", + 'utf8' => "UTF-8", + 'x-unicode20utf8' => "UTF-8", + '866' => "IBM866", + 'cp866' => "IBM866", + 'csibm866' => "IBM866", + 'ibm866' => "IBM866", + 'csisolatin2' => "ISO-8859-2", + 'iso-8859-2' => "ISO-8859-2", + 'iso-ir-101' => "ISO-8859-2", + 'iso8859-2' => "ISO-8859-2", + 'iso88592' => "ISO-8859-2", + 'iso_8859-2' => "ISO-8859-2", + 'iso_8859-2:1987' => "ISO-8859-2", + 'l2' => "ISO-8859-2", + 'latin2' => "ISO-8859-2", + 'csisolatin3' => "ISO-8859-3", + 'iso-8859-3' => "ISO-8859-3", + 'iso-ir-109' => "ISO-8859-3", + 'iso8859-3' => "ISO-8859-3", + 'iso88593' => "ISO-8859-3", + 'iso_8859-3' => "ISO-8859-3", + 'iso_8859-3:1988' => "ISO-8859-3", + 'l3' => "ISO-8859-3", + 'latin3' => "ISO-8859-3", + 'csisolatin4' => "ISO-8859-4", + 'iso-8859-4' => "ISO-8859-4", + 'iso-ir-110' => "ISO-8859-4", + 'iso8859-4' => "ISO-8859-4", + 'iso88594' => "ISO-8859-4", + 'iso_8859-4' => "ISO-8859-4", + 'iso_8859-4:1988' => "ISO-8859-4", + 'l4' => "ISO-8859-4", + 'latin4' => "ISO-8859-4", + 'csisolatincyrillic' => "ISO-8859-5", + 'cyrillic' => "ISO-8859-5", + 'iso-8859-5' => "ISO-8859-5", + 'iso-ir-144' => "ISO-8859-5", + 'iso8859-5' => "ISO-8859-5", + 'iso88595' => "ISO-8859-5", + 'iso_8859-5' => "ISO-8859-5", + 'iso_8859-5:1988' => "ISO-8859-5", + 'arabic' => "ISO-8859-6", + 'asmo-708' => "ISO-8859-6", + 'csiso88596e' => "ISO-8859-6", + 'csiso88596i' => "ISO-8859-6", + 'csisolatinarabic' => "ISO-8859-6", + 'ecma-114' => "ISO-8859-6", + 'iso-8859-6' => "ISO-8859-6", + 'iso-8859-6-e' => "ISO-8859-6", + 'iso-8859-6-i' => "ISO-8859-6", + 'iso-ir-127' => "ISO-8859-6", + 'iso8859-6' => "ISO-8859-6", + 'iso88596' => "ISO-8859-6", + 'iso_8859-6' => "ISO-8859-6", + 'iso_8859-6:1987' => "ISO-8859-6", + 'csisolatingreek' => "ISO-8859-7", + 'ecma-118' => "ISO-8859-7", + 'elot_928' => "ISO-8859-7", + 'greek' => "ISO-8859-7", + 'greek8' => "ISO-8859-7", + 'iso-8859-7' => "ISO-8859-7", + 'iso-ir-126' => "ISO-8859-7", + 'iso8859-7' => "ISO-8859-7", + 'iso88597' => "ISO-8859-7", + 'iso_8859-7' => "ISO-8859-7", + 'iso_8859-7:1987' => "ISO-8859-7", + 'sun_eu_greek' => "ISO-8859-7", + 'csiso88598e' => "ISO-8859-8", + 'csisolatinhebrew' => "ISO-8859-8", + 'hebrew' => "ISO-8859-8", + 'iso-8859-8' => "ISO-8859-8", + 'iso-8859-8-e' => "ISO-8859-8", + 'iso-ir-138' => "ISO-8859-8", + 'iso8859-8' => "ISO-8859-8", + 'iso88598' => "ISO-8859-8", + 'iso_8859-8' => "ISO-8859-8", + 'iso_8859-8:1988' => "ISO-8859-8", + 'visual' => "ISO-8859-8", + 'csiso88598i' => "ISO-8859-8-I", + 'iso-8859-8-i' => "ISO-8859-8-I", + 'logical' => "ISO-8859-8-I", + 'csisolatin6' => "ISO-8859-10", + 'iso-8859-10' => "ISO-8859-10", + 'iso-ir-157' => "ISO-8859-10", + 'iso8859-10' => "ISO-8859-10", + 'iso885910' => "ISO-8859-10", + 'l6' => "ISO-8859-10", + 'latin6' => "ISO-8859-10", + 'iso-8859-13' => "ISO-8859-13", + 'iso8859-13' => "ISO-8859-13", + 'iso885913' => "ISO-8859-13", + 'iso-8859-14' => "ISO-8859-14", + 'iso8859-14' => "ISO-8859-14", + 'iso885914' => "ISO-8859-14", + 'csisolatin9' => "ISO-8859-15", + 'iso-8859-15' => "ISO-8859-15", + 'iso8859-15' => "ISO-8859-15", + 'iso885915' => "ISO-8859-15", + 'iso_8859-15' => "ISO-8859-15", + 'l9' => "ISO-8859-15", + 'iso-8859-16' => "ISO-8859-16", + 'cskoi8r' => "KOI8-R", + 'koi' => "KOI8-R", + 'koi8' => "KOI8-R", + 'koi8-r' => "KOI8-R", + 'koi8_r' => "KOI8-R", + 'koi8-ru' => "KOI8-U", + 'koi8-u' => "KOI8-U", + 'csmacintosh' => "macintosh", + 'mac' => "macintosh", + 'macintosh' => "macintosh", + 'x-mac-roman' => "macintosh", + 'dos-874' => "windows-874", + 'iso-8859-11' => "windows-874", + 'iso8859-11' => "windows-874", + 'iso885911' => "windows-874", + 'tis-620' => "windows-874", + 'windows-874' => "windows-874", + 'cp1250' => "windows-1250", + 'windows-1250' => "windows-1250", + 'x-cp1250' => "windows-1250", + 'cp1251' => "windows-1251", + 'windows-1251' => "windows-1251", + 'x-cp1251' => "windows-1251", + 'ansi_x3.4-1968' => "windows-1252", + 'ascii' => "windows-1252", + 'cp1252' => "windows-1252", + 'cp819' => "windows-1252", + 'csisolatin1' => "windows-1252", + 'ibm819' => "windows-1252", + 'iso-8859-1' => "windows-1252", + 'iso-ir-100' => "windows-1252", + 'iso8859-1' => "windows-1252", + 'iso88591' => "windows-1252", + 'iso_8859-1' => "windows-1252", + 'iso_8859-1:1987' => "windows-1252", + 'l1' => "windows-1252", + 'latin1' => "windows-1252", + 'us-ascii' => "windows-1252", + 'windows-1252' => "windows-1252", + 'x-cp1252' => "windows-1252", + 'cp1253' => "windows-1253", + 'windows-1253' => "windows-1253", + 'x-cp1253' => "windows-1253", + 'cp1254' => "windows-1254", + 'csisolatin5' => "windows-1254", + 'iso-8859-9' => "windows-1254", + 'iso-ir-148' => "windows-1254", + 'iso8859-9' => "windows-1254", + 'iso88599' => "windows-1254", + 'iso_8859-9' => "windows-1254", + 'iso_8859-9:1989' => "windows-1254", + 'l5' => "windows-1254", + 'latin5' => "windows-1254", + 'windows-1254' => "windows-1254", + 'x-cp1254' => "windows-1254", + 'cp1255' => "windows-1255", + 'windows-1255' => "windows-1255", + 'x-cp1255' => "windows-1255", + 'cp1256' => "windows-1256", + 'windows-1256' => "windows-1256", + 'x-cp1256' => "windows-1256", + 'cp1257' => "windows-1257", + 'windows-1257' => "windows-1257", + 'x-cp1257' => "windows-1257", + 'cp1258' => "windows-1258", + 'windows-1258' => "windows-1258", + 'x-cp1258' => "windows-1258", + 'x-mac-cyrillic' => "x-mac-cyrillic", + 'x-mac-ukrainian' => "x-mac-cyrillic", + 'chinese' => "GBK", + 'csgb2312' => "GBK", + 'csiso58gb231280' => "GBK", + 'gb2312' => "GBK", + 'gb_2312' => "GBK", + 'gb_2312-80' => "GBK", + 'gbk' => "GBK", + 'iso-ir-58' => "GBK", + 'x-gbk' => "GBK", + 'gb18030' => "gb18030", + 'big5' => "Big5", + 'big5-hkscs' => "Big5", + 'cn-big5' => "Big5", + 'csbig5' => "Big5", + 'x-x-big5' => "Big5", + 'cseucpkdfmtjapanese' => "EUC-JP", + 'euc-jp' => "EUC-JP", + 'x-euc-jp' => "EUC-JP", + 'csiso2022jp' => "ISO-2022-JP", + 'iso-2022-jp' => "ISO-2022-JP", + 'csshiftjis' => "Shift_JIS", + 'ms932' => "Shift_JIS", + 'ms_kanji' => "Shift_JIS", + 'shift-jis' => "Shift_JIS", + 'shift_jis' => "Shift_JIS", + 'sjis' => "Shift_JIS", + 'windows-31j' => "Shift_JIS", + 'x-sjis' => "Shift_JIS", + 'cseuckr' => "EUC-KR", + 'csksc56011987' => "EUC-KR", + 'euc-kr' => "EUC-KR", + 'iso-ir-149' => "EUC-KR", + 'korean' => "EUC-KR", + 'ks_c_5601-1987' => "EUC-KR", + 'ks_c_5601-1989' => "EUC-KR", + 'ksc5601' => "EUC-KR", + 'ksc_5601' => "EUC-KR", + 'windows-949' => "EUC-KR", + 'csiso2022kr' => "replacement", + 'hz-gb-2312' => "replacement", + 'iso-2022-cn' => "replacement", + 'iso-2022-cn-ext' => "replacement", + 'iso-2022-kr' => "replacement", + 'replacement' => "replacement", + 'unicodefffe' => "UTF-16BE", + 'utf-16be' => "UTF-16BE", + 'csunicode' => "UTF-16LE", + 'iso-10646-ucs-2' => "UTF-16LE", + 'ucs-2' => "UTF-16LE", + 'unicode' => "UTF-16LE", + 'unicodefeff' => "UTF-16LE", + 'utf-16' => "UTF-16LE", + 'utf-16le' => "UTF-16LE", + 'x-user-defined' => "x-user-defined", + ]; + /** @var string The PCRE pattern for a Content-Type HTTP header-field, chopping it up into three main sections */ + const TYPE_PATTERN = <<<'PATTERN' + /^ + [\t\r\n ]* # optional leading whitespace + ([^\/]+) # type + \/ # type-subtype delimiter + ([^;]+) # subtype (possibly with trailing whitespace) + (;.*)? # optional parameters, to be parsed separately + [\t\r\n ]* # optional trailing whitespace + $/sx +PATTERN; + /** @var string The PCRE pattern for an HTTP Content-Type header-field's parameters, splitting them into a series of names and values*/ + const PARAM_PATTERN = <<<'PATTERN' + / + [;\t\r\n ]* # parameter delimiter and leading whitespace, all optional + ([^=;]*) # parameter name; may be empty + (?:= # parameter name-value delimiter + ( + "(?:\\"|[^"])*(?:"|$)[^;]* # quoted parameter value and optional garbage + |[^;]* # unquoted parameter value (possibly with trailing whitespace) + ) + )? + ;? # optional trailing parameter delimiter + [\t\r\n ]* # optional trailing whitespace + /sx +PATTERN; + /** @var string The PCRE pattern for an HTTP token, used to validate the nameof a parameter */ + const TOKEN_PATTERN = '/^[A-Za-z0-9!#$%&\'*+\-\.\^_`|~]+$/s'; + /** @var string The PCRE pattern for a "bare" (unquoted) parameter value, used for validation */ + const BARE_VALUE_PATTERN = '/^[\t\x{20}-\x{7E}\x{80}-\x{FF}]+$/s'; + /** @var string The PCRE pattern for a quoted parameter value, used for validation */ + const QUOTED_VALUE_PATTERN = '/^"((?:\\\"|[\t !\x{23}-\x{7E}\x{80}-\x{FF}])*)(?:"|$)/s'; + /** @var string The PCRE pattern for an escaped character in a quoted parameter value, used to unescape via replacement */ + const ESCAPE_PATTERN = '/\\\(.)/s'; + /** @var string The PCRE pattern for all valid UTF-8 characters */ + const UTF8_PATTERN = <<<'PATTERN' + /^(?: + # Single-byte + [\x{00}-\x{7F}]+ + # Two-byte + |[\x{C2}-\x{DF}][\x{80}-\x{BF}] + # Three-byte excluding surrogates + |\x{E0}[\x{A0}-\x{BF}][\x{80}-\x{BF}] + |\x{ED}[\x{80}-\x{9F}][\x{80}-\x{BF}] + |[\x{E1}-\x{EC}\x{EE}\x{EF}][\x{80}-\x{BF}]{2} + # Four-byte + |\x{F0}[\x{90}-\x{BF}][\x{80}-\x{BF}]{2} + |\x{F4}[\x{80}-\x{8F}][\x{80}-\x{BF}]{2} + |[\x{F1}-\x{F3}][\x{80}-\x{BF}]{3} + )*$/sx +PATTERN; + /** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */ + const ENCODING_NAUGHTY_LIST = [ + "unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8", + "iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e", + "csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597", + "sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual", + "csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913", + "iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r", + "x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911", + "tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819", + "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", + "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", + "us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254", + "x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258", + "x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312", + "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5", + "csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis", + "cseuckr", "euc-kr", "x-user-defined", "replacement", + ]; + /** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */ + const ENCODING_ALIAS_MAP = [ + 'windows-1258' => "x-cp1258", + 'GBK' => "x-gbk", + 'Big5' => "big5-hkscs", + 'EUC-KR' => "korean", + 'x-user-defined' => "windows-1256", // this is technically not correct, but x-user-defined is not likely to be used in HTML documents; it is used to represent binary data in JavaScript; windows-1256 is used as a substitute as every byte is assigned to a character, so text can be converted back into bytes with no loss + ]; + /** @var string A UTF-8 byte order mark */ + const BOM_UTF8 = "\xEF\xBB\xBF"; + /** @var string A UTF-16 (big-endian) byte order mark */ + const BOM_UTF16BE = "\xFE\xFF"; + /** @var string A UTF-16 (little-endian) byte order mark */ + const BOM_UTF16LE = "\xFF\xFE"; + /** @var string A PCRE pattern which finds the insertion point inside the HTML element of a document; the pattern takes some shortcuts, but nothing which is likely to affect documents which actually have microformat metadata */ + const HTML_HEADER_PATTERN = <<]*> # DOCTYPE or + |) # Comment or + |<\?[^>]*> # Processing instruction or XML declaration + )* # ... zero or more times + (?: start tag + (?:\s+ # ... with possible attributes + (?:\s*[^\s=>]* + (?:= + (?: + "[^"]*" + |'[^']*' + |[^ >]* + )? + )? + )* + )? + \/?\s*> + )? + (?: # Followed by... + \s+ # Whitespace or + |]*> # DOCTYPE or + |) # Comment or + |<\?[^>]*> # Processing instruction or XML declaration + )* # ... zero or more times + (?: start tag + (?:\s+ # ... with possible attributes + (?:\s*[^\s=>]* + (?:= + (?: + "[^"]*" + |'[^']*' + |[^ >]* + )? + )? + )* + )? + \/?\s*> + )? + /six +PATTERN; + + /** Matches an encoding label to a known encoding name in the WHATWG encoding list + * + * If the label does not match any known encoding, and empty string is returned. + * + * @param string $label The label to match e.g. "utf8", " FOO_bar " + * @param bool $excludeNaughty Whether or not to exclude labels DOMDocument does not understand + * @return string The canonical name of the encoding if matched (e.g. "UTF-8"), or an empty string in case of failure + */ + public static function matchEncodingLabel($label, $excludeNaughty = false) { + $label = strtolower(trim($label, " \t\r\n\x0C")); // see https://infra.spec.whatwg.org/#ascii-whitespace + if ($excludeNaughty && in_array($label, self::ENCODING_NAUGHTY_LIST)) { + return ""; + } + if (array_key_exists($label, self::ENCODINGS)) { + return self::ENCODINGS[$label]; + } + return ""; + } + + /** Parses a Content-Type header-field to extract the type and charset, if any + * + * @param string $type + * @return array An array containing keys for "type" and "charset" + */ + public static function parseContentType($type) { + $out = [ + 'type' => "", + 'charset' => "", + ]; + if (preg_match(self::TYPE_PATTERN, $type, $match)) { + list($mimeType, $type, $subtype, $params) = array_pad($match, 4, ""); + $type = preg_match(self::TOKEN_PATTERN, $type) ? $type : ""; + $subtype = rtrim($subtype, "\t\r\n "); + $subtype = preg_match(self::TOKEN_PATTERN, $subtype) ? $subtype : ""; + if (strlen($type) && strlen($subtype)) { + $out['type'] = strtolower($type) . "/" . strtolower($subtype); + // parse parameters looking for a charset one + if (preg_match_all(self::PARAM_PATTERN, $params, $matches, \PREG_SET_ORDER)) { + foreach ($matches as $match) { + list($param, $name, $value) = array_pad($match, 3, ""); + $name = strtolower(preg_match(self::TOKEN_PATTERN, $name) ? $name : ""); + if ($name === "charset" && strlen($value)) { + if ($value[0] === '"') { + if (preg_match(self::QUOTED_VALUE_PATTERN, $value, $match)) { + $out['charset'] = preg_replace(self::ESCAPE_PATTERN, '$1', $match[1]); + return $out; + } + } else { + $value = rtrim($value, "\t\r\n "); + if (preg_match(self::BARE_VALUE_PATTERN, $value)) { + $out['charset'] = $value; + return $out; + } + } + } + } + } + } + } + return $out; + } +} diff --git a/tests/Mf2/EncodingTest.php b/tests/Mf2/EncodingTest.php new file mode 100644 index 0000000..5e13ddf --- /dev/null +++ b/tests/Mf2/EncodingTest.php @@ -0,0 +1,123 @@ +assertIsArray($act); + $this->assertArrayHasKey("type", $act); + $this->assertArrayHasKey("charset", $act); + $this->assertSame($type, $act['type'], "Content-Type did not match"); + $this->assertSame($charset, $act['charset'], "Charset did not match"); + } + + public function provideContentTypes() { + return [ + ["a/b", "a/b", ""], + ["A/B", "a/b", ""], + [" \t\r\na/b \t\r\n", "a/b", ""], + ["a/b ; \t\r\n;", "a/b", ""], + ["a/b;a=b ; \t\r\n;", "a/b", ""], + ["a/b;a=b;charset=foo" , "a/b", "foo"], + ["a/b;a=b;charset=foo;charset=bar", "a/b", "foo"], + ["a/ b;charset=foo", "", ""], + ["a/b;charset=FOO", "a/b", "FOO"], + ['a/b;charset="foo"', "a/b", "foo"], + ['a/b;charset="foo bar"', "a/b", "foo bar"], + ['a/b;charset="foo\\"bar"', "a/b", 'foo"bar'], + ['a/b;a="foo"bar;charset=baz', "a/b", 'baz'], + ["text/html;charset=UTF-8", "text/html", "UTF-8"], + ["", "", ""], + ]; + } + + /** + * @dataProvider provideEncodingLabels + * @covers Mf2\Encoding::matchEncodingLabel + */ + public function testMatchEncodingLabels($in, $excludeNaugty, $exp) { + $this->assertSame($exp, Encoding::matchEncodingLabel($in, $excludeNaugty)); + } + + public function provideEncodingLabels() { + return [ + ["", false, ""], + ["foo", false, ""], + ["utf8", false, "UTF-8"], + ["UTF8", false, "UTF-8"], + ["Unicode", false, "UTF-16LE"], + [" \t\r\n\x0Cbig5 \t\r\n\x0C", false, "Big5"], + ["\v\x00big5\v\x00", false, ""], + ["iso88591", false, "windows-1252"], + ["Windows-1252", false, "windows-1252"], + ["", true, ""], + ["foo", true, ""], + ["utf8", true, "UTF-8"], + ["UTF8", true, "UTF-8"], + ["Unicode", true, "UTF-16LE"], + [" \t\r\n\x0Cbig5 \t\r\n\x0C", true, ""], + ["\v\x00big5\v\x00", true, ""], + ["iso88591", true, ""], + ["Windows-1252", true, "windows-1252"], + ]; + } + + /** + * @dataProvider provideDocuments + * @covers Mf2\load_dom_document + */ + public function testDetectEncoding($doc, $type, $body, $htmlClass, $headClass) { + $d = load_dom_document($doc, $type); + $this->assertInstanceOf("DOMDocument", $d); + $this->assertNotNull($d->documentElement); + $this->assertSame($body, $d->documentElement->textContent); + $this->assertSame($htmlClass, $d->documentElement->getAttribute("class")); + if ($head = $d->getElementsByTagName("head")->item(0)) { + $this->assertSame($headClass, $head->getAttribute("class")); + } + } + + public function provideDocuments() { + return [ + ["\xEF\xBB\xBF\xC3\xA9", "", "\xC3\xA9", "", ""], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "", ""], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "text/html;charset=utf-8", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xA7", "", "\xE2\x95\x96", "", "b"], + ["\xFE\xFF\xD8\x34\xDD\x1E", "", "\xF0\x9D\x84\x9E", "", ""], + ["\xFF\xFE\x34\xD8\x1E\xDD", "", "\xF0\x9D\x84\x9E", "", ""], + ["\xD8\x34\xDD\x1E", "text/html;charset=utf-16be", "\xF0\x9D\x84\x9E", "", ""], + ["\x34\xD8\x1E\xDD", "text/html;charset=utf-16le", "\xF0\x9D\x84\x9E", "", ""], + ["\x9F", "", "\xC5\xB8", "a", "b"], + ["\xC3\xA9", "text/html;charset=iso-2022-cn", "\xEF\xBF\xBD", "", ""], + ["\x9F", "", "\xC5\xB8", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + [" \n\t\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xC3\xA9", "", "\xC3\xA9", "a", "b"], + ["\xDD", "text/html;charset=windows-1258", "\xC6\xAF", "", ""], + ]; + } +} \ No newline at end of file diff --git a/tests/Mf2/ParserTest.php b/tests/Mf2/ParserTest.php index a364557..946a693 100644 --- a/tests/Mf2/ParserTest.php +++ b/tests/Mf2/ParserTest.php @@ -823,10 +823,8 @@ public function testNotMutatingPassedInDOM() { // Use same parsing as Parser::__construct(), twice to have a comparison object. libxml_use_internal_errors(true); - $refDoc = new \DOMDocument(); - @$refDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING); - $inputDoc = new \DOMDocument(); - @$inputDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING); + $refDoc = Mf2\load_dom_document($input); + $inputDoc = Mf2\load_dom_document($input); // For completion sake, test PHP itself. $this->assertEquals($refDoc, $inputDoc, 'PHP could not create identical DOMDocument instances.');