diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 8b6485b..b3ba637 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -7,7 +7,6 @@ use DOMXPath; use DOMNode; use DOMNodeList; -use Exception; use SplObjectStorage; use stdClass; @@ -87,13 +86,139 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) { return parse($html, $url, $convertClassic); } -/** - * Unicode to HTML Entities - * @param string $input String containing characters to convert into HTML entities - * @return string - */ -function unicodeToHtmlEntities($input) { - return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); +function load_dom_document($input, $contentType = "") { + /* + The work of this function concerns detecting encoding and making sure + DOMDocument honours the correct document encoding. There are + multiple complications to this, chiefly that DOMDocument's + priorities differ from the modern HTML standard's. HTML mandates the + following from most to least authoritative: + + 1. Byte order mark + 2. Protocol-specified encoding + 3. Meta element in document + 4. Heuristics (optional) + 5. Default encoding (usually windows-1252) + + DOMDocument, on the other hand, takes encoding from the following + sources from most to least authoritative: + + 1. UTF-16 byte order mark + 2. Meta element in document + 3. UTF-8 byte order mark + 4. Default encoding (ISO 8859-1) + + Thus this function finds the most authoritative encoding in the order + required by the specification, and then inserts a meta element where + necessary i.e. when the document doesn't already have the correct encoding + in a meta element. + + Finally, in the case of UTF-16 input the only reliable indicator is a + byte order mark, so we insert one where needed instead of a meta element. + */ + $load = function($input) { + $d = new DOMDocument(); + @$d->loadHTML($input, \LIBXML_NOWARNING); + return $d; + }; + // perform an initial parsing of the document. We may parse it again + // if the encoding is wrong, or use the result if everything is correct + $d = $load($input); + $meta = ""; + $effective = ""; + // determine the encoding which DOMDocument has detected from the document, starting by looking for a UTF-16 BOM + if (substr($input, 0, 2) === Encoding::BOM_UTF16BE) { + $effective = "UTF-16BE"; + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) { + $effective = "UTF-16LE"; + } else { + // find the first valid meta element in the document with + // encoding information; even if this initial parsing uses the + // wrong encoding, it's more reliable than trying to do this + // with the string directly + // we keep note of both what DOMDocument understands (the $effective + // encoding), as well as what the standard says is valid (the $meta + // encoding) as the two can differ + foreach ($d->getElementsByTagName("meta") as $e) { + $meta = Encoding::matchEncodingLabel($e->getAttribute("charset")); + if (strlen($meta)) { + $effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true); + break; + } + if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") { + $candidate = Encoding::parseContentType($e->getAttribute("content")); + $meta = Encoding::matchEncodingLabel($candidate['charset']); + if (strlen($meta)) { + $effective = Encoding::matchEncodingLabel($candidate['charset'], true); + break; + } + } + } + // if no effective encoding was found, check for a UTF-8 BOM + if (!$effective && substr($input, 0, 3) === Encoding::BOM_UTF8) { + $effective = "UTF-8"; + } + } + // now find the authoritative encoding according to the standard HTML order + $uthoritative = ""; + // start by looking for BOMs + if (substr($input, 0, 3) === Encoding::BOM_UTF8) { + $authoritative = "UTF-8"; + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16BE) { + $authoritative = "UTF-16BE"; + } elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) { + $authoritative = "UTF-16LE"; + } else { + // now parse the HTTP Content-Type looking for an encoding + $candidate = Encoding::parseContentType($contentType); + $candidate = Encoding::matchEncodingLabel($candidate['charset']); + if ($candidate) { + $authoritative = $candidate; + } elseif ($meta) { + // if the document has a valid element, use that + $authoritative = $meta; + } elseif (preg_match(Encoding::UTF8_PATTERN, $input)) { + // if the string is valid UTF-8, we can use this + $authoritative = "UTF-8"; + } else { + // otherwise use the default encoding + $authoritative = Encoding::DEFAULT_ENCODING; + } + } + // if the authoritative encoding does not match the encoding currently in + // the document, add an appropriate meta tag or (for UTF-16) a BOM + if ($authoritative !== $effective) { + // some canonical names are not understood by some environments, so we use aliases where needed + if (array_key_exists($authoritative, Encoding::ENCODING_ALIAS_MAP)) { + $authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative]; + } + if ($authoritative === "UTF-16BE") { + if (substr($input, 0, 2) !== Encoding::BOM_UTF16BE) { + // add a BOM and reparse + $d = $load(Encoding::BOM_UTF16BE.$input); + } + } elseif ($authoritative === "UTF-16LE") { + if (substr($input, 0, 2) !== Encoding::BOM_UTF16LE) { + // add a BOM and reparse + $d = $load(Encoding::BOM_UTF16LE.$input); + } + } elseif ($authoritative === "replacement") { + // the replacement encoding is actually a denylist of problematic encodings; the document is reduced to a single replacement character + $d = $load(Encoding::BOM_UTF8."\xEF\xBF\xBD"); + } else { + $offset = 0; + // look for an HTML
element and insert the meta tag within + // it, or at the nearest appropriate point; if the tag is + // inserted too soon, attributes on or (which + // may hold mf data) can be stripped + if (preg_match(Encoding::HTML_HEADER_PATTERN, $input, $match)) { + $offset = strlen($match[0]); + } + $tag = ''; + $d = $load(substr($input, 0, $offset) . $tag . substr($input, $offset)); + } + } + return $d; } /** @@ -111,10 +236,9 @@ function collapseWhitespace($str) { } function unicodeTrim($str) { - // this is cheating. TODO: find a better way if this causes any problems - $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); - $str = preg_replace('/^\s+/', '', $str); - return preg_replace('/\s+$/', '', $str); + // the binary sequence C2A0 is a UTF-8 non-breaking space character + $str = preg_replace('/^(?:\s|\x{C2}\x{A0})+/', '', $str); + return preg_replace('/(?:\s|\x{C2}\x{A0})+$/', '', $str); } /** @@ -170,8 +294,8 @@ function nestedMfPropertyNamesFromClass($class) { foreach (explode(' ', $class) as $classname) { foreach ($prefixes as $prefix) { // Check if $classname is a valid property classname for $prefix. - if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) { - $propertyName = mb_substr($classname, mb_strlen($prefix)); + if (substr($classname, 0, strlen($prefix)) == $prefix && $classname != $prefix) { + $propertyName = substr($classname, strlen($prefix)); $propertyNames[$propertyName][] = $prefix; } } @@ -375,8 +499,7 @@ public function __construct($input, $url = null, $jsonMode = false) { $doc = new \Masterminds\HTML5(array('disable_html_ns' => true)); $doc = $doc->loadHTML($input); } else { - $doc = new DOMDocument(); - @$doc->loadHTML(unicodeToHtmlEntities($input), \LIBXML_NOWARNING); + $doc = load_dom_document($input); } } elseif (is_a($input, 'DOMDocument')) { $doc = clone $input; @@ -2429,3 +2552,431 @@ function removeDotSegments($path) { return $output; } + +/** Tools for determining the encoding of documents */ +class Encoding { + /** @var string The default encoding to use when none is detected in the document */ + const DEFAULT_ENCODING = "windows-1252"; + /** @var array The list of known encodings; see https://encoding.spec.whatwg.org/#names-and-labels */ + const ENCODINGS = [ + 'unicode-1-1-utf-8' => "UTF-8", + 'unicode11utf8' => "UTF-8", + 'unicode20utf8' => "UTF-8", + 'utf-8' => "UTF-8", + 'utf8' => "UTF-8", + 'x-unicode20utf8' => "UTF-8", + '866' => "IBM866", + 'cp866' => "IBM866", + 'csibm866' => "IBM866", + 'ibm866' => "IBM866", + 'csisolatin2' => "ISO-8859-2", + 'iso-8859-2' => "ISO-8859-2", + 'iso-ir-101' => "ISO-8859-2", + 'iso8859-2' => "ISO-8859-2", + 'iso88592' => "ISO-8859-2", + 'iso_8859-2' => "ISO-8859-2", + 'iso_8859-2:1987' => "ISO-8859-2", + 'l2' => "ISO-8859-2", + 'latin2' => "ISO-8859-2", + 'csisolatin3' => "ISO-8859-3", + 'iso-8859-3' => "ISO-8859-3", + 'iso-ir-109' => "ISO-8859-3", + 'iso8859-3' => "ISO-8859-3", + 'iso88593' => "ISO-8859-3", + 'iso_8859-3' => "ISO-8859-3", + 'iso_8859-3:1988' => "ISO-8859-3", + 'l3' => "ISO-8859-3", + 'latin3' => "ISO-8859-3", + 'csisolatin4' => "ISO-8859-4", + 'iso-8859-4' => "ISO-8859-4", + 'iso-ir-110' => "ISO-8859-4", + 'iso8859-4' => "ISO-8859-4", + 'iso88594' => "ISO-8859-4", + 'iso_8859-4' => "ISO-8859-4", + 'iso_8859-4:1988' => "ISO-8859-4", + 'l4' => "ISO-8859-4", + 'latin4' => "ISO-8859-4", + 'csisolatincyrillic' => "ISO-8859-5", + 'cyrillic' => "ISO-8859-5", + 'iso-8859-5' => "ISO-8859-5", + 'iso-ir-144' => "ISO-8859-5", + 'iso8859-5' => "ISO-8859-5", + 'iso88595' => "ISO-8859-5", + 'iso_8859-5' => "ISO-8859-5", + 'iso_8859-5:1988' => "ISO-8859-5", + 'arabic' => "ISO-8859-6", + 'asmo-708' => "ISO-8859-6", + 'csiso88596e' => "ISO-8859-6", + 'csiso88596i' => "ISO-8859-6", + 'csisolatinarabic' => "ISO-8859-6", + 'ecma-114' => "ISO-8859-6", + 'iso-8859-6' => "ISO-8859-6", + 'iso-8859-6-e' => "ISO-8859-6", + 'iso-8859-6-i' => "ISO-8859-6", + 'iso-ir-127' => "ISO-8859-6", + 'iso8859-6' => "ISO-8859-6", + 'iso88596' => "ISO-8859-6", + 'iso_8859-6' => "ISO-8859-6", + 'iso_8859-6:1987' => "ISO-8859-6", + 'csisolatingreek' => "ISO-8859-7", + 'ecma-118' => "ISO-8859-7", + 'elot_928' => "ISO-8859-7", + 'greek' => "ISO-8859-7", + 'greek8' => "ISO-8859-7", + 'iso-8859-7' => "ISO-8859-7", + 'iso-ir-126' => "ISO-8859-7", + 'iso8859-7' => "ISO-8859-7", + 'iso88597' => "ISO-8859-7", + 'iso_8859-7' => "ISO-8859-7", + 'iso_8859-7:1987' => "ISO-8859-7", + 'sun_eu_greek' => "ISO-8859-7", + 'csiso88598e' => "ISO-8859-8", + 'csisolatinhebrew' => "ISO-8859-8", + 'hebrew' => "ISO-8859-8", + 'iso-8859-8' => "ISO-8859-8", + 'iso-8859-8-e' => "ISO-8859-8", + 'iso-ir-138' => "ISO-8859-8", + 'iso8859-8' => "ISO-8859-8", + 'iso88598' => "ISO-8859-8", + 'iso_8859-8' => "ISO-8859-8", + 'iso_8859-8:1988' => "ISO-8859-8", + 'visual' => "ISO-8859-8", + 'csiso88598i' => "ISO-8859-8-I", + 'iso-8859-8-i' => "ISO-8859-8-I", + 'logical' => "ISO-8859-8-I", + 'csisolatin6' => "ISO-8859-10", + 'iso-8859-10' => "ISO-8859-10", + 'iso-ir-157' => "ISO-8859-10", + 'iso8859-10' => "ISO-8859-10", + 'iso885910' => "ISO-8859-10", + 'l6' => "ISO-8859-10", + 'latin6' => "ISO-8859-10", + 'iso-8859-13' => "ISO-8859-13", + 'iso8859-13' => "ISO-8859-13", + 'iso885913' => "ISO-8859-13", + 'iso-8859-14' => "ISO-8859-14", + 'iso8859-14' => "ISO-8859-14", + 'iso885914' => "ISO-8859-14", + 'csisolatin9' => "ISO-8859-15", + 'iso-8859-15' => "ISO-8859-15", + 'iso8859-15' => "ISO-8859-15", + 'iso885915' => "ISO-8859-15", + 'iso_8859-15' => "ISO-8859-15", + 'l9' => "ISO-8859-15", + 'iso-8859-16' => "ISO-8859-16", + 'cskoi8r' => "KOI8-R", + 'koi' => "KOI8-R", + 'koi8' => "KOI8-R", + 'koi8-r' => "KOI8-R", + 'koi8_r' => "KOI8-R", + 'koi8-ru' => "KOI8-U", + 'koi8-u' => "KOI8-U", + 'csmacintosh' => "macintosh", + 'mac' => "macintosh", + 'macintosh' => "macintosh", + 'x-mac-roman' => "macintosh", + 'dos-874' => "windows-874", + 'iso-8859-11' => "windows-874", + 'iso8859-11' => "windows-874", + 'iso885911' => "windows-874", + 'tis-620' => "windows-874", + 'windows-874' => "windows-874", + 'cp1250' => "windows-1250", + 'windows-1250' => "windows-1250", + 'x-cp1250' => "windows-1250", + 'cp1251' => "windows-1251", + 'windows-1251' => "windows-1251", + 'x-cp1251' => "windows-1251", + 'ansi_x3.4-1968' => "windows-1252", + 'ascii' => "windows-1252", + 'cp1252' => "windows-1252", + 'cp819' => "windows-1252", + 'csisolatin1' => "windows-1252", + 'ibm819' => "windows-1252", + 'iso-8859-1' => "windows-1252", + 'iso-ir-100' => "windows-1252", + 'iso8859-1' => "windows-1252", + 'iso88591' => "windows-1252", + 'iso_8859-1' => "windows-1252", + 'iso_8859-1:1987' => "windows-1252", + 'l1' => "windows-1252", + 'latin1' => "windows-1252", + 'us-ascii' => "windows-1252", + 'windows-1252' => "windows-1252", + 'x-cp1252' => "windows-1252", + 'cp1253' => "windows-1253", + 'windows-1253' => "windows-1253", + 'x-cp1253' => "windows-1253", + 'cp1254' => "windows-1254", + 'csisolatin5' => "windows-1254", + 'iso-8859-9' => "windows-1254", + 'iso-ir-148' => "windows-1254", + 'iso8859-9' => "windows-1254", + 'iso88599' => "windows-1254", + 'iso_8859-9' => "windows-1254", + 'iso_8859-9:1989' => "windows-1254", + 'l5' => "windows-1254", + 'latin5' => "windows-1254", + 'windows-1254' => "windows-1254", + 'x-cp1254' => "windows-1254", + 'cp1255' => "windows-1255", + 'windows-1255' => "windows-1255", + 'x-cp1255' => "windows-1255", + 'cp1256' => "windows-1256", + 'windows-1256' => "windows-1256", + 'x-cp1256' => "windows-1256", + 'cp1257' => "windows-1257", + 'windows-1257' => "windows-1257", + 'x-cp1257' => "windows-1257", + 'cp1258' => "windows-1258", + 'windows-1258' => "windows-1258", + 'x-cp1258' => "windows-1258", + 'x-mac-cyrillic' => "x-mac-cyrillic", + 'x-mac-ukrainian' => "x-mac-cyrillic", + 'chinese' => "GBK", + 'csgb2312' => "GBK", + 'csiso58gb231280' => "GBK", + 'gb2312' => "GBK", + 'gb_2312' => "GBK", + 'gb_2312-80' => "GBK", + 'gbk' => "GBK", + 'iso-ir-58' => "GBK", + 'x-gbk' => "GBK", + 'gb18030' => "gb18030", + 'big5' => "Big5", + 'big5-hkscs' => "Big5", + 'cn-big5' => "Big5", + 'csbig5' => "Big5", + 'x-x-big5' => "Big5", + 'cseucpkdfmtjapanese' => "EUC-JP", + 'euc-jp' => "EUC-JP", + 'x-euc-jp' => "EUC-JP", + 'csiso2022jp' => "ISO-2022-JP", + 'iso-2022-jp' => "ISO-2022-JP", + 'csshiftjis' => "Shift_JIS", + 'ms932' => "Shift_JIS", + 'ms_kanji' => "Shift_JIS", + 'shift-jis' => "Shift_JIS", + 'shift_jis' => "Shift_JIS", + 'sjis' => "Shift_JIS", + 'windows-31j' => "Shift_JIS", + 'x-sjis' => "Shift_JIS", + 'cseuckr' => "EUC-KR", + 'csksc56011987' => "EUC-KR", + 'euc-kr' => "EUC-KR", + 'iso-ir-149' => "EUC-KR", + 'korean' => "EUC-KR", + 'ks_c_5601-1987' => "EUC-KR", + 'ks_c_5601-1989' => "EUC-KR", + 'ksc5601' => "EUC-KR", + 'ksc_5601' => "EUC-KR", + 'windows-949' => "EUC-KR", + 'csiso2022kr' => "replacement", + 'hz-gb-2312' => "replacement", + 'iso-2022-cn' => "replacement", + 'iso-2022-cn-ext' => "replacement", + 'iso-2022-kr' => "replacement", + 'replacement' => "replacement", + 'unicodefffe' => "UTF-16BE", + 'utf-16be' => "UTF-16BE", + 'csunicode' => "UTF-16LE", + 'iso-10646-ucs-2' => "UTF-16LE", + 'ucs-2' => "UTF-16LE", + 'unicode' => "UTF-16LE", + 'unicodefeff' => "UTF-16LE", + 'utf-16' => "UTF-16LE", + 'utf-16le' => "UTF-16LE", + 'x-user-defined' => "x-user-defined", + ]; + /** @var string The PCRE pattern for a Content-Type HTTP header-field, chopping it up into three main sections */ + const TYPE_PATTERN = <<<'PATTERN' + /^ + [\t\r\n ]* # optional leading whitespace + ([^\/]+) # type + \/ # type-subtype delimiter + ([^;]+) # subtype (possibly with trailing whitespace) + (;.*)? # optional parameters, to be parsed separately + [\t\r\n ]* # optional trailing whitespace + $/sx +PATTERN; + /** @var string The PCRE pattern for an HTTP Content-Type header-field's parameters, splitting them into a series of names and values*/ + const PARAM_PATTERN = <<<'PATTERN' + / + [;\t\r\n ]* # parameter delimiter and leading whitespace, all optional + ([^=;]*) # parameter name; may be empty + (?:= # parameter name-value delimiter + ( + "(?:\\"|[^"])*(?:"|$)[^;]* # quoted parameter value and optional garbage + |[^;]* # unquoted parameter value (possibly with trailing whitespace) + ) + )? + ;? # optional trailing parameter delimiter + [\t\r\n ]* # optional trailing whitespace + /sx +PATTERN; + /** @var string The PCRE pattern for an HTTP token, used to validate the nameof a parameter */ + const TOKEN_PATTERN = '/^[A-Za-z0-9!#$%&\'*+\-\.\^_`|~]+$/s'; + /** @var string The PCRE pattern for a "bare" (unquoted) parameter value, used for validation */ + const BARE_VALUE_PATTERN = '/^[\t\x{20}-\x{7E}\x{80}-\x{FF}]+$/s'; + /** @var string The PCRE pattern for a quoted parameter value, used for validation */ + const QUOTED_VALUE_PATTERN = '/^"((?:\\\"|[\t !\x{23}-\x{7E}\x{80}-\x{FF}])*)(?:"|$)/s'; + /** @var string The PCRE pattern for an escaped character in a quoted parameter value, used to unescape via replacement */ + const ESCAPE_PATTERN = '/\\\(.)/s'; + /** @var string The PCRE pattern for all valid UTF-8 characters */ + const UTF8_PATTERN = <<<'PATTERN' + /^(?: + # Single-byte + [\x{00}-\x{7F}]+ + # Two-byte + |[\x{C2}-\x{DF}][\x{80}-\x{BF}] + # Three-byte excluding surrogates + |\x{E0}[\x{A0}-\x{BF}][\x{80}-\x{BF}] + |\x{ED}[\x{80}-\x{9F}][\x{80}-\x{BF}] + |[\x{E1}-\x{EC}\x{EE}\x{EF}][\x{80}-\x{BF}]{2} + # Four-byte + |\x{F0}[\x{90}-\x{BF}][\x{80}-\x{BF}]{2} + |\x{F4}[\x{80}-\x{8F}][\x{80}-\x{BF}]{2} + |[\x{F1}-\x{F3}][\x{80}-\x{BF}]{3} + )*$/sx +PATTERN; + /** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */ + const ENCODING_NAUGHTY_LIST = [ + "unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8", + "iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e", + "csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597", + "sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual", + "csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913", + "iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r", + "x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911", + "tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819", + "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", + "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", + "us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254", + "x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258", + "x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312", + "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5", + "csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis", + "cseuckr", "euc-kr", "x-user-defined", "replacement", + ]; + /** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */ + const ENCODING_ALIAS_MAP = [ + 'windows-1258' => "x-cp1258", + 'GBK' => "x-gbk", + 'Big5' => "big5-hkscs", + 'EUC-KR' => "korean", + 'x-user-defined' => "windows-1256", // this is technically not correct, but x-user-defined is not likely to be used in HTML documents; it is used to represent binary data in JavaScript; windows-1256 is used as a substitute as every byte is assigned to a character, so text can be converted back into bytes with no loss + ]; + /** @var string A UTF-8 byte order mark */ + const BOM_UTF8 = "\xEF\xBB\xBF"; + /** @var string A UTF-16 (big-endian) byte order mark */ + const BOM_UTF16BE = "\xFE\xFF"; + /** @var string A UTF-16 (little-endian) byte order mark */ + const BOM_UTF16LE = "\xFF\xFE"; + /** @var string A PCRE pattern which finds the insertion point inside the HTML element of a document; the pattern takes some shortcuts, but nothing which is likely to affect documents which actually have microformat metadata */ + const HTML_HEADER_PATTERN = <<