From b4aee06110099f686f3d57259ed2fc1674328c5b Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Fri, 11 Sep 2020 12:16:31 -0400
Subject: [PATCH 01/11] Correctly trim nbsp without using mbstring

---
 Mf2/Parser.php | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index e8e6dda..b0db775 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -111,10 +111,9 @@ function collapseWhitespace($str) {
 }
 
 function unicodeTrim($str) {
-	// this is cheating. TODO: find a better way if this causes any problems
-	$str = str_replace(mb_convert_encoding('&nbsp;', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
-	$str = preg_replace('/^\s+/', '', $str);
-	return preg_replace('/\s+$/', '', $str);
+	// the binary sequence C2A0 is a UTF-8 non-breaking space character
+	$str = preg_replace('/^(?:\s|\x{C2}\x{A0})+/', '', $str);
+	return preg_replace('/(?:\s|\x{C2}\x{A0})+$/', '', $str);
 }
 
 /**

From 5c7c883d3db5d0b7a064405f20e6f9d442eb41dd Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Fri, 11 Sep 2020 14:56:14 -0400
Subject: [PATCH 02/11] Remove unnecessary use of mbstring

The classname prefixes are always ASCII, so mb_ functions are not needed
---
 Mf2/Parser.php | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index b0db775..8d27b13 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -160,8 +160,8 @@ function nestedMfPropertyNamesFromClass($class) {
 	foreach (explode(' ', $class) as $classname) {
 		foreach ($prefixes as $prefix) {
 			// Check if $classname is a valid property classname for $prefix.
-			if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
-				$propertyName = mb_substr($classname, mb_strlen($prefix));
+			if (substr($classname, 0, strlen($prefix)) == $prefix && $classname != $prefix) {
+				$propertyName = substr($classname, strlen($prefix));
 				$propertyNames[$propertyName][] = $prefix;
 			}
 		}

From 9c3a5a362cda5cd51baf0111d13566800ee844e5 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Fri, 11 Sep 2020 19:06:38 -0400
Subject: [PATCH 03/11] Make mbstring in unicodeToHtmlEntities optional

It's unclear what problems the function was originallymeant to solve;
it is equally unclear the problems still exist
---
 Mf2/Parser.php | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index 8d27b13..cda6380 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -93,7 +93,12 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) {
  * @return string
  */
 function unicodeToHtmlEntities($input) {
-	return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
+	// FIXME: This function purportedly works around UTF-8 decoding problems 
+	// in PHP's DOMDocument class circa 2012. Is this still necessary?
+	if (extension_loaded("mbstring")) {
+		return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));	
+	}
+	return $input;
 }
 
 /**

From 1965f9afac66d2e9af9e95acca60aede5048fa02 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Wed, 7 Apr 2021 12:56:02 -0400
Subject: [PATCH 04/11] Perform charset detection instead of guessing

---
 Mf2/Parser.php                        | 45 +++++++++++++++++++--------
 tests/Mf2/ClassicMicroformatsTest.php |  2 +-
 tests/Mf2/ParseImpliedTest.php        |  2 +-
 tests/Mf2/ParserTest.php              |  6 ++--
 4 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index cda6380..a32cdbe 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -87,18 +87,38 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) {
 	return parse($html, $url, $convertClassic);
 }
 
-/**
- * Unicode to HTML Entities
- * @param string $input String containing characters to convert into HTML entities
- * @return string
- */
-function unicodeToHtmlEntities($input) {
-	// FIXME: This function purportedly works around UTF-8 decoding problems 
-	// in PHP's DOMDocument class circa 2012. Is this still necessary?
-	if (extension_loaded("mbstring")) {
-		return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));	
+function load_dom_document($input) {
+	$d = new \DOMDocument();
+	@$d->loadHTML($input, \LIBXML_NOWARNING);
+	if (substr($input, 0, 3) === "\xEF\xBB\xBF") {
+		// the document has a byte order mark and is unambiguously UTF-8
+		return $d;
+	}
+	/* Examine each <meta> element in the document to determine whether
+		there is a charset attribute that signals UTF-8 encoding.
+
+		If there is we add a UTF-8 byte order mark to the input and parse
+		the document a second time, yielding a correct document. Otherwise
+		we return the document as-is rather than jump through hoops to
+		correct it (if indeed it is incorrect).
+
+		The truly correct thing to do is to use a modern HTML parser which
+		implements the encoding pre-scan algorithm; this should suffice when
+		such a modern parser is not available, however.
+	*/
+	$charset = false;
+	$labelPattern = '/^\s*(?:unicode-1-1-utf-8|unicode11utf8|unicode20utf8|utf-8|utf8|x-unicode20utf8)\s*$/i'; // See https://encoding.spec.whatwg.org/#names-and-labels
+	foreach ($d->getElementsByTagName("meta") as $e) {
+		if (preg_match($labelPattern, $e->getAttribute("charset"))) {
+			$charset = true;
+			break;
+		}
+	}
+	if ($charset) {
+		$d = new \DOMDocument();
+		@$d->loadHTML("\xEF\xBB\xBF".$input, \LIBXML_NOWARNING);
 	}
-	return $input;
+	return $d;
 }
 
 /**
@@ -365,8 +385,7 @@ public function __construct($input, $url = null, $jsonMode = false) {
 					$doc = new \Masterminds\HTML5(array('disable_html_ns' => true));
 					$doc = $doc->loadHTML($input);
 			} else {
-				$doc = new DOMDocument();
-				@$doc->loadHTML(unicodeToHtmlEntities($input), \LIBXML_NOWARNING);
+				$doc = load_dom_document($input);
 			}
 		} elseif (is_a($input, 'DOMDocument')) {
 			$doc = clone $input;
diff --git a/tests/Mf2/ClassicMicroformatsTest.php b/tests/Mf2/ClassicMicroformatsTest.php
index 0150e08..514800a 100644
--- a/tests/Mf2/ClassicMicroformatsTest.php
+++ b/tests/Mf2/ClassicMicroformatsTest.php
@@ -911,7 +911,7 @@ public function testHEntryRelTagInContent() {
 	 * @see source: http://jg.typepad.com/ciel/2006/02/daniel_bouluds_.html
 	 */
 	public function testHReviewRelTag() {
-		$input = '<div class="hreview">
+		$input = '<meta charset="UTF-8"><div class="hreview">
 <span class="version" style="display:none">0.2</span>
   <h2 class="summary">
     Divine Brunch!
diff --git a/tests/Mf2/ParseImpliedTest.php b/tests/Mf2/ParseImpliedTest.php
index cea18fd..990ca0c 100644
--- a/tests/Mf2/ParseImpliedTest.php
+++ b/tests/Mf2/ParseImpliedTest.php
@@ -167,7 +167,7 @@ public function testParsesImpliedNameWithExplicitURL() {
   }
 
 	public function testMultipleImpliedHCards() {
-		$input = '<span class="h-card">Frances Berriman</span>
+		$input = '<meta charset="UTF-8"><span class="h-card">Frances Berriman</span>
 
 <a class="h-card" href="http://benward.me">Ben Ward</a>
 
diff --git a/tests/Mf2/ParserTest.php b/tests/Mf2/ParserTest.php
index e0f96cc..fb588af 100644
--- a/tests/Mf2/ParserTest.php
+++ b/tests/Mf2/ParserTest.php
@@ -816,10 +816,8 @@ public function testNotMutatingPassedInDOM() {
 
 		// Use same parsing as Parser::__construct(), twice to have a comparison object.
 		libxml_use_internal_errors(true);
-		$refDoc = new \DOMDocument();
-		@$refDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING);
-		$inputDoc = new \DOMDocument();
-		@$inputDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING);
+		$refDoc = Mf2\load_dom_document($input);
+		$inputDoc = Mf2\load_dom_document($input);
 
 		// For completion sake, test PHP itself.
 		$this->assertEquals($refDoc, $inputDoc, 'PHP could not create identical DOMDocument instances.');

From 25c8ff46505b627e1b37a99230c66b50269c39c5 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Wed, 7 Apr 2021 13:03:02 -0400
Subject: [PATCH 05/11] Fix standard test harness

---
 tests/Mf2/MicroformatsTestSuiteTest.php | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/Mf2/MicroformatsTestSuiteTest.php b/tests/Mf2/MicroformatsTestSuiteTest.php
index 3f0392f..745d2c6 100644
--- a/tests/Mf2/MicroformatsTestSuiteTest.php
+++ b/tests/Mf2/MicroformatsTestSuiteTest.php
@@ -74,6 +74,8 @@ public function testMf2FromTestSuite($input, $expectedOutput, $test)
             $this->markTestIncomplete('This test does not match because we implement a proposed spec: https://github.com/microformats/microformats2-parsing/issues/4#issuecomment-373457720.');
         }
 
+        // Add a byte order mark to the input to ensure encoding is correctly detected
+        $input = "\xEF\xBB\xBF".$input;
         $parser = new TestSuiteParser($input, 'http://example.com/');
         $this->assertEquals(
             $this->makeComparible(json_decode($expectedOutput, true)),

From adf2eb8775fdaeb4779905dfce55ab7cac5f102d Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Thu, 6 Apr 2023 21:53:13 -0400
Subject: [PATCH 06/11] Perform encoding detection as a browser would

This is the result of extension testing against PHP 5.6 through
PHP 8.2 in Linux and Windows to determine how encoding detection
actually works in DOMDocument
---
 Mf2/Parser.php | 503 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 477 insertions(+), 26 deletions(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index 49ecd13..c0398c0 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -7,7 +7,6 @@
 use DOMXPath;
 use DOMNode;
 use DOMNodeList;
-use Exception;
 use SplObjectStorage;
 use stdClass;
 
@@ -87,36 +86,112 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) {
 	return parse($html, $url, $convertClassic);
 }
 
-function load_dom_document($input) {
-	$d = new \DOMDocument();
-	@$d->loadHTML($input, \LIBXML_NOWARNING);
-	if (substr($input, 0, 3) === "\xEF\xBB\xBF") {
-		// the document has a byte order mark and is unambiguously UTF-8
-		return $d;
-	}
-	/* Examine each <meta> element in the document to determine whether
-		there is a charset attribute that signals UTF-8 encoding.
-
-		If there is we add a UTF-8 byte order mark to the input and parse
-		the document a second time, yielding a correct document. Otherwise
-		we return the document as-is rather than jump through hoops to
-		correct it (if indeed it is incorrect).
-
-		The truly correct thing to do is to use a modern HTML parser which
-		implements the encoding pre-scan algorithm; this should suffice when
-		such a modern parser is not available, however.
+function load_dom_document($input, $contentType = "") {
+	/* 
+	The work of this function concerns detecting encoding and making sure
+	DOMDocument honours the correct document encoding. There are
+	multiple complications to this, chiefly that DOMDocument's
+	priorities differ from the modern HTML standard's. HTML mandates the
+	following from most to least authoritative:
+
+	1. Byte order mark
+	2. Protocol-specified encoding
+	3. Meta element in document
+	4. Heuristics (optional)
+	5. Default encoding (usually windows-1252)
+
+	DOMDocument, on the other hand, takes encoding from the following
+	sources from most to least authoritative:
+
+	1. Meta element in document
+	2. Byte order mark
+	3. Default encoding (ISO 8859-1)
+
+	Thus this function finds the most authoritative encoding in the order
+	required by the specification, and then inserts a meta element where 
+	necessary i.e. when the document doesn't already have the correct encoding
+	in a meta element.
+
+	Finally, in the case of UTF-16 input the only reliable indicator is a
+	byte order mark, so we insert one where needed instead of a meta element.
 	*/
-	$charset = false;
-	$labelPattern = '/^\s*(?:unicode-1-1-utf-8|unicode11utf8|unicode20utf8|utf-8|utf8|x-unicode20utf8)\s*$/i'; // See https://encoding.spec.whatwg.org/#names-and-labels
+	$load = function($input) {
+		$d = new DOMDocument();
+		@$d->loadHTML($input, \LIBXML_NOWARNING);
+		return $d;
+	};
+	// parse the document and find the first valid meta element with encoding information
+	// even if this initial parsing uses the wrong encoding, it's more reliable than trying to do this with the string directly
+	$d = $load($input);
+	$meta = "";
+	$effective = "";
 	foreach ($d->getElementsByTagName("meta") as $e) {
-		if (preg_match($labelPattern, $e->getAttribute("charset"))) {
-			$charset = true;
+		$meta = Encoding::matchEncodingLabel($e->getAttribute("charset"));
+		if (strlen($meta)) {
+			$effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true);
 			break;
 		}
+		if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") {
+			$candidate = Encoding::parseContentType($e->getAttribute("content"));
+			$meta = Encoding::matchEncodingLabel($candidate['charset']);
+			if (strlen($meta)) {
+				$effective = Encoding::matchEncodingLabel($candidate['charset'], true);
+				break;
+			}
+		}
 	}
-	if ($charset) {
-		$d = new \DOMDocument();
-		@$d->loadHTML("\xEF\xBB\xBF".$input, \LIBXML_NOWARNING);
+	// now find the authoritative encoding according to the standard HTML order
+	$uthoritative = "";
+	// start by looking for BOMs
+	if (substr($input, 0, 3) === "\xEF\xBB\xBF") {
+		$authoritative = "UTF-8";
+	} elseif (substr($input, 0, 2) === "\xFE\xFF") {
+		$authoritative = "UTF-16BE";
+	} elseif (substr($input, 0, 2) === "\xFF\xFE") {
+		$authoritative = "UTF-16LE";
+	} else {
+		// now parse the HTTP Content-Type looking for an encoding
+		$candidate = Encoding::parseContentType($contentType);
+		$candidate = Encoding::matchEncodingLabel($candidate['charset']);
+		if ($candidate) {
+			$authoritative = $candidate;
+		} elseif ($meta) {
+			// if the document has a valid <meta> element, use that
+			$authoritative = $meta;
+		} elseif (preg_match(Encoding::UTF8_PATTERN, $input)) {
+			// if the string is valid UTF-8, we can use this
+			$authoritative = "UTF-8";
+		} else {
+			// otherwise use the default encoding
+			$authoritative = Encoding::DEFAULT_ENCODING;
+		}
+	}
+	// if the authoritative encoding does not match the encoding currently in
+	//   the document, add an appropriate meta tag or (for UTF-16) a BOM
+	if ($authoritative !== $effective) {
+		// some canonical names are not understood by some environments, so we use aliases where needed
+		if (array_key_exists($authoritative, Encoding::ENCODING_ALIAS_MAP)) {
+			$authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative];
+		}
+		if ($authoritative === "UTF-16BE") {
+			if (!substr($input, 0, 2) === "\xFE\xFF") {
+				// add a BOM and reparse
+				$d = $load("\xFE\xFF".$input);
+			}
+		} elseif ($authoritative === "UTF-16BLE") {
+			if (!substr($input, 0, 2) === "\xFF\xFE") {
+				// add a BOM and reparse
+				$d = $load("\xFF\xFE".$input);
+			}
+		} else {
+			$offset = 0;
+			// look for a DOCTYPE, possibly preceded by a UTF-8 BOM and/or whitespace; we'll insert our meta tag after this to avoid any quirks mode the parser may have
+			if (preg_match('/^\x{EF}\x{BB}\x{BF}\s*<!DOCTYPE(?:\s+html(?:\s+PUBLIC(?:\s+"[^"]*"|\s+\'[^\']*\'(?:\s+"[^"]*"|\s+\'[^\']*\')?)?)?)?\s*>/is', $input, $match)) {
+				$offset = strlen($match[0]);
+			}
+			$tag = '<meta http-equiv="Content-Type" content="text/html;charset=' . $authoritative . '">';
+			$d = $load(substr($input, 0, $offset) . $tag . substr($input, $offset));
+		}
 	}
 	return $d;
 }
@@ -2452,3 +2527,379 @@ function removeDotSegments($path) {
 
 	return $output;
 }
+
+/** Tools for determining the encoding of documents */
+class Encoding {
+	/** @var string The default encoding to use when none is detected in the document */
+	const DEFAULT_ENCODING = "windows-1252";
+	/** @var array The list of known encodings; see https://encoding.spec.whatwg.org/#names-and-labels */
+	const ENCODINGS = [
+		'unicode-1-1-utf-8' => "UTF-8",
+		'unicode11utf8' => "UTF-8",
+		'unicode20utf8' => "UTF-8",
+		'utf-8' => "UTF-8",
+		'utf8' => "UTF-8",
+		'x-unicode20utf8' => "UTF-8",
+		'866' => "IBM866",
+		'cp866' => "IBM866",
+		'csibm866' => "IBM866",
+		'ibm866' => "IBM866",
+		'csisolatin2' => "ISO-8859-2",
+		'iso-8859-2' => "ISO-8859-2",
+		'iso-ir-101' => "ISO-8859-2",
+		'iso8859-2' => "ISO-8859-2",
+		'iso88592' => "ISO-8859-2",
+		'iso_8859-2' => "ISO-8859-2",
+		'iso_8859-2:1987' => "ISO-8859-2",
+		'l2' => "ISO-8859-2",
+		'latin2' => "ISO-8859-2",
+		'csisolatin3' => "ISO-8859-3",
+		'iso-8859-3' => "ISO-8859-3",
+		'iso-ir-109' => "ISO-8859-3",
+		'iso8859-3' => "ISO-8859-3",
+		'iso88593' => "ISO-8859-3",
+		'iso_8859-3' => "ISO-8859-3",
+		'iso_8859-3:1988' => "ISO-8859-3",
+		'l3' => "ISO-8859-3",
+		'latin3' => "ISO-8859-3",
+		'csisolatin4' => "ISO-8859-4",
+		'iso-8859-4' => "ISO-8859-4",
+		'iso-ir-110' => "ISO-8859-4",
+		'iso8859-4' => "ISO-8859-4",
+		'iso88594' => "ISO-8859-4",
+		'iso_8859-4' => "ISO-8859-4",
+		'iso_8859-4:1988' => "ISO-8859-4",
+		'l4' => "ISO-8859-4",
+		'latin4' => "ISO-8859-4",
+		'csisolatincyrillic' => "ISO-8859-5",
+		'cyrillic' => "ISO-8859-5",
+		'iso-8859-5' => "ISO-8859-5",
+		'iso-ir-144' => "ISO-8859-5",
+		'iso8859-5' => "ISO-8859-5",
+		'iso88595' => "ISO-8859-5",
+		'iso_8859-5' => "ISO-8859-5",
+		'iso_8859-5:1988' => "ISO-8859-5",
+		'arabic' => "ISO-8859-6",
+		'asmo-708' => "ISO-8859-6",
+		'csiso88596e' => "ISO-8859-6",
+		'csiso88596i' => "ISO-8859-6",
+		'csisolatinarabic' => "ISO-8859-6",
+		'ecma-114' => "ISO-8859-6",
+		'iso-8859-6' => "ISO-8859-6",
+		'iso-8859-6-e' => "ISO-8859-6",
+		'iso-8859-6-i' => "ISO-8859-6",
+		'iso-ir-127' => "ISO-8859-6",
+		'iso8859-6' => "ISO-8859-6",
+		'iso88596' => "ISO-8859-6",
+		'iso_8859-6' => "ISO-8859-6",
+		'iso_8859-6:1987' => "ISO-8859-6",
+		'csisolatingreek' => "ISO-8859-7",
+		'ecma-118' => "ISO-8859-7",
+		'elot_928' => "ISO-8859-7",
+		'greek' => "ISO-8859-7",
+		'greek8' => "ISO-8859-7",
+		'iso-8859-7' => "ISO-8859-7",
+		'iso-ir-126' => "ISO-8859-7",
+		'iso8859-7' => "ISO-8859-7",
+		'iso88597' => "ISO-8859-7",
+		'iso_8859-7' => "ISO-8859-7",
+		'iso_8859-7:1987' => "ISO-8859-7",
+		'sun_eu_greek' => "ISO-8859-7",
+		'csiso88598e' => "ISO-8859-8",
+		'csisolatinhebrew' => "ISO-8859-8",
+		'hebrew' => "ISO-8859-8",
+		'iso-8859-8' => "ISO-8859-8",
+		'iso-8859-8-e' => "ISO-8859-8",
+		'iso-ir-138' => "ISO-8859-8",
+		'iso8859-8' => "ISO-8859-8",
+		'iso88598' => "ISO-8859-8",
+		'iso_8859-8' => "ISO-8859-8",
+		'iso_8859-8:1988' => "ISO-8859-8",
+		'visual' => "ISO-8859-8",
+		'csiso88598i' => "ISO-8859-8-I",
+		'iso-8859-8-i' => "ISO-8859-8-I",
+		'logical' => "ISO-8859-8-I",
+		'csisolatin6' => "ISO-8859-10",
+		'iso-8859-10' => "ISO-8859-10",
+		'iso-ir-157' => "ISO-8859-10",
+		'iso8859-10' => "ISO-8859-10",
+		'iso885910' => "ISO-8859-10",
+		'l6' => "ISO-8859-10",
+		'latin6' => "ISO-8859-10",
+		'iso-8859-13' => "ISO-8859-13",
+		'iso8859-13' => "ISO-8859-13",
+		'iso885913' => "ISO-8859-13",
+		'iso-8859-14' => "ISO-8859-14",
+		'iso8859-14' => "ISO-8859-14",
+		'iso885914' => "ISO-8859-14",
+		'csisolatin9' => "ISO-8859-15",
+		'iso-8859-15' => "ISO-8859-15",
+		'iso8859-15' => "ISO-8859-15",
+		'iso885915' => "ISO-8859-15",
+		'iso_8859-15' => "ISO-8859-15",
+		'l9' => "ISO-8859-15",
+		'iso-8859-16' => "ISO-8859-16",
+		'cskoi8r' => "KOI8-R",
+		'koi' => "KOI8-R",
+		'koi8' => "KOI8-R",
+		'koi8-r' => "KOI8-R",
+		'koi8_r' => "KOI8-R",
+		'koi8-ru' => "KOI8-U",
+		'koi8-u' => "KOI8-U",
+		'csmacintosh' => "macintosh",
+		'mac' => "macintosh",
+		'macintosh' => "macintosh",
+		'x-mac-roman' => "macintosh",
+		'dos-874' => "windows-874",
+		'iso-8859-11' => "windows-874",
+		'iso8859-11' => "windows-874",
+		'iso885911' => "windows-874",
+		'tis-620' => "windows-874",
+		'windows-874' => "windows-874",
+		'cp1250' => "windows-1250",
+		'windows-1250' => "windows-1250",
+		'x-cp1250' => "windows-1250",
+		'cp1251' => "windows-1251",
+		'windows-1251' => "windows-1251",
+		'x-cp1251' => "windows-1251",
+		'ansi_x3.4-1968' => "windows-1252",
+		'ascii' => "windows-1252",
+		'cp1252' => "windows-1252",
+		'cp819' => "windows-1252",
+		'csisolatin1' => "windows-1252",
+		'ibm819' => "windows-1252",
+		'iso-8859-1' => "windows-1252",
+		'iso-ir-100' => "windows-1252",
+		'iso8859-1' => "windows-1252",
+		'iso88591' => "windows-1252",
+		'iso_8859-1' => "windows-1252",
+		'iso_8859-1:1987' => "windows-1252",
+		'l1' => "windows-1252",
+		'latin1' => "windows-1252",
+		'us-ascii' => "windows-1252",
+		'windows-1252' => "windows-1252",
+		'x-cp1252' => "windows-1252",
+		'cp1253' => "windows-1253",
+		'windows-1253' => "windows-1253",
+		'x-cp1253' => "windows-1253",
+		'cp1254' => "windows-1254",
+		'csisolatin5' => "windows-1254",
+		'iso-8859-9' => "windows-1254",
+		'iso-ir-148' => "windows-1254",
+		'iso8859-9' => "windows-1254",
+		'iso88599' => "windows-1254",
+		'iso_8859-9' => "windows-1254",
+		'iso_8859-9:1989' => "windows-1254",
+		'l5' => "windows-1254",
+		'latin5' => "windows-1254",
+		'windows-1254' => "windows-1254",
+		'x-cp1254' => "windows-1254",
+		'cp1255' => "windows-1255",
+		'windows-1255' => "windows-1255",
+		'x-cp1255' => "windows-1255",
+		'cp1256' => "windows-1256",
+		'windows-1256' => "windows-1256",
+		'x-cp1256' => "windows-1256",
+		'cp1257' => "windows-1257",
+		'windows-1257' => "windows-1257",
+		'x-cp1257' => "windows-1257",
+		'cp1258' => "windows-1258",
+		'windows-1258' => "windows-1258",
+		'x-cp1258' => "windows-1258",
+		'x-mac-cyrillic' => "x-mac-cyrillic",
+		'x-mac-ukrainian' => "x-mac-cyrillic",
+		'chinese' => "GBK",
+		'csgb2312' => "GBK",
+		'csiso58gb231280' => "GBK",
+		'gb2312' => "GBK",
+		'gb_2312' => "GBK",
+		'gb_2312-80' => "GBK",
+		'gbk' => "GBK",
+		'iso-ir-58' => "GBK",
+		'x-gbk' => "GBK",
+		'gb18030' => "gb18030",
+		'big5' => "Big5",
+		'big5-hkscs' => "Big5",
+		'cn-big5' => "Big5",
+		'csbig5' => "Big5",
+		'x-x-big5' => "Big5",
+		'cseucpkdfmtjapanese' => "EUC-JP",
+		'euc-jp' => "EUC-JP",
+		'x-euc-jp' => "EUC-JP",
+		'csiso2022jp' => "ISO-2022-JP",
+		'iso-2022-jp' => "ISO-2022-JP",
+		'csshiftjis' => "Shift_JIS",
+		'ms932' => "Shift_JIS",
+		'ms_kanji' => "Shift_JIS",
+		'shift-jis' => "Shift_JIS",
+		'shift_jis' => "Shift_JIS",
+		'sjis' => "Shift_JIS",
+		'windows-31j' => "Shift_JIS",
+		'x-sjis' => "Shift_JIS",
+		'cseuckr' => "EUC-KR",
+		'csksc56011987' => "EUC-KR",
+		'euc-kr' => "EUC-KR",
+		'iso-ir-149' => "EUC-KR",
+		'korean' => "EUC-KR",
+		'ks_c_5601-1987' => "EUC-KR",
+		'ks_c_5601-1989' => "EUC-KR",
+		'ksc5601' => "EUC-KR",
+		'ksc_5601' => "EUC-KR",
+		'windows-949' => "EUC-KR",
+		'csiso2022kr' => "replacement",
+		'hz-gb-2312' => "replacement",
+		'iso-2022-cn' => "replacement",
+		'iso-2022-cn-ext' => "replacement",
+		'iso-2022-kr' => "replacement",
+		'replacement' => "replacement",
+		'unicodefffe' => "UTF-16BE",
+		'utf-16be' => "UTF-16BE",
+		'csunicode' => "UTF-16LE",
+		'iso-10646-ucs-2' => "UTF-16LE",
+		'ucs-2' => "UTF-16LE",
+		'unicode' => "UTF-16LE",
+		'unicodefeff' => "UTF-16LE",
+		'utf-16' => "UTF-16LE",
+		'utf-16le' => "UTF-16LE",
+		'x-user-defined' => "x-user-defined",
+	];
+	/** @var string The PCRE pattern for a Content-Type HTTP header-field, chopping it up into three main sections */
+	const TYPE_PATTERN = <<<'PATTERN'
+	/^
+		[\t\r\n ]*                              # optional leading whitespace
+		([^\/]+)                                # type  
+		\/                                      # type-subtype delimiter
+		([^;]+)                                 # subtype (possibly with trailing whitespace)
+		(;.*)?                                  # optional parameters, to be parsed separately
+		[\t\r\n ]*                              # optional trailing whitespace
+	$/sx
+PATTERN;
+	/** @var string  The PCRE pattern for an HTTP Content-Type header-field's parameters, splitting them into a series of names and values*/
+	const PARAM_PATTERN = <<<'PATTERN'
+	/
+		[;\t\r\n ]*                             # parameter delimiter and leading whitespace, all optional
+		([^=;]*)                                # parameter name; may be empty
+		(?:=                                    # parameter name-value delimiter
+			(
+				"(?:\\"|[^"])*(?:"|$)[^;]*      # quoted parameter value and optional garbage
+				|[^;]*                          # unquoted parameter value (possibly with trailing whitespace)
+			)
+		)?
+		;?                                      # optional trailing parameter delimiter
+		[\t\r\n ]*                              # optional trailing whitespace
+	/sx
+PATTERN;
+	/** @var string The PCRE pattern for an HTTP token, used to validate the nameof a parameter */
+	const TOKEN_PATTERN = '/^[A-Za-z0-9!#$%&\'*+\-\.\^_`|~]+$/s';
+	/** @var string The PCRE pattern for a "bare" (unquoted) parameter value, used for validation */
+	const BARE_VALUE_PATTERN = '/^[\t\x{20}-\x{7E}\x{80}-\x{FF}]+$/s';
+	/** @var string The PCRE pattern for a quoted parameter value, used for validation */
+	const QUOTED_VALUE_PATTERN = '/^"((?:\\\"|[\t !\x{23}-\x{7E}\x{80}-\x{FF}])*)(?:"|$)/s';
+	/** @var string The PCRE pattern for an escaped character in a quoted parameter value, used to unescape via replacement */
+	const ESCAPE_PATTERN = '/\\\(.)/s';
+	/** @var string The PCRE pattern for all valid UTF-8 characters */
+	const UTF8_PATTERN = <<<'PATTERN'
+	/^(?:
+		# Single-byte
+		[\x{00}-\x{7F}]+
+		# Two-byte
+		|[\x{C2}-\x{DF}][\x{80}-\x{BF}]
+		# Three-byte excluding surrogates
+		|\x{E0}[\x{A0}-\x{BF}][\x{80}-\x{BF}]
+		|\x{ED}[\x{80}-\x{9F}][\x{80}-\x{BF}]
+		|[\x{E1}-\x{EC}\x{EE}\x{EF}][\x{80}-\x{BF}]{2}
+		# Four-byte
+		|\x{F0}[\x{90}-\x{BF}][\x{80}-\x{BF}]{2}
+		|\x{F4}[\x{80}-\x{8F}][\x{80}-\x{BF}]{2}
+		|[\x{F1}-\x{F3}][\x{80}-\x{BF}]{3}
+	)*$/sx
+PATTERN;
+	/** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */
+	const ENCODING_NAUGHTY_LIST = [
+		"unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8",
+		"iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e",
+		"csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597",
+		"sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual",
+		"csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913",
+		"iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r",
+		"x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
+		"tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819",
+		"csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1",
+		"iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1",
+		"us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254",
+		"x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258",
+		"x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312",
+		"gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5",
+		"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
+		"cseuckr", "euc-kr"
+	];
+	/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
+	const ENCODING_ALIAS_MAP = [
+		'windows-1258' => "x-cp1258",
+		'GBK'          => "x-gbk",
+		'Big5'         => "big5-hkscs",
+		'EUC-KR'       => "korean",
+	];
+
+	/** Matches an encoding label to a known encoding name in the WHATWG encoding list
+	 * 
+	 * If the label does not match any known encoding, and empty string is returned.
+	 * 
+	 * @param string $label The label to match e.g. "utf8", " FOO_bar "
+	 * @param bool $excludeNaughty Whether or not to exclude labels DOMDocument does not understand
+	 * @return string The canonical name of the encoding if matched (e.g. "UTF-8"), or an empty string in case of failure 
+	 */
+	public static function matchEncodingLabel($label, $excludeNaughty = false) {
+		$label = strtolower(trim($label));
+		if ($excludeNaughty && in_array($label, self::ENCODING_NAUGHTY_LIST)) {
+			return "";
+		}
+		if (array_key_exists($label, self::ENCODINGS)) {
+			return self::ENCODINGS[$label];
+		}
+		return "";
+	}
+
+	/** Parses a Content-Type header-field to extract the type and charset, if any
+	 * 
+	 * @param string $type
+	 * @return array An array containing keys for "type" and "charset"
+	 */
+	public static function parseContentType($type) {
+		$out = [
+			'type' => "",
+			'charset' => "",
+		];
+		if (preg_match(self::TYPE_PATTERN, $type, $match)) {
+			list($mimeType, $type, $subtype, $params) = array_pad($match, 4, "");
+			$type = preg_match(self::TOKEN_PATTERN, $type) ? $type : "";
+			$subtype = rtrim($subtype, "\t\r\n ");
+			$subtype = preg_match(self::TOKEN_PATTERN, $subtype) ? $subtype : ""; 
+			if (strlen($type) && strlen($subtype)) {
+				$out['type'] = strtolower($type) . "/" . strtolower($subtype);
+				// parse parameters looking for a charset one
+				if (preg_match_all(self::PARAM_PATTERN, $params, $matches, \PREG_SET_ORDER)) {
+					foreach ($matches as $match) {
+						list($param, $name, $value) = array_pad($match, 3, "");
+						$name = strtolower(preg_match(self::TOKEN_PATTERN, $name) ? $name : "");
+						if ($name === "charset" && strlen($value)) {
+							if ($value[0] === '"') {
+								if (preg_match(self::QUOTED_VALUE_PATTERN, $value, $match)) {
+									$out['charset'] = preg_replace(self::ESCAPE_PATTERN, '$1', $match[1]);
+									return $out;
+								}
+							} else {
+								$value = rtrim($value, "\t\r\n ");
+								if (preg_match(self::BARE_VALUE_PATTERN, $value)) {
+									$out['charset'] = $value;
+									return $out;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+		return $out;
+	}
+}

From 04b947f5d2248117e7092ac419065ebc448efb20 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Thu, 6 Apr 2023 21:54:32 -0400
Subject: [PATCH 07/11] Revert changes to tests which are not needed

As UTF-8 is now heuristically detected, no explicit encoding declaration
is required
---
 tests/Mf2/ClassicMicroformatsTest.php   | 2 +-
 tests/Mf2/MicroformatsTestSuiteTest.php | 2 --
 tests/Mf2/ParseImpliedTest.php          | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/Mf2/ClassicMicroformatsTest.php b/tests/Mf2/ClassicMicroformatsTest.php
index 89579e7..5b192c3 100644
--- a/tests/Mf2/ClassicMicroformatsTest.php
+++ b/tests/Mf2/ClassicMicroformatsTest.php
@@ -911,7 +911,7 @@ public function testHEntryRelTagInContent() {
 	 * @see source: http://jg.typepad.com/ciel/2006/02/daniel_bouluds_.html
 	 */
 	public function testHReviewRelTag() {
-		$input = '<meta charset="UTF-8"><div class="hreview">
+		$input = '<div class="hreview">
 <span class="version" style="display:none">0.2</span>
   <h2 class="summary">
     Divine Brunch!
diff --git a/tests/Mf2/MicroformatsTestSuiteTest.php b/tests/Mf2/MicroformatsTestSuiteTest.php
index 6126f66..855c1fd 100644
--- a/tests/Mf2/MicroformatsTestSuiteTest.php
+++ b/tests/Mf2/MicroformatsTestSuiteTest.php
@@ -76,8 +76,6 @@ public function testMf2FromTestSuite($input, $expectedOutput, $test)
             $this->markTestIncomplete('This test does not match because we implement a proposed spec: https://github.com/microformats/microformats2-parsing/issues/4#issuecomment-373457720.');
         }
 
-        // Add a byte order mark to the input to ensure encoding is correctly detected
-        $input = "\xEF\xBB\xBF".$input;
         $parser = new TestSuiteParser($input, 'http://example.com/');
         $this->assertEquals(
             $this->makeComparible(json_decode($expectedOutput, true)),
diff --git a/tests/Mf2/ParseImpliedTest.php b/tests/Mf2/ParseImpliedTest.php
index 7195581..d806c1a 100644
--- a/tests/Mf2/ParseImpliedTest.php
+++ b/tests/Mf2/ParseImpliedTest.php
@@ -166,7 +166,7 @@ public function testParsesImpliedNameWithExplicitURL() {
   }
 
 	public function testMultipleImpliedHCards() {
-		$input = '<meta charset="UTF-8"><span class="h-card">Frances Berriman</span>
+		$input = '<span class="h-card">Frances Berriman</span>
 
 <a class="h-card" href="http://benward.me">Ben Ward</a>
 

From 69825b6e5e6f2fb0b43bb2e03aef2f6a9e91ab8b Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Fri, 7 Apr 2023 16:52:39 -0400
Subject: [PATCH 08/11] Consider BOMs correctly

---
 Mf2/Parser.php | 71 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index c0398c0..9322c82 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -103,9 +103,10 @@ function load_dom_document($input, $contentType = "") {
 	DOMDocument, on the other hand, takes encoding from the following
 	sources from most to least authoritative:
 
-	1. Meta element in document
-	2. Byte order mark
-	3. Default encoding (ISO 8859-1)
+	1. UTF-16 byte order mark
+	2. Meta element in document
+	3. UTF-8 byte order mark
+	4. Default encoding (ISO 8859-1)
 
 	Thus this function finds the most authoritative encoding in the order
 	required by the specification, and then inserts a meta element where 
@@ -120,34 +121,50 @@ function load_dom_document($input, $contentType = "") {
 		@$d->loadHTML($input, \LIBXML_NOWARNING);
 		return $d;
 	};
-	// parse the document and find the first valid meta element with encoding information
-	// even if this initial parsing uses the wrong encoding, it's more reliable than trying to do this with the string directly
-	$d = $load($input);
 	$meta = "";
 	$effective = "";
-	foreach ($d->getElementsByTagName("meta") as $e) {
-		$meta = Encoding::matchEncodingLabel($e->getAttribute("charset"));
-		if (strlen($meta)) {
-			$effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true);
-			break;
-		}
-		if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") {
-			$candidate = Encoding::parseContentType($e->getAttribute("content"));
-			$meta = Encoding::matchEncodingLabel($candidate['charset']);
+	// determine the encoding which DOMDocument has detected from the document, starting by looking for a UTF-16 BOM
+	if (substr($input, 0, 2) === Encoding::BOM_UTF16BE) {
+		$effective = "UTF-16BE";
+	} elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) {
+		$effective = "UTF-16LE";
+	} else {
+		// parse the document and find the first valid meta element with 
+		//   encoding information; even if this initial parsing uses the
+		//   wrong encoding, it's more reliable than trying to do this
+		//   with the string directly
+		// we keep note of both what DOMDocument understands (the $effective
+		//   encoding), as well as what the standard says is valid (the $meta
+		//   encoding) as the two can differ
+		$d = $load($input);
+		foreach ($d->getElementsByTagName("meta") as $e) {
+			$meta = Encoding::matchEncodingLabel($e->getAttribute("charset"));
 			if (strlen($meta)) {
-				$effective = Encoding::matchEncodingLabel($candidate['charset'], true);
+				$effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true);
 				break;
 			}
+			if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") {
+				$candidate = Encoding::parseContentType($e->getAttribute("content"));
+				$meta = Encoding::matchEncodingLabel($candidate['charset']);
+				if (strlen($meta)) {
+					$effective = Encoding::matchEncodingLabel($candidate['charset'], true);
+					break;
+				}
+			}
+		}
+		// if no effective encoding was found, check for a UTF-8 BOM
+		if (!$effective && substr($input, 0, 3) === Encoding::BOM_UTF8) {
+			$effective = "UTF-8";
 		}
 	}
 	// now find the authoritative encoding according to the standard HTML order
 	$uthoritative = "";
 	// start by looking for BOMs
-	if (substr($input, 0, 3) === "\xEF\xBB\xBF") {
+	if (substr($input, 0, 3) === Encoding::BOM_UTF8) {
 		$authoritative = "UTF-8";
-	} elseif (substr($input, 0, 2) === "\xFE\xFF") {
+	} elseif (substr($input, 0, 2) === Encoding::BOM_UTF16BE) {
 		$authoritative = "UTF-16BE";
-	} elseif (substr($input, 0, 2) === "\xFF\xFE") {
+	} elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) {
 		$authoritative = "UTF-16LE";
 	} else {
 		// now parse the HTTP Content-Type looking for an encoding
@@ -174,14 +191,14 @@ function load_dom_document($input, $contentType = "") {
 			$authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative];
 		}
 		if ($authoritative === "UTF-16BE") {
-			if (!substr($input, 0, 2) === "\xFE\xFF") {
+			if (!substr($input, 0, 2) === Encoding::BOM_UTF16BE) {
 				// add a BOM and reparse
-				$d = $load("\xFE\xFF".$input);
+				$d = $load(Encoding::BOM_UTF16BE.$input);
 			}
-		} elseif ($authoritative === "UTF-16BLE") {
-			if (!substr($input, 0, 2) === "\xFF\xFE") {
+		} elseif ($authoritative === "UTF-16LE") {
+			if (!substr($input, 0, 2) === Encoding::BOM_UTF16LE) {
 				// add a BOM and reparse
-				$d = $load("\xFF\xFE".$input);
+				$d = $load(Encoding::BOM_UTF16LE.$input);
 			}
 		} else {
 			$offset = 0;
@@ -2840,6 +2857,12 @@ class Encoding {
 		'Big5'         => "big5-hkscs",
 		'EUC-KR'       => "korean",
 	];
+	/** @var string A UTF-8 byte order mark */
+	const BOM_UTF8 = "\xEF\xBB\xBF";
+	/** @var string A UTF-16 (big-endian) byte order mark */
+	const BOM_UTF16BE = "\xFE\xFF";
+	/** @var string A UTF-16 (little-endian) byte order mark */
+	const BOM_UTF16LE = "\xFF\xFE";
 
 	/** Matches an encoding label to a known encoding name in the WHATWG encoding list
 	 * 

From 9e93f7f940c5ba43e451069a2452136812190242 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Sat, 8 Apr 2023 11:16:14 -0400
Subject: [PATCH 09/11] Add <meta> after <head> rather than after DOCTYPE

---
 Mf2/Parser.php | 52 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index 9322c82..92209bc 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -202,8 +202,11 @@ function load_dom_document($input, $contentType = "") {
 			}
 		} else {
 			$offset = 0;
-			// look for a DOCTYPE, possibly preceded by a UTF-8 BOM and/or whitespace; we'll insert our meta tag after this to avoid any quirks mode the parser may have
-			if (preg_match('/^\x{EF}\x{BB}\x{BF}\s*<!DOCTYPE(?:\s+html(?:\s+PUBLIC(?:\s+"[^"]*"|\s+\'[^\']*\'(?:\s+"[^"]*"|\s+\'[^\']*\')?)?)?)?\s*>/is', $input, $match)) {
+			// look for an HTML <head> element and insert the meta tag within
+			//   it, or at the nearest appropriate point; if the tag is 
+			//   inserted too soon, attributes on <head> or <html> (which
+			//   may hold mf data) can be stripped
+			if (preg_match(Encoding::HTML_HEADER_PATTERN, $input, $match)) {
 				$offset = strlen($match[0]);
 			}
 			$tag = '<meta http-equiv="Content-Type" content="text/html;charset=' . $authoritative . '">';
@@ -2863,6 +2866,51 @@ class Encoding {
 	const BOM_UTF16BE = "\xFE\xFF";
 	/** @var string A UTF-16 (little-endian) byte order mark */
 	const BOM_UTF16LE = "\xFF\xFE";
+	/** @var string A PCRE pattern which finds the insertion point inside the HTML <head> element of a document; the pattern takes some shortcuts, but nothing which is likely to affect documents which actually have microformat metadata */
+	const HTML_HEADER_PATTERN = <<<PATTERN
+	/^(?:\x{EF}\x{BB}\x{BF})?							# Optional UTF-8 BOM at start of string
+	(?:													# Followed by...
+		\s+												# Whitespace or
+		|<!DOCTYPE[^>]*>								# DOCTYPE or
+		|<!--(?:-?>|(?:[^-]|-(?!->))*-->)				# Comment or
+		|<\?[^>]*>										# Processing instruction or XML declaration
+	)*													# ... zero or more times
+	(?:<html											# Followed by a possible <html> start tag
+		(?:\s+											# ... with possible attributes
+			(?:\s*[^\s=>]*
+				(?:=
+					(?:
+						"[^"]*"
+						|'[^']*'
+						|[^ >]*
+					)?
+				)?
+			)*
+		)?
+		\/?\s*>
+	)?
+	(?:													# Followed by...
+		\s+												# Whitespace or
+		|<!DOCTYPE[^>]*>								# DOCTYPE or
+		|<!--(?:-?>|(?:[^-]|-(?!->))*-->)				# Comment or
+		|<\?[^>]*>										# Processing instruction or XML declaration
+	)*													# ... zero or more times
+	(?:<head											# Followed by a possible <head> start tag
+		(?:\s+											# ... with possible attributes
+			(?:\s*[^\s=>]*
+				(?:=
+					(?:
+						"[^"]*"
+						|'[^']*'
+						|[^ >]*
+					)?
+				)?
+			)*
+		)?
+		\/?\s*>
+	)?
+	/six
+PATTERN;
 
 	/** Matches an encoding label to a known encoding name in the WHATWG encoding list
 	 * 

From f3cbf76944dd6ea48f95650c9853b4063710bb01 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Sat, 8 Apr 2023 13:32:42 -0400
Subject: [PATCH 10/11] Handle x-user-defined and replacement

Neither is likely to appear in a real HTML document; x-user-defined is
not actually supported properly, but it's unlikely to be used for HTML
---
 Mf2/Parser.php | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index 92209bc..a6ad4fa 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -191,15 +191,18 @@ function load_dom_document($input, $contentType = "") {
 			$authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative];
 		}
 		if ($authoritative === "UTF-16BE") {
-			if (!substr($input, 0, 2) === Encoding::BOM_UTF16BE) {
+			if (substr($input, 0, 2) !== Encoding::BOM_UTF16BE) {
 				// add a BOM and reparse
 				$d = $load(Encoding::BOM_UTF16BE.$input);
 			}
 		} elseif ($authoritative === "UTF-16LE") {
-			if (!substr($input, 0, 2) === Encoding::BOM_UTF16LE) {
+			if (substr($input, 0, 2) !== Encoding::BOM_UTF16LE) {
 				// add a BOM and reparse
 				$d = $load(Encoding::BOM_UTF16LE.$input);
 			}
+		} elseif ($authoritative === "replacement") {
+			// the replacement encoding is actually a denylist of problematic encodings; the document is reduced to a single replacement character
+			$d = $load(Encoding::BOM_UTF8."\xEF\xBF\xBD");
 		} else {
 			$offset = 0;
 			// look for an HTML <head> element and insert the meta tag within
@@ -2851,14 +2854,15 @@ class Encoding {
 		"x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312",
 		"gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5",
 		"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
-		"cseuckr", "euc-kr"
+		"cseuckr", "euc-kr", "x-user-defined", "replacement",
 	];
 	/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
 	const ENCODING_ALIAS_MAP = [
 		'windows-1258' => "x-cp1258",
-		'GBK'          => "x-gbk",
-		'Big5'         => "big5-hkscs",
-		'EUC-KR'       => "korean",
+		'GBK' => "x-gbk",
+		'Big5' => "big5-hkscs",
+		'EUC-KR' => "korean",
+		'x-user-defined' => "windows-1256", // this is technically not correct, but x-user-defined is not likely to be used in HTML documents; it is used to represent binary data in JavaScript; windows-1256 is used as a substitute as every byte is assigned to a character, so text can be converted back into bytes with no loss
 	];
 	/** @var string A UTF-8 byte order mark */
 	const BOM_UTF8 = "\xEF\xBB\xBF";

From 10dda4659e2c873a046161e604393ae2d37917a7 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Sun, 16 Apr 2023 19:13:51 -0400
Subject: [PATCH 11/11] Add tests and correct two bugs

All new lines are covered, and an effort was made to cover as many
permutations of PCRE matches as practical
---
 Mf2/Parser.php             |   8 ++-
 tests/Mf2/EncodingTest.php | 123 +++++++++++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+), 3 deletions(-)
 create mode 100644 tests/Mf2/EncodingTest.php

diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index a6ad4fa..b3ba637 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -121,6 +121,9 @@ function load_dom_document($input, $contentType = "") {
 		@$d->loadHTML($input, \LIBXML_NOWARNING);
 		return $d;
 	};
+	// perform an initial parsing of the document. We may parse it again
+	//   if the encoding is wrong, or use the result if everything is correct
+	$d = $load($input);
 	$meta = "";
 	$effective = "";
 	// determine the encoding which DOMDocument has detected from the document, starting by looking for a UTF-16 BOM
@@ -129,14 +132,13 @@ function load_dom_document($input, $contentType = "") {
 	} elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) {
 		$effective = "UTF-16LE";
 	} else {
-		// parse the document and find the first valid meta element with 
+		// find the first valid meta element in the document with 
 		//   encoding information; even if this initial parsing uses the
 		//   wrong encoding, it's more reliable than trying to do this
 		//   with the string directly
 		// we keep note of both what DOMDocument understands (the $effective
 		//   encoding), as well as what the standard says is valid (the $meta
 		//   encoding) as the two can differ
-		$d = $load($input);
 		foreach ($d->getElementsByTagName("meta") as $e) {
 			$meta = Encoding::matchEncodingLabel($e->getAttribute("charset"));
 			if (strlen($meta)) {
@@ -2925,7 +2927,7 @@ class Encoding {
 	 * @return string The canonical name of the encoding if matched (e.g. "UTF-8"), or an empty string in case of failure 
 	 */
 	public static function matchEncodingLabel($label, $excludeNaughty = false) {
-		$label = strtolower(trim($label));
+		$label = strtolower(trim($label, " \t\r\n\x0C")); // see https://infra.spec.whatwg.org/#ascii-whitespace
 		if ($excludeNaughty && in_array($label, self::ENCODING_NAUGHTY_LIST)) {
 			return "";
 		}
diff --git a/tests/Mf2/EncodingTest.php b/tests/Mf2/EncodingTest.php
new file mode 100644
index 0000000..5e13ddf
--- /dev/null
+++ b/tests/Mf2/EncodingTest.php
@@ -0,0 +1,123 @@
+<?php
+
+namespace Mf2\Parser\Test;
+
+use Mf2\Encoding;
+use Yoast\PHPUnitPolyfills\TestCases\TestCase;
+use function Mf2\load_dom_document;
+
+class EncodingTest extends TestCase {
+	/** 
+	 * @dataProvider provideContentTypes
+	 * @covers Mf2\Encoding::parseContentType
+	 */
+	public function testParseContentType($in, $type, $charset) {
+		$act = Encoding::parseContentType($in);
+		$this->assertIsArray($act);
+		$this->assertArrayHasKey("type", $act);
+		$this->assertArrayHasKey("charset", $act);
+		$this->assertSame($type, $act['type'], "Content-Type did not match");
+		$this->assertSame($charset, $act['charset'], "Charset did not match");
+	}
+
+	public function provideContentTypes() {
+		return [
+			["a/b",								"a/b",			""],
+			["A/B",								"a/b",			""],
+			[" \t\r\na/b \t\r\n",				"a/b",			""],
+			["a/b ; \t\r\n;",					"a/b",			""],
+			["a/b;a=b ; \t\r\n;",				"a/b",			""],
+			["a/b;a=b;charset=foo"	,			"a/b",			"foo"],
+			["a/b;a=b;charset=foo;charset=bar",	"a/b",			"foo"],
+			["a/ b;charset=foo",				"",				""],
+			["a/b;charset=FOO",					"a/b",			"FOO"],
+			['a/b;charset="foo"',				"a/b",			"foo"],
+			['a/b;charset="foo bar"',			"a/b",			"foo bar"],
+			['a/b;charset="foo\\"bar"',			"a/b",			'foo"bar'],
+			['a/b;a="foo"bar;charset=baz',		"a/b",			'baz'],
+			["text/html;charset=UTF-8",         "text/html",	"UTF-8"],
+			["",								"",				""],
+		];
+	}
+
+	/**
+	 * @dataProvider provideEncodingLabels
+	 * @covers Mf2\Encoding::matchEncodingLabel
+	 */
+	public function testMatchEncodingLabels($in, $excludeNaugty, $exp) {
+		$this->assertSame($exp, Encoding::matchEncodingLabel($in, $excludeNaugty));
+	}
+
+	public function provideEncodingLabels() {
+		return [
+			["",							false, ""],
+			["foo",							false, ""],
+			["utf8",						false, "UTF-8"],
+			["UTF8",						false, "UTF-8"],
+			["Unicode",						false, "UTF-16LE"],
+			[" \t\r\n\x0Cbig5 \t\r\n\x0C",	false, "Big5"],
+			["\v\x00big5\v\x00",			false, ""],
+			["iso88591",					false, "windows-1252"],
+			["Windows-1252",                false, "windows-1252"],
+			["",							true,  ""],
+			["foo",							true,  ""],
+			["utf8",						true,  "UTF-8"],
+			["UTF8",						true,  "UTF-8"],
+			["Unicode",						true,  "UTF-16LE"],
+			[" \t\r\n\x0Cbig5 \t\r\n\x0C",	true,  ""],
+			["\v\x00big5\v\x00",			true,  ""],
+			["iso88591",					true,  ""],
+			["Windows-1252",                true,  "windows-1252"],
+		];
+	}
+
+	/**
+	 * @dataProvider provideDocuments
+	 * @covers Mf2\load_dom_document
+	 */
+	public function testDetectEncoding($doc, $type, $body, $htmlClass, $headClass) {
+		$d = load_dom_document($doc, $type);
+		$this->assertInstanceOf("DOMDocument", $d);
+		$this->assertNotNull($d->documentElement);
+		$this->assertSame($body, $d->documentElement->textContent);
+		$this->assertSame($htmlClass, $d->documentElement->getAttribute("class"));
+		if ($head = $d->getElementsByTagName("head")->item(0)) {
+			$this->assertSame($headClass, $head->getAttribute("class"));
+		}
+	}
+
+	public function provideDocuments() {
+		return [
+			["\xEF\xBB\xBF\xC3\xA9",																												"",									"\xC3\xA9",			"",		""],
+			["\xC3\xA9",																															"text/html;charset=utf-8",			"\xC3\xA9",			"",		""],
+			["<html class=a><head class=b>\xC3\xA9",																								"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=a foo><head class=b bar=>\xC3\xA9",																						"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=a><head class=b>\xC3\xA9",																								"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class='a'><head class='b'>\xC3\xA9",																							"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=\"a\"><head class=\"b\">\xC3\xA9",																						"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class='a'id=c><head class='b'/>\xC3\xA9",																						"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=a><head class=b>\xC3\xA9",																								"",									"\xC3\xA9",			"a",	"b"],
+			["<head class=b><meta charset=koi>\xA7",																								"",									"\xE2\x95\x96",		"",		"b"],
+			["<head class=b><meta http-equiv=content-type content=text/html;charset=koi>\xA7",														"",									"\xE2\x95\x96",		"",		"b"],
+			["<head class=b><meta content=text/html;charset=koi http-equiv=content-type>\xA7",														"",									"\xE2\x95\x96",		"",		"b"],
+			["<head class=b><META HTTP-EQUIV=CONTENT-TYPE CONTENT=text/html;charset=koi>\xA7",														"",									"\xE2\x95\x96",		"",		"b"],
+			["\xFE\xFF\xD8\x34\xDD\x1E",																											"",									"\xF0\x9D\x84\x9E",	"",		""],
+			["\xFF\xFE\x34\xD8\x1E\xDD",																											"",									"\xF0\x9D\x84\x9E",	"",		""],
+			["\xD8\x34\xDD\x1E",																													"text/html;charset=utf-16be",		"\xF0\x9D\x84\x9E",	"",		""],
+			["\x34\xD8\x1E\xDD",																													"text/html;charset=utf-16le",		"\xF0\x9D\x84\x9E",	"",		""],
+			["<html class=a><head class=b>\x9F",																									"",									"\xC5\xB8",			"a",	"b"],
+			["\xC3\xA9",																															"text/html;charset=iso-2022-cn",	"\xEF\xBF\xBD",		"",		""],
+			["<html class=a><head class=b>\x9F",																									"",									"\xC5\xB8",			"a",	"b"],
+			["<!DOCTYPE><html class=a><head class=b>\xC3\xA9",																						"",									"\xC3\xA9",			"a",	"b"],
+			["<!DOCTYPE  ><html class=a><head class=b>\xC3\xA9",																					"",									"\xC3\xA9",			"a",	"b"],
+			["<!DOCTYPE html><html class=a><head class=b>\xC3\xA9",																					"",									"\xC3\xA9",			"a",	"b"],
+			["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html class=a><head class=b>\xC3\xA9",	"",									"\xC3\xA9",			"a",	"b"],
+			[" \n\t<!--><html class=a><head class=b>\xC3\xA9",																						"",									"\xC3\xA9",			"a",	"b"],
+			["<html class=a><!-- This is --- a comment! --><head class=b>\xC3\xA9",																	"",									"\xC3\xA9",			"a",	"b"],
+			["<html class=a><!-- Pen > Sword --><head class=b>\xC3\xA9",																			"",									"\xC3\xA9",			"a",	"b"],
+			["<html class=a><!-- This -> Way --><head class=b>\xC3\xA9",																			"",									"\xC3\xA9",			"a",	"b"],
+			["<?xml-stylesheet type=\"text/css\" href=\"style.css\"?><html class=a><head class=b>\xC3\xA9",											"",									"\xC3\xA9",			"a",	"b"],
+			["\xDD",																																"text/html;charset=windows-1258",	"\xC6\xAF",			"",		""],
+		];
+	}
+}
\ No newline at end of file