diff --git a/Mf2/Parser.php b/Mf2/Parser.php
index 8b6485b..b3ba637 100644
--- a/Mf2/Parser.php
+++ b/Mf2/Parser.php
@@ -7,7 +7,6 @@
 use DOMXPath;
 use DOMNode;
 use DOMNodeList;
-use Exception;
 use SplObjectStorage;
 use stdClass;
 
@@ -87,13 +86,139 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) {
 	return parse($html, $url, $convertClassic);
 }
 
-/**
- * Unicode to HTML Entities
- * @param string $input String containing characters to convert into HTML entities
- * @return string
- */
-function unicodeToHtmlEntities($input) {
-	return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
+function load_dom_document($input, $contentType = "") {
+	/* 
+	The work of this function concerns detecting encoding and making sure
+	DOMDocument honours the correct document encoding. There are
+	multiple complications to this, chiefly that DOMDocument's
+	priorities differ from the modern HTML standard's. HTML mandates the
+	following from most to least authoritative:
+
+	1. Byte order mark
+	2. Protocol-specified encoding
+	3. Meta element in document
+	4. Heuristics (optional)
+	5. Default encoding (usually windows-1252)
+
+	DOMDocument, on the other hand, takes encoding from the following
+	sources from most to least authoritative:
+
+	1. UTF-16 byte order mark
+	2. Meta element in document
+	3. UTF-8 byte order mark
+	4. Default encoding (ISO 8859-1)
+
+	Thus this function finds the most authoritative encoding in the order
+	required by the specification, and then inserts a meta element where 
+	necessary i.e. when the document doesn't already have the correct encoding
+	in a meta element.
+
+	Finally, in the case of UTF-16 input the only reliable indicator is a
+	byte order mark, so we insert one where needed instead of a meta element.
+	*/
+	$load = function($input) {
+		$d = new DOMDocument();
+		@$d->loadHTML($input, \LIBXML_NOWARNING);
+		return $d;
+	};
+	// perform an initial parsing of the document. We may parse it again
+	//   if the encoding is wrong, or use the result if everything is correct
+	$d = $load($input);
+	$meta = "";
+	$effective = "";
+	// determine the encoding which DOMDocument has detected from the document, starting by looking for a UTF-16 BOM
+	if (substr($input, 0, 2) === Encoding::BOM_UTF16BE) {
+		$effective = "UTF-16BE";
+	} elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) {
+		$effective = "UTF-16LE";
+	} else {
+		// find the first valid meta element in the document with 
+		//   encoding information; even if this initial parsing uses the
+		//   wrong encoding, it's more reliable than trying to do this
+		//   with the string directly
+		// we keep note of both what DOMDocument understands (the $effective
+		//   encoding), as well as what the standard says is valid (the $meta
+		//   encoding) as the two can differ
+		foreach ($d->getElementsByTagName("meta") as $e) {
+			$meta = Encoding::matchEncodingLabel($e->getAttribute("charset"));
+			if (strlen($meta)) {
+				$effective = Encoding::matchEncodingLabel($e->getAttribute("charset"), true);
+				break;
+			}
+			if (strtolower(trim($e->getAttribute("http-equiv"))) === "content-type") {
+				$candidate = Encoding::parseContentType($e->getAttribute("content"));
+				$meta = Encoding::matchEncodingLabel($candidate['charset']);
+				if (strlen($meta)) {
+					$effective = Encoding::matchEncodingLabel($candidate['charset'], true);
+					break;
+				}
+			}
+		}
+		// if no effective encoding was found, check for a UTF-8 BOM
+		if (!$effective && substr($input, 0, 3) === Encoding::BOM_UTF8) {
+			$effective = "UTF-8";
+		}
+	}
+	// now find the authoritative encoding according to the standard HTML order
+	$uthoritative = "";
+	// start by looking for BOMs
+	if (substr($input, 0, 3) === Encoding::BOM_UTF8) {
+		$authoritative = "UTF-8";
+	} elseif (substr($input, 0, 2) === Encoding::BOM_UTF16BE) {
+		$authoritative = "UTF-16BE";
+	} elseif (substr($input, 0, 2) === Encoding::BOM_UTF16LE) {
+		$authoritative = "UTF-16LE";
+	} else {
+		// now parse the HTTP Content-Type looking for an encoding
+		$candidate = Encoding::parseContentType($contentType);
+		$candidate = Encoding::matchEncodingLabel($candidate['charset']);
+		if ($candidate) {
+			$authoritative = $candidate;
+		} elseif ($meta) {
+			// if the document has a valid <meta> element, use that
+			$authoritative = $meta;
+		} elseif (preg_match(Encoding::UTF8_PATTERN, $input)) {
+			// if the string is valid UTF-8, we can use this
+			$authoritative = "UTF-8";
+		} else {
+			// otherwise use the default encoding
+			$authoritative = Encoding::DEFAULT_ENCODING;
+		}
+	}
+	// if the authoritative encoding does not match the encoding currently in
+	//   the document, add an appropriate meta tag or (for UTF-16) a BOM
+	if ($authoritative !== $effective) {
+		// some canonical names are not understood by some environments, so we use aliases where needed
+		if (array_key_exists($authoritative, Encoding::ENCODING_ALIAS_MAP)) {
+			$authoritative = Encoding::ENCODING_ALIAS_MAP[$authoritative];
+		}
+		if ($authoritative === "UTF-16BE") {
+			if (substr($input, 0, 2) !== Encoding::BOM_UTF16BE) {
+				// add a BOM and reparse
+				$d = $load(Encoding::BOM_UTF16BE.$input);
+			}
+		} elseif ($authoritative === "UTF-16LE") {
+			if (substr($input, 0, 2) !== Encoding::BOM_UTF16LE) {
+				// add a BOM and reparse
+				$d = $load(Encoding::BOM_UTF16LE.$input);
+			}
+		} elseif ($authoritative === "replacement") {
+			// the replacement encoding is actually a denylist of problematic encodings; the document is reduced to a single replacement character
+			$d = $load(Encoding::BOM_UTF8."\xEF\xBF\xBD");
+		} else {
+			$offset = 0;
+			// look for an HTML <head> element and insert the meta tag within
+			//   it, or at the nearest appropriate point; if the tag is 
+			//   inserted too soon, attributes on <head> or <html> (which
+			//   may hold mf data) can be stripped
+			if (preg_match(Encoding::HTML_HEADER_PATTERN, $input, $match)) {
+				$offset = strlen($match[0]);
+			}
+			$tag = '<meta http-equiv="Content-Type" content="text/html;charset=' . $authoritative . '">';
+			$d = $load(substr($input, 0, $offset) . $tag . substr($input, $offset));
+		}
+	}
+	return $d;
 }
 
 /**
@@ -111,10 +236,9 @@ function collapseWhitespace($str) {
 }
 
 function unicodeTrim($str) {
-	// this is cheating. TODO: find a better way if this causes any problems
-	$str = str_replace(mb_convert_encoding('&nbsp;', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
-	$str = preg_replace('/^\s+/', '', $str);
-	return preg_replace('/\s+$/', '', $str);
+	// the binary sequence C2A0 is a UTF-8 non-breaking space character
+	$str = preg_replace('/^(?:\s|\x{C2}\x{A0})+/', '', $str);
+	return preg_replace('/(?:\s|\x{C2}\x{A0})+$/', '', $str);
 }
 
 /**
@@ -170,8 +294,8 @@ function nestedMfPropertyNamesFromClass($class) {
 	foreach (explode(' ', $class) as $classname) {
 		foreach ($prefixes as $prefix) {
 			// Check if $classname is a valid property classname for $prefix.
-			if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
-				$propertyName = mb_substr($classname, mb_strlen($prefix));
+			if (substr($classname, 0, strlen($prefix)) == $prefix && $classname != $prefix) {
+				$propertyName = substr($classname, strlen($prefix));
 				$propertyNames[$propertyName][] = $prefix;
 			}
 		}
@@ -375,8 +499,7 @@ public function __construct($input, $url = null, $jsonMode = false) {
 					$doc = new \Masterminds\HTML5(array('disable_html_ns' => true));
 					$doc = $doc->loadHTML($input);
 			} else {
-				$doc = new DOMDocument();
-				@$doc->loadHTML(unicodeToHtmlEntities($input), \LIBXML_NOWARNING);
+				$doc = load_dom_document($input);
 			}
 		} elseif (is_a($input, 'DOMDocument')) {
 			$doc = clone $input;
@@ -2429,3 +2552,431 @@ function removeDotSegments($path) {
 
 	return $output;
 }
+
+/** Tools for determining the encoding of documents */
+class Encoding {
+	/** @var string The default encoding to use when none is detected in the document */
+	const DEFAULT_ENCODING = "windows-1252";
+	/** @var array The list of known encodings; see https://encoding.spec.whatwg.org/#names-and-labels */
+	const ENCODINGS = [
+		'unicode-1-1-utf-8' => "UTF-8",
+		'unicode11utf8' => "UTF-8",
+		'unicode20utf8' => "UTF-8",
+		'utf-8' => "UTF-8",
+		'utf8' => "UTF-8",
+		'x-unicode20utf8' => "UTF-8",
+		'866' => "IBM866",
+		'cp866' => "IBM866",
+		'csibm866' => "IBM866",
+		'ibm866' => "IBM866",
+		'csisolatin2' => "ISO-8859-2",
+		'iso-8859-2' => "ISO-8859-2",
+		'iso-ir-101' => "ISO-8859-2",
+		'iso8859-2' => "ISO-8859-2",
+		'iso88592' => "ISO-8859-2",
+		'iso_8859-2' => "ISO-8859-2",
+		'iso_8859-2:1987' => "ISO-8859-2",
+		'l2' => "ISO-8859-2",
+		'latin2' => "ISO-8859-2",
+		'csisolatin3' => "ISO-8859-3",
+		'iso-8859-3' => "ISO-8859-3",
+		'iso-ir-109' => "ISO-8859-3",
+		'iso8859-3' => "ISO-8859-3",
+		'iso88593' => "ISO-8859-3",
+		'iso_8859-3' => "ISO-8859-3",
+		'iso_8859-3:1988' => "ISO-8859-3",
+		'l3' => "ISO-8859-3",
+		'latin3' => "ISO-8859-3",
+		'csisolatin4' => "ISO-8859-4",
+		'iso-8859-4' => "ISO-8859-4",
+		'iso-ir-110' => "ISO-8859-4",
+		'iso8859-4' => "ISO-8859-4",
+		'iso88594' => "ISO-8859-4",
+		'iso_8859-4' => "ISO-8859-4",
+		'iso_8859-4:1988' => "ISO-8859-4",
+		'l4' => "ISO-8859-4",
+		'latin4' => "ISO-8859-4",
+		'csisolatincyrillic' => "ISO-8859-5",
+		'cyrillic' => "ISO-8859-5",
+		'iso-8859-5' => "ISO-8859-5",
+		'iso-ir-144' => "ISO-8859-5",
+		'iso8859-5' => "ISO-8859-5",
+		'iso88595' => "ISO-8859-5",
+		'iso_8859-5' => "ISO-8859-5",
+		'iso_8859-5:1988' => "ISO-8859-5",
+		'arabic' => "ISO-8859-6",
+		'asmo-708' => "ISO-8859-6",
+		'csiso88596e' => "ISO-8859-6",
+		'csiso88596i' => "ISO-8859-6",
+		'csisolatinarabic' => "ISO-8859-6",
+		'ecma-114' => "ISO-8859-6",
+		'iso-8859-6' => "ISO-8859-6",
+		'iso-8859-6-e' => "ISO-8859-6",
+		'iso-8859-6-i' => "ISO-8859-6",
+		'iso-ir-127' => "ISO-8859-6",
+		'iso8859-6' => "ISO-8859-6",
+		'iso88596' => "ISO-8859-6",
+		'iso_8859-6' => "ISO-8859-6",
+		'iso_8859-6:1987' => "ISO-8859-6",
+		'csisolatingreek' => "ISO-8859-7",
+		'ecma-118' => "ISO-8859-7",
+		'elot_928' => "ISO-8859-7",
+		'greek' => "ISO-8859-7",
+		'greek8' => "ISO-8859-7",
+		'iso-8859-7' => "ISO-8859-7",
+		'iso-ir-126' => "ISO-8859-7",
+		'iso8859-7' => "ISO-8859-7",
+		'iso88597' => "ISO-8859-7",
+		'iso_8859-7' => "ISO-8859-7",
+		'iso_8859-7:1987' => "ISO-8859-7",
+		'sun_eu_greek' => "ISO-8859-7",
+		'csiso88598e' => "ISO-8859-8",
+		'csisolatinhebrew' => "ISO-8859-8",
+		'hebrew' => "ISO-8859-8",
+		'iso-8859-8' => "ISO-8859-8",
+		'iso-8859-8-e' => "ISO-8859-8",
+		'iso-ir-138' => "ISO-8859-8",
+		'iso8859-8' => "ISO-8859-8",
+		'iso88598' => "ISO-8859-8",
+		'iso_8859-8' => "ISO-8859-8",
+		'iso_8859-8:1988' => "ISO-8859-8",
+		'visual' => "ISO-8859-8",
+		'csiso88598i' => "ISO-8859-8-I",
+		'iso-8859-8-i' => "ISO-8859-8-I",
+		'logical' => "ISO-8859-8-I",
+		'csisolatin6' => "ISO-8859-10",
+		'iso-8859-10' => "ISO-8859-10",
+		'iso-ir-157' => "ISO-8859-10",
+		'iso8859-10' => "ISO-8859-10",
+		'iso885910' => "ISO-8859-10",
+		'l6' => "ISO-8859-10",
+		'latin6' => "ISO-8859-10",
+		'iso-8859-13' => "ISO-8859-13",
+		'iso8859-13' => "ISO-8859-13",
+		'iso885913' => "ISO-8859-13",
+		'iso-8859-14' => "ISO-8859-14",
+		'iso8859-14' => "ISO-8859-14",
+		'iso885914' => "ISO-8859-14",
+		'csisolatin9' => "ISO-8859-15",
+		'iso-8859-15' => "ISO-8859-15",
+		'iso8859-15' => "ISO-8859-15",
+		'iso885915' => "ISO-8859-15",
+		'iso_8859-15' => "ISO-8859-15",
+		'l9' => "ISO-8859-15",
+		'iso-8859-16' => "ISO-8859-16",
+		'cskoi8r' => "KOI8-R",
+		'koi' => "KOI8-R",
+		'koi8' => "KOI8-R",
+		'koi8-r' => "KOI8-R",
+		'koi8_r' => "KOI8-R",
+		'koi8-ru' => "KOI8-U",
+		'koi8-u' => "KOI8-U",
+		'csmacintosh' => "macintosh",
+		'mac' => "macintosh",
+		'macintosh' => "macintosh",
+		'x-mac-roman' => "macintosh",
+		'dos-874' => "windows-874",
+		'iso-8859-11' => "windows-874",
+		'iso8859-11' => "windows-874",
+		'iso885911' => "windows-874",
+		'tis-620' => "windows-874",
+		'windows-874' => "windows-874",
+		'cp1250' => "windows-1250",
+		'windows-1250' => "windows-1250",
+		'x-cp1250' => "windows-1250",
+		'cp1251' => "windows-1251",
+		'windows-1251' => "windows-1251",
+		'x-cp1251' => "windows-1251",
+		'ansi_x3.4-1968' => "windows-1252",
+		'ascii' => "windows-1252",
+		'cp1252' => "windows-1252",
+		'cp819' => "windows-1252",
+		'csisolatin1' => "windows-1252",
+		'ibm819' => "windows-1252",
+		'iso-8859-1' => "windows-1252",
+		'iso-ir-100' => "windows-1252",
+		'iso8859-1' => "windows-1252",
+		'iso88591' => "windows-1252",
+		'iso_8859-1' => "windows-1252",
+		'iso_8859-1:1987' => "windows-1252",
+		'l1' => "windows-1252",
+		'latin1' => "windows-1252",
+		'us-ascii' => "windows-1252",
+		'windows-1252' => "windows-1252",
+		'x-cp1252' => "windows-1252",
+		'cp1253' => "windows-1253",
+		'windows-1253' => "windows-1253",
+		'x-cp1253' => "windows-1253",
+		'cp1254' => "windows-1254",
+		'csisolatin5' => "windows-1254",
+		'iso-8859-9' => "windows-1254",
+		'iso-ir-148' => "windows-1254",
+		'iso8859-9' => "windows-1254",
+		'iso88599' => "windows-1254",
+		'iso_8859-9' => "windows-1254",
+		'iso_8859-9:1989' => "windows-1254",
+		'l5' => "windows-1254",
+		'latin5' => "windows-1254",
+		'windows-1254' => "windows-1254",
+		'x-cp1254' => "windows-1254",
+		'cp1255' => "windows-1255",
+		'windows-1255' => "windows-1255",
+		'x-cp1255' => "windows-1255",
+		'cp1256' => "windows-1256",
+		'windows-1256' => "windows-1256",
+		'x-cp1256' => "windows-1256",
+		'cp1257' => "windows-1257",
+		'windows-1257' => "windows-1257",
+		'x-cp1257' => "windows-1257",
+		'cp1258' => "windows-1258",
+		'windows-1258' => "windows-1258",
+		'x-cp1258' => "windows-1258",
+		'x-mac-cyrillic' => "x-mac-cyrillic",
+		'x-mac-ukrainian' => "x-mac-cyrillic",
+		'chinese' => "GBK",
+		'csgb2312' => "GBK",
+		'csiso58gb231280' => "GBK",
+		'gb2312' => "GBK",
+		'gb_2312' => "GBK",
+		'gb_2312-80' => "GBK",
+		'gbk' => "GBK",
+		'iso-ir-58' => "GBK",
+		'x-gbk' => "GBK",
+		'gb18030' => "gb18030",
+		'big5' => "Big5",
+		'big5-hkscs' => "Big5",
+		'cn-big5' => "Big5",
+		'csbig5' => "Big5",
+		'x-x-big5' => "Big5",
+		'cseucpkdfmtjapanese' => "EUC-JP",
+		'euc-jp' => "EUC-JP",
+		'x-euc-jp' => "EUC-JP",
+		'csiso2022jp' => "ISO-2022-JP",
+		'iso-2022-jp' => "ISO-2022-JP",
+		'csshiftjis' => "Shift_JIS",
+		'ms932' => "Shift_JIS",
+		'ms_kanji' => "Shift_JIS",
+		'shift-jis' => "Shift_JIS",
+		'shift_jis' => "Shift_JIS",
+		'sjis' => "Shift_JIS",
+		'windows-31j' => "Shift_JIS",
+		'x-sjis' => "Shift_JIS",
+		'cseuckr' => "EUC-KR",
+		'csksc56011987' => "EUC-KR",
+		'euc-kr' => "EUC-KR",
+		'iso-ir-149' => "EUC-KR",
+		'korean' => "EUC-KR",
+		'ks_c_5601-1987' => "EUC-KR",
+		'ks_c_5601-1989' => "EUC-KR",
+		'ksc5601' => "EUC-KR",
+		'ksc_5601' => "EUC-KR",
+		'windows-949' => "EUC-KR",
+		'csiso2022kr' => "replacement",
+		'hz-gb-2312' => "replacement",
+		'iso-2022-cn' => "replacement",
+		'iso-2022-cn-ext' => "replacement",
+		'iso-2022-kr' => "replacement",
+		'replacement' => "replacement",
+		'unicodefffe' => "UTF-16BE",
+		'utf-16be' => "UTF-16BE",
+		'csunicode' => "UTF-16LE",
+		'iso-10646-ucs-2' => "UTF-16LE",
+		'ucs-2' => "UTF-16LE",
+		'unicode' => "UTF-16LE",
+		'unicodefeff' => "UTF-16LE",
+		'utf-16' => "UTF-16LE",
+		'utf-16le' => "UTF-16LE",
+		'x-user-defined' => "x-user-defined",
+	];
+	/** @var string The PCRE pattern for a Content-Type HTTP header-field, chopping it up into three main sections */
+	const TYPE_PATTERN = <<<'PATTERN'
+	/^
+		[\t\r\n ]*                              # optional leading whitespace
+		([^\/]+)                                # type  
+		\/                                      # type-subtype delimiter
+		([^;]+)                                 # subtype (possibly with trailing whitespace)
+		(;.*)?                                  # optional parameters, to be parsed separately
+		[\t\r\n ]*                              # optional trailing whitespace
+	$/sx
+PATTERN;
+	/** @var string  The PCRE pattern for an HTTP Content-Type header-field's parameters, splitting them into a series of names and values*/
+	const PARAM_PATTERN = <<<'PATTERN'
+	/
+		[;\t\r\n ]*                             # parameter delimiter and leading whitespace, all optional
+		([^=;]*)                                # parameter name; may be empty
+		(?:=                                    # parameter name-value delimiter
+			(
+				"(?:\\"|[^"])*(?:"|$)[^;]*      # quoted parameter value and optional garbage
+				|[^;]*                          # unquoted parameter value (possibly with trailing whitespace)
+			)
+		)?
+		;?                                      # optional trailing parameter delimiter
+		[\t\r\n ]*                              # optional trailing whitespace
+	/sx
+PATTERN;
+	/** @var string The PCRE pattern for an HTTP token, used to validate the nameof a parameter */
+	const TOKEN_PATTERN = '/^[A-Za-z0-9!#$%&\'*+\-\.\^_`|~]+$/s';
+	/** @var string The PCRE pattern for a "bare" (unquoted) parameter value, used for validation */
+	const BARE_VALUE_PATTERN = '/^[\t\x{20}-\x{7E}\x{80}-\x{FF}]+$/s';
+	/** @var string The PCRE pattern for a quoted parameter value, used for validation */
+	const QUOTED_VALUE_PATTERN = '/^"((?:\\\"|[\t !\x{23}-\x{7E}\x{80}-\x{FF}])*)(?:"|$)/s';
+	/** @var string The PCRE pattern for an escaped character in a quoted parameter value, used to unescape via replacement */
+	const ESCAPE_PATTERN = '/\\\(.)/s';
+	/** @var string The PCRE pattern for all valid UTF-8 characters */
+	const UTF8_PATTERN = <<<'PATTERN'
+	/^(?:
+		# Single-byte
+		[\x{00}-\x{7F}]+
+		# Two-byte
+		|[\x{C2}-\x{DF}][\x{80}-\x{BF}]
+		# Three-byte excluding surrogates
+		|\x{E0}[\x{A0}-\x{BF}][\x{80}-\x{BF}]
+		|\x{ED}[\x{80}-\x{9F}][\x{80}-\x{BF}]
+		|[\x{E1}-\x{EC}\x{EE}\x{EF}][\x{80}-\x{BF}]{2}
+		# Four-byte
+		|\x{F0}[\x{90}-\x{BF}][\x{80}-\x{BF}]{2}
+		|\x{F4}[\x{80}-\x{8F}][\x{80}-\x{BF}]{2}
+		|[\x{F1}-\x{F3}][\x{80}-\x{BF}]{3}
+	)*$/sx
+PATTERN;
+	/** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */
+	const ENCODING_NAUGHTY_LIST = [
+		"unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8",
+		"iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e",
+		"csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597",
+		"sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual",
+		"csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913",
+		"iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r",
+		"x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
+		"tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819",
+		"csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1",
+		"iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1",
+		"us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254",
+		"x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258",
+		"x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312",
+		"gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5",
+		"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
+		"cseuckr", "euc-kr", "x-user-defined", "replacement",
+	];
+	/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
+	const ENCODING_ALIAS_MAP = [
+		'windows-1258' => "x-cp1258",
+		'GBK' => "x-gbk",
+		'Big5' => "big5-hkscs",
+		'EUC-KR' => "korean",
+		'x-user-defined' => "windows-1256", // this is technically not correct, but x-user-defined is not likely to be used in HTML documents; it is used to represent binary data in JavaScript; windows-1256 is used as a substitute as every byte is assigned to a character, so text can be converted back into bytes with no loss
+	];
+	/** @var string A UTF-8 byte order mark */
+	const BOM_UTF8 = "\xEF\xBB\xBF";
+	/** @var string A UTF-16 (big-endian) byte order mark */
+	const BOM_UTF16BE = "\xFE\xFF";
+	/** @var string A UTF-16 (little-endian) byte order mark */
+	const BOM_UTF16LE = "\xFF\xFE";
+	/** @var string A PCRE pattern which finds the insertion point inside the HTML <head> element of a document; the pattern takes some shortcuts, but nothing which is likely to affect documents which actually have microformat metadata */
+	const HTML_HEADER_PATTERN = <<<PATTERN
+	/^(?:\x{EF}\x{BB}\x{BF})?							# Optional UTF-8 BOM at start of string
+	(?:													# Followed by...
+		\s+												# Whitespace or
+		|<!DOCTYPE[^>]*>								# DOCTYPE or
+		|<!--(?:-?>|(?:[^-]|-(?!->))*-->)				# Comment or
+		|<\?[^>]*>										# Processing instruction or XML declaration
+	)*													# ... zero or more times
+	(?:<html											# Followed by a possible <html> start tag
+		(?:\s+											# ... with possible attributes
+			(?:\s*[^\s=>]*
+				(?:=
+					(?:
+						"[^"]*"
+						|'[^']*'
+						|[^ >]*
+					)?
+				)?
+			)*
+		)?
+		\/?\s*>
+	)?
+	(?:													# Followed by...
+		\s+												# Whitespace or
+		|<!DOCTYPE[^>]*>								# DOCTYPE or
+		|<!--(?:-?>|(?:[^-]|-(?!->))*-->)				# Comment or
+		|<\?[^>]*>										# Processing instruction or XML declaration
+	)*													# ... zero or more times
+	(?:<head											# Followed by a possible <head> start tag
+		(?:\s+											# ... with possible attributes
+			(?:\s*[^\s=>]*
+				(?:=
+					(?:
+						"[^"]*"
+						|'[^']*'
+						|[^ >]*
+					)?
+				)?
+			)*
+		)?
+		\/?\s*>
+	)?
+	/six
+PATTERN;
+
+	/** Matches an encoding label to a known encoding name in the WHATWG encoding list
+	 * 
+	 * If the label does not match any known encoding, and empty string is returned.
+	 * 
+	 * @param string $label The label to match e.g. "utf8", " FOO_bar "
+	 * @param bool $excludeNaughty Whether or not to exclude labels DOMDocument does not understand
+	 * @return string The canonical name of the encoding if matched (e.g. "UTF-8"), or an empty string in case of failure 
+	 */
+	public static function matchEncodingLabel($label, $excludeNaughty = false) {
+		$label = strtolower(trim($label, " \t\r\n\x0C")); // see https://infra.spec.whatwg.org/#ascii-whitespace
+		if ($excludeNaughty && in_array($label, self::ENCODING_NAUGHTY_LIST)) {
+			return "";
+		}
+		if (array_key_exists($label, self::ENCODINGS)) {
+			return self::ENCODINGS[$label];
+		}
+		return "";
+	}
+
+	/** Parses a Content-Type header-field to extract the type and charset, if any
+	 * 
+	 * @param string $type
+	 * @return array An array containing keys for "type" and "charset"
+	 */
+	public static function parseContentType($type) {
+		$out = [
+			'type' => "",
+			'charset' => "",
+		];
+		if (preg_match(self::TYPE_PATTERN, $type, $match)) {
+			list($mimeType, $type, $subtype, $params) = array_pad($match, 4, "");
+			$type = preg_match(self::TOKEN_PATTERN, $type) ? $type : "";
+			$subtype = rtrim($subtype, "\t\r\n ");
+			$subtype = preg_match(self::TOKEN_PATTERN, $subtype) ? $subtype : ""; 
+			if (strlen($type) && strlen($subtype)) {
+				$out['type'] = strtolower($type) . "/" . strtolower($subtype);
+				// parse parameters looking for a charset one
+				if (preg_match_all(self::PARAM_PATTERN, $params, $matches, \PREG_SET_ORDER)) {
+					foreach ($matches as $match) {
+						list($param, $name, $value) = array_pad($match, 3, "");
+						$name = strtolower(preg_match(self::TOKEN_PATTERN, $name) ? $name : "");
+						if ($name === "charset" && strlen($value)) {
+							if ($value[0] === '"') {
+								if (preg_match(self::QUOTED_VALUE_PATTERN, $value, $match)) {
+									$out['charset'] = preg_replace(self::ESCAPE_PATTERN, '$1', $match[1]);
+									return $out;
+								}
+							} else {
+								$value = rtrim($value, "\t\r\n ");
+								if (preg_match(self::BARE_VALUE_PATTERN, $value)) {
+									$out['charset'] = $value;
+									return $out;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+		return $out;
+	}
+}
diff --git a/tests/Mf2/EncodingTest.php b/tests/Mf2/EncodingTest.php
new file mode 100644
index 0000000..5e13ddf
--- /dev/null
+++ b/tests/Mf2/EncodingTest.php
@@ -0,0 +1,123 @@
+<?php
+
+namespace Mf2\Parser\Test;
+
+use Mf2\Encoding;
+use Yoast\PHPUnitPolyfills\TestCases\TestCase;
+use function Mf2\load_dom_document;
+
+class EncodingTest extends TestCase {
+	/** 
+	 * @dataProvider provideContentTypes
+	 * @covers Mf2\Encoding::parseContentType
+	 */
+	public function testParseContentType($in, $type, $charset) {
+		$act = Encoding::parseContentType($in);
+		$this->assertIsArray($act);
+		$this->assertArrayHasKey("type", $act);
+		$this->assertArrayHasKey("charset", $act);
+		$this->assertSame($type, $act['type'], "Content-Type did not match");
+		$this->assertSame($charset, $act['charset'], "Charset did not match");
+	}
+
+	public function provideContentTypes() {
+		return [
+			["a/b",								"a/b",			""],
+			["A/B",								"a/b",			""],
+			[" \t\r\na/b \t\r\n",				"a/b",			""],
+			["a/b ; \t\r\n;",					"a/b",			""],
+			["a/b;a=b ; \t\r\n;",				"a/b",			""],
+			["a/b;a=b;charset=foo"	,			"a/b",			"foo"],
+			["a/b;a=b;charset=foo;charset=bar",	"a/b",			"foo"],
+			["a/ b;charset=foo",				"",				""],
+			["a/b;charset=FOO",					"a/b",			"FOO"],
+			['a/b;charset="foo"',				"a/b",			"foo"],
+			['a/b;charset="foo bar"',			"a/b",			"foo bar"],
+			['a/b;charset="foo\\"bar"',			"a/b",			'foo"bar'],
+			['a/b;a="foo"bar;charset=baz',		"a/b",			'baz'],
+			["text/html;charset=UTF-8",         "text/html",	"UTF-8"],
+			["",								"",				""],
+		];
+	}
+
+	/**
+	 * @dataProvider provideEncodingLabels
+	 * @covers Mf2\Encoding::matchEncodingLabel
+	 */
+	public function testMatchEncodingLabels($in, $excludeNaugty, $exp) {
+		$this->assertSame($exp, Encoding::matchEncodingLabel($in, $excludeNaugty));
+	}
+
+	public function provideEncodingLabels() {
+		return [
+			["",							false, ""],
+			["foo",							false, ""],
+			["utf8",						false, "UTF-8"],
+			["UTF8",						false, "UTF-8"],
+			["Unicode",						false, "UTF-16LE"],
+			[" \t\r\n\x0Cbig5 \t\r\n\x0C",	false, "Big5"],
+			["\v\x00big5\v\x00",			false, ""],
+			["iso88591",					false, "windows-1252"],
+			["Windows-1252",                false, "windows-1252"],
+			["",							true,  ""],
+			["foo",							true,  ""],
+			["utf8",						true,  "UTF-8"],
+			["UTF8",						true,  "UTF-8"],
+			["Unicode",						true,  "UTF-16LE"],
+			[" \t\r\n\x0Cbig5 \t\r\n\x0C",	true,  ""],
+			["\v\x00big5\v\x00",			true,  ""],
+			["iso88591",					true,  ""],
+			["Windows-1252",                true,  "windows-1252"],
+		];
+	}
+
+	/**
+	 * @dataProvider provideDocuments
+	 * @covers Mf2\load_dom_document
+	 */
+	public function testDetectEncoding($doc, $type, $body, $htmlClass, $headClass) {
+		$d = load_dom_document($doc, $type);
+		$this->assertInstanceOf("DOMDocument", $d);
+		$this->assertNotNull($d->documentElement);
+		$this->assertSame($body, $d->documentElement->textContent);
+		$this->assertSame($htmlClass, $d->documentElement->getAttribute("class"));
+		if ($head = $d->getElementsByTagName("head")->item(0)) {
+			$this->assertSame($headClass, $head->getAttribute("class"));
+		}
+	}
+
+	public function provideDocuments() {
+		return [
+			["\xEF\xBB\xBF\xC3\xA9",																												"",									"\xC3\xA9",			"",		""],
+			["\xC3\xA9",																															"text/html;charset=utf-8",			"\xC3\xA9",			"",		""],
+			["<html class=a><head class=b>\xC3\xA9",																								"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=a foo><head class=b bar=>\xC3\xA9",																						"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=a><head class=b>\xC3\xA9",																								"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class='a'><head class='b'>\xC3\xA9",																							"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=\"a\"><head class=\"b\">\xC3\xA9",																						"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class='a'id=c><head class='b'/>\xC3\xA9",																						"text/html;charset=utf-8",			"\xC3\xA9",			"a",	"b"],
+			["<html class=a><head class=b>\xC3\xA9",																								"",									"\xC3\xA9",			"a",	"b"],
+			["<head class=b><meta charset=koi>\xA7",																								"",									"\xE2\x95\x96",		"",		"b"],
+			["<head class=b><meta http-equiv=content-type content=text/html;charset=koi>\xA7",														"",									"\xE2\x95\x96",		"",		"b"],
+			["<head class=b><meta content=text/html;charset=koi http-equiv=content-type>\xA7",														"",									"\xE2\x95\x96",		"",		"b"],
+			["<head class=b><META HTTP-EQUIV=CONTENT-TYPE CONTENT=text/html;charset=koi>\xA7",														"",									"\xE2\x95\x96",		"",		"b"],
+			["\xFE\xFF\xD8\x34\xDD\x1E",																											"",									"\xF0\x9D\x84\x9E",	"",		""],
+			["\xFF\xFE\x34\xD8\x1E\xDD",																											"",									"\xF0\x9D\x84\x9E",	"",		""],
+			["\xD8\x34\xDD\x1E",																													"text/html;charset=utf-16be",		"\xF0\x9D\x84\x9E",	"",		""],
+			["\x34\xD8\x1E\xDD",																													"text/html;charset=utf-16le",		"\xF0\x9D\x84\x9E",	"",		""],
+			["<html class=a><head class=b>\x9F",																									"",									"\xC5\xB8",			"a",	"b"],
+			["\xC3\xA9",																															"text/html;charset=iso-2022-cn",	"\xEF\xBF\xBD",		"",		""],
+			["<html class=a><head class=b>\x9F",																									"",									"\xC5\xB8",			"a",	"b"],
+			["<!DOCTYPE><html class=a><head class=b>\xC3\xA9",																						"",									"\xC3\xA9",			"a",	"b"],
+			["<!DOCTYPE  ><html class=a><head class=b>\xC3\xA9",																					"",									"\xC3\xA9",			"a",	"b"],
+			["<!DOCTYPE html><html class=a><head class=b>\xC3\xA9",																					"",									"\xC3\xA9",			"a",	"b"],
+			["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html class=a><head class=b>\xC3\xA9",	"",									"\xC3\xA9",			"a",	"b"],
+			[" \n\t<!--><html class=a><head class=b>\xC3\xA9",																						"",									"\xC3\xA9",			"a",	"b"],
+			["<html class=a><!-- This is --- a comment! --><head class=b>\xC3\xA9",																	"",									"\xC3\xA9",			"a",	"b"],
+			["<html class=a><!-- Pen > Sword --><head class=b>\xC3\xA9",																			"",									"\xC3\xA9",			"a",	"b"],
+			["<html class=a><!-- This -> Way --><head class=b>\xC3\xA9",																			"",									"\xC3\xA9",			"a",	"b"],
+			["<?xml-stylesheet type=\"text/css\" href=\"style.css\"?><html class=a><head class=b>\xC3\xA9",											"",									"\xC3\xA9",			"a",	"b"],
+			["\xDD",																																"text/html;charset=windows-1258",	"\xC6\xAF",			"",		""],
+		];
+	}
+}
\ No newline at end of file
diff --git a/tests/Mf2/ParserTest.php b/tests/Mf2/ParserTest.php
index a364557..946a693 100644
--- a/tests/Mf2/ParserTest.php
+++ b/tests/Mf2/ParserTest.php
@@ -823,10 +823,8 @@ public function testNotMutatingPassedInDOM() {
 
 		// Use same parsing as Parser::__construct(), twice to have a comparison object.
 		libxml_use_internal_errors(true);
-		$refDoc = new \DOMDocument();
-		@$refDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING);
-		$inputDoc = new \DOMDocument();
-		@$inputDoc->loadHTML(Mf2\unicodeToHtmlEntities($input), \LIBXML_NOWARNING);
+		$refDoc = Mf2\load_dom_document($input);
+		$inputDoc = Mf2\load_dom_document($input);
 
 		// For completion sake, test PHP itself.
 		$this->assertEquals($refDoc, $inputDoc, 'PHP could not create identical DOMDocument instances.');