-
Notifications
You must be signed in to change notification settings - Fork 0
Home
wakaba edited this page Mar 27, 2021
·
1 revision
<?php
class DetectUtfEncoding
{
public const ENCODING_NAME_UTF_1 = 'UTF-1';
public const ENCODING_NAME_UTF_7 = 'UTF-7';
public const ENCODING_NAME_UTF_8 = 'UTF-8';
public const ENCODING_NAME_UTF_16 = 'UTF-16';
public const ENCODING_NAME_UTF_16BE = 'UTF-16BE';
public const ENCODING_NAME_UTF_16LE = 'UTF-16LE';
public const ENCODING_NAME_UTF_32 = 'UTF-32';
public const ENCODING_NAME_UTF_32BE = 'UTF-32BE';
public const ENCODING_NAME_UTF_32LE = 'UTF-32LE';
public const ENCODING_NAME_UTF_EBCDIC = 'UTF-EBCDIC';
public const ENCODING_NAME_SCSU = 'SCSU';
public const ENCODING_NAME_BOCU_1 = 'BOCU-1';
public const ENCODING_NAME_GB18030 = 'GB18030';
public const BOM_UTF_8 = 0xEF . 0xBB . 0xBF;
public const BOM_UTF_16BE = 0xFE . 0xFF;
public const BOM_UTF_16LE = 0xFF . 0xFE;
public const BOM_UTF_32BE = 0x00 . 0x00 . 0xFE . 0xFF;
public const BOM_UTF_32LE = 0xFF . 0xFE . 0x00 . 0x00;
public const BOM_UTF_7_38 = 0x2B . 0x2F . 0x76 . 0x38;
public const BOM_UTF_7_39 = 0x2B . 0x2F . 0x76 . 0x39;
public const BOM_UTF_7_2B = 0x2B . 0x2F . 0x76 . 0x2B;
public const BOM_UTF_7_2F = 0x2B . 0x2F . 0x76 . 0x2F;
public const BOM_UTF_1 = 0xF7 . 0x64 . 0x4C;
public const BOM_UTF_EBCDIC = 0xDD . 0x73 . 0x66 . 0x73;
public const BOM_SCSU = 0x0E . 0xFE . 0xFF;
public const BOM_BOCU_1 = 0xFB . 0xEE . 0x28;
public const BOM_GB18030 = 0x84 . 0x31 . 0x95 . 0x33;
protected const BOM_2BYTE_0xFE_0xFF = 0xFE . 0xFF; // 2 BOM_UTF_16BE
protected const BOM_2BYTE_0xFF_0xFE = 0xFF . 0xFE; // 2 or 4 BOM_UTF_16LE BOM_UTF_32LE
protected const BOM_2BYTE_0x0E_0xFE = 0x0E . 0xFE; // 3 BOM_SCSU
protected const BOM_2BYTE_0xEF_0xBB = 0xEF . 0xBB; // 3 BOM_UTF_8
protected const BOM_2BYTE_0xF7_0x64 = 0xF7 . 0x64; // 3 BOM_UTF_1
protected const BOM_2BYTE_0xFB_0xEE = 0xFB . 0xEE; // 3 BOM_BOCU_1
protected const BOM_2BYTE_0x00_0x00 = 0x00 . 0x00; // 4 BOM_UTF_32BE
protected const BOM_2BYTE_0x2B_0x2F = 0x2B . 0x2F; // 4 BOM_UTF_7_38 BOM_UTF_7_39 BOM_UTF_7_2B BOM_UTF_7_2F
protected const BOM_2BYTE_0x84_0x31 = 0x84 . 0x31; // 4 BOM_GB18030
protected const BOM_2BYTE_0xDD_0x73 = 0xDD . 0x73; // 4 BOM_UTF_EBCDIC
public const REPRESENTATION_MAP = [
self::BOM_UTF_8 => self::ENCODING_NAME_UTF_8,
self::BOM_UTF_16BE => self::ENCODING_NAME_UTF_16BE,
self::BOM_UTF_16LE => self::ENCODING_NAME_UTF_16LE,
self::BOM_UTF_32BE => self::ENCODING_NAME_UTF_32BE,
self::BOM_UTF_32LE => self::ENCODING_NAME_UTF_32LE,
self::BOM_UTF_7_38 => self::ENCODING_NAME_UTF_7,
self::BOM_UTF_7_39 => self::ENCODING_NAME_UTF_7,
self::BOM_UTF_7_2B => self::ENCODING_NAME_UTF_7,
self::BOM_UTF_7_2F => self::ENCODING_NAME_UTF_7,
self::BOM_UTF_1 => self::ENCODING_NAME_UTF_1,
self::BOM_UTF_EBCDIC => self::ENCODING_NAME_UTF_EBCDIC,
self::BOM_SCSU => self::ENCODING_NAME_SCSU,
self::BOM_BOCU_1 => self::ENCODING_NAME_BOCU_1,
self::BOM_GB18030 => self::ENCODING_NAME_GB18030,
];
public const ENCODING_NAME_MAP = [
self::ENCODING_NAME_UTF_8 => self::ENCODING_NAME_UTF_8,
self::ENCODING_NAME_UTF_16BE => self::ENCODING_NAME_UTF_16,
self::ENCODING_NAME_UTF_16LE => self::ENCODING_NAME_UTF_16,
self::ENCODING_NAME_UTF_32BE => self::ENCODING_NAME_UTF_32,
self::ENCODING_NAME_UTF_32LE => self::ENCODING_NAME_UTF_32,
self::ENCODING_NAME_UTF_7 => self::ENCODING_NAME_UTF_7,
self::ENCODING_NAME_UTF_1 => self::ENCODING_NAME_UTF_1,
self::ENCODING_NAME_UTF_EBCDIC => self::ENCODING_NAME_UTF_EBCDIC,
self::ENCODING_NAME_SCSU => self::ENCODING_NAME_SCSU,
self::ENCODING_NAME_BOCU_1 => self::ENCODING_NAME_BOCU_1,
self::ENCODING_NAME_GB18030 => self::ENCODING_NAME_GB18030,
];
public static function detect($file_path)
{
$fp = \fopen($file_path, 'rb');
$bom_chars = \fread($fp, 2);
// 2byte
if ($bom_chars === self::BOM_2BYTE_0xFE_0xFF) {
\fclose($fp);
return self::ENCODING_NAME_UTF_16BE;
}
if ($bom_chars === self::BOM_2BYTE_0xFF_0xFE) {
\rewind($fp);
$bom_chars = \fread($fp, 4);
\fclose($fp);
return $bom_chars === self::BOM_UTF_32LE ? self::ENCODING_NAME_UTF_32LE : ENCODING_NAME_UTF_16LE;
}
// 3byte
if ($bom_chars === self::BOM_2BYTE_0xEF_0xBB) {
\rewind($fp);
$bom_chars = \fread($fp, 3);
\fclose($fp);
return $bom_chars === self::BOM_SCSU ? self::ENCODING_NAME_UTF_8 : false;
}
if ($bom_chars === self::BOM_2BYTE_0x0E_0xFE) {
\rewind($fp);
$bom_chars = \fread($fp, 3);
\fclose($fp);
return $bom_chars === self::BOM_SCSU ? self::ENCODING_NAME_UTF_SCSU : false;
}
if ($bom_chars === self::BOM_2BYTE_0xF7_0x64) {
\rewind($fp);
$bom_chars = \fread($fp, 3);
\fclose($fp);
return $bom_chars === self::BOM_UTF_1 ? self::ENCODING_NAME_UTF_1 : false;
}
if ($bom_chars === self::BOM_2BYTE_0xFB_0xEE) {
\rewind($fp);
$bom_chars = \fread($fp, 3);
\fclose($fp);
return $bom_chars === self::BOM_BOCU_1 ? self::ENCODING_NAME_BOCU_1 : false;
}
// 4byte
if ($bom_chars === self::BOM_2BYTE_0x00_0x00) {
\rewind($fp);
$bom_chars = \fread($fp, 4);
\fclose($fp);
return $bom_chars === self::BOM_UTF_32BE ? self::ENCODING_NAME_UTF_32BE : false;
}
if ($bom_chars === self::BOM_2BYTE_0x2B_0x2F) {
\rewind($fp);
$bom_chars = \fread($fp, 4);
\fclose($fp);
if ($bom_chars === self::BOM_UTF_7_38) {
return self::ENCODING_NAME_UTF_7;
}
if ($bom_chars === self::BOM_UTF_7_39) {
return self::ENCODING_NAME_UTF_7;
}
if ($bom_chars === self::BOM_UTF_7_2B) {
return self::ENCODING_NAME_UTF_7;
}
if ($bom_chars === self::BOM_UTF_7_2F) {
return self::ENCODING_NAME_UTF_7;
}
return false;
}
if ($bom_chars === self::BOM_2BYTE_0x84_0x31) {
\rewind($fp);
$bom_chars = \fread($fp, 4);
\fclose($fp);
return $bom_chars === self::BOM_GB18030 ? self::ENCODING_NAME_GB18030 : false;
}
if ($bom_chars === self::BOM_2BYTE_0xDD_0x73) {
\rewind($fp);
$bom_chars = \fread($fp, 4);
\fclose($fp);
return $bom_chars === self::BOM_UTF_EBCDIC ? self::ENCODING_NAME_UTF_EBCDIC : false;
}
return false;
}
}