Skip to content
wakaba edited this page Mar 27, 2021 · 1 revision
<?php

class DetectUtfEncoding
{
    public const ENCODING_NAME_UTF_1        = 'UTF-1';
    public const ENCODING_NAME_UTF_7        = 'UTF-7';
    public const ENCODING_NAME_UTF_8        = 'UTF-8';
    public const ENCODING_NAME_UTF_16       = 'UTF-16';
    public const ENCODING_NAME_UTF_16BE     = 'UTF-16BE';
    public const ENCODING_NAME_UTF_16LE     = 'UTF-16LE';
    public const ENCODING_NAME_UTF_32       = 'UTF-32';
    public const ENCODING_NAME_UTF_32BE     = 'UTF-32BE';

    public const ENCODING_NAME_UTF_32LE     = 'UTF-32LE';
    public const ENCODING_NAME_UTF_EBCDIC   = 'UTF-EBCDIC';
    public const ENCODING_NAME_SCSU         = 'SCSU';
    public const ENCODING_NAME_BOCU_1       = 'BOCU-1';
    public const ENCODING_NAME_GB18030      = 'GB18030';

    public const BOM_UTF_8      = 0xEF . 0xBB . 0xBF;
    public const BOM_UTF_16BE   = 0xFE . 0xFF;
    public const BOM_UTF_16LE   = 0xFF . 0xFE;
    public const BOM_UTF_32BE   = 0x00 . 0x00 . 0xFE . 0xFF;
    public const BOM_UTF_32LE   = 0xFF . 0xFE . 0x00 . 0x00;
    public const BOM_UTF_7_38   = 0x2B . 0x2F . 0x76 . 0x38;
    public const BOM_UTF_7_39   = 0x2B . 0x2F . 0x76 . 0x39;
    public const BOM_UTF_7_2B   = 0x2B . 0x2F . 0x76 . 0x2B;
    public const BOM_UTF_7_2F   = 0x2B . 0x2F . 0x76 . 0x2F;
    public const BOM_UTF_1      = 0xF7 . 0x64 . 0x4C;
    public const BOM_UTF_EBCDIC = 0xDD . 0x73 . 0x66 . 0x73;
    public const BOM_SCSU       = 0x0E . 0xFE . 0xFF;
    public const BOM_BOCU_1     = 0xFB . 0xEE . 0x28;
    public const BOM_GB18030    = 0x84 . 0x31 . 0x95 . 0x33;

    protected const BOM_2BYTE_0xFE_0xFF = 0xFE . 0xFF;  // 2        BOM_UTF_16BE
    protected const BOM_2BYTE_0xFF_0xFE = 0xFF . 0xFE;  // 2 or 4   BOM_UTF_16LE    BOM_UTF_32LE

    protected const BOM_2BYTE_0x0E_0xFE = 0x0E . 0xFE;  // 3        BOM_SCSU
    protected const BOM_2BYTE_0xEF_0xBB = 0xEF . 0xBB;  // 3        BOM_UTF_8
    protected const BOM_2BYTE_0xF7_0x64 = 0xF7 . 0x64;  // 3        BOM_UTF_1
    protected const BOM_2BYTE_0xFB_0xEE = 0xFB . 0xEE;  // 3        BOM_BOCU_1

    protected const BOM_2BYTE_0x00_0x00 = 0x00 . 0x00;  // 4        BOM_UTF_32BE
    protected const BOM_2BYTE_0x2B_0x2F = 0x2B . 0x2F;  // 4        BOM_UTF_7_38    BOM_UTF_7_39    BOM_UTF_7_2B    BOM_UTF_7_2F
    protected const BOM_2BYTE_0x84_0x31 = 0x84 . 0x31;  // 4        BOM_GB18030
    protected const BOM_2BYTE_0xDD_0x73 = 0xDD . 0x73;  // 4        BOM_UTF_EBCDIC

    public const REPRESENTATION_MAP = [
        self::BOM_UTF_8         => self::ENCODING_NAME_UTF_8,
        self::BOM_UTF_16BE      => self::ENCODING_NAME_UTF_16BE,
        self::BOM_UTF_16LE      => self::ENCODING_NAME_UTF_16LE,
        self::BOM_UTF_32BE      => self::ENCODING_NAME_UTF_32BE,
        self::BOM_UTF_32LE      => self::ENCODING_NAME_UTF_32LE,
        self::BOM_UTF_7_38      => self::ENCODING_NAME_UTF_7,
        self::BOM_UTF_7_39      => self::ENCODING_NAME_UTF_7,
        self::BOM_UTF_7_2B      => self::ENCODING_NAME_UTF_7,
        self::BOM_UTF_7_2F      => self::ENCODING_NAME_UTF_7,
        self::BOM_UTF_1         => self::ENCODING_NAME_UTF_1,
        self::BOM_UTF_EBCDIC    => self::ENCODING_NAME_UTF_EBCDIC,
        self::BOM_SCSU          => self::ENCODING_NAME_SCSU,
        self::BOM_BOCU_1        => self::ENCODING_NAME_BOCU_1,
        self::BOM_GB18030       => self::ENCODING_NAME_GB18030,
    ];

    public const ENCODING_NAME_MAP  = [
        self::ENCODING_NAME_UTF_8       => self::ENCODING_NAME_UTF_8,
        self::ENCODING_NAME_UTF_16BE    => self::ENCODING_NAME_UTF_16,
        self::ENCODING_NAME_UTF_16LE    => self::ENCODING_NAME_UTF_16,
        self::ENCODING_NAME_UTF_32BE    => self::ENCODING_NAME_UTF_32,
        self::ENCODING_NAME_UTF_32LE    => self::ENCODING_NAME_UTF_32,
        self::ENCODING_NAME_UTF_7       => self::ENCODING_NAME_UTF_7,
        self::ENCODING_NAME_UTF_1       => self::ENCODING_NAME_UTF_1,
        self::ENCODING_NAME_UTF_EBCDIC  => self::ENCODING_NAME_UTF_EBCDIC,
        self::ENCODING_NAME_SCSU        => self::ENCODING_NAME_SCSU,
        self::ENCODING_NAME_BOCU_1      => self::ENCODING_NAME_BOCU_1,
        self::ENCODING_NAME_GB18030     => self::ENCODING_NAME_GB18030,
    ];

    public static function detect($file_path)
    {
        $fp         = \fopen($file_path, 'rb');
        $bom_chars  = \fread($fp, 2);

        // 2byte
        if ($bom_chars === self::BOM_2BYTE_0xFE_0xFF) {
            \fclose($fp);

            return self::ENCODING_NAME_UTF_16BE;
        }

        if ($bom_chars === self::BOM_2BYTE_0xFF_0xFE) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 4);
            \fclose($fp);

            return $bom_chars === self::BOM_UTF_32LE ? self::ENCODING_NAME_UTF_32LE : ENCODING_NAME_UTF_16LE;
        }

        // 3byte
        if ($bom_chars === self::BOM_2BYTE_0xEF_0xBB) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 3);
            \fclose($fp);

            return $bom_chars === self::BOM_SCSU ? self::ENCODING_NAME_UTF_8 : false;
        }

        if ($bom_chars === self::BOM_2BYTE_0x0E_0xFE) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 3);
            \fclose($fp);

            return $bom_chars === self::BOM_SCSU ? self::ENCODING_NAME_UTF_SCSU : false;
        }

        if ($bom_chars === self::BOM_2BYTE_0xF7_0x64) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 3);
            \fclose($fp);

            return $bom_chars === self::BOM_UTF_1 ? self::ENCODING_NAME_UTF_1 : false;
        }

        if ($bom_chars === self::BOM_2BYTE_0xFB_0xEE) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 3);
            \fclose($fp);

            return $bom_chars === self::BOM_BOCU_1 ? self::ENCODING_NAME_BOCU_1 : false;
        }

        // 4byte
        if ($bom_chars === self::BOM_2BYTE_0x00_0x00) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 4);
            \fclose($fp);

            return $bom_chars === self::BOM_UTF_32BE ? self::ENCODING_NAME_UTF_32BE : false;
        }

        if ($bom_chars === self::BOM_2BYTE_0x2B_0x2F) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 4);
            \fclose($fp);

            if ($bom_chars === self::BOM_UTF_7_38) {
                return self::ENCODING_NAME_UTF_7;
            }

            if ($bom_chars === self::BOM_UTF_7_39) {
                return self::ENCODING_NAME_UTF_7;
            }

            if ($bom_chars === self::BOM_UTF_7_2B) {
                return self::ENCODING_NAME_UTF_7;
            }

            if ($bom_chars === self::BOM_UTF_7_2F) {
                return self::ENCODING_NAME_UTF_7;
            }

            return false;
        }

        if ($bom_chars === self::BOM_2BYTE_0x84_0x31) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 4);
            \fclose($fp);

            return $bom_chars === self::BOM_GB18030 ? self::ENCODING_NAME_GB18030 : false;
        }

        if ($bom_chars === self::BOM_2BYTE_0xDD_0x73) {
            \rewind($fp);
            $bom_chars  = \fread($fp, 4);
            \fclose($fp);

            return $bom_chars === self::BOM_UTF_EBCDIC ? self::ENCODING_NAME_UTF_EBCDIC : false;
        }

        return false;
    }
}
Clone this wiki locally