分享

判断字符串的编码类型(GBK / UTF8)

 9loong 2009-03-03
 
GBK:

int isgbk(char *s, size_t ns)
{
        if(ns > 2 && (uint8_t)*s >= 0x81 && (uint8_t)*s <= 0xfe
                && (
                        ((uint8_t)*(s+1) >= 0x80 && (uint8_t)*(s+1) <= 0x7e)
                        || ((uint8_t)*(s+1) >= 0xa1 && (uint8_t)*(s+1) <= 0xfe)
                    )
          )
        {
                return 1;
        }
        return 0;
}

 bool isGBKCode(const string& strIn)
{
    unsigned char ch1;
    unsigned char ch2;

    if (strIn.size() >= 2)
    {
        ch1 = (unsigned char)strIn.at(0);
        ch2 = (unsigned char)strIn.at(1);
        if (ch1>=129 && ch1<=254 && ch2>=64 && ch2<=254)
            return true;
        else return false;
    }
    else return false;
}

 
UTF8:

int isutf8(char *s, size_t ns)
{
        uint8_t x = 0, i = 0, j = 0, nbytes = 0, n = 0;

        for(i = 1; i < 7; i++)
        {
                x = (uint8_t)(255 << i);
                if(((uint8_t)*s & x) == x)
                {
                        n = nbytes = (8 - i);
                        for(j = 0; (j < nbytes && j < ns); j++)
                        {
                                if((uint8_t)s[j] <= 0x80 && (uint8_t)s[j] >= 0xc0)break;
                                else n--;
                        }
                        if(n == 0) return nbytes;
                }
        }
        return 0;
}

1
            2
            3
            4
            5
            6
            7
            8
            9
            10
            11
            12
            13
            14
            15
            16
            17
            18
            19
            20
            21
            22
            23
            24
            25
            26
            27
            28
            29
            30
            31
            32
            33
            34
            35
            36
            37
            38
            39
            40
            41
            42
            43
            44
            45
            46
            47
            48
            49
            50
            51
            52
            53
            54
            55
            56
            57
            58
            59
            60
            61
            62
            63
            64
            65
            66
            67
            68
            69
            70
            71
            72
            73
            74
            75
            76
            77
            78
            79
            80
            81
            
bool isUtf8(const char *buf)
            {
            int i, n;
            register unsigned char c;
            bool gotone = false;
             
            #define F 0  /* character never appears in text */
            #define T 1  /* character appears in plain ASCII text */
            #define I 2  /* character appears in ISO-8859 text */
            #define X 3  /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
             
            static const unsigned char text_chars[256] = {
            /*            BEL BS HT LF  FF CR  */
            F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
            /*                    ESC      */
            F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
            T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
            T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
            T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
            T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
            T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
            T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
            /*        NEL                  */
            X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
            X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
            I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
            I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
            I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
            I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
            I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
            I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I  /* 0xfX */
            };
             
            /* *ulen = 0; */
            for (i = 0; (c = buf[i ]); i++) {
            if ((c & 0x80) == 0) {    /* 0xxxxxxx is plain ASCII */
            /*
            * Even if the whole file is valid UTF-8 sequences,
            * still reject it if it uses weird control characters.
            */
             
            if (text_chars[c] != T)
            return false;
             
            } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
            return false;
            } else {                  /* 11xxxxxx begins UTF-8 */
            int following;
             
            if ((c & 0x20) == 0) {        /* 110xxxxx */
            following = 1;
            } else if ((c & 0x10) == 0) {    /* 1110xxxx */
            following = 2;
            } else if ((c & 0x08) == 0) {    /* 11110xxx */
            following = 3;
            } else if ((c & 0x04) == 0) {    /* 111110xx */
            following = 4;
            } else if ((c & 0x02) == 0) {    /* 1111110x */
            following = 5;
            } else
            return false;
             
            for (n = 0; n < following; n++) {
            i++;
            if (!(c = buf[i ]))
            goto done;
             
            if ((c & 0x80) == 0 || (c & 0x40))
            return false;
            }
            gotone = true;
            }
            }
            done:
            return gotone;  /* don't claim it's UTF-8 if it's all 7-bit */
            }
             
            #undef F
            #undef T
            #undef I
            #undef X

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多