c++ - utf-16でエンコードされたテキストを含むstd::stringをutf-16wstringに変換するにはどうすればよいですか？

Question

したがって РќРѕРІР°СЏ РїР°РїРєР°、utf-16でエンコードされた行（utf-16）のutf-8表現であるような文字列を取得しますНовая папка。この文字列をエンコードを変更せずにwstringに変換します。つまり、文字列からすべてのデータを変換せずに文字列からwstringに変換します。。したがって、コンテンツを含むwstringを取得しНовая папкаます。そのようなことをどのように行うのですか？

更新： 私が言いたかったこと-文字列内に正しいutf-16文字列のすべてのデータがあります。必要なのは、そのデータをwstringに入れることだけです...つまり、wstringにwcharが含まれている場合、00002つの文字列文字00を00まとめて配置する必要があります。それは私がどうしたらいいかわからないことです。

Update2 どうやってここにたどり着いたのか-サーバーで使用する義務があるC++ライブラリはCスタイルのパーサーです。そしてそれは私にstd::stringとしてユーザーリクエストアドレスを返します。クライアントにそのような形式でリクエストを送信させている間。

url_encode(UTF16toUTF8(wstring)) //pseudocode.

どこ

string UTF16toUTF8(const wstring & in)
{
    string out;
    unsigned int codepoint;
    bool completecode = false;
    for (wstring::const_iterator p = in.begin();  p != in.end();  ++p)
    {
        if (*p >= 0xd800 && *p <= 0xdbff)
        {
            codepoint = ((*p - 0xd800) << 10) + 0x10000;
            completecode = false;
        }
        else if (!completecode && *p >= 0xdc00 && *p <= 0xdfff)
        {
            codepoint |= *p - 0xdc00;
            completecode = true;
        }
        else
        {
            codepoint = *p;
            completecode = true;
        }
        if (completecode)
        {
            if (codepoint <= 0x7f)
                out.push_back(codepoint);
            else if (codepoint <= 0x7ff)
            {
                out.push_back(0xc0 | ((codepoint >> 6) & 0x1f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else if (codepoint <= 0xffff)
            {
                out.push_back(0xe0 | ((codepoint >> 12) & 0x0f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else
            {
                out.push_back(0xf0 | ((codepoint >> 18) & 0x07));
                out.push_back(0x80 | ((codepoint >> 12) & 0x3f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
        }
    }
    return out;
}

std::string url_encode( std::string sSrc )
{
    const char SAFE[256] =
    {
        /*      0 1 2 3  4 5 6 7  8 9 A B  C D E F */
        /* 0 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 1 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 2 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 3 */ 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,

        /* 4 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 5 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
        /* 6 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 7 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,

        /* 8 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 9 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* A */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* B */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,

        /* C */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* D */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* E */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* F */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
    };
    const char DEC2HEX[16 + 1] = "0123456789ABCDEF";
    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    unsigned char * const pStart = new unsigned char[SRC_LEN * 3];
    unsigned char * pEnd = pStart;
    const unsigned char * const SRC_END = pSrc + SRC_LEN;

    for (; pSrc < SRC_END; ++pSrc)
    {
        if (SAFE[*pSrc]) 
            *pEnd++ = *pSrc;
        else
        {
            // escape this char
            *pEnd++ = '%';
            *pEnd++ = DEC2HEX[*pSrc >> 4];
            *pEnd++ = DEC2HEX[*pSrc & 0x0F];
        }
    }

    std::string sResult((char *)pStart, (char *)pEnd);
    delete [] pStart;
    return sResult;
}

std::string url_decode( std::string sSrc )
{
    // Note from RFC1630:  "Sequences which start with a percent sign
    // but are not followed by two hexadecimal characters (0-9, A-F) are reserved
    // for future extension"

    const char HEX2DEC[256] = 
    {
        /*       0  1  2  3   4  5  6  7   8  9  A  B   C  D  E  F */
        /* 0 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 1 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 2 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 3 */  0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,

        /* 4 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 5 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 6 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 7 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* 8 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 9 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* A */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* B */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* C */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* D */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* E */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* F */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
    };

    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    const unsigned char * const SRC_END = pSrc + SRC_LEN;
    const unsigned char * const SRC_LAST_DEC = SRC_END - 2;   // last decodable '%' 

    char * const pStart = new char[SRC_LEN];
    char * pEnd = pStart;

    while (pSrc < SRC_LAST_DEC)
    {
        if (*pSrc == '%')
        {
            char dec1, dec2;
            if (-1 != (dec1 = HEX2DEC[*(pSrc + 1)])
                && -1 != (dec2 = HEX2DEC[*(pSrc + 2)]))
            {
                *pEnd++ = (dec1 << 4) + dec2;
                pSrc += 3;
                continue;
            }
        }

        *pEnd++ = *pSrc++;
    }

    // the last 2- chars
    while (pSrc < SRC_END)
        *pEnd++ = *pSrc++;

    std::string sResult(pStart, pEnd);
    delete [] pStart;
    return sResult;
}

もちろん、私はurl_decodeを呼び出しますが、文字列を取得します。

score 2 · Accepted Answer

あなたの問題を解決するために私がいじくり回しているのは次のとおりです。

std::string wrong("РќРѕРІР°СЏ РїР°РїРєР°");
std::wstring correct( (wchar_t*)wrong.data() );

http://www.cplusplus.com/reference/string/string/data/によると、data() メンバー関数は生の char* を提供する必要があり、単純に (wchar_t*) にキャストすると 00 が固定され、あなたの例で説明しているように、00を一緒にして0000にします。

私は個人的にこのようなキャスティングは好きではありませんが、今のところ思いついたのはこれだけです。

編集 - どのライブラリを使用していますか? それが行ったことを元に戻すための他の機能が付属していますか?

人気がある場合は、他の誰かが以前にこの問題を抱えていたことは確かです。彼らはどのようにそれを解決しましたか?

編集 2 - これは、malloc、元の文字列に半分のコードポイントが存在しないといういくつかの仮定、および別のひどいキャストを使用した嫌な方法です。:(

std::string wrong("РќРѕРІР°СЏ РїР°РїРєР°");
wchar_t *lesswrong = (wchar_t*) malloc (wrong.size()/sizeof(wchar_t) + sizeof(wchar_t));
lesswrong = (wchar_t*)wrong.data();
lesswrong[wrong.size()] = '\0';
std::wstring correct( lesswrong );

これが正解なんてありえない。それが機能していても、それはとても醜いです。

編集 3 - Kerrick sadi のように、これはより良い方法です。

std::string wrong("РќРѕРІР°СЏ РїР°РїРєР°");
std::wstring correct( (wchar_t*)wrong.data(), wrong.size()/2 );

score 1 · Accepted Answer

私があなたを正しく理解していれば、エンコードされた文字列std::stringを含むオブジェクトがあり、UTF-16エンコードを変更せずにそれをに変換したいと考えていstd::wstringます。私が正しければ、エンコーディングや表現の変換を行う必要はなく、ストレージのみを変換する必要があります。

また、文字列が誤ってにエンコードされた可能性があると考えていますUTF-8。ただし、UTF-8は可変長エンコーディングですが、誤って解釈されたデータ ( РќРsiРІР°СЏ РїР°РїРєР°は 22 文字) の長さは、元のデータの長さ ( Новая папкаは 11 文字) のちょうど 2 倍です。これが、間違ったエンコーディングではなく、間違ったストレージのケースである可能性があると私が疑う理由です。

次のコードはそれを行います。

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert((input.size() & 1) == 0);
    size_t len = input.size() / 2;
    std::wstring output;
    output.resize(len);

    for (size_t i = 0; i < len; ++i) {
        unsigned char chr1 = (unsigned char)input[2 * i];
        unsigned char chr2 = (unsigned char)input[2 * i + 1];

        // Note: this line suppose that you use `UTF-16-BE` both for
        // the std::string and the std::wstring. You'll have to swap
        // chr1 & chr2 if this is not the case.
        unsigned short val = (chr2 << 8)|(chr1);
        output[i] = (wchar_t)(val);
    }

    return output;
}

すべてのプラットフォームでsizeof(wchar_t)等しい 2 をターゲットにすることがわかっている場合 (たとえば、64 ビットプログラムの 1 つの Mac OS ではsizeof(wchar_t)4 に等しい場合は当てはまりません)、単純なキャストを使用できます。

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert(sizeof(wchar_t) == 2); // A static assert would be better here
    assert((input.size() & 1) == 0);
    return input.empty()
        ? std::wstring()
        : std::wstring((wchar_t*)input[0], input.size() / 2);
}

c++ - utf-16でエンコードされたテキストを含むstd::stringをutf-16wstringに変換するにはどうすればよいですか？

2 に答える 2

Related

Reference