c++ - boost :: property_tree::json_parserおよび2バイト幅の文字

Question

序章

std::string text = "á";

「á」は2バイト文字です（UTF-8エンコーディングを想定）。
したがって、次の行は2を出力します。

std::cout << text.size() << "\n";

ただし、std::coutテキストは正しく印刷されます。

std::cout << text << "\n";

私の問題

私はに渡しtextてboost::property_tree::ptreeからwrite_json

boost::property_tree::ptree root;
root.put<std::string>("text", text);

std::stringstream ss;
boost::property_tree::json_parser::write_json(ss, root);
std::cout << ss.str() << "\n";

結果は

{
    "text": "\u00C3\u00A1"
}

テキストは「á」とは異なる「Ã¡」と同じです。

に切り替えずにこの問題を解決することは可能std::wstringですか？ライブラリ（boost::property_tree::ptree）を変更することでこの問題を解決できる可能性はありますか？

score 11 · Accepted Answer

私はいくつかの解決策を見つけました。一般に、「特別な機会のバグのないエスケープ」を提供するboost::property_tree::json_parser::create_escapesために、のテンプレートを指定する必要があります。[Ch=Char]

JSON標準では、すべての文字列が「\ uXXXX」エスケープでUTF-16エンコードされていると想定していますが、一部のライブラリは「\xXX」エスケープでUTF-8エンコードをサポートしています。JSONファイルをUTF-8でエンコードできる場合は、0x7Fより高いすべての文字を渡すことができます。witchは元の機能を目的としていました。

を使用する前にこのコードを配置しboost::property_tree::json_parser::write_jsonました。それはから来ていboost_1_49_0/boost/property_tree/detail/json_parser_write.hppます：

namespace boost { namespace property_tree { namespace json_parser
{
    // Create necessary escape sequences from illegal characters
    template<>
    std::basic_string<char> create_escapes(const std::basic_string<char> &s)
    {
        std::basic_string<char> result;
        std::basic_string<char>::const_iterator b = s.begin();
        std::basic_string<char>::const_iterator e = s.end();
        while (b != e)
        {
            // This assumes an ASCII superset. But so does everything in PTree.
            // We escape everything outside ASCII, because this code can't
            // handle high unicode characters.
            if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
                (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0xFF)  //it fails here because char are signed
                || (*b >= -0x80 && *b < 0 ) ) // this will pass UTF-8 signed chars
                result += *b;
            else if (*b == char('\b')) result += char('\\'), result += char('b');
            else if (*b == char('\f')) result += char('\\'), result += char('f');
            else if (*b == char('\n')) result += char('\\'), result += char('n');
            else if (*b == char('\r')) result += char('\\'), result += char('r');
            else if (*b == char('/')) result += char('\\'), result += char('/');
            else if (*b == char('"'))  result += char('\\'), result += char('"');
            else if (*b == char('\\')) result += char('\\'), result += char('\\');
            else
            {
                const char *hexdigits = "0123456789ABCDEF";
                typedef make_unsigned<char>::type UCh;
                unsigned long u = (std::min)(static_cast<unsigned long>(
                                                 static_cast<UCh>(*b)),
                                             0xFFFFul);
                int d1 = u / 4096; u -= d1 * 4096;
                int d2 = u / 256; u -= d2 * 256;
                int d3 = u / 16; u -= d3 * 16;
                int d4 = u;
                result += char('\\'); result += char('u');
                result += char(hexdigits[d1]); result += char(hexdigits[d2]);
                result += char(hexdigits[d3]); result += char(hexdigits[d4]);
            }
            ++b;
        }
        return result;
    }
} } }

そして私が得る出力：

{
    "text": "aáb"
}

また、この関数boost::property_tree::json_parser::a_unicodeには、エスケープされたUnicode文字を符号付き文字に読み取る際に同様の問題があります。

score -1 · Accepted Answer

上記の基本多言語面のサポート：

    template<class Ch>
std::basic_string<Ch> create_escapes(const std::basic_string<Ch> &s)
{
    std::basic_string<Ch> result;
    typename std::basic_string<Ch>::const_iterator b = s.begin();
    typename std::basic_string<Ch>::const_iterator e = s.end();
    while (b != e)
    {
        if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
            (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0x80))
            result += *b;
        else if (*b == Ch('\b')) result += Ch('\\'), result += Ch('b');
        else if (*b == Ch('\f')) result += Ch('\\'), result += Ch('f');
        else if (*b == Ch('\n')) result += Ch('\\'), result += Ch('n');
        else if (*b == Ch('\r')) result += Ch('\\'), result += Ch('r');
        else if (*b == Ch('/')) result += Ch('\\'), result += Ch('/');
        else if (*b == Ch('"'))  result += Ch('\\'), result += Ch('"');
        else if (*b == Ch('\\')) result += Ch('\\'), result += Ch('\\');
        else
        {
            const char * hexdigits = "0123456789ABCDEF";

            typedef typename make_unsigned<Ch>::type UCh;
            unsigned long u = static_cast<unsigned long>(static_cast<UCh>(*b));

            if (u <= 0xFFFF)
            {            
                int d1 = u / 4096; u -= d1 * 4096;
                int d2 = u / 256; u -= d2 * 256;
                int d3 = u / 16; u -= d3 * 16;
                int d4 = u;

                result += Ch('\\'); result += Ch('u');
                result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
                result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
            }
            else
            {
                u = (((static_cast<unsigned long>(static_cast<UCh>(*b)) - 0x10000) >> 10) & 0x3ff) + 0xd800;

                int d1 = u / 4096; u -= d1 * 4096;
                int d2 = u / 256; u -= d2 * 256;
                int d3 = u / 16; u -= d3 * 16;
                int d4 = u;

                result += Ch('\\'); result += Ch('u');
                result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
                result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);

                u = ((static_cast<unsigned long>(static_cast<UCh>(*b)) - 0x10000) & 0x3ff) + 0xdc00;

                d1 = u / 4096; u -= d1 * 4096;
                d2 = u / 256; u -= d2 * 256;
                d3 = u / 16; u -= d3 * 16;
                d4 = u;

                result += Ch('\\'); result += Ch('u');
                result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
                result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
            }
        }
        ++b;
    }
    return result;
}

c++ - boost :: property_tree::json_parserおよび2バイト幅の文字

序章

私の問題

3 に答える 3

Related

Reference