サンプル データには、「double qoute」(sic) の直後に単一の二重引用符があり、二重引用符で囲まれたフィールドの開始が終了しますが、次のコンマまたは行末まで読み続ける必要があります。これは不正な複数行フィールドですが、Excel がそれに対して何を行うかを確認できます。これについての説明は、C での CSV 解析ライブラリと C++ での再実装を含み、このレベルについて説明している (優れた) 本The Practice of Programming (デッド リンク—インターネット アーカイブ コピー;プリンストン大学のライブ リンク) で見つけることができます。詳細の。RFC 4180「Common Format and MIME Type for Comma-Separated Values」には CSV の標準もあり、Wikipediaを調べることもできます。件名に。
もう1つの回答には、まだテスト中のサンプルコードがあります。制限内ではかなり回復力があります。ここでは、SSCCE ( Short, Self-Contained, Correct Example ) に変更されています。
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input: %d [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wprintf(L"Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
出力例
$ cat csv.data
a,bb, c ,d""e,f
1,"2","",,"""",4
$ ./wcsv < csv.data
Input: 16 [a,bb, c ,d""e,f]
Field: [a,] (eol = 0)
Field: [bb,] (eol = 0)
Field: [ c ,] (eol = 0)
Field: [d""e,] (eol = 0)
Field: [f] (eol = 1)
Input: 17 [1,"2","",,"""",4]
Field: [1,] (eol = 0)
Field: ["2",] (eol = 0)
Field: ["",] (eol = 0)
Field: [,] (eol = 0)
Field: ["""",] (eol = 0)
Field: [4] (eol = 1)
Input: 296 [First field of first row,"This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
"This is second field of second row, but it is not multiline
because it doesn't start
with an immediate double quote"]
Field: [First field of first row,] (eol = 0)
Field: ["This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field: [ "This is second field of second row,] (eol = 0)
Field: [ but it is not multiline] (eol = 1)
Field: [ because it doesn't start ] (eol = 1)
Field: [ with an immediate double quote"] (eol = 1)
$
私は「その制限内で」と言った。その制限は何ですか?
主に、変換されたフィールドではなく、生のフィールドを分離します。したがって、それが分離するフィールドは、「実際の」値を生成するように変更する必要があります。囲んでいる二重引用符は取り除かれ、内部の二重二重引用符は単一引用符に置き換えられます。生のフィールドを実際の値に変換すると、関数内の多くのコードが模倣されnextCsvField()
ます。入力は、フィールドの開始とフィールドの終了 (区切り文字) です。これは追加の関数を持つ 2 番目の SSCCE であり、上記csvFieldData()
のdissect()
関数はそれを呼び出すように修正されています。分析された出力の形式はわずかに異なるため、見栄えが良くなります。
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
wchar_t *dst = buffer;
wchar_t *end = buffer + buflen - 1;
const wchar_t *src = fld_s;
if (*src == L'"')
{
const wchar_t *p = src + 1;
while (p < fld_e && dst < end)
{
if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
{
*dst++ = p[0];
p += 2;
}
else if (p[0] == L'"')
{
p++;
break;
}
else
*dst++ = *p++;
}
src = p;
}
while (src < fld_e && dst < end)
*dst++ = *src++;
if (dst >= end)
return 0;
*dst = L'\0';
return(buffer);
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wchar_t buffer[1024];
wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
出力例
$ ./wcsv < csv.data
Input 16: [a,bb, c ,d""e,f]
Raw Field: [a,] (eol = 0)
Field 1: [a]
Raw Field: [bb,] (eol = 0)
Field 2: [bb]
Raw Field: [ c ,] (eol = 0)
Field 3: [ c ]
Raw Field: [d""e,] (eol = 0)
Field 4: [d""e]
Raw Field: [f] (eol = 1)
Field 1: [f]
Input 17: [1,"2","",,"""",4]
Raw Field: [1,] (eol = 0)
Field 1: [1]
Raw Field: ["2",] (eol = 0)
Field 1: [2]
Raw Field: ["",] (eol = 0)
Field 0: []
Raw Field: [,] (eol = 0)
Field 0: []
Raw Field: ["""",] (eol = 0)
Field 2: [""]
Raw Field: [4] (eol = 1)
Field 1: [4]
Input 296: [First field of first row,"This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
"This is second field of second row, but it is not multiline
because it doesn't start
with an immediate double quote"]
Raw Field: [First field of first row,] (eol = 0)
Field 24: [First field of first row]
Raw Field: ["This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field 140: [This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped " double quote" but this one "" is not]
Raw Field: [ "This is second field of second row,] (eol = 0)
Field 38: [ "This is second field of second row]
Raw Field: [ but it is not multiline] (eol = 1)
Field 24: [ but it is not multiline]
Raw Field: [ because it doesn't start ] (eol = 1)
Field 28: [ because it doesn't start ]
Raw Field: [ with an immediate double quote"] (eol = 1)
Field 34: [ with an immediate double quote"]
$
\r\n
(またはプレーン\r
)行末でテストしていません。