postgresql - Postgres COPYコマンド-コンマ付きのフィールド、二重引用符で引用

Question

postgres csvのインポートに関連するいくつかの投稿を検索して見つけましたが、現在の問題を解決するものは何もありません。

私は常にpostgrescopyコマンドを使用して、異種のデータソースからシステムにデータを取り込みます。現在、1億行の.csvファイルで苦労しており、カンマ引用符で区切られています。問題は次のような行にあります：

009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"

カンマが埋め込まれた二重引用符で囲まれたフィールド。フィールドが正しく解析されず、エラーが発生します。

"ERROR:  extra data after last expected column"

通常、これが発生した場合、問題のある行をその場で処理しますが、このファイルは非常に大きいので、それを防ぐためのより一般的な方法を望んでいます。改訂されたデータ形式を要求することは可能ではありません。

copy mytable from '/path/to/file.csv' csv header quote '"'

score 4 · Accepted Answer

それは不正な形式の CSV です。引用フィールド内に二重引用符を埋め込むには、二重引用符を二重にします。例えば：

"where","is ""pancakes""","house?"

次の 3 つの値があります。

where
is "pancakes"
house?

問題が発生している行には、二重引用符が二重になっています。

009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
                                               ^^                            ^^

正しいバージョンがあいまいであるため、 COPYがこれについてできることは何もないと思い"this one, well, is a problem"ます"""this one, well, is a problem"""。

手で直せばいいと思います。sed壊れた行を一意に識別できる場合は、簡単なワンライナーで作業を行うことができます。

参考までに、私が見た CSV 標準に最も近いものはRFC 4180であり、セクション 2 には次のように書かれています。

5.  Each field may or may not be enclosed in double quotes (however
    some programs, such as Microsoft Excel, do not use double quotes
    at all).  If fields are not enclosed with double quotes, then
    double quotes may not appear inside the fields.  For example:

    "aaa","bbb","ccc" CRLF
     zzz,yyy,xxx
[...]
7.  If double-quotes are used to enclose fields, then a double-quote
    appearing inside a field must be escaped by preceding it with
    another double quote.  For example:

    "aaa","b""bb","ccc"

score 2 · Accepted Answer

これは、Kernighan と Plauger によるThe Practice of Programmingの CSV コードに基づいたコードで、奇妙で不正な形式の CSV データを処理するように適合されています。（それほど難しいことではありませんでした。すでにメインコードが機能してパッケージ化されているため、CSV出力関数を追加しadvquoted()、この質問の奇妙な形式を処理するように関数を変更する必要がありました.

csv2.h

/*
@(#)File:           $RCSfile: csv2.h,v $
@(#)Version:        $Revision: 2.1 $
@(#)Last changed:   $Date: 2012/11/01 22:23:07 $
@(#)Purpose:        Scanner for Comma Separated Variable (CSV) Data
@(#)Author:         J Leffler
@(#)Origin:         Kernighan & Pike, 'The Practice of Programming'
*/

/*TABSTOP=4*/

#ifndef CSV2_H
#define CSV2_H

#ifdef  __cplusplus
extern "C" {
#endif

#ifdef MAIN_PROGRAM
#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_h[] = "@(#)$Id: csv2.h,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */
#endif /* MAIN_PROGRAM */

#include <stdio.h>

extern char  *csvgetline(FILE *ifp);    /* Read next input line */
extern char  *csvgetfield(size_t n);    /* Return field n */
extern size_t csvnfield(void);          /* Return number of fields */
extern void   csvreset(void);           /* Release space used by CSV */

extern int    csvputfield(FILE *ofp, const char *field);
extern int    csvputline(FILE *ofp, char **fields, int nfields);
extern void   csvseteol(const char *eol);

#ifdef  __cplusplus
}
#endif

#endif /* CSV2_H */

csv2.c

/*
@(#)File:           $RCSfile: csv2.c,v $
@(#)Version:        $Revision: 2.1 $
@(#)Last changed:   $Date: 2012/11/01 22:23:07 $
@(#)Purpose:        Scanner for Comma Separated Variable (CSV) Data
@(#)Modification:   Deal with specific malformed CSV
@(#)Author:         J Leffler
@(#)Origin:         Kernighan & Pike, 'The Practice of Programming'
*/

/*TABSTOP=4*/

#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_c[] = "@(#)$Id: csv2.c,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */

/*
** See RFC 4180 (http://www.ietf.org/rfc/rfc4180.txt).
**
** Specific malformed CSV - see SO 13183644 (http://stackoverflow.com/questions/13183644).
** Data contains malformed CSV fields like: OK,""this is a problem"",OK
** Two (but not three) field quotes at the start extract as "this is a problem" (with the quotes).
*/

#include "csv2.h"
#include <stdlib.h>
#include <string.h>

enum { NOMEM = -2 };

static char *line = 0;      /* Input line */
static char *sline = 0;     /* Split line */
static size_t maxline = 0;  /* Size of line[] and sline[] */
static char **field = 0;    /* Field pointers */
static size_t maxfield = 0; /* Size of field[] */
static size_t nfield = 0;   /* Number of fields */

static char fieldsep[]= ",";    /* Field separator characters */
static char fieldquote = '"';   /* Quote character */

static char eolstr[8] = "\n";

void csvreset(void)
{
    free(line);
    free(sline);
    free(field);
    line = 0;
    sline = 0;
    field = 0;
    maxline = maxfield = nfield = 0;
}

static int endofline(FILE *ifp, int c)
{
    int eol = (c == '\r' || c == '\n');
    if (c == '\r')
    {
        c = getc(ifp);
        if (c != '\n' && c != EOF)
            ungetc(c, ifp);
    }
    return(eol);
}

/* Modified to deal with specific malformed CSV */
static char *advquoted(char *p)
{
    size_t i;
    size_t j;
    if (p[0] == fieldquote && (p[1] != *fieldsep && p[1] != fieldquote))
    {
        /* Malformed CSV: ""some stuff"" --> "some stuff" */
        /* Find "\"\"," or "\"\"\0" to mark end of field */
        /* If we don't find it, drop through to 'regular' case */
        char *eof = strstr(&p[2], "\"\"");
        if (eof != 0 && (eof[2] == *fieldsep || eof[2] == '\0'))
        {
            p[eof + 1 - p] = '\0';
            return(eof + 2);
        }
    }
    for (i = j = 0; p[j] != '\0'; i++, j++)
    {
        if (p[j] == fieldquote && p[++j] != fieldquote)
        {
            size_t k = strcspn(p+j, fieldsep);
            memmove(p+i, p+j, k);  // 1 -> i fixing transcription error
            i += k;
            j += k;
            break;
        }
        p[i] = p[j];
    }
    p[i] = '\0';
    return(p + j);
}

static int split(void)
{
    char *p;
    char **newf;
    char *sepp;
    int sepc;

    nfield = 0;
    if (line[0] == '\0')
        return(0);
    strcpy(sline, line);
    p = sline;

    do
    {
        if (nfield >= maxfield)
        {
            maxfield *= 2;
            newf = (char **)realloc(field, maxfield * sizeof(field[0]));
            if (newf == 0)
                return NOMEM;
            field = newf;
        }
        if (*p == fieldquote)
            sepp = advquoted(++p);
        else
            sepp = p + strcspn(p, fieldsep);
        sepc = sepp[0];
        sepp[0] = '\0';
        field[nfield++] = p;
        p = sepp + 1;
    } while (sepc == ',');

    return(nfield);
}

char *csvgetline(FILE *ifp)
{
    size_t i;
    int    c;

    if (line == NULL)
    {
        /* Allocate on first call */
        maxline = maxfield = 1;
        line = (char *)malloc(maxline);     /*=C++=*/
        sline = (char *)malloc(maxline);    /*=C++-*/
        field = (char **)malloc(maxfield*sizeof(field[0])); /*=C++=*/
        if (line == NULL || sline == NULL || field == NULL)
        {
            csvreset();
            return(NULL);   /* out of memory */
        }
    }
    for (i = 0; (c = getc(ifp)) != EOF && !endofline(ifp, c); i++)
    {
        if (i >= maxline - 1)
        {
            char  *newl;
            char  *news;
            maxline *= 2;
            newl = (char *)realloc(line, maxline);  /*=C++=*/
            news = (char *)realloc(sline, maxline); /*=C++-*/
            if (newl == NULL || news == NULL)
            {
                csvreset();
                return(NULL);   /* out of memory */
            }
            line = newl;
            sline = news;
        }
        line[i] = c;
    }
    line[i] = '\0';
    if (split() == NOMEM)
    {
        csvreset();
        return(NULL);
    }
    return((c == EOF && i == 0) ? NULL : line);
}


char *csvgetfield(size_t n)
{
    if (n >= nfield)
        return(0);
    return(field[n]);
}

size_t csvnfield(void)
{
    return(nfield);
}

int csvputfield(FILE *ofp, const char *ofield)
{
    const char escapes[] = "\",\r\n";
    if (strpbrk(ofield, escapes) != 0)
    {
        size_t len = strlen(ofield) + 2;
        const char *pos = ofield;
        while ((pos = strchr(pos, '"')) != 0)
        {
            len++;
            pos++;
        }
        char *space = malloc(len+1);
        if (space == 0)
            return EOF;
        char *cpy = space;
        pos = ofield;
        *cpy++ = '"';
        char c;
        while ((c = *pos++) != '\0')
        {
            if (c == '"')
                *cpy++ = c;
            *cpy++ = c;
        }
        *cpy++ = '"';
        *cpy = '\0';
        int rc = fputs(space, ofp);
        free(space);
        return rc;
    }
    else
        return fputs(ofield, ofp);
}

int csvputline(FILE *ofp, char **fields, int nfields)
{
    for (int i = 0; i < nfields; i++)
    {
        if (i > 0)
            putc(',', ofp);
        if (csvputfield(ofp, fields[i]) == EOF)
            return EOF;
    }
    return(fputs(eolstr, ofp));
}

void csvseteol(const char *eol)
{
    size_t nbytes = strlen(eol);
    if (nbytes >= sizeof(eolstr))
        nbytes = sizeof(eolstr) - 1;
    memmove(eolstr, eol, nbytes);
    eolstr[nbytes] = '\0';
}

#ifdef TEST

int main(void)
{
    char *in_line;

    while ((in_line = csvgetline(stdin)) != 0)
    {
        size_t n = csvnfield();
        char *fields[n];        /* C99 VLA */
        printf("line = '%s'\n", in_line);
        for (size_t i = 0; i < n; i++)
        {
            printf("field[%zu] = '%s'\n", i, csvgetfield(i));
            printf("field[%zu] = [", i);
            csvputfield(stdout, csvgetfield(i));
            fputs("]\n", stdout);
            fields[i] = csvgetfield(i);
        }
        printf("fields[0..%zu] = ", n-1);
        csvputline(stdout, fields, n);
    }

    return(0);
}

#endif /* TEST */

でコードをコンパイルして、サンプル関数-DTESTを含むプログラムを作成します。main()C99 コンパイラが必要です。のコードはmain()VLA (可変長配列) を使用しています。動的なメモリ割り当てまたは悲観的な (過剰な) メモリ割り当てを使用して、これを回避できます (最近では、数千のポインターの配列がほとんどのシステムを殺すことはありませんが、1 行に数千のフィールドを持つ CSV ファイルはほとんどありません)。

サンプルデータ

質問のデータに厳密に基づいています。

009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky
503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"
610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"

出力例

line = '009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"'
field[0] = '009098'
field[0] = [009098]
field[1] = '0981098094'
field[1] = [0981098094]
field[2] = 'something'
field[2] = [something]
field[3] = 'something else'
field[3] = [something else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = ' "another thing"'
field[5] = [" ""another thing"""]
fields[0..5] = 009098,0981098094,something,something else,"""this one, well, is a problem"""," ""another thing"""
line = '123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky'
field[0] = '123458'
field[0] = [123458]
field[1] = '1234561007'
field[1] = [1234561007]
field[2] = 'anything'
field[2] = [anything]
field[3] = 'nothing else'
field[3] = [nothing else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = 'dohicky'
field[5] = [dohicky]
fields[0..5] = 123458,1234561007,anything,nothing else,"""this one, well, is a problem""",dohicky
line = '503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"'
field[0] = '503458'
field[0] = [503458]
field[1] = '1234598094'
field[1] = [1234598094]
field[2] = 'nothing'
field[2] = [nothing]
field[3] = 'everything else'
field[3] = [everything else]
field[4] = '"this one, well, it isn't a problem"'
field[4] = ["""this one, well, it isn't a problem"""]
field[5] = 'abelone'
field[5] = [abelone]
fields[0..5] = 503458,1234598094,nothing,everything else,"""this one, well, it isn't a problem""",abelone
line = '610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"'
field[0] = '610078'
field[0] = [610078]
field[1] = '1236100794'
field[1] = [1236100794]
field[2] = 'everything'
field[2] = [everything]
field[3] = 'anything else'
field[3] = [anything else]
field[4] = 'this "isn't a problem", he said.'
field[4] = ["this ""isn't a problem"", he said."]
field[5] = 'Orcas Rule'
field[5] = [Orcas Rule]
fields[0..5] = 610078,1236100794,everything,anything else,"this ""isn't a problem"", he said.",Orcas Rule

フィールドは 2 回印刷されます。1 回はフィールド抽出をテストするため、もう 1 回はフィールド印刷をテストするためです。csvputline()ファイルを不正な形式の CSV から適切な形式の CSV に変換する以外は、印刷を削除して出力を簡素化します。

postgresql - Postgres COPYコマンド-コンマ付きのフィールド、二重引用符で引用

2 に答える 2

csv2.h

csv2.c

サンプルデータ

出力例

Related

Reference