linux - 2 つのテキストファイル間で一致するパターンを見つけて、別のファイルに出力する方法は?

Question

テキスト構成が異なる 2 つのテキストファイルがあります。両方のファイルのテキストには、同一のパターン (数字) がほとんど含まれていません。両方のファイルに存在するパターン (番号) を見つけて、出力ファイルに書き込みたいと思います。

ファイル1.txt:

blablabla_25947.bkwjcnwelkcnwelckme

blablabla_111.bkwjcnwelkcnwelckme

blablabla_65155.bkwjcnwelkcnwelckme

blablabla_56412.bkwjcnwelkcnwelckme

file2.txt:

blablabla_647728.bkwjcnwelkcnwelck
kjwdhcwkejcwmekcjwhemckwejhcmwekch

blablabla_6387.bkwjcnwelkcnwelckme
wexkwhenqlciwuehnqweiugfnwekfiugew
wedhwnejchwenckhwqecmwequhcnkwjehc
owichjwmelcwqhemclekcelmkjcelkwejc

blablabla_59148.bkwjcnwelkcnwelckme
ecmwequhcnkwjehcowichjwmelcwqhemcle
kcelmkjcelkwejcwecawecwacewwAWWAXEG

blablabla_111.bkwjcnwelkcnwelckm
WESETRBRVSSCQEsfdveradassefwaefawecc

出力ファイル.txt:

score 1 · Accepted Answer

どうですか：

$ egrep -o '_[0-9]+\.' file1 | grep -of - file2 | tr -d '_.'
111

# Redirect to new file
$ egrep -o '_[0-9]+\.' file1 | grep -of - file2 | tr -d '_.' > file3

最初grepにからすべての数字文字列(_とが先行する.)を取得しfile1、このリストを使用しgrepてで一致しfile2ます。_とはを使用.して取り除かれtrます。

score 0 · Accepted Answer

実際、あなたが提起していると思っていた「難しい問題」を解決しようとしました。次のコードは、file1 と file2 の両方で見つかった最長の文字列を探します。「最も長い」文字列が複数ある場合は、最初に見つかった文字列のみが報告されます。ある時点で誰かに役立つかもしれません（ただし、ここで探している解決策ではないかもしれません）：

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <sys/stat.h>

/* This routine returns the size of the file it is called with. */

static unsigned
get_file_size (const char * file_name)
{
    struct stat sb;
    if (stat (file_name, & sb) != 0) {
        fprintf (stderr, "'stat' failed for '%s': %s.\n",
                 file_name, strerror (errno));
        exit (EXIT_FAILURE);
    }
    return sb.st_size;
}

/* This routine reads the entire file into memory. */

static unsigned char *
read_whole_file (const char * file_name)
{
    unsigned s;
    unsigned char * contents;
    FILE * f;
    size_t bytes_read;
    int status;

    s = get_file_size (file_name);
    contents = malloc (s + 1);
    if (! contents) {
        fprintf (stderr, "Not enough memory.\n");
        exit (EXIT_FAILURE);
    }

    f = fopen (file_name, "r");
    if (! f) {
        fprintf (stderr, "Could not open '%s': %s.\n", file_name,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    bytes_read = fread (contents, sizeof (unsigned char), s, f);
    if (bytes_read != s) {
        fprintf (stderr, "Short read of '%s': expected %d bytes "
                 "but got %d: %s.\n", file_name, s, bytes_read,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    status = fclose (f);
    if (status != 0) {
        fprintf (stderr, "Error closing '%s': %s.\n", file_name,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    return contents;
}

int main(int argc, char* argv[]){
    int i1, i2, l1, l2, lm;
    unsigned char longestString[1000]; // lazy way to make big enough.
    unsigned char tempString[1000];
    int longestFound=0;
    unsigned char *f1, *f2; // buffers with entire file contents
    f1  = read_whole_file (argv[1]);
    f2  = read_whole_file (argv[2]);

    l1 = strlen(f1);
    l2 = strlen(f2);

    for(i1 = 0; i1 < l1; i1++) {
        lm = 0;// length of match
        for(i2 = i1; i2<l2; i2++) {
            lm = 0;

            while (f1[i1+lm] == f2[i2+lm] && (i1+lm<l1) && (i2+lm<l2) && lm < 1000-1) {
                tempString[lm] = f1[i1+lm];
                lm++;
            }

            if (lm > longestFound) {
                tempString[lm]=0; // terminate string
                strcpy(longestString, tempString);
                longestFound = lm;
            }
        }

    }

    printf("longest string found is %d characters:\n", longestFound);
    printf("%s\n", longestString);
    free(f1);
    free(f2);
    return 0;
}

ファイルの内容全体を読み取るためのコードは、http://www.lemoda.net/c/read-whole-file/index.htmlにあります。

linux - 2 つのテキスト ファイル間で一致するパターンを見つけて、別のファイルに出力する方法は?

2 に答える 2

Related

Reference

linux - 2 つのテキストファイル間で一致するパターンを見つけて、別のファイルに出力する方法は?