c# - .NETを使用して2つのファイルをすばやく比較するにはどうすればよいですか？

Question

一般的なアプローチでは、FileStreamを介してバイナリを読み取り、バイトごとに比較することをお勧めします。

CRCなどのチェックサム比較はより高速になりますか？
ファイルのチェックサムを生成できる.NETライブラリはありますか？

score 148 · Accepted Answer

最も遅い方法は、2 つのファイルをバイトごとに比較することです。私が思いついた最速の方法は同様の比較ですが、一度に 1 バイトずつではなく、Int64 のサイズのバイト配列を使用して、結果の数値を比較します。

これが私が思いついたものです：

    const int BYTES_TO_READ = sizeof(Int64);

    static bool FilesAreEqual(FileInfo first, FileInfo second)
    {
        if (first.Length != second.Length)
            return false;

        if (string.Equals(first.FullName, second.FullName, StringComparison.OrdinalIgnoreCase))
            return true;

        int iterations = (int)Math.Ceiling((double)first.Length / BYTES_TO_READ);

        using (FileStream fs1 = first.OpenRead())
        using (FileStream fs2 = second.OpenRead())
        {
            byte[] one = new byte[BYTES_TO_READ];
            byte[] two = new byte[BYTES_TO_READ];

            for (int i = 0; i < iterations; i++)
            {
                 fs1.Read(one, 0, BYTES_TO_READ);
                 fs2.Read(two, 0, BYTES_TO_READ);

                if (BitConverter.ToInt64(one,0) != BitConverter.ToInt64(two,0))
                    return false;
            }
        }

        return true;
    }

私のテストでは、これが単純な ReadByte() シナリオよりもほぼ 3:1 優れていることがわかりました。1000 回以上の実行を平均すると、このメソッドは 1063 ミリ秒で得られ、以下のメソッド (単純なバイトごとの比較) は 3031 ミリ秒で得られました。ハッシュは、平均約 865 ミリ秒で常に 1 秒未満で返されます。このテストは、約 100MB のビデオファイルで行われました。

比較のために、私が使用した ReadByte メソッドとハッシュメソッドを次に示します。

    static bool FilesAreEqual_OneByte(FileInfo first, FileInfo second)
    {
        if (first.Length != second.Length)
            return false;

        if (string.Equals(first.FullName, second.FullName, StringComparison.OrdinalIgnoreCase))
            return true;

        using (FileStream fs1 = first.OpenRead())
        using (FileStream fs2 = second.OpenRead())
        {
            for (int i = 0; i < first.Length; i++)
            {
                if (fs1.ReadByte() != fs2.ReadByte())
                    return false;
            }
        }

        return true;
    }

    static bool FilesAreEqual_Hash(FileInfo first, FileInfo second)
    {
        byte[] firstHash = MD5.Create().ComputeHash(first.OpenRead());
        byte[] secondHash = MD5.Create().ComputeHash(second.OpenRead());

        for (int i=0; i<firstHash.Length; i++)
        {
            if (firstHash[i] != secondHash[i])
                return false;
        }
        return true;
    }

score 127 · Accepted Answer

チェックサムの比較は、バイトごとの比較よりも遅くなる可能性があります。

チェックサムを生成するには、ファイルの各バイトをロードし、それに対して処理を実行する必要があります。次に、2番目のファイルでこれを行う必要があります。処理はほぼ間違いなく比較チェックよりも遅くなります。

チェックサムの生成について：暗号化クラスを使用すると、これを簡単に行うことができます。これは、C＃でMD5チェックサムを生成する簡単な例です。

ただし、「テスト」または「ベース」ケースのチェックサムを事前に計算できる場合は、チェックサムの方が高速で意味があります。既存のファイルがあり、新しいファイルが既存のファイルと同じであるかどうかを確認している場合、「既存の」ファイルでチェックサムを事前に計算することは、DiskIOを1回だけ実行する必要があることを意味します。新しいファイル。これは、バイトごとの比較よりも高速である可能性があります。

score 57 · Accepted Answer

完全なバイトごとの比較が本当に必要だと判断した場合（ハッシュの議論については他の回答を参照）、最も簡単な解決策は次のとおりです。

• `System.String` パス名の場合:

public static bool AreFileContentsEqual(String path1, String path2) =>
              File.ReadAllBytes(path1).SequenceEqual(File.ReadAllBytes(path2));

• `System.IO.FileInfo` インスタンスの場合:

public static bool AreFileContentsEqual(FileInfo fi1, FileInfo fi2) =>
    fi1.Length == fi2.Length &&
    (fi1.Length == 0L || File.ReadAllBytes(fi1.FullName).SequenceEqual(
                         File.ReadAllBytes(fi2.FullName)));

他の投稿された回答とは異なり、これはバイナリ、テキスト、メディア、実行可能ファイルなど、あらゆる種類のファイルに対して決定的に正しいですが、完全なバイナリ比較として、「重要でない」方法でのみ異なるファイル( BOM、行など) -ending、文字エンコーディング、メディアメタデータ、空白、パディング、ソースコードコメントなど。^{注 1 ) は常に}not-equalと見なされます。

このコードは両方のファイルを完全にメモリにロードするため、本当に巨大なファイルの比較には使用しないでください。その重要な警告を超えて、.NET GCの設計を考えると、完全な読み込みは実際にはペナルティではありません(これは、小さくて短期間の割り当てを非常に安価に保つように基本的に最適化されているためです)。最小限のユーザーコード (ここに示すように) を使用することは、ファイルパフォーマンスの問題を、、および (たとえば) 最新の設計テクノロジ、システムコード、およびアダプティブランタイムの最適化の恩恵を受けるために最大限に委任することを意味するため、85K未満にする必要があります。CLRBCLJIT

さらに、このような平日のシナリオでは、ファイル I/O のためにディスクa̲t̲ a̲l̲l̲をヒットすると、メリットが数桁小さくなるため、 LINQ(ここに示すように) 列挙子を介したバイト単位の比較のパフォーマンスに関する懸念は意味がありません。さまざまなメモリ比較の選択肢の。たとえば、実際には最初の不一致で放棄する「最適化」が得られますが、ファイルの内容を取得した後では、これはほとんど問題になりません。SequenceEqual

^{1.あいまいな例外: NTFS 代替データストリームは、このページで説明されている回答のいずれによっても調べられないため、「同じ」と見なされるファイルとは異なる場合があります。}

score 33 · Accepted Answer

Reed Copseyの回答に加えて:

最悪のケースは、2 つのファイルが同一である場合です。この場合、ファイルをバイト単位で比較するのが最善です。
2 つのファイルが同一でない場合は、それらが同一でないことをより早く検出することで、処理を少し高速化できます。

たとえば、2 つのファイルの長さが異なる場合、それらを同一にすることはできないことがわかり、実際の内容を比較する必要さえありません。

score 20 · Accepted Answer

小さな 8 バイトのチャンクを読み取るのではなく、ループを配置して大きなチャンクを読み取ると、さらに高速になります。平均比較時間を 1/4 に短縮しました。

    public static bool FilesContentsAreEqual(FileInfo fileInfo1, FileInfo fileInfo2)
    {
        bool result;

        if (fileInfo1.Length != fileInfo2.Length)
        {
            result = false;
        }
        else
        {
            using (var file1 = fileInfo1.OpenRead())
            {
                using (var file2 = fileInfo2.OpenRead())
                {
                    result = StreamsContentsAreEqual(file1, file2);
                }
            }
        }

        return result;
    }

    private static bool StreamsContentsAreEqual(Stream stream1, Stream stream2)
    {
        const int bufferSize = 1024 * sizeof(Int64);
        var buffer1 = new byte[bufferSize];
        var buffer2 = new byte[bufferSize];

        while (true)
        {
            int count1 = stream1.Read(buffer1, 0, bufferSize);
            int count2 = stream2.Read(buffer2, 0, bufferSize);

            if (count1 != count2)
            {
                return false;
            }

            if (count1 == 0)
            {
                return true;
            }

            int iterations = (int)Math.Ceiling((double)count1 / sizeof(Int64));
            for (int i = 0; i < iterations; i++)
            {
                if (BitConverter.ToInt64(buffer1, i * sizeof(Int64)) != BitConverter.ToInt64(buffer2, i * sizeof(Int64)))
                {
                    return false;
                }
            }
        }
    }
}

score 14 · Accepted Answer

チェックサムの比較をバイトごとの比較よりもわずかに速くする可能性がある唯一のことは、一度に1つのファイルを読み取っており、ディスクヘッドのシーク時間をいくらか短縮しているという事実です。ただし、そのわずかな増加は、ハッシュを計算するための追加の時間によって非常によく食い尽くされる可能性があります。

Also, a checksum comparison of course only has any chance of being faster if the files are identical. If they are not, a byte-by-byte comparison would end at the first difference, making it a lot faster.

You should also consider that a hash code comparison only tells you that it's very likely that the files are identical. To be 100% certain you need to do a byte-by-byte comparison.

If the hash code for example is 32 bits, you are about 99.99999998% certain that the files are identical if the hash codes match. That is close to 100%, but if you truly need 100% certainty, that's not it.

score 14 · Accepted Answer

編集:この方法は、バイナリファイルの比較には使用できません!

.NET 4.0 では、Fileクラスに次の 2 つの新しいメソッドがあります。

public static IEnumerable<string> ReadLines(string path)
public static IEnumerable<string> ReadLines(string path, Encoding encoding)

つまり、次を使用できます。

bool same = File.ReadLines(path1).SequenceEqual(File.ReadLines(path2));

score 7 · Accepted Answer

正直なところ、検索ツリーを可能な限り削減する必要があると思います。

バイトごとに進む前に確認すること:

サイズは同じですか？
ファイル A の最後のバイトはファイル B とは異なります

また、ドライブは順次バイトをより速く読み取るため、一度に大きなブロックを読み取る方が効率的です。バイトごとに移動すると、システムコールがはるかに多くなるだけでなく、両方のファイルが同じドライブ上にある場合、従来のハードドライブの読み取りヘッドがより頻繁に前後にシークします。

チャンク A とチャンク B をバイトバッファーに読み込み、それらを比較します (Array.Equals は使用しないでください。コメントを参照してください)。メモリとパフォーマンスの間の適切なトレードオフであると感じるサイズになるまで、ブロックのサイズを調整します。比較をマルチスレッド化することもできますが、ディスクの読み取りをマルチスレッド化しないでください。

score 6 · Accepted Answer

私の答えは @lars の派生物ですが、への呼び出しのバグを修正しStream.Readます。また、他の回答に含まれていた高速パスチェックと入力検証も追加します。要するに、これが答えになるはずです：

using System;
using System.IO;

namespace ConsoleApp4
{
    class Program
    {
        static void Main(string[] args)
        {
            var fi1 = new FileInfo(args[0]);
            var fi2 = new FileInfo(args[1]);
            Console.WriteLine(FilesContentsAreEqual(fi1, fi2));
        }

        public static bool FilesContentsAreEqual(FileInfo fileInfo1, FileInfo fileInfo2)
        {
            if (fileInfo1 == null)
            {
                throw new ArgumentNullException(nameof(fileInfo1));
            }

            if (fileInfo2 == null)
            {
                throw new ArgumentNullException(nameof(fileInfo2));
            }

            if (string.Equals(fileInfo1.FullName, fileInfo2.FullName, StringComparison.OrdinalIgnoreCase))
            {
                return true;
            }

            if (fileInfo1.Length != fileInfo2.Length)
            {
                return false;
            }
            else
            {
                using (var file1 = fileInfo1.OpenRead())
                {
                    using (var file2 = fileInfo2.OpenRead())
                    {
                        return StreamsContentsAreEqual(file1, file2);
                    }
                }
            }
        }

        private static int ReadFullBuffer(Stream stream, byte[] buffer)
        {
            int bytesRead = 0;
            while (bytesRead < buffer.Length)
            {
                int read = stream.Read(buffer, bytesRead, buffer.Length - bytesRead);
                if (read == 0)
                {
                    // Reached end of stream.
                    return bytesRead;
                }

                bytesRead += read;
            }

            return bytesRead;
        }

        private static bool StreamsContentsAreEqual(Stream stream1, Stream stream2)
        {
            const int bufferSize = 1024 * sizeof(Int64);
            var buffer1 = new byte[bufferSize];
            var buffer2 = new byte[bufferSize];

            while (true)
            {
                int count1 = ReadFullBuffer(stream1, buffer1);
                int count2 = ReadFullBuffer(stream2, buffer2);

                if (count1 != count2)
                {
                    return false;
                }

                if (count1 == 0)
                {
                    return true;
                }

                int iterations = (int)Math.Ceiling((double)count1 / sizeof(Int64));
                for (int i = 0; i < iterations; i++)
                {
                    if (BitConverter.ToInt64(buffer1, i * sizeof(Int64)) != BitConverter.ToInt64(buffer2, i * sizeof(Int64)))
                    {
                        return false;
                    }
                }
            }
        }
    }
}

または、非常に素晴らしいものにしたい場合は、非同期バリアントを使用できます。

using System;
using System.IO;
using System.Threading.Tasks;

namespace ConsoleApp4
{
    class Program
    {
        static void Main(string[] args)
        {
            var fi1 = new FileInfo(args[0]);
            var fi2 = new FileInfo(args[1]);
            Console.WriteLine(FilesContentsAreEqualAsync(fi1, fi2).GetAwaiter().GetResult());
        }

        public static async Task<bool> FilesContentsAreEqualAsync(FileInfo fileInfo1, FileInfo fileInfo2)
        {
            if (fileInfo1 == null)
            {
                throw new ArgumentNullException(nameof(fileInfo1));
            }

            if (fileInfo2 == null)
            {
                throw new ArgumentNullException(nameof(fileInfo2));
            }

            if (string.Equals(fileInfo1.FullName, fileInfo2.FullName, StringComparison.OrdinalIgnoreCase))
            {
                return true;
            }

            if (fileInfo1.Length != fileInfo2.Length)
            {
                return false;
            }
            else
            {
                using (var file1 = fileInfo1.OpenRead())
                {
                    using (var file2 = fileInfo2.OpenRead())
                    {
                        return await StreamsContentsAreEqualAsync(file1, file2).ConfigureAwait(false);
                    }
                }
            }
        }

        private static async Task<int> ReadFullBufferAsync(Stream stream, byte[] buffer)
        {
            int bytesRead = 0;
            while (bytesRead < buffer.Length)
            {
                int read = await stream.ReadAsync(buffer, bytesRead, buffer.Length - bytesRead).ConfigureAwait(false);
                if (read == 0)
                {
                    // Reached end of stream.
                    return bytesRead;
                }

                bytesRead += read;
            }

            return bytesRead;
        }

        private static async Task<bool> StreamsContentsAreEqualAsync(Stream stream1, Stream stream2)
        {
            const int bufferSize = 1024 * sizeof(Int64);
            var buffer1 = new byte[bufferSize];
            var buffer2 = new byte[bufferSize];

            while (true)
            {
                int count1 = await ReadFullBufferAsync(stream1, buffer1).ConfigureAwait(false);
                int count2 = await ReadFullBufferAsync(stream2, buffer2).ConfigureAwait(false);

                if (count1 != count2)
                {
                    return false;
                }

                if (count1 == 0)
                {
                    return true;
                }

                int iterations = (int)Math.Ceiling((double)count1 / sizeof(Int64));
                for (int i = 0; i < iterations; i++)
                {
                    if (BitConverter.ToInt64(buffer1, i * sizeof(Int64)) != BitConverter.ToInt64(buffer2, i * sizeof(Int64)))
                    {
                        return false;
                    }
                }
            }
        }
    }
}

score 2 · Accepted Answer

My experiments show that it definitely helps to call Stream.ReadByte() fewer times, but using BitConverter to package bytes does not make much difference against comparing bytes in a byte array.

So it is possible to replace that "Math.Ceiling and iterations" loop in the comment above with the simplest one:

            for (int i = 0; i < count1; i++)
            {
                if (buffer1[i] != buffer2[i])
                    return false;
            }

I guess it has to do with the fact that BitConverter.ToInt64 needs to do a bit of work (check arguments and then perform the bit shifting) before you compare and that ends up being the same amount of work as compare 8 bytes in two arrays.

score 2 · Accepted Answer

ファイルが大きすぎない場合は、次を使用できます。

public static byte[] ComputeFileHash(string fileName)
{
    using (var stream = File.OpenRead(fileName))
        return System.Security.Cryptography.MD5.Create().ComputeHash(stream);
}

ハッシュを保存するのに役立つ場合にのみ、ハッシュを比較できます。

(コードをよりクリーンなものに編集しました。)

score 1 · Accepted Answer

2つのファイルを比較するだけでよい場合は、最速の方法だと思います（Cでは、.NETに適用できるかどうかはわかりません）。

両方のファイルf1、f2を開きます
それぞれのファイルの長さl1、l2を取得します
l1！= l2の場合、ファイルは異なります。止まる
mmap（）両方のファイル
mmap（）されたファイルでmemcmp（）を使用する

OTOH、N個のファイルのセットに重複ファイルがあるかどうかを確認する必要がある場合、N方向のビットごとの比較を回避するために、間違いなくハッシュを使用するのが最速の方法です。

score 1 · Accepted Answer

2 つのファイル (または 2 つのストリーム) に同一のデータが含まれているかどうかを判断できるユーティリティ関数を次に示します。

タスクを使用して異なるスレッドでバイト配列 (各ファイルで読み取られたものから満たされた各バッファー) を比較するため、マルチスレッド化された「高速」バージョンを提供しました。

予想どおり、はるかに高速 (約 3 倍) ですが、より多くの CPU (マルチスレッドであるため) とより多くのメモリ (比較スレッドごとに 2 バイト配列バッファーが必要であるため) を消費します。

    public static bool AreFilesIdenticalFast(string path1, string path2)
    {
        return AreFilesIdentical(path1, path2, AreStreamsIdenticalFast);
    }

    public static bool AreFilesIdentical(string path1, string path2)
    {
        return AreFilesIdentical(path1, path2, AreStreamsIdentical);
    }

    public static bool AreFilesIdentical(string path1, string path2, Func<Stream, Stream, bool> areStreamsIdentical)
    {
        if (path1 == null)
            throw new ArgumentNullException(nameof(path1));

        if (path2 == null)
            throw new ArgumentNullException(nameof(path2));

        if (areStreamsIdentical == null)
            throw new ArgumentNullException(nameof(path2));

        if (!File.Exists(path1) || !File.Exists(path2))
            return false;

        using (var thisFile = new FileStream(path1, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
        {
            using (var valueFile = new FileStream(path2, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
            {
                if (valueFile.Length != thisFile.Length)
                    return false;

                if (!areStreamsIdentical(thisFile, valueFile))
                    return false;
            }
        }
        return true;
    }

    public static bool AreStreamsIdenticalFast(Stream stream1, Stream stream2)
    {
        if (stream1 == null)
            throw new ArgumentNullException(nameof(stream1));

        if (stream2 == null)
            throw new ArgumentNullException(nameof(stream2));

        const int bufsize = 80000; // 80000 is below LOH (85000)

        var tasks = new List<Task<bool>>();
        do
        {
            // consumes more memory (two buffers for each tasks)
            var buffer1 = new byte[bufsize];
            var buffer2 = new byte[bufsize];

            int read1 = stream1.Read(buffer1, 0, buffer1.Length);
            if (read1 == 0)
            {
                int read3 = stream2.Read(buffer2, 0, 1);
                if (read3 != 0) // not eof
                    return false;

                break;
            }

            // both stream read could return different counts
            int read2 = 0;
            do
            {
                int read3 = stream2.Read(buffer2, read2, read1 - read2);
                if (read3 == 0)
                    return false;

                read2 += read3;
            }
            while (read2 < read1);

            // consumes more cpu
            var task = Task.Run(() =>
            {
                return IsSame(buffer1, buffer2);
            });
            tasks.Add(task);
        }
        while (true);

        Task.WaitAll(tasks.ToArray());
        return !tasks.Any(t => !t.Result);
    }

    public static bool AreStreamsIdentical(Stream stream1, Stream stream2)
    {
        if (stream1 == null)
            throw new ArgumentNullException(nameof(stream1));

        if (stream2 == null)
            throw new ArgumentNullException(nameof(stream2));

        const int bufsize = 80000; // 80000 is below LOH (85000)
        var buffer1 = new byte[bufsize];
        var buffer2 = new byte[bufsize];

        var tasks = new List<Task<bool>>();
        do
        {
            int read1 = stream1.Read(buffer1, 0, buffer1.Length);
            if (read1 == 0)
                return stream2.Read(buffer2, 0, 1) == 0; // check not eof

            // both stream read could return different counts
            int read2 = 0;
            do
            {
                int read3 = stream2.Read(buffer2, read2, read1 - read2);
                if (read3 == 0)
                    return false;

                read2 += read3;
            }
            while (read2 < read1);

            if (!IsSame(buffer1, buffer2))
                return false;
        }
        while (true);
    }

    public static bool IsSame(byte[] bytes1, byte[] bytes2)
    {
        if (bytes1 == null)
            throw new ArgumentNullException(nameof(bytes1));

        if (bytes2 == null)
            throw new ArgumentNullException(nameof(bytes2));

        if (bytes1.Length != bytes2.Length)
            return false;

        for (int i = 0; i < bytes1.Length; i++)
        {
            if (bytes1[i] != bytes2[i])
                return false;
        }
        return true;
    }

score 1 · Accepted Answer

同じ長さの大きなファイルのもう 1 つの改善点は、ファイルを順番に読み取るのではなく、多かれ少なかれランダムなブロックを比較することです。

複数のスレッドを使用して、ファイル内の異なる位置から開始し、順方向または逆方向に比較できます。

このようにして、シーケンシャルなアプローチを使用してそこに到達するよりも速く、ファイルの途中/最後で変更を検出できます。

score 0 · Accepted Answer

バイトごとに比較するよりも「ハッシュ」の方が速いアプリケーションがあると思います。ファイルを他のファイルと比較する必要がある場合、または変更可能な写真のサムネイルが必要な場合。どこでどのように使用するかによって異なります。

private bool CompareFilesByte(string file1, string file2)
{
    using (var fs1 = new FileStream(file1, FileMode.Open))
    using (var fs2 = new FileStream(file2, FileMode.Open))
    {
        if (fs1.Length != fs2.Length) return false;
        int b1, b2;
        do
        {
            b1 = fs1.ReadByte();
            b2 = fs2.ReadByte();
            if (b1 != b2 || b1 < 0) return false;
        }
        while (b1 >= 0);
    }
    return true;
}

private string HashFile(string file)
{
    using (var fs = new FileStream(file, FileMode.Open))
    using (var reader = new BinaryReader(fs))
    {
        var hash = new SHA512CryptoServiceProvider();
        hash.ComputeHash(reader.ReadBytes((int)file.Length));
        return Convert.ToBase64String(hash.Hash);
    }
}

private bool CompareFilesWithHash(string file1, string file2)
{
    var str1 = HashFile(file1);
    var str2 = HashFile(file2);
    return str1 == str2;
}

ここでは、最速のものを取得できます。

var sw = new Stopwatch();
sw.Start();
var compare1 = CompareFilesWithHash(receiveLogPath, logPath);
sw.Stop();
Debug.WriteLine(string.Format("Compare using Hash {0}", sw.ElapsedTicks));
sw.Reset();
sw.Start();
var compare2 = CompareFilesByte(receiveLogPath, logPath);
sw.Stop();
Debug.WriteLine(string.Format("Compare byte-byte {0}", sw.ElapsedTicks));

オプションで、ハッシュをデータベースに保存できます。

これが役立つことを願っています

c# - .NETを使用して2つのファイルをすばやく比較するにはどうすればよいですか？

18 に答える 18

Related

Reference