非常に大きな文字列を使用すると、メモリが問題になることがあります。以下のコードは、最も長い共通部分文字列を見つけて、より小さな共通部分文字列を含む変数を上書きしますが、インデックスと長さをリストにプッシュして文字列の配列として返すように簡単に変更できます。
これは、 https: //iq.opengenus.org/longest-common-substring-using-rolling-hash/ にある Ashutosh Singh のリファクタリングされた C++ コードです。これにより、O(N * log(N)^2) 時間で部分文字列が検出されます。および O(N) スペース
using System;
using System.Collections.Generic;
public class RollingHash
{
private class RollingHashPowers
{
// _mod = prime modulus of polynomial hashing
// any prime number over a billion should suffice
internal const int _mod = (int)1e9 + 123;
// _hashBase = base (point of hashing)
// this should be a prime number larger than the number of characters used
// in my use case I am only interested in ASCII (256) characters
// for strings in languages using non-latin characters, this should be much larger
internal const long _hashBase = 257;
// _pow1 = powers of base modulo mod
internal readonly List<int> _pow1 = new List<int> { 1 };
// _pow2 = powers of base modulo 2^64
internal readonly List<long> _pow2 = new List<long> { 1L };
internal void EnsureLength(int length)
{
if (_pow1.Capacity < length)
{
_pow1.Capacity = _pow2.Capacity = length;
}
for (int currentIndx = _pow1.Count - 1; currentIndx < length; ++currentIndx)
{
_pow1.Add((int)(_pow1[currentIndx] * _hashBase % _mod));
_pow2.Add(_pow2[currentIndx] * _hashBase);
}
}
}
private class RollingHashedString
{
readonly RollingHashPowers _pows;
readonly int[] _pref1; // Hash on prefix modulo mod
readonly long[] _pref2; // Hash on prefix modulo 2^64
// Constructor from string:
internal RollingHashedString(RollingHashPowers pows, string s, bool caseInsensitive = false)
{
_pows = pows;
_pref1 = new int[s.Length + 1];
_pref2 = new long[s.Length + 1];
const long capAVal = 'A';
const long capZVal = 'Z';
const long aADif = 'a' - 'A';
unsafe
{
fixed (char* c = s)
{
// Fill arrays with polynomial hashes on prefix
for (int i = 0; i < s.Length; ++i)
{
long v = c[i];
if (caseInsensitive && capAVal <= v && v <= capZVal)
{
v += aADif;
}
_pref1[i + 1] = (int)((_pref1[i] + v * _pows._pow1[i]) % RollingHashPowers._mod);
_pref2[i + 1] = _pref2[i] + v * _pows._pow2[i];
}
}
}
}
// Rollingnomial hash of subsequence [pos, pos+len)
// If mxPow != 0, value automatically multiply on base in needed power.
// Finally base ^ mxPow
internal Tuple<int, long> Apply(int pos, int len, int mxPow = 0)
{
int hash1 = _pref1[pos + len] - _pref1[pos];
long hash2 = _pref2[pos + len] - _pref2[pos];
if (hash1 < 0)
{
hash1 += RollingHashPowers._mod;
}
if (mxPow != 0)
{
hash1 = (int)((long)hash1 * _pows._pow1[mxPow - (pos + len - 1)] % RollingHashPowers._mod);
hash2 *= _pows._pow2[mxPow - (pos + len - 1)];
}
return Tuple.Create(hash1, hash2);
}
}
private readonly RollingHashPowers _rhp;
public RollingHash(int longestLength = 0)
{
_rhp = new RollingHashPowers();
if (longestLength > 0)
{
_rhp.EnsureLength(longestLength);
}
}
public string FindCommonSubstring(string a, string b, bool caseInsensitive = false)
{
// Calculate max neede power of base:
int mxPow = Math.Max(a.Length, b.Length);
_rhp.EnsureLength(mxPow);
// Create hashing objects from strings:
RollingHashedString hash_a = new RollingHashedString(_rhp, a, caseInsensitive);
RollingHashedString hash_b = new RollingHashedString(_rhp, b, caseInsensitive);
// Binary search by length of same subsequence:
int pos = -1;
int low = 0;
int minLen = Math.Min(a.Length, b.Length);
int high = minLen + 1;
var tupleCompare = Comparer<Tuple<int, long>>.Default;
while (high - low > 1)
{
int mid = (low + high) / 2;
List<Tuple<int, long>> hashes = new List<Tuple<int, long>>(a.Length - mid + 1);
for (int i = 0; i + mid <= a.Length; ++i)
{
hashes.Add(hash_a.Apply(i, mid, mxPow));
}
hashes.Sort(tupleCompare);
int p = -1;
for (int i = 0; i + mid <= b.Length; ++i)
{
if (hashes.BinarySearch(hash_b.Apply(i, mid, mxPow), tupleCompare) >= 0)
{
p = i;
break;
}
}
if (p >= 0)
{
low = mid;
pos = p;
}
else
{
high = mid;
}
}
// Output answer:
return pos >= 0
? b.Substring(pos, low)
: string.Empty;
}
}