SQLServer Soundex でエンコードされた姓 + 名の列を持つデータベース テーブルがあります。私の C# プログラムでは、クエリで使用するために soundex を使用して文字列を変換したいと考えています。
dotnet ライブラリに soundex の標準文字列関数がありますか、それともそれを実装するオープン ソース ライブラリですか (おそらく文字列の拡張メソッドとして)?
私はこれが遅れていることを知っていますが、同様のものが必要でした (データベースは関係ありませんが)、唯一の答えは正確ではありません (「Tymczak」と「Pfister」では失敗します)。
これは私が思いついたものです:
class Program
{
public static void Main(string[] args)
{
Assert.AreEqual(Soundex.Generate("H"), "H000");
Assert.AreEqual(Soundex.Generate("Robert"), "R163");
Assert.AreEqual(Soundex.Generate("Rupert"), "R163");
Assert.AreEqual(Soundex.Generate("Rubin"), "R150");
Assert.AreEqual(Soundex.Generate("Ashcraft"), "A261");
Assert.AreEqual(Soundex.Generate("Ashcroft"), "A261");
Assert.AreEqual(Soundex.Generate("Tymczak"), "T522");
Assert.AreEqual(Soundex.Generate("Pfister"), "P236");
Assert.AreEqual(Soundex.Generate("Gutierrez"), "G362");
Assert.AreEqual(Soundex.Generate("Jackson"), "J250");
Assert.AreEqual(Soundex.Generate("VanDeusen"), "V532");
Assert.AreEqual(Soundex.Generate("Deusen"), "D250");
Assert.AreEqual(Soundex.Generate("Sword"), "S630");
Assert.AreEqual(Soundex.Generate("Sord"), "S630");
Assert.AreEqual(Soundex.Generate("Log-out"), "L230");
Assert.AreEqual(Soundex.Generate("Logout"), "L230");
Assert.AreEqual(Soundex.Generate("123"), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(""), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(null), Soundex.Empty);
}
}
public static class Soundex
{
public const string Empty = "0000";
private static readonly Regex Sanitiser = new Regex(@"[^A-Z]", RegexOptions.Compiled);
private static readonly Regex CollapseRepeatedNumbers = new Regex(@"(\d)?\1*[WH]*\1*", RegexOptions.Compiled);
private static readonly Regex RemoveVowelSounds = new Regex(@"[AEIOUY]", RegexOptions.Compiled);
public static string Generate(string Phrase)
{
// Remove non-alphas
Phrase = Sanitiser.Replace((Phrase ?? string.Empty).ToUpper(), string.Empty);
// Nothing to soundex, return empty
if (string.IsNullOrEmpty(Phrase))
return Empty;
// Convert consonants to numerical representation
var Numified = Numify(Phrase);
// Remove repeated numberics (characters of the same sound class), even if separated by H or W
Numified = CollapseRepeatedNumbers.Replace(Numified, @"$1");
if (Numified.Length > 0 && Numified[0] == Numify(Phrase[0]))
{
// Remove first numeric as first letter in same class as subsequent letters
Numified = Numified.Substring(1);
}
// Remove vowels
Numified = RemoveVowelSounds.Replace(Numified, string.Empty);
// Concatenate, pad and trim to ensure X### format.
return string.Format("{0}{1}", Phrase[0], Numified).PadRight(4, '0').Substring(0, 4);
}
private static string Numify(string Phrase)
{
return new string(Phrase.ToCharArray().Select(Numify).ToArray());
}
private static char Numify(char Character)
{
switch (Character)
{
case 'B': case 'F': case 'P': case 'V':
return '1';
case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z':
return '2';
case 'D': case 'T':
return '3';
case 'L':
return '4';
case 'M': case 'N':
return '5';
case 'R':
return '6';
default:
return Character;
}
}
}
SQLごとにc#でこのようなものを使用できます
public static string Soundex(string data)
{
StringBuilder result = new StringBuilder();
if (data != null && data.Length > 0)
{
string previousCode = "", currentCode = "", currentLetter = "";
result.Append(data.Substring(0, 1));
for (int i = 1; i < data.Length; i++)
{
currentLetter = data.Substring(i, 1).ToLower();
currentCode = "";
if ("bfpv".IndexOf(currentLetter) > -1)
currentCode = "1";
else if ("cgjkqsxz".IndexOf(currentLetter) > -1)
currentCode = "2";
else if ("dt".IndexOf(currentLetter) > -1)
currentCode = "3";
else if (currentLetter == "l")
currentCode = "4";
else if ("mn".IndexOf(currentLetter) > -1)
currentCode = "5";
else if (currentLetter == "r")
currentCode = "6";
if (currentCode != previousCode)
result.Append(currentCode);
if (result.Length == 4) break;
if (currentCode != "")
previousCode = currentCode;
}
}
if (result.Length < 4)
result.Append(new String('0', 4 - result.Length));
return result.ToString().ToUpper();
}