string doc = "This is a test sentence with some words with some words repeating like: is a test";
var result = doc.Split(' ')
.GroupBy(word => word)
.OrderByDescending(g=> g.Count())
.Take(1000)
.ToDictionary(r => r.Key ,r=> r.Count());
編集:
キーとしての単語と値としての最終的なカウントに基づいて、文字列の配列から最終的な辞書を取得しようとしていると思います。辞書には重複する値を含めることができないため、を使用する必要はありませんDistict
。メソッドを次のように書き直す必要があります。
private Dictionary<string,int> GenerateTerms(string[] docs)
{
List<Dictionary<string, int>> combinedDictionaryList = new List<Dictionary<string, int>>();
foreach (string str in docs)
{
//Add returned dictionaries to a list
combinedDictionaryList.Add(ProcessDocument(str));
}
//return a single dictionary from list od dictionaries
return combinedDictionaryList
.SelectMany(dict=> dict)
.ToLookup(pair => pair.Key, pair => pair.Value)
.ToDictionary(group => group.Key, group => group.Sum(value => value));
}
private Dictionary<string,int> ProcessDocument(string doc)
{
return doc.Split(' ')
.GroupBy(word => word)
.OrderByDescending(g => g.Count())
.Take(1000)
.ToDictionary(r => r.Key, r => r.Count());
}
次に、次のように呼び出すことができます。
string[] docs = new[]
{
"This is a test sentence with some words with some words repeating like: is a test",
"This is a test sentence with some words with some words repeating like: is a test",
"This is a test sentence with some words",
"This is a test sentence with some words",
};
Dictionary<string, int> finalDictionary = GenerateTerms(docs);