0

入力文字列と以前に入力した文字列の間であいまい一致テストを行っています。テストは入力中にライブで実行されます。

私はすでに多くの言語に翻訳されているStrikeAMatchと呼ばれる驚くほど正確なアルゴリズムを実装しています。私が見つけた最速の Ruby 実装はamatchです。ただし、 Ruby (MRI) 用の C インタープリターを必要とするC 拡張でデータを処理するため、私の JRuby 環境とは互換性がありません。ただし、かなり高速です。

a = "Lorem ipsum dolor"
b = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa. Sed eleifend nonummy diam. Praesent mauris ante,"
puts Benchmark.measure {
  10000.times { a.pair_distance_similar(b) }
}
# => 0.130000   0.000000   0.130000 (  0.146347)

代替環境の設定を回避できれば幸いです。別のアプローチとして、JRuby Wiki で提案されているように、元の Java コードを試して移植することもできます。しかし、それを行う方法がわかりません。

これにアプローチする方法についてのアイデアはありますか?

4

2 に答える 2

0

アルゴリズムは簡単に実装できます。たとえば、これは私が Java で書いた簡単な実装です。

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

public class StrikeAMatch {

    protected int numPairs(final String string) {
        return Math.max(0, string.length() - 1);
    }

    protected Map<String, Integer> pairs(final String string) {
        if (string.isEmpty()) {
            return new HashMap<>(0);
        }

        final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));

        for (int i = 1, j = string.length(); i < j; i += 1) {
            final char a = string.charAt(i - 1);
            final char b = string.charAt(i);
            final String pair = String.format("%c%c", a, b);
            if (pairs.containsKey(pair)) {
                pairs.put(pair, 1 + pairs.get(pair));
            }
            else {
                pairs.put(pair, 1);
            }
        }

        return pairs;
    }

    protected int intersectionSize(
            final Map<String, Integer> lhsPairs,
            final Map<String, Integer> rhsPairs) {
        final Set<String> lhsKeys = lhsPairs.keySet();
        final Set<String> rhsKeys = rhsPairs.keySet();
        final Set<String> pairIntersection = new HashSet<>(lhsKeys);
        pairIntersection.retainAll(rhsKeys);
        int sharedPairs = 0;
        for (final String pair : pairIntersection) {
            sharedPairs += Math.min(lhsPairs.get(pair), rhsPairs.get(pair));
        }
        return sharedPairs;
    }

    public double between(final String lhs, final String rhs) {
        if (lhs.isEmpty() && rhs.isEmpty()) {
            return 1.0;
        }
        final Map<String, Integer> lhsPairs = pairs(lhs.toUpperCase());
        final Map<String, Integer> rhsPairs = pairs(rhs.toUpperCase());
        return (2.0 * intersectionSize(lhsPairs, rhsPairs))
             / (numPairs(lhs) + numPairs(rhs));
    }

    public static void main(final String[] args) {
        final StrikeAMatch distance = new StrikeAMatch();
        for (final String lhs : args) {
            for (final String rhs : args) {
                System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                    lhs, rhs, distance.between(lhs, rhs));
            }
        }
    }
}

最初と最後の文字をパディングして、1 文字または 0 文字の用語に拡張することもできます。

import java.util.HashMap;
import java.util.Map;

public class PaddedStrikeAMatch extends StrikeAMatch {

    protected int numPairs(final String string) {
        return string.length() + 1;
    }

    protected Map<String, Integer> pairs(final String string) {
        if (string.isEmpty()) {
            final Map<String, Integer> pairs = new HashMap<>(1);
            pairs.put("  ", 1);
            return pairs;
        }

        final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));
        pairs.put(String.format(" %c", string.charAt(0)), 1);
        pairs.put(String.format("%c ", string.charAt(string.length() - 1)), 1);

        for (int i = 1, j = string.length(); i < j; i += 1) {
            final char a = string.charAt(i - 1);
            final char b = string.charAt(i);
            final String pair = String.format("%c%c", a, b);
            if (pairs.containsKey(pair)) {
                pairs.put(pair, 1 + pairs.get(pair));
            }
            else {
                pairs.put(pair, 1);
            }
        }

        return pairs;
    }

    public static void main(final String[] args) {
        final StrikeAMatch distance = new PaddedStrikeAMatch();
        for (final String lhs : args) {
            for (final String rhs : args) {
                System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                    lhs, rhs, distance.between(lhs, rhs));
            }
        }
    }
}

それを検証するために、提供したリンクで提案されている例を次に示します。

% java StrikeAMatch France French
d("France", "France") = [1.0000000]
d("France", "French") = [0.4000000]
d("French", "France") = [0.4000000]
d("French", "French") = [1.0000000]

% java StrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
d("Healed", "Healed") = [1.0000000]
d("Healed", "Sealed") = [0.8000000]
d("Healed", "Healthy") = [0.5454545]
d("Healed", "Heard") = [0.4444444]
d("Healed", "Herded") = [0.4000000]
d("Healed", "Help") = [0.2500000]
d("Healed", "Sold") = [0.0000000]
d("Sealed", "Healed") = [0.8000000]
d("Sealed", "Sealed") = [1.0000000]
d("Sealed", "Healthy") = [0.3636364]
d("Sealed", "Heard") = [0.2222222]
d("Sealed", "Herded") = [0.2000000]
d("Sealed", "Help") = [0.0000000]
d("Sealed", "Sold") = [0.0000000]
d("Healthy", "Healed") = [0.5454545]
d("Healthy", "Sealed") = [0.3636364]
d("Healthy", "Healthy") = [1.0000000]
d("Healthy", "Heard") = [0.4000000]
d("Healthy", "Herded") = [0.1818182]
d("Healthy", "Help") = [0.2222222]
d("Healthy", "Sold") = [0.0000000]
d("Heard", "Healed") = [0.4444444]
d("Heard", "Sealed") = [0.2222222]
d("Heard", "Healthy") = [0.4000000]
d("Heard", "Heard") = [1.0000000]
d("Heard", "Herded") = [0.4444444]
d("Heard", "Help") = [0.2857143]
d("Heard", "Sold") = [0.0000000]
d("Herded", "Healed") = [0.4000000]
d("Herded", "Sealed") = [0.2000000]
d("Herded", "Healthy") = [0.1818182]
d("Herded", "Heard") = [0.4444444]
d("Herded", "Herded") = [1.0000000]
d("Herded", "Help") = [0.2500000]
d("Herded", "Sold") = [0.0000000]
d("Help", "Healed") = [0.2500000]
d("Help", "Sealed") = [0.0000000]
d("Help", "Healthy") = [0.2222222]
d("Help", "Heard") = [0.2857143]
d("Help", "Herded") = [0.2500000]
d("Help", "Help") = [1.0000000]
d("Help", "Sold") = [0.0000000]
d("Sold", "Healed") = [0.0000000]
d("Sold", "Sealed") = [0.0000000]
d("Sold", "Healthy") = [0.0000000]
d("Sold", "Heard") = [0.0000000]
d("Sold", "Herded") = [0.0000000]
d("Sold", "Help") = [0.0000000]
d("Sold", "Sold") = [1.0000000]

...そして、パディングされたバージョンは次のとおりです。

% java PaddedStrikeAMatch France French
d("France", "France") = [1.0000000]
d("France", "French") = [0.4285714]
d("French", "France") = [0.4285714]
d("French", "French") = [1.0000000]

% java PaddedStrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
d("Healed", "Healed") = [1.0000000]
d("Healed", "Sealed") = [0.7142857]
d("Healed", "Healthy") = [0.5333333]
d("Healed", "Heard") = [0.6153846]
d("Healed", "Herded") = [0.5714286]
d("Healed", "Help") = [0.3333333]
d("Healed", "Sold") = [0.1666667]
d("Sealed", "Healed") = [0.7142857]
d("Sealed", "Sealed") = [1.0000000]
d("Sealed", "Healthy") = [0.2666667]
d("Sealed", "Heard") = [0.3076923]
d("Sealed", "Herded") = [0.2857143]
d("Sealed", "Help") = [0.0000000]
d("Sealed", "Sold") = [0.3333333]
d("Healthy", "Healed") = [0.5333333]
d("Healthy", "Sealed") = [0.2666667]
d("Healthy", "Healthy") = [1.0000000]
d("Healthy", "Heard") = [0.4285714]
d("Healthy", "Herded") = [0.2666667]
d("Healthy", "Help") = [0.3076923]
d("Healthy", "Sold") = [0.0000000]
d("Heard", "Healed") = [0.6153846]
d("Heard", "Sealed") = [0.3076923]
d("Heard", "Healthy") = [0.4285714]
d("Heard", "Heard") = [1.0000000]
d("Heard", "Herded") = [0.6153846]
d("Heard", "Help") = [0.3636364]
d("Heard", "Sold") = [0.1818182]
d("Herded", "Healed") = [0.5714286]
d("Herded", "Sealed") = [0.2857143]
d("Herded", "Healthy") = [0.2666667]
d("Herded", "Heard") = [0.6153846]
d("Herded", "Herded") = [1.0000000]
d("Herded", "Help") = [0.3333333]
d("Herded", "Sold") = [0.1666667]
d("Help", "Healed") = [0.3333333]
d("Help", "Sealed") = [0.0000000]
d("Help", "Healthy") = [0.3076923]
d("Help", "Heard") = [0.3636364]
d("Help", "Herded") = [0.3333333]
d("Help", "Help") = [1.0000000]
d("Help", "Sold") = [0.0000000]
d("Sold", "Healed") = [0.1666667]
d("Sold", "Sealed") = [0.3333333]
d("Sold", "Healthy") = [0.0000000]
d("Sold", "Heard") = [0.1818182]
d("Sold", "Herded") = [0.1666667]
d("Sold", "Help") = [0.0000000]
d("Sold", "Sold") = [1.0000000]

JRuby から直接使用する場合は、 に追加するだけで済み、スクリプト内にを追加StrikeAMatch.classするだけで済みます。$CLASSPATHrequire 'java'java_import 'StrikeAMatch

require 'java'

java_import 'StrikeAMatch'

distance = StrikeAMatch.new
ARGV.each do |lhs|
  ARGV.each do |rhs|
    puts "d(\"#{lhs}\", \"#{rhs}\") = [#{distance.between(lhs, rhs)}]"
  end
end

それを呼び出すには:

% jruby strike_a_match.rb France French
d("France", "France") = [1.0]
d("France", "French") = [0.4]
d("French", "France") = [0.4]
d("French", "French") = [1.0]
于 2016-05-10T20:45:38.063 に答える