在 JRuby 中使用带有 c 扩展的 gem,或者将它们移植到 Java?

Use gems with c-extensions in JRuby, or port them to Java?

我正在对一个输入字符串和一些之前输入的字符串进行模糊匹配测试。测试是在打字时实时进行的。

我已经有了一个非常准确的算法,称为 StrikeAMatch, which has been translated into many languages. The fastest Ruby implementation I've found is amatch. However, it is incompatible with my JRuby environment because it crunches data in a C extension,它需要 Ruby (MRI) 的 C 解释器。不过还是挺快的:

a = "Lorem ipsum dolor"
b = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa. Sed eleifend nonummy diam. Praesent mauris ante,"
puts Benchmark.measure {
  10000.times { a.pair_distance_similar(b) }
}
# => 0.130000   0.000000   0.130000 (  0.146347)

我希望我可以避免设置替代环境。另一种方法可能是尝试移植原始 Java 代码 as suggested in the JRuby Wiki。不过不知道该怎么做。

关于如何解决这个问题有什么想法吗?

有 2 种方法可以解决此问题:

  1. 从现有的 Ruby gem 中提取 C ext 代码并将其移植到 Java

...有许多 gem 的示例,它们具有 C-Ruby/JRuby 的特定扩展名,例如 json/nokogiri

  1. 使用 Java 端口,加载 Java 类 并使用 JRuby 的 Java 集成

该算法易于实现。例如,这是我在 Java:

中编写的快速实现
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

public class StrikeAMatch {

    protected int numPairs(final String string) {
        return Math.max(0, string.length() - 1);
    }

    protected Map<String, Integer> pairs(final String string) {
        if (string.isEmpty()) {
            return new HashMap<>(0);
        }

        final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));

        for (int i = 1, j = string.length(); i < j; i += 1) {
            final char a = string.charAt(i - 1);
            final char b = string.charAt(i);
            final String pair = String.format("%c%c", a, b);
            if (pairs.containsKey(pair)) {
                pairs.put(pair, 1 + pairs.get(pair));
            }
            else {
                pairs.put(pair, 1);
            }
        }

        return pairs;
    }

    protected int intersectionSize(
            final Map<String, Integer> lhsPairs,
            final Map<String, Integer> rhsPairs) {
        final Set<String> lhsKeys = lhsPairs.keySet();
        final Set<String> rhsKeys = rhsPairs.keySet();
        final Set<String> pairIntersection = new HashSet<>(lhsKeys);
        pairIntersection.retainAll(rhsKeys);
        int sharedPairs = 0;
        for (final String pair : pairIntersection) {
            sharedPairs += Math.min(lhsPairs.get(pair), rhsPairs.get(pair));
        }
        return sharedPairs;
    }

    public double between(final String lhs, final String rhs) {
        if (lhs.isEmpty() && rhs.isEmpty()) {
            return 1.0;
        }
        final Map<String, Integer> lhsPairs = pairs(lhs.toUpperCase());
        final Map<String, Integer> rhsPairs = pairs(rhs.toUpperCase());
        return (2.0 * intersectionSize(lhsPairs, rhsPairs))
             / (numPairs(lhs) + numPairs(rhs));
    }

    public static void main(final String[] args) {
        final StrikeAMatch distance = new StrikeAMatch();
        for (final String lhs : args) {
            for (final String rhs : args) {
                System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                    lhs, rhs, distance.between(lhs, rhs));
            }
        }
    }
}

您甚至可以填充第一个和最后一个字符,以将其扩展为一个字符或零个字符的术语:

import java.util.HashMap;
import java.util.Map;

public class PaddedStrikeAMatch extends StrikeAMatch {

    protected int numPairs(final String string) {
        return string.length() + 1;
    }

    protected Map<String, Integer> pairs(final String string) {
        if (string.isEmpty()) {
            final Map<String, Integer> pairs = new HashMap<>(1);
            pairs.put("  ", 1);
            return pairs;
        }

        final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));
        pairs.put(String.format(" %c", string.charAt(0)), 1);
        pairs.put(String.format("%c ", string.charAt(string.length() - 1)), 1);

        for (int i = 1, j = string.length(); i < j; i += 1) {
            final char a = string.charAt(i - 1);
            final char b = string.charAt(i);
            final String pair = String.format("%c%c", a, b);
            if (pairs.containsKey(pair)) {
                pairs.put(pair, 1 + pairs.get(pair));
            }
            else {
                pairs.put(pair, 1);
            }
        }

        return pairs;
    }

    public static void main(final String[] args) {
        final StrikeAMatch distance = new PaddedStrikeAMatch();
        for (final String lhs : args) {
            for (final String rhs : args) {
                System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                    lhs, rhs, distance.between(lhs, rhs));
            }
        }
    }
}

为了验证它,以下是您提供的 link 中提出的示例:

% java StrikeAMatch France French
d("France", "France") = [1.0000000]
d("France", "French") = [0.4000000]
d("French", "France") = [0.4000000]
d("French", "French") = [1.0000000]

% java StrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
d("Healed", "Healed") = [1.0000000]
d("Healed", "Sealed") = [0.8000000]
d("Healed", "Healthy") = [0.5454545]
d("Healed", "Heard") = [0.4444444]
d("Healed", "Herded") = [0.4000000]
d("Healed", "Help") = [0.2500000]
d("Healed", "Sold") = [0.0000000]
d("Sealed", "Healed") = [0.8000000]
d("Sealed", "Sealed") = [1.0000000]
d("Sealed", "Healthy") = [0.3636364]
d("Sealed", "Heard") = [0.2222222]
d("Sealed", "Herded") = [0.2000000]
d("Sealed", "Help") = [0.0000000]
d("Sealed", "Sold") = [0.0000000]
d("Healthy", "Healed") = [0.5454545]
d("Healthy", "Sealed") = [0.3636364]
d("Healthy", "Healthy") = [1.0000000]
d("Healthy", "Heard") = [0.4000000]
d("Healthy", "Herded") = [0.1818182]
d("Healthy", "Help") = [0.2222222]
d("Healthy", "Sold") = [0.0000000]
d("Heard", "Healed") = [0.4444444]
d("Heard", "Sealed") = [0.2222222]
d("Heard", "Healthy") = [0.4000000]
d("Heard", "Heard") = [1.0000000]
d("Heard", "Herded") = [0.4444444]
d("Heard", "Help") = [0.2857143]
d("Heard", "Sold") = [0.0000000]
d("Herded", "Healed") = [0.4000000]
d("Herded", "Sealed") = [0.2000000]
d("Herded", "Healthy") = [0.1818182]
d("Herded", "Heard") = [0.4444444]
d("Herded", "Herded") = [1.0000000]
d("Herded", "Help") = [0.2500000]
d("Herded", "Sold") = [0.0000000]
d("Help", "Healed") = [0.2500000]
d("Help", "Sealed") = [0.0000000]
d("Help", "Healthy") = [0.2222222]
d("Help", "Heard") = [0.2857143]
d("Help", "Herded") = [0.2500000]
d("Help", "Help") = [1.0000000]
d("Help", "Sold") = [0.0000000]
d("Sold", "Healed") = [0.0000000]
d("Sold", "Sealed") = [0.0000000]
d("Sold", "Healthy") = [0.0000000]
d("Sold", "Heard") = [0.0000000]
d("Sold", "Herded") = [0.0000000]
d("Sold", "Help") = [0.0000000]
d("Sold", "Sold") = [1.0000000]

...这是填充版本:

% java PaddedStrikeAMatch France French
d("France", "France") = [1.0000000]
d("France", "French") = [0.4285714]
d("French", "France") = [0.4285714]
d("French", "French") = [1.0000000]

% java PaddedStrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
d("Healed", "Healed") = [1.0000000]
d("Healed", "Sealed") = [0.7142857]
d("Healed", "Healthy") = [0.5333333]
d("Healed", "Heard") = [0.6153846]
d("Healed", "Herded") = [0.5714286]
d("Healed", "Help") = [0.3333333]
d("Healed", "Sold") = [0.1666667]
d("Sealed", "Healed") = [0.7142857]
d("Sealed", "Sealed") = [1.0000000]
d("Sealed", "Healthy") = [0.2666667]
d("Sealed", "Heard") = [0.3076923]
d("Sealed", "Herded") = [0.2857143]
d("Sealed", "Help") = [0.0000000]
d("Sealed", "Sold") = [0.3333333]
d("Healthy", "Healed") = [0.5333333]
d("Healthy", "Sealed") = [0.2666667]
d("Healthy", "Healthy") = [1.0000000]
d("Healthy", "Heard") = [0.4285714]
d("Healthy", "Herded") = [0.2666667]
d("Healthy", "Help") = [0.3076923]
d("Healthy", "Sold") = [0.0000000]
d("Heard", "Healed") = [0.6153846]
d("Heard", "Sealed") = [0.3076923]
d("Heard", "Healthy") = [0.4285714]
d("Heard", "Heard") = [1.0000000]
d("Heard", "Herded") = [0.6153846]
d("Heard", "Help") = [0.3636364]
d("Heard", "Sold") = [0.1818182]
d("Herded", "Healed") = [0.5714286]
d("Herded", "Sealed") = [0.2857143]
d("Herded", "Healthy") = [0.2666667]
d("Herded", "Heard") = [0.6153846]
d("Herded", "Herded") = [1.0000000]
d("Herded", "Help") = [0.3333333]
d("Herded", "Sold") = [0.1666667]
d("Help", "Healed") = [0.3333333]
d("Help", "Sealed") = [0.0000000]
d("Help", "Healthy") = [0.3076923]
d("Help", "Heard") = [0.3636364]
d("Help", "Herded") = [0.3333333]
d("Help", "Help") = [1.0000000]
d("Help", "Sold") = [0.0000000]
d("Sold", "Healed") = [0.1666667]
d("Sold", "Sealed") = [0.3333333]
d("Sold", "Healthy") = [0.0000000]
d("Sold", "Heard") = [0.1818182]
d("Sold", "Herded") = [0.1666667]
d("Sold", "Help") = [0.0000000]
d("Sold", "Sold") = [1.0000000]

如果您想直接从 JRuby 使用它,您只需将 StrikeAMatch.class 添加到您的 $CLASSPATH,并在您的脚本中添加 require 'java',然后添加 java_import 'StrikeAMatch

require 'java'

java_import 'StrikeAMatch'

distance = StrikeAMatch.new
ARGV.each do |lhs|
  ARGV.each do |rhs|
    puts "d(\"#{lhs}\", \"#{rhs}\") = [#{distance.between(lhs, rhs)}]"
  end
end

要调用它:

% jruby strike_a_match.rb France French
d("France", "France") = [1.0]
d("France", "French") = [0.4]
d("French", "France") = [0.4]
d("French", "French") = [1.0]