hadoop mapreduce 无序元组作为映射键
hadoop mapreduce unordered tuple as map key
基于 Hadoop - 权威指南中的 wordcount 示例,我开发了一个 mapreduce 作业来计算无序字符串元组的出现次数。输入看起来像这样(只是更大):
a b
c c
d d
b a
a d
d d
运行 mapreduce 我希望输出是(对于这个例子):
c c 1
d d 1
a b 2
a d 1
d d 1
也就是说,我希望元组 a,b 和 b,a 被认为是相同的。这个问题已经在这里问过:Hadoop MapReduce: Two values as key in Mapper-Reducer and probably been solved here https://developer.yahoo.com/hadoop/tutorial/module5.html#keytypes。
对于大型输入文件,我得到这样的输出,第一列是响应的哈希码。键:
151757761 a a 62822
153322274 a b 62516
154886787 a c 62248
156451300 a d 62495
153322274 b a 62334
154902916 b b 62232
158064200 b d 62759
154886787 c a 62200
156483558 c b 124966
158080329 c c 62347
159677100 d c 125047
156451300 d a 62653
158064200 d b 62603
161290000 d d 62778
可以看出,有些键是重复的,比如 a, b 和 b, a 的 153322274。对于其他人,如 c, b(和 b,c)和 c, d(和 d,c),计数是正确的。大约比其他数量多一倍,因为测试数据是随机抽取的。
我已经搜索这个问题一段时间了,现在 运行 想不通为什么在 reduce 阶段之后仍然会有键重复。
下面是我使用的代码:
首先是我的自定义 WritableComparable 的代码
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.math.BigInteger;
public class Pair implements WritableComparable<Pair> {
private String first;
private String second;
public Pair(String first, String second) {
this.first = first;
this.second = second;
}
public Pair() {
this("", "");
}
@Override
public String toString() {
return this.hashCode() + "\t" + first + "\t" + second;
}
@Override
public void write(DataOutput out) throws IOException {
WritableUtils.writeString(out, first);
WritableUtils.writeString(out, second);
}
@Override
public void readFields(DataInput in) throws IOException {
first = WritableUtils.readString(in);
second = WritableUtils.readString(in);
}
@Override
public int hashCode() {
BigInteger bA = BigInteger.ZERO;
BigInteger bB = BigInteger.ZERO;
for(int i = 0; i < first.length(); i++) {
bA = bA.add(BigInteger.valueOf(127L).pow(i+1).multiply(BigInteger.valueOf(first.codePointAt(i))));
}
for(int i = 0; i < second.length(); i++) {
bB = bB.add(BigInteger.valueOf(127L).pow(i+1).multiply(BigInteger.valueOf(second.codePointAt(i))));
}
return bA.multiply(bB).intValue();
}
@Override
public boolean equals(Object o) {
if (o instanceof Pair) {
Pair other = (Pair) o;
boolean result = ( first.compareTo(other.first) == 0 && second.compareTo(other.second) == 0 )
|| ( first.compareTo(other.second) == 0 && second.compareTo(other.first) == 0 );
return result;
}
return false;
}
@Override
public int compareTo(Pair other) {
if (( first.compareTo(other.first) == 0 && second.compareTo(other.second) == 0 )
|| ( first.compareTo(other.second) == 0 && second.compareTo(other.first) == 0 ) ) {
return 0;
} else {
int cmp = first.compareTo( other.first );
if (cmp != 0) {
return cmp;
}
return second.compareTo( other.second );
}
}
}
其余的:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class PairCount {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: paircount <in-dir> <out-dir>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(PairCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setMapOutputKeyClass(Pair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Pair.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class TokenizerMapper extends Mapper<Object, Text, Pair, IntWritable> {
private final static IntWritable one = new IntWritable(1);
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
context.write(new Pair(itr.nextToken(), itr.nextToken()), one);
}
}
}
public static class IntSumReducer extends Reducer<Pair, IntWritable, Pair, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Pair key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write( key, result);
}
}
}
编辑:我为 hashCode() 和 compareTo() 函数添加了单元测试。他们工作得很好。
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
public class Tests {
@Test
public void testPairComparison() {
assertTrue( 0 == new Pair("a", "a").compareTo(new Pair("a", "a")) );
assertTrue( 0 == new Pair("a", "b").compareTo(new Pair("b", "a")) );
assertTrue( 0 == new Pair("a", "c").compareTo(new Pair("c", "a")) );
assertTrue( 0 == new Pair("a", "d").compareTo(new Pair("d", "a")) );
assertTrue( 0 == new Pair("b", "b").compareTo(new Pair("b", "b")) );
assertTrue( 0 == new Pair("b", "c").compareTo(new Pair("c", "b")) );
assertTrue( 0 == new Pair("b", "d").compareTo(new Pair("d", "b")) );
assertTrue( 0 == new Pair("c", "c").compareTo(new Pair("c", "c")) );
assertTrue( 0 == new Pair("c", "d").compareTo(new Pair("d", "c")) );
assertTrue( 0 == new Pair("d", "d").compareTo(new Pair("d", "d")) );
assertTrue( 0 > new Pair("a", "a").compareTo(new Pair("b", "b")) );
assertTrue( 0 > new Pair("a", "a").compareTo(new Pair("c", "b")) );
assertTrue( 0 < new Pair("d", "d").compareTo(new Pair("c", "b")) );
assertTrue( 0 < new Pair("c", "d").compareTo(new Pair("c", "a")) );
}
@Test
public void testPairHashcode(){
assertTrue( 0 != new Pair("a", "a").hashCode());
assertTrue( 0 != new Pair("a", "b").hashCode());
assertTrue( 0 != new Pair("a", "c").hashCode());
assertTrue( 0 != new Pair("a", "d").hashCode());
assertTrue( 0 != new Pair("b", "b").hashCode());
assertTrue( 0 != new Pair("b", "c").hashCode());
assertTrue( 0 != new Pair("b", "d").hashCode());
assertTrue( 0 != new Pair("c", "c").hashCode());
assertTrue( 0 != new Pair("c", "d").hashCode());
assertTrue( 0 != new Pair("d", "d").hashCode());
assertEquals( new Pair("a", "a").hashCode(), new Pair("a", "a").hashCode() );
assertEquals( new Pair("a", "b").hashCode(), new Pair("b", "a").hashCode() );
assertEquals( new Pair("a", "c").hashCode(), new Pair("c", "a").hashCode() );
assertEquals( new Pair("a", "d").hashCode(), new Pair("d", "a").hashCode() );
assertEquals( new Pair("b", "b").hashCode(), new Pair("b", "b").hashCode() );
assertEquals( new Pair("b", "c").hashCode(), new Pair("c", "b").hashCode() );
assertEquals( new Pair("b", "d").hashCode(), new Pair("d", "b").hashCode() );
assertEquals( new Pair("c", "c").hashCode(), new Pair("c", "c").hashCode() );
assertEquals( new Pair("c", "d").hashCode(), new Pair("d", "c").hashCode() );
assertEquals( new Pair("d", "d").hashCode(), new Pair("d", "d").hashCode() );
assertNotEquals( new Pair("a", "a").hashCode(), new Pair("b", "b").hashCode() );
assertNotEquals( new Pair("a", "b").hashCode(), new Pair("b", "d").hashCode() );
assertNotEquals( new Pair("a", "c").hashCode(), new Pair("d", "a").hashCode() );
assertNotEquals( new Pair("a", "d").hashCode(), new Pair("a", "a").hashCode() );
}
}
但我意识到,将 compareTo() 更改为始终 return 0 将导致每对蜂被视为相同,从而导致输出:
156483558 c b 1000000
同时将 hashCode() 更改为始终 return 0(对于与上面相同的输入数据)将导致与上面相同的结果,只是键为零。
0 a a 62822
0 a b 62516
0 a c 62248
0 a d 62495
0 b a 62334
0 b b 62232
0 b d 62759
0 c a 62200
0 c b 124966
0 c c 62347
0 d c 125047
0 d a 62653
0 d b 62603
0 d d 62778
编辑:
我进一步调查,使 compareTo() 打印正在比较的内容。这表明,像 a、b 和 b、a 这样的一些键永远不会相互比较,因此不会被分组。
如果不是所有的键都相互比较,分组怎么可能呢(除了使用 hashCode() 而不是)?
我想我遗漏了一些小东西。
我很高兴有任何想法!非常感谢您。
此致
我想我在这里看到了问题。您尚未实施分区程序。
当你说你面临大型数据集的问题时,我假设你正在使用多个减速器。如果您使用的是单个减速器,您的代码就可以工作。但是在有多个 reducer 的情况下,你需要一个分区器来告诉 frameowrk ab 和 ba 本质上是相同的键,应该去同一个 reducer。
这里是解释 link : LINK
问题出在 compareTo() 函数中。
首先检查它们在 a,b 等于 b,a 方面是否相等。如果不是这种情况,首先比较对的较小值,如果它们匹配,则比较 resp 的较大值。对。这解决了问题。
我现在是这样实现的:
@Override
public int compareTo(Pair other){
int cmpFirstFirst = first.compareTo(other.first);
int cmpSecondSecond = second.compareTo(other.second);
int cmpFirstSecond = first.compareTo(other.second);
int cmpSecondFirst = second.compareTo(other.first);
if ( cmpFirstFirst == 0 && cmpSecondSecond == 0 || cmpFirstSecond == 0 && cmpSecondFirst == 0) {
return 0;
}
String thisSmaller;
String otherSmaller;
String thisBigger;
String otherBigger;
if ( this.first.compareTo(this.second) < 0 ) {
thisSmaller = this.first;
thisBigger = this.second;
} else {
thisSmaller = this.second;
thisBigger = this.first;
}
if ( other.first.compareTo(other.second) < 0 ) {
otherSmaller = other.first;
otherBigger = other.second;
} else {
otherSmaller = other.second;
otherBigger = other.first;
}
int cmpThisSmallerOtherSmaller = thisSmaller.compareTo(otherSmaller);
int cmpThisBiggerOtherBigger = thisBigger.compareTo(otherBigger);
if (cmpThisSmallerOtherSmaller == 0) {
return cmpThisBiggerOtherBigger;
} else {
return cmpThisSmallerOtherSmaller;
}
}
这意味着,与我的假设相反,地图输出的分组是使用传递关系而不是键的叉积完成的。键的稳定顺序是必要的。一旦您知道并理解它,这就完全有意义了。
考虑到初始要求 {a,b} =:= {b,a} 在构造函数中对元组元素进行排序不是更容易吗?
public Pair(String first, String second) {
boolean swap = first.compareTo(second) > 0;
this.first = swap ? second : first;
this.second = swap ? first : second;
}
这将简化诸如 compareTo 和 equals 之类的方法,并且无需实施 Partitioner。
基于 Hadoop - 权威指南中的 wordcount 示例,我开发了一个 mapreduce 作业来计算无序字符串元组的出现次数。输入看起来像这样(只是更大):
a b
c c
d d
b a
a d
d d
运行 mapreduce 我希望输出是(对于这个例子):
c c 1
d d 1
a b 2
a d 1
d d 1
也就是说,我希望元组 a,b 和 b,a 被认为是相同的。这个问题已经在这里问过:Hadoop MapReduce: Two values as key in Mapper-Reducer and probably been solved here https://developer.yahoo.com/hadoop/tutorial/module5.html#keytypes。
对于大型输入文件,我得到这样的输出,第一列是响应的哈希码。键:
151757761 a a 62822
153322274 a b 62516
154886787 a c 62248
156451300 a d 62495
153322274 b a 62334
154902916 b b 62232
158064200 b d 62759
154886787 c a 62200
156483558 c b 124966
158080329 c c 62347
159677100 d c 125047
156451300 d a 62653
158064200 d b 62603
161290000 d d 62778
可以看出,有些键是重复的,比如 a, b 和 b, a 的 153322274。对于其他人,如 c, b(和 b,c)和 c, d(和 d,c),计数是正确的。大约比其他数量多一倍,因为测试数据是随机抽取的。
我已经搜索这个问题一段时间了,现在 运行 想不通为什么在 reduce 阶段之后仍然会有键重复。
下面是我使用的代码:
首先是我的自定义 WritableComparable 的代码
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.math.BigInteger;
public class Pair implements WritableComparable<Pair> {
private String first;
private String second;
public Pair(String first, String second) {
this.first = first;
this.second = second;
}
public Pair() {
this("", "");
}
@Override
public String toString() {
return this.hashCode() + "\t" + first + "\t" + second;
}
@Override
public void write(DataOutput out) throws IOException {
WritableUtils.writeString(out, first);
WritableUtils.writeString(out, second);
}
@Override
public void readFields(DataInput in) throws IOException {
first = WritableUtils.readString(in);
second = WritableUtils.readString(in);
}
@Override
public int hashCode() {
BigInteger bA = BigInteger.ZERO;
BigInteger bB = BigInteger.ZERO;
for(int i = 0; i < first.length(); i++) {
bA = bA.add(BigInteger.valueOf(127L).pow(i+1).multiply(BigInteger.valueOf(first.codePointAt(i))));
}
for(int i = 0; i < second.length(); i++) {
bB = bB.add(BigInteger.valueOf(127L).pow(i+1).multiply(BigInteger.valueOf(second.codePointAt(i))));
}
return bA.multiply(bB).intValue();
}
@Override
public boolean equals(Object o) {
if (o instanceof Pair) {
Pair other = (Pair) o;
boolean result = ( first.compareTo(other.first) == 0 && second.compareTo(other.second) == 0 )
|| ( first.compareTo(other.second) == 0 && second.compareTo(other.first) == 0 );
return result;
}
return false;
}
@Override
public int compareTo(Pair other) {
if (( first.compareTo(other.first) == 0 && second.compareTo(other.second) == 0 )
|| ( first.compareTo(other.second) == 0 && second.compareTo(other.first) == 0 ) ) {
return 0;
} else {
int cmp = first.compareTo( other.first );
if (cmp != 0) {
return cmp;
}
return second.compareTo( other.second );
}
}
}
其余的:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class PairCount {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: paircount <in-dir> <out-dir>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(PairCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setMapOutputKeyClass(Pair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Pair.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class TokenizerMapper extends Mapper<Object, Text, Pair, IntWritable> {
private final static IntWritable one = new IntWritable(1);
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
context.write(new Pair(itr.nextToken(), itr.nextToken()), one);
}
}
}
public static class IntSumReducer extends Reducer<Pair, IntWritable, Pair, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Pair key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write( key, result);
}
}
}
编辑:我为 hashCode() 和 compareTo() 函数添加了单元测试。他们工作得很好。
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
public class Tests {
@Test
public void testPairComparison() {
assertTrue( 0 == new Pair("a", "a").compareTo(new Pair("a", "a")) );
assertTrue( 0 == new Pair("a", "b").compareTo(new Pair("b", "a")) );
assertTrue( 0 == new Pair("a", "c").compareTo(new Pair("c", "a")) );
assertTrue( 0 == new Pair("a", "d").compareTo(new Pair("d", "a")) );
assertTrue( 0 == new Pair("b", "b").compareTo(new Pair("b", "b")) );
assertTrue( 0 == new Pair("b", "c").compareTo(new Pair("c", "b")) );
assertTrue( 0 == new Pair("b", "d").compareTo(new Pair("d", "b")) );
assertTrue( 0 == new Pair("c", "c").compareTo(new Pair("c", "c")) );
assertTrue( 0 == new Pair("c", "d").compareTo(new Pair("d", "c")) );
assertTrue( 0 == new Pair("d", "d").compareTo(new Pair("d", "d")) );
assertTrue( 0 > new Pair("a", "a").compareTo(new Pair("b", "b")) );
assertTrue( 0 > new Pair("a", "a").compareTo(new Pair("c", "b")) );
assertTrue( 0 < new Pair("d", "d").compareTo(new Pair("c", "b")) );
assertTrue( 0 < new Pair("c", "d").compareTo(new Pair("c", "a")) );
}
@Test
public void testPairHashcode(){
assertTrue( 0 != new Pair("a", "a").hashCode());
assertTrue( 0 != new Pair("a", "b").hashCode());
assertTrue( 0 != new Pair("a", "c").hashCode());
assertTrue( 0 != new Pair("a", "d").hashCode());
assertTrue( 0 != new Pair("b", "b").hashCode());
assertTrue( 0 != new Pair("b", "c").hashCode());
assertTrue( 0 != new Pair("b", "d").hashCode());
assertTrue( 0 != new Pair("c", "c").hashCode());
assertTrue( 0 != new Pair("c", "d").hashCode());
assertTrue( 0 != new Pair("d", "d").hashCode());
assertEquals( new Pair("a", "a").hashCode(), new Pair("a", "a").hashCode() );
assertEquals( new Pair("a", "b").hashCode(), new Pair("b", "a").hashCode() );
assertEquals( new Pair("a", "c").hashCode(), new Pair("c", "a").hashCode() );
assertEquals( new Pair("a", "d").hashCode(), new Pair("d", "a").hashCode() );
assertEquals( new Pair("b", "b").hashCode(), new Pair("b", "b").hashCode() );
assertEquals( new Pair("b", "c").hashCode(), new Pair("c", "b").hashCode() );
assertEquals( new Pair("b", "d").hashCode(), new Pair("d", "b").hashCode() );
assertEquals( new Pair("c", "c").hashCode(), new Pair("c", "c").hashCode() );
assertEquals( new Pair("c", "d").hashCode(), new Pair("d", "c").hashCode() );
assertEquals( new Pair("d", "d").hashCode(), new Pair("d", "d").hashCode() );
assertNotEquals( new Pair("a", "a").hashCode(), new Pair("b", "b").hashCode() );
assertNotEquals( new Pair("a", "b").hashCode(), new Pair("b", "d").hashCode() );
assertNotEquals( new Pair("a", "c").hashCode(), new Pair("d", "a").hashCode() );
assertNotEquals( new Pair("a", "d").hashCode(), new Pair("a", "a").hashCode() );
}
}
但我意识到,将 compareTo() 更改为始终 return 0 将导致每对蜂被视为相同,从而导致输出:
156483558 c b 1000000
同时将 hashCode() 更改为始终 return 0(对于与上面相同的输入数据)将导致与上面相同的结果,只是键为零。
0 a a 62822
0 a b 62516
0 a c 62248
0 a d 62495
0 b a 62334
0 b b 62232
0 b d 62759
0 c a 62200
0 c b 124966
0 c c 62347
0 d c 125047
0 d a 62653
0 d b 62603
0 d d 62778
编辑:
我进一步调查,使 compareTo() 打印正在比较的内容。这表明,像 a、b 和 b、a 这样的一些键永远不会相互比较,因此不会被分组。
如果不是所有的键都相互比较,分组怎么可能呢(除了使用 hashCode() 而不是)?
我想我遗漏了一些小东西。 我很高兴有任何想法!非常感谢您。
此致
我想我在这里看到了问题。您尚未实施分区程序。
当你说你面临大型数据集的问题时,我假设你正在使用多个减速器。如果您使用的是单个减速器,您的代码就可以工作。但是在有多个 reducer 的情况下,你需要一个分区器来告诉 frameowrk ab 和 ba 本质上是相同的键,应该去同一个 reducer。
这里是解释 link : LINK
问题出在 compareTo() 函数中。 首先检查它们在 a,b 等于 b,a 方面是否相等。如果不是这种情况,首先比较对的较小值,如果它们匹配,则比较 resp 的较大值。对。这解决了问题。
我现在是这样实现的:
@Override
public int compareTo(Pair other){
int cmpFirstFirst = first.compareTo(other.first);
int cmpSecondSecond = second.compareTo(other.second);
int cmpFirstSecond = first.compareTo(other.second);
int cmpSecondFirst = second.compareTo(other.first);
if ( cmpFirstFirst == 0 && cmpSecondSecond == 0 || cmpFirstSecond == 0 && cmpSecondFirst == 0) {
return 0;
}
String thisSmaller;
String otherSmaller;
String thisBigger;
String otherBigger;
if ( this.first.compareTo(this.second) < 0 ) {
thisSmaller = this.first;
thisBigger = this.second;
} else {
thisSmaller = this.second;
thisBigger = this.first;
}
if ( other.first.compareTo(other.second) < 0 ) {
otherSmaller = other.first;
otherBigger = other.second;
} else {
otherSmaller = other.second;
otherBigger = other.first;
}
int cmpThisSmallerOtherSmaller = thisSmaller.compareTo(otherSmaller);
int cmpThisBiggerOtherBigger = thisBigger.compareTo(otherBigger);
if (cmpThisSmallerOtherSmaller == 0) {
return cmpThisBiggerOtherBigger;
} else {
return cmpThisSmallerOtherSmaller;
}
}
这意味着,与我的假设相反,地图输出的分组是使用传递关系而不是键的叉积完成的。键的稳定顺序是必要的。一旦您知道并理解它,这就完全有意义了。
考虑到初始要求 {a,b} =:= {b,a} 在构造函数中对元组元素进行排序不是更容易吗?
public Pair(String first, String second) {
boolean swap = first.compareTo(second) > 0;
this.first = swap ? second : first;
this.second = swap ? first : second;
}
这将简化诸如 compareTo 和 equals 之类的方法,并且无需实施 Partitioner。