Java Hadoop 奇怪的连接行为
Java Hadoop wierd join behaviour
瞄准
我有两个 csv 文件试图在它们之间进行连接。一个包含 movieId、title,另一个包含 userId、movieId、comment-tag。我想通过打印标题 comment_count 来找出每部电影有多少 comments-tags。所以我的代码:
Driver
public class Driver
{
public Driver(String[] args)
{
if (args.length < 3) {
System.err.println("input path ");
}
try {
Job job = Job.getInstance();
job.setJobName("movie tag count");
// set file input/output path
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, TagMapper.class);
MultipleInputs.addInputPath(job, new Path(args[2]), TextInputFormat.class, MovieMapper.class);
FileOutputFormat.setOutputPath(job, new Path(args[3]));
// set jar class name
job.setJarByClass(Driver.class);
// set mapper and reducer to job
job.setReducerClass(Reducer.class);
// set output key class
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
int returnValue = job.waitForCompletion(true) ? 0 : 1;
System.out.println(job.isSuccessful());
System.exit(returnValue);
} catch (IOException | ClassNotFoundException | InterruptedException e) {
e.printStackTrace();
}
}
}
MovieMapper
public class MovieMapper extends org.apache.hadoop.mapreduce.Mapper<Object, Text, Text, Text>
{
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] items = line.split("(?!\B\"[^\"]*),(?![^\"]*\"\B)"); //comma not in quotes
String movieId = items[0].trim();
if(tryParseInt(movieId))
{
context.write(new Text(movieId), new Text(items[1].trim()));
}
}
private boolean tryParseInt(String s)
{
try {
Integer.parseInt(s);
return true;
} catch (NumberFormatException e) {
return false;
}
}
}
标签映射器
public class TagMapper extends org.apache.hadoop.mapreduce.Mapper<Object, Text, Text, Text>
{
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] items = line.split("(?!\B\"[^\"]*),(?![^\"]*\"\B)");
String movieId = items[1].trim();
if(tryParseInt(movieId))
{
context.write(new Text(movieId), new Text("_"));
}
}
private boolean tryParseInt(String s)
{
try {
Integer.parseInt(s);
return true;
} catch (NumberFormatException e) {
return false;
}
}
}
减速器
public class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, IntWritable>
{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
int noOfFrequency = 0;
Text movieTitle = new Text();
for (Text o : values)
{
if(o.toString().trim().equals("_"))
{
noOfFrequency++;
}
else
{
System.out.println(o.toString());
movieTitle = o;
}
}
context.write(movieTitle, new IntWritable(noOfFrequency));
}
}
问题
我得到的结果是这样的:
标题,计数
_, 计数
标题,计数
标题,计数
_, 计数
标题,计数
_, 计数
这个 _ 是如何成为关键的?我无法理解。有一个 if 语句检查是否有 _ 计数并且不要将其作为标题。是不是toString()方法有问题导致equals操作失败?有什么想法吗?
这并不奇怪,因为您遍历了 values
,而 o
是指向 values
元素的指针,这里是 Text
。在某个时间点,您使 movieTitle
指向 o
指向 movieTitle = o
的位置。在接下来的迭代中,o
指向 "_"
并且 movieTitle
指向 "_"
.
如果您像这样更改代码,一切正常:
int noOfFrequency = 0;
Text movieTitle = null;
for (Text o : values)
{
if(o.toString().trim().equals("_"))
{
noOfFrequency++;
}
else
{
movieTitle = new Text(o.toString());
}
}
context.write(movieTitle, new IntWritable(noOfFrequency));
瞄准
我有两个 csv 文件试图在它们之间进行连接。一个包含 movieId、title,另一个包含 userId、movieId、comment-tag。我想通过打印标题 comment_count 来找出每部电影有多少 comments-tags。所以我的代码:
Driver
public class Driver
{
public Driver(String[] args)
{
if (args.length < 3) {
System.err.println("input path ");
}
try {
Job job = Job.getInstance();
job.setJobName("movie tag count");
// set file input/output path
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, TagMapper.class);
MultipleInputs.addInputPath(job, new Path(args[2]), TextInputFormat.class, MovieMapper.class);
FileOutputFormat.setOutputPath(job, new Path(args[3]));
// set jar class name
job.setJarByClass(Driver.class);
// set mapper and reducer to job
job.setReducerClass(Reducer.class);
// set output key class
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
int returnValue = job.waitForCompletion(true) ? 0 : 1;
System.out.println(job.isSuccessful());
System.exit(returnValue);
} catch (IOException | ClassNotFoundException | InterruptedException e) {
e.printStackTrace();
}
}
}
MovieMapper
public class MovieMapper extends org.apache.hadoop.mapreduce.Mapper<Object, Text, Text, Text>
{
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] items = line.split("(?!\B\"[^\"]*),(?![^\"]*\"\B)"); //comma not in quotes
String movieId = items[0].trim();
if(tryParseInt(movieId))
{
context.write(new Text(movieId), new Text(items[1].trim()));
}
}
private boolean tryParseInt(String s)
{
try {
Integer.parseInt(s);
return true;
} catch (NumberFormatException e) {
return false;
}
}
}
标签映射器
public class TagMapper extends org.apache.hadoop.mapreduce.Mapper<Object, Text, Text, Text>
{
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] items = line.split("(?!\B\"[^\"]*),(?![^\"]*\"\B)");
String movieId = items[1].trim();
if(tryParseInt(movieId))
{
context.write(new Text(movieId), new Text("_"));
}
}
private boolean tryParseInt(String s)
{
try {
Integer.parseInt(s);
return true;
} catch (NumberFormatException e) {
return false;
}
}
}
减速器
public class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, IntWritable>
{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
int noOfFrequency = 0;
Text movieTitle = new Text();
for (Text o : values)
{
if(o.toString().trim().equals("_"))
{
noOfFrequency++;
}
else
{
System.out.println(o.toString());
movieTitle = o;
}
}
context.write(movieTitle, new IntWritable(noOfFrequency));
}
}
问题
我得到的结果是这样的:
标题,计数
_, 计数
标题,计数
标题,计数
_, 计数
标题,计数
_, 计数
这个 _ 是如何成为关键的?我无法理解。有一个 if 语句检查是否有 _ 计数并且不要将其作为标题。是不是toString()方法有问题导致equals操作失败?有什么想法吗?
这并不奇怪,因为您遍历了 values
,而 o
是指向 values
元素的指针,这里是 Text
。在某个时间点,您使 movieTitle
指向 o
指向 movieTitle = o
的位置。在接下来的迭代中,o
指向 "_"
并且 movieTitle
指向 "_"
.
如果您像这样更改代码,一切正常:
int noOfFrequency = 0;
Text movieTitle = null;
for (Text o : values)
{
if(o.toString().trim().equals("_"))
{
noOfFrequency++;
}
else
{
movieTitle = new Text(o.toString());
}
}
context.write(movieTitle, new IntWritable(noOfFrequency));