如何在 mapreduce 作业中解决 expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable

How to solve expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable in mapreduce job

我正在尝试编写一个可以分析来自 youtube 数据的一些信息的作业 set.I 相信我已经正确设置了 driver class 中地图的输出键,但是我仍然收到上述错误,我在这里发布代码和异常,

映射器

public class YouTubeDataMapper extends Mapper<LongWritable,Text,Text,IntWritable>{

private static final IntWritable one = new IntWritable(1); 
private Text category = new Text(); 
public void mapper(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
    String str[] = value.toString().split("\t");
    category.set(str[3]);
    context.write(category, one);
}

}

减速器class

public class YouTubeDataReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException{
    int sum=0;
    for(IntWritable count:values){
        sum+=count.get();
    }
    context.write(key, new IntWritable(sum));
}

}

Driver Class

public class YouTubeDataDriver {

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "categories");
    job.setJarByClass(YouTubeDataDriver.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    // job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);// Here i have set the output keys
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(YouTubeDataMapper.class);
    job.setReducerClass(YouTubeDataReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    Path out = new Path(args[1]);
    out.getFileSystem(conf).delete(out);
    job.waitForCompletion(true);

}

}

我得到的异常

java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1069) at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:712) at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89) at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112) at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:124) at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145) at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:784) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341) at org.apache.hadoop.mapred.YarnChild.run(YarnChild.java:168) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)

我已经在 driver class

中设置了输出键
    job.setOutputKeyClass(Text.class);// Here i have set the output keys
    job.setOutputValueClass(IntWritable.class);

但为什么我仍然收到错误消息?请帮助,我是 mapreduce 的新手

下面的代码(用对象更新 LongWritable)对我有用 -

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class YouTubeDataDriver {

    public static class YouTubeDataMapper
            extends Mapper<Object, Text, Text, IntWritable>{

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class YouTubeDataReducer
            extends Reducer<Text,IntWritable,Text,IntWritable> {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context
        ) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        @SuppressWarnings("deprecation")
        Job job = new Job(conf, "categories");
        job.setJarByClass(YouTubeDataDriver.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        // job.setNumReduceTasks(0);
        job.setOutputKeyClass(Text.class);// Here i have set the output keys
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(YouTubeDataMapper.class);
        job.setReducerClass(YouTubeDataReducer.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        Path out = new Path(args[1]);
        out.getFileSystem(conf).delete(out);
        job.waitForCompletion(true);

    }

}

mapper() 方法重命名为 map()(参见 official docs)。

发生的事情是映射器实际上没有处理任何数据。它没有进入 mapper() 方法(因为它正在寻找 map() 方法),因此保持 map 阶段不变,这意味着 map 输出键仍然是 LongWritable.

顺便说一句,

String str[] = value.toString().split("\t");
category.set(str[3]);

非常危险。假设您的所有输入数据都至少包含 3 \t 个字符是有风险的。在处理大量数据时,几乎总会有一些数据不是您期望的格式,并且您不希望在这种情况发生时您的整个工作就此结束。考虑做类似的事情:

String valueStr = value.toString();
if (valueStr != null) {
    String str[] = valueStr.split("\t");
    if (str[] != null && str.size > 3) {
        category.set(str[3]);
        context.write(category, one);
    }
}