未找到 MapReduce 停用词
MapReduce stopwords not being found
我是 MapReduce 的新手,正在尝试编写一个程序来计算文件中停用词的数量。我从命令行引用我的 stopword.txt 文件,但每次我 运行 时,结果都是 Stop Words=0 和 Good Words=30(应该是 5 和 25)。我没有收到任何异常,它正在编译并且 运行ning 没问题。我坚持要尝试什么。
下面是我的代码。 Hadoop 版本为 2.0.
StopWord.java
public class StopWord {
public enum COUNTERS {
STOPWORDS, GOODWORDS
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
args = parser.getRemainingArgs();
Job job = new Job(conf, "StopWord");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setJarByClass(StopWord.class);
job.setMapperClass(MyMapper.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
List<String> other_args = new ArrayList<String>();
for (int i = 0; i < args.length; i++) {
if ("-skip".equals(args[i])) {
DistributedCache.addCacheFile(new Path(args[++i]).toUri(),
job.getConfiguration());
if (i+1 < args.length)
{
i++;
}
else
{
break;
}
}
other_args.add(args[i]);
}
FileInputFormat.setInputPaths(job, new Path(other_args.get(0)));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
job.waitForCompletion(true);
Counters counters = job.getCounters();
System.out.printf("Good Words: %d, Stop Words: %d\n",
counters.findCounter(COUNTERS.GOODWORDS).getValue(),
counters.findCounter(COUNTERS.STOPWORDS).getValue());
}
}
MyMapper.java
public class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Text word = new Text();
private Set<String> stopWordList = new HashSet<String>();
private BufferedReader fis;
protected void setup(Context context) throws java.io.IOException,
InterruptedException {
try {
Path[] stopWordFiles = new Path[0];
stopWordFiles = context.getLocalCacheFiles();
System.out.println(stopWordFiles.toString());
if (stopWordFiles != null && stopWordFiles.length > 0) {
for (Path stopWordFile : stopWordFiles) {
readStopWordFile(stopWordFile);
}
}
} catch (IOException e) {
System.err.println("Exception reading stop word file: " + e);
}
}
//reading the stop word file
private void readStopWordFile(Path stopWordFile) {
try {
fis = new BufferedReader(new FileReader(stopWordFile.toString()));
String stopWord = null;
while ((stopWord = fis.readLine()) != null) {
stopWordList.add(stopWord);
}
} catch (IOException e) {
System.err.println("Exception while reading stop word file '"
+ stopWordFile + "' : " + e.toString());
}
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (stopWordList.contains(token)) {
context.getCounter(StopWord.COUNTERS.STOPWORDS)
.increment(1);
} else {
context.getCounter(StopWord.COUNTERS.GOODWORDS)
.increment(1);
word.set(token);
context.write(word, null);
}
}
}
}
据我所知,您的 stopWordFiles 可能是空的,
您在作业初始化后添加分布式缓存。
查看此 post 了解更多信息
Accessing files in hadoop distributed cache
我是 MapReduce 的新手,正在尝试编写一个程序来计算文件中停用词的数量。我从命令行引用我的 stopword.txt 文件,但每次我 运行 时,结果都是 Stop Words=0 和 Good Words=30(应该是 5 和 25)。我没有收到任何异常,它正在编译并且 运行ning 没问题。我坚持要尝试什么。 下面是我的代码。 Hadoop 版本为 2.0.
StopWord.java
public class StopWord {
public enum COUNTERS {
STOPWORDS, GOODWORDS
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
args = parser.getRemainingArgs();
Job job = new Job(conf, "StopWord");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setJarByClass(StopWord.class);
job.setMapperClass(MyMapper.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
List<String> other_args = new ArrayList<String>();
for (int i = 0; i < args.length; i++) {
if ("-skip".equals(args[i])) {
DistributedCache.addCacheFile(new Path(args[++i]).toUri(),
job.getConfiguration());
if (i+1 < args.length)
{
i++;
}
else
{
break;
}
}
other_args.add(args[i]);
}
FileInputFormat.setInputPaths(job, new Path(other_args.get(0)));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
job.waitForCompletion(true);
Counters counters = job.getCounters();
System.out.printf("Good Words: %d, Stop Words: %d\n",
counters.findCounter(COUNTERS.GOODWORDS).getValue(),
counters.findCounter(COUNTERS.STOPWORDS).getValue());
}
}
MyMapper.java
public class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Text word = new Text();
private Set<String> stopWordList = new HashSet<String>();
private BufferedReader fis;
protected void setup(Context context) throws java.io.IOException,
InterruptedException {
try {
Path[] stopWordFiles = new Path[0];
stopWordFiles = context.getLocalCacheFiles();
System.out.println(stopWordFiles.toString());
if (stopWordFiles != null && stopWordFiles.length > 0) {
for (Path stopWordFile : stopWordFiles) {
readStopWordFile(stopWordFile);
}
}
} catch (IOException e) {
System.err.println("Exception reading stop word file: " + e);
}
}
//reading the stop word file
private void readStopWordFile(Path stopWordFile) {
try {
fis = new BufferedReader(new FileReader(stopWordFile.toString()));
String stopWord = null;
while ((stopWord = fis.readLine()) != null) {
stopWordList.add(stopWord);
}
} catch (IOException e) {
System.err.println("Exception while reading stop word file '"
+ stopWordFile + "' : " + e.toString());
}
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (stopWordList.contains(token)) {
context.getCounter(StopWord.COUNTERS.STOPWORDS)
.increment(1);
} else {
context.getCounter(StopWord.COUNTERS.GOODWORDS)
.increment(1);
word.set(token);
context.write(word, null);
}
}
}
}
据我所知,您的 stopWordFiles 可能是空的, 您在作业初始化后添加分布式缓存。
查看此 post 了解更多信息 Accessing files in hadoop distributed cache