头歌Hadoop WordCount词频统计
时间: 2024-04-05 10:10:16 浏览: 200
hadoop word count 单词统计
好的,以下是一个简单的 Hadoop WordCount 词频统计程序示例:
Mapper 类:
```java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
// 将每一行按空格分隔成多个单词
String[] words = line.split(" ");
// 遍历每个单词,发送到 Reducer 进行统计
for (String word : words) {
context.write(new Text(word), new LongWritable(1));
}
}
}
```
Reducer 类:
```java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
long sum = 0;
// 遍历同一个单词的所有出现次数,求和
for (LongWritable value : values) {
sum += value.get();
}
// 将单词和出现次数发送到输出
context.write(key, new LongWritable(sum));
}
}
```
Main 方法:
```java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
```
在执行该程序时,需要在命令行中输入以下命令:
```bash
hadoop jar WordCount.jar WordCount /input /output
```
其中,第一个参数 WordCount.jar 是打包好的程序文件,第二个参数 WordCount 表示程序的类名,后面的 /input 和 /output 分别是输入和输出的路径。
阅读全文