写一个Hadoop的Java程序可以将txt文本的第一列和第四列提取并排序
时间: 2024-02-21 08:56:39 浏览: 70
以下是一个简单的Java程序,可以读取一个txt文件,提取第一列和第四列,然后按照第一列进行排序。该程序需要在Hadoop环境下运行。
```java
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SortColumns {
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
String[] columns = itr.nextToken().split("\t");
if (columns.length >= 4) {
word.set(columns[0]);
context.write(word, new IntWritable(Integer.parseInt(columns[3])));
}
}
}
}
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private TreeMap<String, Integer> sortedMap = new TreeMap<String, Integer>();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
sortedMap.put(key.toString(), sum);
}
protected void cleanup(Context context) throws IOException, InterruptedException {
for (String key : sortedMap.keySet()) {
context.write(new Text(key), new IntWritable(sortedMap.get(key)));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "sort columns");
job.setJarByClass(SortColumns.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
```
在该程序中,首先定义了一个Mapper类来提取第一列和第四列,并将第一列作为键,第四列作为值输出。接着定义了一个Reducer类来对相同键的值进行求和,并使用TreeMap对键进行排序。最后在main方法中配置作业,包括输入和输出路径以及Mapper和Reducer类,然后提交作业并等待完成。
阅读全文