mapreduce (四) MapReduce實現Grep+sort

1.txt
dong xi cheng
xi dong cheng
wo ai beijing
tian an men
qiche
dong
dong
dong
2.txt
dong xi cheng
xi dong cheng
wo ai beijing
tian an men
qiche
dong
dong
dong

import java.io.IOException;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.map.RegexMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;

public class IGrep {

    public static void main(String[] args) throws IOException,
            ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        String dir_in = "hdfs://localhost:9000/input_grep";
        String dir_out = "hdfs://localhost:9000/output_grep";
        String reg = ".ng";//匹配三個字符的字符串,且以ng結尾。

        conf.set(RegexMapper.PATTERN, reg);
        conf.setInt(RegexMapper.GROUP, 0);

        Path in = new Path(dir_in);
        Path tmp = new Path("grep-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Path out = new Path(dir_out);

        try {
            Job grepJob = new Job(conf, "grep-search");

            grepJob.setJarByClass(IGrep.class);

            grepJob.setInputFormatClass(TextInputFormat.class);
            grepJob.setMapperClass(RegexMapper.class);
            grepJob.setCombinerClass(LongSumReducer.class);
            grepJob.setPartitionerClass(HashPartitioner.class);

            grepJob.setMapOutputKeyClass(Text.class);
            grepJob.setMapOutputValueClass(LongWritable.class);
            FileInputFormat.addInputPath(grepJob, in);

            grepJob.setReducerClass(LongSumReducer.class);
            // job.setNumReduceTasks(1);
            grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);

            grepJob.setOutputKeyClass(Text.class);
            grepJob.setOutputValueClass(LongWritable.class);
            FileOutputFormat.setOutputPath(grepJob, tmp);

            grepJob.waitForCompletion(true);

            Job sortJob = new Job(conf, "grep-sort");

            sortJob.setJarByClass(IGrep.class);

            sortJob.setInputFormatClass(SequenceFileInputFormat.class);
            sortJob.setMapperClass(InverseMapper.class);
            FileInputFormat.addInputPath(sortJob, tmp);

            sortJob.setNumReduceTasks(1);【全局排序】
            sortJob.setSortComparatorClass(LongWritable.DecreasingComparator.class);//逆序

            FileOutputFormat.setOutputPath(sortJob, out);

            sortJob.waitForCompletion(true);
            
        } finally {
            FileSystem.get(conf).delete(tmp, true);
        }
    }
}輸出結果:10    ong4    eng2    ing
相關文章
相關標籤/搜索