Java編程MapReduce實現WordCountjava
1.編寫Mapperapache
package net.toocruel.yarn.mapreduce.wordcount; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.util.StringTokenizer; /** * @author : 宋同煜 * @version : 1.0 * @createTime : 2017/4/12 14:15 * @description : */ public class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>{ //對於每一個單詞賦予出現頻數1,由於單詞是一個一個取出來的,因此每一個數量都爲1 private final static IntWritable one = new IntWritable(1); //存儲取出來的一行單詞 private Text word = new Text(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { //StringTokenizer 對輸入單詞進行切分 StringTokenizer itr = new StringTokenizer(value.toString()); while(itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } 123456789101112131415161718192021222324252627282930313233
2.編寫Reducer編程
package net.toocruel.yarn.mapreduce.wordcount; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * @author : 宋同煜 * @version : 1.0 * @createTime : 2017/4/12 14:16 * @description : */ public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ //存取對應單詞總頻數 private IntWritable result = new IntWritable(); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { //計算頻數 int sum = 0; for(IntWritable value:values){ sum+=value.get(); } result.set(sum); //寫入輸出 context.write(key, result); } } 12345678910111213141516171819202122232425262728293031
3.編寫Job提交器app
package net.toocruel.yarn.mapreduce.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * wordcount 提交器 打包在hadoop集羣任意機器執行 hadoop jar XXX.jar net.toocruel.yarn.mapreduce.wordcount WordCount * @author : 宋同煜 * @version : 1.0 * @createTime : 2017/4/12 14:15 * @description : */ public class WordCount { public static void main(String[] args)throws Exception { //初始化配置 Configuration conf = new Configuration(); System.setProperty("HADOOP_USER_NAME","hdfs"); //建立一個job提交器對象 Job job = Job.getInstance(conf); job.setJobName("WordCount"); job.setJarByClass(WordCount.class); //設置map,reduce處理 job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); //設置輸出格式處理類 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //設置輸入輸出路徑 FileSystem.get(new Configuration()).delete(new Path("/sty/wordcount/output")); //先清空輸出目錄 FileInputFormat.addInputPath(job, new Path("hdfs://cloudera:8020/sty/wordcount/input")); FileOutputFormat.setOutputPath(job, new Path("hdfs://cloudera:8020/sty/wordcount/output")); boolean res = job.waitForCompletion(true); System.out.println("任務名稱: "+job.getJobName()); System.out.println("任務成功: "+(res?"Yes":"No")); System.exit(res?0:1); } } 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
4.打包maven
我用的maven打包,也能夠Eclipse的直接導出jar包或Idea的build artifactside
hadoopSimple-1.0.jaroop
5.運行ui
在Yarn的ResourceManager 或NodeManager節點機器上運行this
hadoop jar hadoopSimple-1.0.jar net.toocruel.yarn.mapreduce.wordcount.WordCount
6.運行結果url
[root@cloudera ~]# hadoop jar hadoopSimple-1.0.jar net.toocruel.yarn.mapreduce.wordcount.WordCount 17/04/13 12:57:13 INFO client.RMProxy: Connecting to ResourceManager at cloudera/192.168.254.203:8032 17/04/13 12:57:14 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 17/04/13 12:57:18 INFO input.FileInputFormat: Total input paths to process : 1 17/04/13 12:57:18 INFO mapreduce.JobSubmitter: number of splits:1 17/04/13 12:57:18 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1491999347093_0012 17/04/13 12:57:19 INFO impl.YarnClientImpl: Submitted application application_1491999347093_0012 17/04/13 12:57:19 INFO mapreduce.Job: The url to track the job: http://cloudera:8088/proxy/application_1491999347093_0012/ 17/04/13 12:57:19 INFO mapreduce.Job: Running job: job_1491999347093_0012 17/04/13 12:57:32 INFO mapreduce.Job: Job job_1491999347093_0012 running in uber mode : false 17/04/13 12:57:32 INFO mapreduce.Job: map 0% reduce 0% 17/04/13 12:57:39 INFO mapreduce.Job: map 100% reduce 0% 17/04/13 12:57:47 INFO mapreduce.Job: map 100% reduce 33% 17/04/13 12:57:49 INFO mapreduce.Job: map 100% reduce 67% 17/04/13 12:57:53 INFO mapreduce.Job: map 100% reduce 100% 17/04/13 12:57:54 INFO mapreduce.Job: Job job_1491999347093_0012 completed successfully 17/04/13 12:57:54 INFO mapreduce.Job: Counters: 49 File System Counters FILE: Number of bytes read=162 FILE: Number of bytes written=497579 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=233 HDFS: Number of bytes written=62 HDFS: Number of read operations=12 HDFS: Number of large read operations=0 HDFS: Number of write operations=6 Job Counters Launched map tasks=1 Launched reduce tasks=3 Data-local map tasks=1 Total time spent by all maps in occupied slots (ms)=5167 Total time spent by all reduces in occupied slots (ms)=18520 Total time spent by all map tasks (ms)=5167 Total time spent by all reduce tasks (ms)=18520 Total vcore-seconds taken by all map tasks=5167 Total vcore-seconds taken by all reduce tasks=18520 Total megabyte-seconds taken by all map tasks=5291008 Total megabyte-seconds taken by all reduce tasks=18964480 Map-Reduce Framework Map input records=19 Map output records=18 Map output bytes=193 Map output materialized bytes=150 Input split bytes=111 Combine input records=0 Combine output records=0 Reduce input groups=7 Reduce shuffle bytes=150 Reduce input records=18 Reduce output records=7 Spilled Records=36 Shuffled Maps =3 Failed Shuffles=0 Merged Map outputs=3 GC time elapsed (ms)=320 CPU time spent (ms)=4280 Physical memory (bytes) snapshot=805298176 Virtual memory (bytes) snapshot=11053834240 Total committed heap usage (bytes)=529731584 Shuffle Errors BAD_ID=0 CONNECTION=0 IO_ERROR=0 WRONG_LENGTH=0 WRONG_MAP=0 WRONG_REDUCE=0 File Input Format Counters Bytes Read=122 File Output Format Counters Bytes Written=62 任務名稱: WordCount 任務成功: Yes