【hadoop】看懂WordCount例子

時間 2019-11-12

原文原文鏈接

前言：今天剛開始看到map和reduce類裏面的內容時，說實話一片迷茫，who are you?，最後實在沒辦法，上B站看別人的解說視頻，再加上本身去網上查java的包的解釋，終於把WordCount例子看懂，準備後面本身寫一遍！實話說，如今實在肝不動了，天天只有晚上有點時間來學習，代碼貼上來，睡覺！html

正文：實在不想寫太多，解釋都在代碼的註釋裏面，饒了我吧！java

貼一個講的比較好的網址：https://www.cnblogs.com/houji/p/7161468.htmlexpress

代碼以下：apache

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.hadoop;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {//WordCount是類名，要用public class進行修飾，java程序由類（class）組成，一個源文件能夠包含多個類

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, IntWritable>{
/***
               Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
                      行偏移量 輸入值  輸出key  輸出值
***/
    
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
      
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());//value.toString()獲取輸入值,並用StringTokenizer進行分隔（默認空格）
      while (itr.hasMoreTokens()) { //判斷itr是否還有字符串，返回true或false
        word.set(itr.nextToken()); //set方法給word賦值，nextToken()返回下一個標記
        context.write(word, one);//輸出<'word',1>
      }
    }
  }
  
  public static class IntSumReducer 
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,  //values 裏面存儲着map輸出數據，格式爲 'word list<1,1,1,1,1>'
                       Context context
                       ) throws IOException, InterruptedException {

      int sum = 0;//自定義一個計數器
      for (IntWritable val : values) { //循環list裏面的值
        sum += val.get();//求和
      }
      result.set(sum);//賦值給result
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception { //一、Java程序的入口，public static void main(String[] args){}是固定用法，public static void都是關鍵字。二、throws：聲明一個異常可能被拋出
    /*** 
    Create a new Job
    ***/
    Configuration conf = new Configuration();  //實例化Configuration，讀取Hadoop配置信息
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); //讀取Hadoop的argument填入地址信息  
    if (otherArgs.length < 2) {//若填入的地址小於2，報錯並輸出"Usage: wordcount <in> [<in>...] <out>"
      System.err.println("Usage: wordcount <in> [<in>...] <out>");
      System.exit(2);
    }
    Job job = Job.getInstance(conf, "word count");//單例模式getInstance(),在主函數開始時調用，返回一個實例化對象，此對象是static的，在內存中保留着它的引用
    job.setJarByClass(WordCount.class);
    //設置Job處理的Map（拆分）、Combiner（中間結果合併）以及Reduce（合併）的相關處理類
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    //設置job輸出結果<key,value>的中key和value數據類型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    /***
    調用addInputPath()和setOutputPath()設置輸入輸出路徑置
    InputFormat()方法是用來生成可供map處理的<key,value>對的
    ***/
    for (int i = 0; i < otherArgs.length - 1; ++i) {
      FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job,
      new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1); //運行job
  }
}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。