從搭建大數據環境提及,到執行WordCount所遇到的坑

從搭建大數據環境提及,到執行WordCount所遇到的坑

背景說明

最近(2020年12月20日)在瞭解大數據相關架構及技術體系。java

雖說只是瞭解,不須要親自動手去搭建一個環境並執行相應的jobnode

可是,技術嘛。就是要靠下笨功夫,一點點的積累。該動手的仍是不能少。linux

因此,就從搭環境(基於docker)開始,一直到成功執行了一個基於yarn調度的wordcountjobgit

期間,遇到了很多坑點,一個一個填好,大概花了10個小時左右的時間。github

但願能將這種血淚教訓,分享給須要的人。花更少的時間,去完成整個流程。web

注意:我的本地環境爲macOS Big Sursql

基於docker compose的大數據環境搭建

參考 docker-hadoop-spark-hive 快速構建你的大數據環境 搭建了一個大數據環境,調整了部分參數,以適用於mac osdocker

主要是以下五個文件:apache

.
├── copy-jar.sh # spark yarn支持
├── docker-compose.yml # docker compose文件
├── hadoop-hive.env # 環境變量配置
├── run.sh # 啓動腳本
└── stop.sh # 中止腳本

注意:mac osdocker有一個坑點就是沒法直接在宿主機訪問容器,我使用Docker for Mac 的網絡問題及解決辦法(新增方法四)中的方法四解決的。數組

注意:須要在宿主機配置好相應docker容器對應的ip,這才能保證job成功執行,且各個服務在宿主機訪問的時候,跳轉不會出現問題。這坑很深,慎踩

# switch_local

172.21.0.3 namenode
172.21.0.8 resourcemanager
172.21.0.9 nodemanager
172.21.0.10 historyserver

docker-compose.yml

version: '2' 
services:
  namenode:
    image: bde2020/hadoop-namenode:1.1.0-hadoop2.8-java8
    container_name: namenode
    volumes:
      - ~/data/namenode:/hadoop/dfs/name
    environment:
      - CLUSTER_NAME=test
    env_file:
      - ./hadoop-hive.env
    ports:
      - 50070:50070
      - 8020:8020
  resourcemanager:
    image: bde2020/hadoop-resourcemanager:1.1.0-hadoop2.8-java8
    container_name: resourcemanager
    environment:
      - CLUSTER_NAME=test
    env_file:
      - ./hadoop-hive.env
    ports:
      - 8088:8088
  historyserver:
    image: bde2020/hadoop-historyserver:1.1.0-hadoop2.8-java8
    container_name: historyserver
    environment:
      - CLUSTER_NAME=test
    env_file:
      - ./hadoop-hive.env
    ports:
      - 8188:8188
  datanode:
    image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8
    depends_on: 
      - namenode
    volumes:
      - ~/data/datanode:/hadoop/dfs/data
    env_file:
      - ./hadoop-hive.env
    ports:
      - 50075:50075
  datanode2:
    image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8
    depends_on: 
      - namenode
    volumes:
      - ~/data/datanode2:/hadoop/dfs/data
    env_file:
      - ./hadoop-hive.env
    ports:
      - 50076:50075
  datanode3:
    image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8
    depends_on: 
      - namenode
    volumes:
      - ~/data/datanode3:/hadoop/dfs/data
    env_file:
      - ./hadoop-hive.env
    ports:
      - 50077:50075
  nodemanager:
    image: bde2020/hadoop-nodemanager:1.1.0-hadoop2.8-java8
    container_name: nodemanager
    hostname: nodemanager
    environment:
      - CLUSTER_NAME=test
    env_file:
      - ./hadoop-hive.env
    ports:
      - 8042:8042
  hive-server:
    image: bde2020/hive:2.1.0-postgresql-metastore
    container_name: hive-server
    env_file:
      - ./hadoop-hive.env
    environment:
      - "HIVE_CORE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore/metastore"
    ports:
      - "10000:10000"
  hive-metastore:
    image: bde2020/hive:2.1.0-postgresql-metastore
    container_name: hive-metastore
    env_file:
      - ./hadoop-hive.env
    command: /opt/hive/bin/hive --service metastore
    ports:
      - 9083:9083
  hive-metastore-postgresql:
    image: bde2020/hive-metastore-postgresql:2.1.0
    ports:
      - 5432:5432
    volumes:
      - ~/data/postgresql/:/var/lib/postgresql/data
  spark-master:
    image: bde2020/spark-master:2.1.0-hadoop2.8-hive-java8
    container_name: spark-master
    hostname: spark-master
    volumes:
      - ./copy-jar.sh:/copy-jar.sh
    ports:
      - 18080:8080
      - 7077:7077
    env_file:
      - ./hadoop-hive.env
  spark-worker:
    image: bde2020/spark-worker:2.1.0-hadoop2.8-hive-java8
    depends_on:
      - spark-master
    environment:
      - SPARK_MASTER=spark://spark-master:7077
    ports:
      - "18081:8081"
    env_file:
      - ./hadoop-hive.env

hadoop-hive.env

HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HIVE_SITE_CONF_hive_metastore_warehouse_dir=hdfs://namenode:8020/user/hive/warehouse

CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_fs_default_name=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*

HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false

YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle

run.sh

#!/bin/bash

# 啓動容器
docker-compose -f docker-compose.yml up -d namenode hive-metastore-postgresql
docker-compose -f docker-compose.yml up -d datanode datanode2 datanode3 hive-metastore
docker-compose -f docker-compose.yml up -d resourcemanager
docker-compose -f docker-compose.yml up -d nodemanager
docker-compose -f docker-compose.yml up -d historyserver
sleep 5
docker-compose -f docker-compose.yml up -d hive-server
docker-compose -f docker-compose.yml up -d spark-master spark-worker

# 獲取ip地址並打印到控制檯
my_ip=`ifconfig | grep 'inet.*netmask.*broadcast' |  awk '{print $2;exit}'`
echo "Namenode: http://${my_ip}:50070"
echo "Datanode: http://${my_ip}:50075"
echo "Spark-master: http://${my_ip}:18080"

# 執行腳本,spark yarn支持
docker-compose exec spark-master bash -c "./copy-jar.sh && exit"

copy-jar.sh

#!/bin/bash

cd /opt/hadoop-2.8.0/share/hadoop/yarn/lib/ && cp jersey-core-1.9.jar jersey-client-1.9.jar /spark/jars/ && rm -rf /spark/jars/jersey-client-2.22.2.jar

stop.sh

#!/bin/bash
docker-compose stop

基於IDEA提交MapReduceyarn

參考列表

  1. IDEA向hadoop集羣提交MapReduce做業
  2. java操做hadoop hdfs,實現文件上傳下載demo
  3. IDEA遠程提交mapreduce任務至linux,遇到ClassNotFoundException: Mapper

注意:在提交至yarn的時候,要將代碼打成jar包,不然會報錯ClassNotFoundExeption。具體參考《IDEA遠程提交mapreduce任務至linux,遇到ClassNotFoundException: Mapper》。

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.switchvov</groupId>
    <artifactId>hadoop-test</artifactId>
    <version>1.0.0</version>

    <name>hadoop-test</name>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>

    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.8.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.8.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.8.0</version>
        </dependency>
    </dependencies>
</project>

log4j.properties

log4j.rootLogger=INFO, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.Target=System.out
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=[%p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%m%n

words.txt

this is a tests
this is a tests
this is a tests
this is a tests
this is a tests
this is a tests
this is a tests
this is a tests
this is a tests

HdfsDemo.java

package com.switchvov.hadoop.hdfs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

import java.io.InputStream;

/**
 * @author switch
 * @since 2020/12/18
 */
public class HdfsDemo {
    /**
     * hadoop fs的配置文件
     */
    private static final Configuration CONFIGURATION = new Configuration();

    static {
        // 指定hadoop fs的地址
        CONFIGURATION.set("fs.default.name", "hdfs://namenode:8020");
    }

    /**
     * 將本地文件(filePath)上傳到HDFS服務器的指定路徑(dst)
     */
    public static void uploadFileToHDFS(String filePath, String dst) throws Exception {
        // 建立一個文件系統
        FileSystem fs = FileSystem.get(CONFIGURATION);
        Path srcPath = new Path(filePath);
        Path dstPath = new Path(dst);
        long start = System.currentTimeMillis();
        fs.copyFromLocalFile(false, srcPath, dstPath);
        System.out.println("Time:" + (System.currentTimeMillis() - start));
        System.out.println("________準備上傳文件" + CONFIGURATION.get("fs.default.name") + "____________");
        fs.close();
    }

    /**
     * 下載文件
     */
    public static void downLoadFileFromHDFS(String src) throws Exception {
        FileSystem fs = FileSystem.get(CONFIGURATION);
        Path srcPath = new Path(src);
        InputStream in = fs.open(srcPath);
        try {
            // 將文件COPY到標準輸出(即控制檯輸出)
            IOUtils.copyBytes(in, System.out, 4096, false);
        } finally {
            IOUtils.closeStream(in);
            fs.close();
        }
    }

    public static void main(String[] args) throws Exception {
        String filename = "words.txt";
//        uploadFileToHDFS(
//                "/Users/switch/projects/OtherProjects/bigdata-enviroment/hadoop-test/data/" + filename,
//                "/share/" + filename
//        );
        downLoadFileFromHDFS("/share/output12/" + filename + "/part-r-00000");
    }
}

WordCountRunner.java

package com.switchvov.hadoop.mapreduce.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


/**
 * @author switch
 * @since 2020/12/17
 */
public class WordCountRunner {

    /**
     * LongWritable 行號 類型
     * Text 輸入的value 類型
     * Text 輸出的key 類型
     * IntWritable 輸出的value 類型
     *
     * @author switch
     * @since 2020/12/17
     */
    public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        /**
         * @param key     行號
         * @param value   第一行的內容 如  this is a tests
         * @param context 輸出
         * @throws IOException          異常
         * @throws InterruptedException 異常
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            // 以空格分割獲取字符串數組
            String[] words = line.split(" ");
            for (String word : words) {
                context.write(new Text(word), new IntWritable(1));
            }
        }
    }

    /**
     * Text 輸入的key的類型
     * IntWritable 輸入的value的類型
     * Text 輸出的key類型
     * IntWritable 輸出的value類型
     *
     * @author switch
     * @since 2020/12/17
     */
    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        /**
         * @param key     輸入map的key
         * @param values  輸入map的value
         * @param context 輸出
         * @throws IOException          異常
         * @throws InterruptedException 異常
         */
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();
            }
            context.write(key, new IntWritable(count));
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        // 跨平臺,保證在 Windows 下能夠提交 mr job
        conf.set("mapreduce.app-submission.cross-platform", "true");
        // 配置yarn調度
        conf.set("mapreduce.framework.name", "yarn");
        // 配置resourcemanager的主機名
        conf.set("yarn.resourcemanager.hostname", "resourcemanager");
        // 配置默認了namenode訪問地址
        conf.set("fs.defaultFS", "hdfs://namenode:8020");
        conf.set("fs.default.name", "hdfs://namenode:8020");
        // 配置代碼jar包,不然會出現ClassNotFound異常,參考:https://blog.csdn.net/qq_19648191/article/details/56684268
        conf.set("mapred.jar", "/Users/switch/projects/OtherProjects/bigdata-enviroment/hadoop-test/out/artifacts/hadoop/hadoop.jar");
        // 任務名
        Job job = Job.getInstance(conf, "word count");
        // 指定Class
        job.setJarByClass(WordCountRunner.class);
        // 指定 Mapper Class
        job.setMapperClass(WordCountMapper.class);
        // 指定 Combiner Class,與 reduce 計算邏輯同樣
        job.setCombinerClass(WordCountReducer.class);
        // 指定Reucer Class
        job.setReducerClass(WordCountReducer.class);
        // 指定輸出的KEY的格式
        job.setOutputKeyClass(Text.class);
        // 指定輸出的VALUE的格式
        job.setOutputValueClass(IntWritable.class);
        //設置Reducer 個數默認1
        job.setNumReduceTasks(1);
        // Mapper<Object, Text, Text, IntWritable> 輸出格式必須與繼承類的後兩個輸出類型一致
        String filename = "words.txt";
        String args0 = "hdfs://namenode:8020/share/" + filename;
        String args1 = "hdfs://namenode:8020/share/output12/" + filename;
        // 輸入路徑
        FileInputFormat.addInputPath(job, new Path(args0));
        // 輸出路徑
        FileOutputFormat.setOutputPath(job, new Path(args1));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

分享並記錄所學所見

相關文章
相關標籤/搜索