JDK 1.8.0 Hadoop 2.6.0 Scala 2.11.8 Spark 2.1.2 Oozie 4.1 Hue 3.9
spark.serializer=org.apache.spark.serializer.KryoSerializer spark.kryo.registrationRequired=true
import dw.common.util.HdfsHelper; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import scala.Tuple2; import java.util.Arrays;
public class WordCount { public static void main(String[] args) throws ClassNotFoundException { // 輸入文件 String wordFile = "/user/qhy/input/wordcount/idea.txt"; SparkConf sparkConf = new SparkConf(); sparkConf.registerKryoClasses(new Class<?>[]{ java.lang.Class.class, Object[].class, Class.forName("scala.reflect.ClassTag$$anon$1") }); SparkSession spark = SparkSession.builder() .appName("WordCount") .config(sparkConf) .config("spark.executor.instances", 10) .config("spark.executor.memory", "4g") .config("spark.executor.cores", 1) .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false) .getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); JavaRDD<String> hdfstext = jsc.textFile(wordFile); // 切分 JavaRDD<String> words = hdfstext.flatMap(line -> Arrays.asList(line.split("\\s+")).iterator()); // 單次計 1 JavaPairRDD<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1)); // 累加 1 JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey((v1, v2) -> v1 + v2); // 交換 k, v JavaPairRDD<Integer, String> swapWordCounts = wordCounts.mapToPair(tuple2 -> tuple2.swap()); // 降序 swapWordCounts = swapWordCounts.sortByKey(false, 1).repartition(1); swapWordCounts.map(tuple -> tuple._1 + tuple._2); // 保存結果到 HDFS String outDir = "/user/qhy/output/wordcount"; HdfsHelper.deleteDir(jsc, outDir); swapWordCounts.saveAsTextFile(outDir); jsc.close(); } }
java.io.EOFException java.io.IOException: java.lang.NullPointerException java.lang.IndexOutOfBoundsException com.esotericsoftware.kryo.KryoException TorrentBroadcast
可先將html
spark.serializer=org.apache.spark.serializer.KryoSerializer
換回默認的java
spark.serializer=org.apache.spark.serializer.JavaSerializer
以肯定錯誤不是由其餘緣由形成。
形成這種錯誤的緣由多是在執行環境中同時存在 kryo-2.21.jar 和 kryo-shaded-3.0.3.jar 兩個 jar 文件,刪除 kryo-2.21.jar 便可。
若是是使用 Oozie 調度,需重啓 oozie,不然可能報錯。(JA008: File does not exist)
walker 實際環境中兩個 jar 的 HDFS 目錄爲 .../share/lib/lib_20190930130812/spark
sql
ERROR scheduler.TaskSetManager: Task 0.0 in stage 1.0 (TID 0) had a not serializable result: org.apache.hadoop.io.Text
本文出自 walker snapshot