pom依賴java
<properties> <scala.version>2.11.8</scala.version> <hadoop.version>2.7.4</hadoop.version> <spark.version>2.1.3</spark.version> </properties> <dependencies> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-flume_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka-0-10_2.11</artifactId> <version>${spark.version}</version> </dependency> </dependencies>
demo代碼sql
package com.blaze.kafka2streaming; import com.blaze.conf.ConfigurationManager; import com.blaze.constant.Constants; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.spark.SparkConf; import org.apache.spark.api.java.Optional; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.dstream.DStream; import org.apache.spark.streaming.kafka010.ConsumerStrategies; import org.apache.spark.streaming.kafka010.KafkaUtils; import org.apache.spark.streaming.kafka010.LocationStrategies; import scala.Tuple2; import java.util.*; /** * create by zy 2019/3/15 9:26 * TODO: kafka2streaming示例 使用的java8的lambda表達式(idea能夠alt+enter將方法轉換成非lambda表達式的java代碼) */ public class BlazeDemo { public static void main(String[] args) { // 構建SparkStreaming上下文 SparkConf conf = new SparkConf().setAppName("BlazeDemo").setMaster("local[2]"); // 每隔5秒鐘,sparkStreaming做業就會收集最近5秒內的數據源接收過來的數據 JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5)); //checkpoint目錄 //jssc.checkpoint(ConfigurationManager.getProperty(Constants.STREAMING_CHECKPOINT_DIR)); jssc.checkpoint("/streaming_checkpoint"); // 構建kafka參數map // 主要要放置的是鏈接的kafka集羣的地址(broker集羣的地址列表) Map<String, Object> kafkaParams = new HashMap<>(); //Kafka服務監聽端口 kafkaParams.put("bootstrap.servers", ConfigurationManager.getProperty(Constants.KAFKA_BOOTSTRAP_SERVERS)); //指定kafka輸出key的數據類型及編碼格式(默認爲字符串類型編碼格式爲uft-8) kafkaParams.put("key.deserializer", StringDeserializer.class); //指定kafka輸出value的數據類型及編碼格式(默認爲字符串類型編碼格式爲uft-8) kafkaParams.put("value.deserializer", StringDeserializer.class); //消費者ID,隨意指定 kafkaParams.put("group.id", ConfigurationManager.getProperty(Constants.GROUP_ID)); //指定從latest(最新,其餘版本的是largest這裏不行)仍是smallest(最先)處開始讀取數據 kafkaParams.put("auto.offset.reset", "latest"); //若是true,consumer按期地往zookeeper寫入每一個分區的offset kafkaParams.put("enable.auto.commit", false); // 構建topic set String kafkaTopics = ConfigurationManager.getProperty(Constants.KAFKA_TOPICS); String[] kafkaTopicsSplited = kafkaTopics.split(","); Collection<String> topics = new HashSet<>(); for (String kafkaTopic : kafkaTopicsSplited) { topics.add(kafkaTopic); } try { // 獲取kafka的數據 final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream( jssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams) ); //獲取words //JavaDStream<String> words = stream.flatMap(s -> Arrays.asList(s.value().split(",")).iterator()); JavaDStream<String> words = stream.flatMap((FlatMapFunction<ConsumerRecord<String, String>, String>) s -> { List<String> list = new ArrayList<>(); //todo 獲取到kafka的每條數據 進行操做 System.out.print("***************************" + s.value() + "***************************"); list.add(s.value() + "23333"); return list.iterator(); }); //獲取word,1格式數據 JavaPairDStream<String, Integer> wordsAndOne = words.mapToPair((PairFunction<String, String, Integer>) word -> new Tuple2<>(word, 1)); //聚合本次5s的拉取的數據 //JavaPairDStream<String, Integer> wordsCount = wordsAndOne.reduceByKey((Function2<Integer, Integer, Integer>) (a, b) -> a + b); //wordsCount.print(); //歷史累計 60秒checkpoint一次 DStream<Tuple2<String, Integer>> result = wordsAndOne.updateStateByKey(((Function2<List<Integer>, Optional<Integer>, Optional<Integer>>) (values, state) -> { Integer updatedValue = 0; if (state.isPresent()) { updatedValue = Integer.parseInt(state.get().toString()); } for (Integer value : values) { updatedValue += value; } return Optional.of(updatedValue); })).checkpoint(Durations.seconds(60)); result.print(); //開窗函數 5秒計算一次 計算前15秒的數據聚合 JavaPairDStream<String, Integer> result2 = wordsAndOne.reduceByKeyAndWindow((Function2<Integer, Integer, Integer>) (x, y) -> x + y, Durations.seconds(15), Durations.seconds(5)); result2.print(); jssc.start(); jssc.awaitTermination(); jssc.close(); } catch (Exception e) { e.printStackTrace(); } } }
相關配置文件apache
package com.blaze.conf; import java.io.InputStream; import java.util.Properties; /** * create by zy 2019/3/15 9:33 * TODO: */ public class ConfigurationManager { //私有配置對象 private static Properties prop = new Properties(); /** * 靜態代碼塊 */ static { try { //獲取配置文件輸入流 InputStream in = ConfigurationManager.class .getClassLoader().getResourceAsStream("blaze.properties"); //加載配置對象 prop.load(in); } catch (Exception e) { e.printStackTrace(); } } /** * 獲取指定key對應的value * * @param key * @return value */ public static String getProperty(String key) { return prop.getProperty(key); } /** * 獲取整數類型的配置項 * * @param key * @return value */ public static Integer getInteger(String key) { String value = getProperty(key); try { return Integer.valueOf(value); } catch (Exception e) { e.printStackTrace(); } return 0; } /** * 獲取布爾類型的配置項 * * @param key * @return value */ public static Boolean getBoolean(String key) { String value = getProperty(key); try { return Boolean.valueOf(value); } catch (Exception e) { e.printStackTrace(); } return false; } /** * 獲取Long類型的配置項 * * @param key * @return */ public static Long getLong(String key) { String value = getProperty(key); try { return Long.valueOf(value); } catch (Exception e) { e.printStackTrace(); } return 0L; } }
package com.blaze.constant; /** * create by zy 2019/3/15 9:31 * TODO:常量接口 */ public interface Constants { String GROUP_ID = "group.id"; String KAFKA_TOPICS = "kafka.topics"; String KAFKA_BOOTSTRAP_SERVERS = "bootstrap.servers"; String STREAMING_CHECKPOINT_DIR = "streaming.checkpoint.dir"; }
blaze.propertiesbootstrap
bootstrap.servers=192.168.44.41:9092,192.168.44.42:9092,192.168.44.43:9092
kafka.topics=sparkDemo
group.id=blaze
streaming.checkpoint.dir=hdfs://192.168.44.41:9000/streaming_checkpoint