最近在專一Spark開發,記錄下本身的工做和學習路程,但願能跟你們互相交流成長
本文章更傾向於實戰案例,涉及框架原理及基本應用還請讀者自行閱讀相關文章,相關在本文章最後參考資料中
關於Zookeeper/Kafka/HBase/Hadoop相關集羣環境搭建做者會陸續更新
本文章發佈後會及時更新文章中出現的錯誤及增長內容,歡迎你們訂閱
QQ:86608625 微信:guofei1990123html
Kafka實時記錄從數據採集工具Flume或業務系統實時接口收集數據,並做爲消息緩衝組件爲上游實時計算框架提供可靠數據支撐,Spark 1.3版本後支持兩種整合Kafka機制(Receiver-based Approach 和 Direct Approach),具體細節請參考文章最後官方文檔連接,數據存儲使用HBasejava
因爲筆者機器性能有限,hadoop/zookeeper/kafka集羣都搭建在一塊兒主機名分別爲hadoop1,hadoop2,hadoop3; hbase爲單節點 在hadoop1git
因爲筆者技術有限,代碼設計上有部分缺陷,好比spark-streaming計算後數據保存hbase邏輯性能很低,但願你們多提意見以便小編及時更正github
Kafka消息模擬器redis
package clickstream import java.util.{Properties, Random, UUID} import kafka.producer.{KeyedMessage, Producer, ProducerConfig} import org.codehaus.jettison.json.JSONObject /** * Created by 郭飛 on 2016/5/31. */ object KafkaMessageGenerator { private val random = new Random() private var pointer = -1 private val os_type = Array( "Android", "IPhone OS", "None", "Windows Phone") def click() : Double = { random.nextInt(10) } def getOsType() : String = { pointer = pointer + 1 if(pointer >= os_type.length) { pointer = 0 os_type(pointer) } else { os_type(pointer) } } def main(args: Array[String]): Unit = { val topic = "user_events" //本地虛擬機ZK地址 val brokers = "hadoop1:9092,hadoop2:9092,hadoop3:9092" val props = new Properties() props.put("metadata.broker.list", brokers) props.put("serializer.class", "kafka.serializer.StringEncoder") val kafkaConfig = new ProducerConfig(props) val producer = new Producer[String, String](kafkaConfig) while(true) { // prepare event data val event = new JSONObject() event .put("uid", UUID.randomUUID())//隨機生成用戶id .put("event_time", System.currentTimeMillis.toString) //記錄時間發生時間 .put("os_type", getOsType) //設備類型 .put("click_count", click) //點擊次數 // produce event message producer.send(new KeyedMessage[String, String](topic, event.toString)) println("Message sent: " + event) Thread.sleep(200) } } }
Spark-Streaming主類apache
package clickstream import kafka.serializer.StringDecoder import net.sf.json.JSONObject import org.apache.hadoop.hbase.client.{HTable, Put} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.spark.SparkConf import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} /** * Created by 郭飛 on 2016/5/31. */ object PageViewStream { def main(args: Array[String]): Unit = { var masterUrl = "local[2]" if (args.length > 0) { masterUrl = args(0) } // Create a StreamingContext with the given master URL val conf = new SparkConf().setMaster(masterUrl).setAppName("PageViewStream") val ssc = new StreamingContext(conf, Seconds(5)) // Kafka configurations val topics = Set("PageViewStream") //本地虛擬機ZK地址 val brokers = "hadoop1:9092,hadoop2:9092,hadoop3:9092" val kafkaParams = Map[String, String]( "metadata.broker.list" -> brokers, "serializer.class" -> "kafka.serializer.StringEncoder") // Create a direct stream val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) val events = kafkaStream.flatMap(line => { val data = JSONObject.fromObject(line._2) Some(data) }) // Compute user click times val userClicks = events.map(x => (x.getString("uid"), x.getInt("click_count"))).reduceByKey(_ + _) userClicks.foreachRDD(rdd => { rdd.foreachPartition(partitionOfRecords => { partitionOfRecords.foreach(pair => { //Hbase配置 val tableName = "PageViewStream" val hbaseConf = HBaseConfiguration.create() hbaseConf.set("hbase.zookeeper.quorum", "hadoop1:9092") hbaseConf.set("hbase.zookeeper.property.clientPort", "2181") hbaseConf.set("hbase.defaults.for.version.skip", "true") //用戶ID val uid = pair._1 //點擊次數 val click = pair._2 //組裝數據 val put = new Put(Bytes.toBytes(uid)) put.add("Stat".getBytes, "ClickStat".getBytes, Bytes.toBytes(click)) val StatTable = new HTable(hbaseConf, TableName.valueOf(tableName)) StatTable.setAutoFlush(false, false) //寫入數據緩存 StatTable.setWriteBufferSize(3*1024*1024) StatTable.put(put) //提交 StatTable.flushCommits() }) }) }) ssc.start() ssc.awaitTermination() } }
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.guofei.spark</groupId> <artifactId>RiskControl</artifactId> <version>1.0-SNAPSHOT</version> <packaging>jar</packaging> <name>RiskControl</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <!--Spark core 及 streaming --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>1.3.0</version> </dependency> <!-- Spark整合Kafka--> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka_2.10</artifactId> <version>1.3.0</version> </dependency> <!-- 整合Hbase--> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase</artifactId> <version>0.96.2-hadoop2</version> <type>pom</type> </dependency> <!--Hbase依賴 --> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>0.96.2-hadoop2</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>0.96.2-hadoop2</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-common</artifactId> <version>0.96.2-hadoop2</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>1.3.2</version> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>1.1.3</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> <dependency> <groupId>com.google.protobuf</groupId> <artifactId>protobuf-java</artifactId> <version>2.5.0</version> </dependency> <dependency> <groupId>io.netty</groupId> <artifactId>netty</artifactId> <version>3.6.6.Final</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-protocol</artifactId> <version>0.96.2-hadoop2</version> </dependency> <dependency> <groupId>org.apache.zookeeper</groupId> <artifactId>zookeeper</artifactId> <version>3.4.5</version> </dependency> <dependency> <groupId>org.cloudera.htrace</groupId> <artifactId>htrace-core</artifactId> <version>2.01</version> </dependency> <dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-mapper-asl</artifactId> <version>1.9.13</version> </dependency> <dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-core-asl</artifactId> <version>1.9.13</version> </dependency> <dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-jaxrs</artifactId> <version>1.9.13</version> </dependency> <dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-xc</artifactId> <version>1.9.13</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.6.4</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.6.4</version> </dependency> <!-- Hadoop依賴包--> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.4</version> </dependency> <dependency> <groupId>commons-configuration</groupId> <artifactId>commons-configuration</artifactId> <version>1.6</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-auth</artifactId> <version>2.6.4</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.4</version> </dependency> <dependency> <groupId>net.sf.json-lib</groupId> <artifactId>json-lib</artifactId> <version>2.4</version> <classifier>jdk15</classifier> </dependency> <dependency> <groupId>org.codehaus.jettison</groupId> <artifactId>jettison</artifactId> <version>1.1</version> </dependency> <dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>2.5.2</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-pool2</artifactId> <version>2.2</version> </dependency> </dependencies> <build> <sourceDirectory>src/main/scala</sourceDirectory> <testSourceDirectory>src/test/scala</testSourceDirectory> <plugins> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.2.2</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> <configuration> <args> <arg>-make:transitive</arg> <arg>-dependencyfile</arg> <arg>${project.build.directory}/.scala_dependencies</arg> </args> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.4.3</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>
userClicks.foreachRDD(rdd => { rdd.foreachPartition(partitionOfRecords => { partitionOfRecords.foreach( 這裏面的代碼中所包含的對象必須是序列化的 這裏面的代碼中所包含的對象必須是序列化的 這裏面的代碼中所包含的對象必須是序列化的 }) }) })
做者:MichaelFly
連接:https://www.jianshu.com/p/ccba410462ba
來源:簡書
簡書著做權歸做者全部,任何形式的轉載都請聯繫做者得到受權並註明出處。json