Spark RDD概念學習系列之rdd持久化、廣播、累加器（十八）

時間 2019-11-18

標籤 spark rdd 概念學習系列持久廣播累加器十八欄目 Spark 简体版

原文原文鏈接

一、rdd持久化html

二、廣播java

三、累加器node

一、rdd持久化python

　　經過spark-shell，能夠快速的驗證咱們的想法和操做！程序員

啓動hdfs集羣web

spark@SparkSingleNode:/usr/local/hadoop/hadoop-2.6.0$ sbin/start-dfs.sh算法

啓動spark集羣spring

spark@SparkSingleNode:/usr/local/spark/spark-1.5.2-bin-hadoop2.6$ sbin/start-all.shshell

啓動spark-shellexpress

spark@SparkSingleNode:/usr/local/spark/spark-1.5.2-bin-hadoop2.6/bin$ ./spark-shell --master spark://SparkSingleNode:7077 --executor-memory 1g

reduce

scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@3bcc8f13

scala> val numbers = sc.parallelize
<console>:21: error: missing arguments for method parallelize in class SparkContext;
follow this method with `_' if you want to treat it as a partially applied function
val numbers = sc.parallelize
^

scala> val numbers = sc.parallelize(1 to 100)
numbers: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:21

scala> numbers.reduce(_+_)

took 11.790246 s
res1: Int = 5050

可見，reduce是個action。

scala> val result = numbers.map(2*_)
result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[1] at map at <console>:23

scala> val data = result.collect

reduce源碼

/**
 * Reduces the elements of this RDD using the specified commutative and
 * associative binary operator.
 */
def reduce(f: (T, T) => T): T = withScope {
  val cleanF = sc.clean(f)
  val reducePartition: Iterator[T] => Option[T] = iter => {
    if (iter.hasNext) {
      Some(iter.reduceLeft(cleanF))
    } else {
      None
    }
  }
  var jobResult: Option[T] = None
  val mergeResult = (index: Int, taskResult: Option[T]) => {
    if (taskResult.isDefined) {
      jobResult = jobResult match {
        case Some(value) => Some(f(value, taskResult.get))
        case None => taskResult
      }
    }
  }
  sc.runJob(this, reducePartition, mergeResult)
  // Get the final result out of our Option, or throw an exception if the RDD was empty
  jobResult.getOrElse(throw new UnsupportedOperationException("empty collection"))
}

可見,這也是一個action操做。

collect

data: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200)

scala>

collect源碼

/**
 * Return an array that contains all of the elements in this RDD.
 */
def collect(): Array[T] = withScope {
  val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray)
  Array.concat(results: _*)
}

可見,這也是一個action操做。

　　從收集結果的角度來講，若是想要在命令行終端中，看到執行結果，就必須collect。

　　從源碼的角度來講，凡是action級別的操做，都會觸發sc.rubJob。這點，spark裏是一個應用程序容許有多個Job,而hadoop裏一個應用程序只能一個Job。

count

scala> numbers
res2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:21

scala> 1 to 100
res3: scala.collection.immutable.Range.Inclusive = Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)

scala> numbers.count

took 0.649005 s
res4: Long = 100

count源碼

/**
 * Return the number of elements in the RDD.
 */
def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum

可見,這也是一個action操做。

take

scala> val topN = numbers.take(5)

topN: Array[Int] = Array(1, 2, 3, 4, 5)

take源碼

/**
 * Take the first num elements of the RDD. It works by first scanning one partition, and use the
 * results from that partition to estimate the number of additional partitions needed to satisfy
 * the limit.
 *
 * @note due to complications in the internal implementation, this method will raise
 * an exception if called on an RDD of `Nothing` or `Null`.
 */
def take(num: Int): Array[T] = withScope {
  if (num == 0) {
    new Array[T](0)
  } else {
    val buf = new ArrayBuffer[T]
    val totalParts = this.partitions.length
    var partsScanned = 0
    while (buf.size < num && partsScanned < totalParts) {
      // The number of partitions to try in this iteration. It is ok for this number to be
      // greater than totalParts because we actually cap it at totalParts in runJob.
      var numPartsToTry = 1
      if (partsScanned > 0) {
        // If we didn't find any rows after the previous iteration, quadruple and retry.
        // Otherwise, interpolate the number of partitions we need to try, but overestimate
        // it by 50%. We also cap the estimation in the end.
        if (buf.size == 0) {
          numPartsToTry = partsScanned * 4
        } else {
          // the left side of max is >=1 whenever partsScanned >= 2
          numPartsToTry = Math.max((1.5 * num * partsScanned / buf.size).toInt - partsScanned, 1)
          numPartsToTry = Math.min(numPartsToTry, partsScanned * 4)
        }
      }

      val left = num - buf.size
      val p = partsScanned until math.min(partsScanned + numPartsToTry, totalParts)
      val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p)

      res.foreach(buf ++= _.take(num - buf.size))
      partsScanned += numPartsToTry
    }

    buf.toArray
  }
}

可見,這也是一個action操做。

countByKey

scala> val scores = Array(Tuple2(1,100),Tuple2(1,100),Tuple2(2,100),Tuple2(2,100),Tuple2(3,100))
scores: Array[(Int, Int)] = Array((1,100), (1,100), (2,100), (2,100), (3,100))

scala> val content = sc.parallelize
<console>:21: error: missing arguments for method parallelize in class SparkContext;
follow this method with `_' if you want to treat it as a partially applied function
val content = sc.parallelize
^

scala> val content = sc.parallelize(scores)
content: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:23

scala> val data = content.countByKey()

took 10.556634 s
data: scala.collection.Map[Int,Long] = Map(2 -> 2, 1 -> 2, 3 -> 1)

countByKey源碼

/**
 * Count the number of elements for each key, collecting the results to a local Map.
 *
 * Note that this method should only be used if the resulting map is expected to be small, as
 * the whole thing is loaded into the driver's memory.
 * To handle very large results, consider using rdd.mapValues(_ => 1L).reduceByKey(_ + _), which
 * returns an RDD[T, Long] instead of a map.
 */
def countByKey(): Map[K, Long] = self.withScope {
  self.mapValues(_ => 1L).reduceByKey(_ + _).collect().toMap
}

可見,這也是一個action操做。

saveAsTextFile

以前，在  rdd實戰（rdd基本操做實戰及transformation和action流程圖）（源碼）

scala> val partitionsReadmeRdd =  sc.textFile("hdfs://SparkSingleNode:9000/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).saveAsTextFile("~/partition1README.txt")

這裏呢。

scala> val partitionsReadmeRdd =  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).saveAsTextFile("/partition1README.txt")

scala> val partitionsReadmeRdd =  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).saveAsTextFile("/partition1README.txt")

saveAsTextFile源碼

/**
 * Save this RDD as a text file, using string representations of elements.
 */
def saveAsTextFile(path: String): Unit = withScope {
  // https://issues.apache.org/jira/browse/SPARK-2075
  //
  // NullWritable is a `Comparable` in Hadoop 1.+, so the compiler cannot find an implicit
  // Ordering for it and will use the default `null`. However, it's a `Comparable[NullWritable]`
  // in Hadoop 2.+, so the compiler will call the implicit `Ordering.ordered` method to create an
  // Ordering for `NullWritable`. That's why the compiler will generate different anonymous
  // classes for `saveAsTextFile` in Hadoop 1.+ and Hadoop 2.+.
  //
  // Therefore, here we provide an explicit Ordering `null` to make sure the compiler generate
  // same bytecodes for `saveAsTextFile`.
  val nullWritableClassTag = implicitly[ClassTag[NullWritable]]
  val textClassTag = implicitly[ClassTag[Text]]
  val r = this.mapPartitions { iter =>
    val text = new Text()
    iter.map { x =>
      text.set(x.toString)
      (NullWritable.get(), text)
    }
  }
  RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)
    .saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path)
}

/**
 * Save this RDD as a compressed text file, using string representations of elements.
 */
def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit = withScope {
  // https://issues.apache.org/jira/browse/SPARK-2075
  val nullWritableClassTag = implicitly[ClassTag[NullWritable]]
  val textClassTag = implicitly[ClassTag[Text]]
  val r = this.mapPartitions { iter =>
    val text = new Text()
    iter.map { x =>
      text.set(x.toString)
      (NullWritable.get(), text)
    }
  }
  RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)
    .saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path, codec)
}

saveAsTextFile不只，可保存在集羣裏，也能夠保存到本地，這就要看hadoop的運行模式。
因而可知，它也是個action操做。


以上是rdd持久化的第一個方面，就是action級別的操做。

rdd持久化的第二個方面，就是經過persist。
爲何在spark裏，隨處可見persist的身影呢？
緣由一：spark在默認狀況下，數據是放在內存中，適合高速迭代。好比在一個stage裏，有1000個步驟，它其實只在第1個步驟輸入數據，在第1000個步驟輸出數據，在中間不產生臨時數據。可是，分佈式系統，分享很是高，因此，容出錯，設計到容錯。

　　　　因爲，rdd是有血統繼承關係的，即lineager。若是後面的rdd數據分片出錯了或rdd自己出錯了，則，可根據其前面依賴的lineager，算出來。
可是，假設1000個步驟，若是以前，沒有父rdd進行persist或cache的話，則要重頭開始了。親！


何時，該persist？

一、在某個步驟很是費時的狀況下，很差使  　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　（手動）
二、計算鏈條特別長的狀況下   　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　（手動）

三、checkpoint所在的rdd也必定要持久化數據      （注意：在checkpoint以前，進行persist）    　　　　　　（手動）

checkpoint是rdd的算子，

　　先寫，某個具體rdd.checkpoint  或   某個具體rdd.cache ，再寫，  某個具體rdd.persist

四、shuffle以後   （由於shuffle以後，要網絡傳輸，風險大）  　　　　　　　　　　　　　　　　　　　　　　　　（手動）

五、shuffle以前    （框架，默認給咱們作的，把數據持久化到本地磁盤）

checkpoint源碼

/**
 * Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint
 * directory set with `SparkContext#setCheckpointDir` and all references to its parent
 * RDDs will be removed. This function must be called before any job has been
 * executed on this RDD. It is strongly recommended that this RDD is persisted in
 * memory, otherwise saving it on a file will require recomputation.
 */
def checkpoint(): Unit = RDDCheckpointData.synchronized {
  // NOTE: we use a global lock here due to complexities downstream with ensuring
  // children RDD partitions point to the correct parent partitions. In the future
  // we should revisit this consideration.
  if (context.checkpointDir.isEmpty) {
    throw new SparkException("Checkpoint directory has not been set in the SparkContext")
  } else if (checkpointData.isEmpty) {
    checkpointData = Some(new ReliableRDDCheckpointData(this))
  }
}

persist源碼

/**
 * Mark this RDD for persisting using the specified level.
 *
 * @param newLevel the target storage level
 * @param allowOverride whether to override any existing level with the new one
 */
private def persist(newLevel: StorageLevel, allowOverride: Boolean): this.type = {
  // TODO: Handle changes of StorageLevel
  if (storageLevel != StorageLevel.NONE && newLevel != storageLevel && !allowOverride) {
    throw new UnsupportedOperationException(
      "Cannot change storage level of an RDD after it was already assigned a level")
  }
  // If this is the first time this RDD is marked for persisting, register it
  // with the SparkContext for cleanups and accounting. Do this only once.
  if (storageLevel == StorageLevel.NONE) {
    sc.cleaner.foreach(_.registerRDDForCleanup(this))
    sc.persistRDD(this)
  }
  storageLevel = newLevel
  this
}

/**
 * Set this RDD's storage level to persist its values across operations after the first time
 * it is computed. This can only be used to assign a new storage level if the RDD does not
 * have a storage level set yet. Local checkpointing is an exception.
 */
def persist(newLevel: StorageLevel): this.type = {
  if (isLocallyCheckpointed) {
    // This means the user previously called localCheckpoint(), which should have already
    // marked this RDD for persisting. Here we should override the old storage level with
    // one that is explicitly requested by the user (after adapting it to use disk).
    persist(LocalRDDCheckpointData.transformStorageLevel(newLevel), allowOverride = true)
  } else {
    persist(newLevel, allowOverride = false)
  }
}

/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
def persist(): this.type = persist(StorageLevel.MEMORY_ONLY)

/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
def cache(): this.type = persist()

StorageLevel裏有不少類型

這裏，牽扯到序列化。

問，爲何要序列化？

答：節省空間，減小體積。內存不夠時，把MEMORY中的數據，進行序列化。

　　固然，也有很差一面，序列化時，會反序列化，反序列化耗cpu。

MEMORY_AND_DISK 
假設，咱們制定數據存儲方式是，MEMORY_AND_DISK。則，是否是同時，存儲到內存和磁盤呢？
答：不是啊，親。spark必定是優先考慮內存的啊，只要內存足夠，不會考慮磁盤。若內存不夠了，則才放部分數據到磁盤。

極大地減小數據丟失機率發生。

MEMORY_ONLY

假設，咱們制定數據存儲方式是，MEMORY_ONLY。則，只放到內存。當內存不夠了，會出現OOM。或數據丟失。

OFF_HEAP

這牽扯到Tachyon，基於內存的分佈式系統

爲何有2分副本？好處是？

假設，一個計算特別耗時，並且，又是基於內存，若是其中一份副本崩潰掉，則可迅速切換到另外一份副本去計算。這就是「空間換時間」！很是重要
這不是並行計算，這是計算後的結果，放2份副本。

scala> val partitionsReadmeRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).count

took 6.270138 s

scala> val partitionsReadmeRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 4.147545 s

scala> val partitionsReadmeRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 4.914212 s

scala> val cacheRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache

scala> cacheRdd.count

took 3.371621

scala> val cacheRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache

scala> cacheRdd.count

took 0.943499 s

個人天啊！

scala> sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 5.603903

scala> sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 4.146627

scala> sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 3.071122

cache以後，必定不能當即有其餘算子！

實際工程中， cache以後，若是有其餘算子，則會，從新觸發這個工做過程。

注意:cache，不是action

cache緩存，怎麼讓它失效？

答：unpersist

persist是lazy級別的，unpersist是eager級別的。cache是persist的一個特殊狀況。

cache和persist的區別？

答：persist能夠放到磁盤、放到內存、同時放到內存和磁盤。以及多份副本

　　cache只能放到內存，以及只能一份副本。

persisit在內存不夠時，保存在磁盤的哪一個目錄？

答：local的process。

好的，以上是，rdd持久化的兩個方面。

rdd持久化的第一個方面，就是經常使用的action級別的操做。

rdd持久化的第二個方面，就是持久化的不一樣方式，以及它內部的運行狀況

　　小知識：cache以後，必定不能當即有其餘算子！實際工程中， cache以後，若是有其餘算子，則會，從新觸發這個工做過程。

通常都不會跨機器抓內存，寧願排隊。寧願數據不動代碼動。

二、廣播

　　爲何要有，rdd廣播？

　　答：大變量、join、冗餘、減小數據移動、通訊、狀態、集羣消息、共享、網絡傳輸慢要提早、數據量大耗網絡、減小通訊、要同步。

　　爲何大變量，須要廣播呢？

　　答：緣由是，每一個task運行，讀取全集數據時，task自己執行時，每次都要拷貝一份數據副本，若是變量比較大，如一百萬，則要拷貝一百萬。

（2）廣播（線程中共享）沒必要每一個task都拷貝一份副本，由於它是全局惟一的，極大的減小oom，減小通訊，冗餘、共享變量等。廣播是將數據廣播到Executor的內存中，其內部因此的任務都會只享有全局惟一的變量，減小網絡傳輸。

text在讀取數據時候，拷貝一份的數據副本（變量），由於函數式編程（變量不變），不拷貝狀態容易被改變，數據量小（一、引用較小二、數據自己小），變量大容易產生oom（task拷貝數據在內存中運行），網絡傳輸慢，要提早，冗餘、共享，減小通訊。

廣播變量：

廣播變量容許程序員將一個只讀的變量緩存在每臺機器上，而不用在任務之間傳遞變量。廣播變量可被用於有效地給每一個節點一個大輸入數據集的副本。Spark還嘗試使用高效地廣播算法來分發變量，進而減小通訊的開銷。

Spark的動做經過一系列的步驟執行，這些步驟由分佈式的洗牌操做分開。Spark自動地廣播每一個步驟每一個任務須要的通用數據。這些廣播數據被序列化地緩存，在運行任務以前被反序列化出來。這意味着當咱們須要在多個階段的任務之間使用相同的數據，或者以反序列化形式緩存數據是十分重要的時候，顯式地建立廣播變量纔有用。

（本段摘自：http://blog.csdn.net/happyanger6/article/details/46576831）

　　　　　　　　　　　　　　　　　　　　　　　　　　　　廣播工做機制圖

參考：　http://blog.csdn.net/kxr0502/article/details/50574561

問：讀廣播，會消耗網絡傳輸嗎？

答：不消耗，廣播是放在內存中。讀取它，不消耗。

問：廣播變量是否是就是向每個executor，廣播一份數據，而不是向每個task，廣播一份數據？這樣對嗎?

答：對

　　廣播是由Driver發給當前Application分配的全部Executor內存級別的全局只讀變量，Executor中的線程池中的線程共享該全局變量，極大的減小了網絡傳輸（不然的話每一個Task都要傳輸一次該變量）並極大的節省了內存，固然也隱形的提升的CPU的有效工做。

實戰建立廣播：

scala> val number = 10
number: Int = 10

scala> val broadcastNumber = sc.broadcast(number)

16/09/29 17:26:47 INFO storage.MemoryStore: ensureFreeSpace(40) called with curMem=1782734, maxMem=560497950
16/09/29 17:26:47 INFO storage.MemoryStore: Block broadcast_38 stored as values in memory (estimated size 40.0 B, free 532.8 MB)
16/09/29 17:26:48 INFO storage.MemoryStore: ensureFreeSpace(97) called with curMem=1782774, maxMem=560497950
16/09/29 17:26:48 INFO storage.MemoryStore: Block broadcast_38_piece0 stored as bytes in memory (estimated size 97.0 B, free 532.8 MB)
16/09/29 17:26:48 INFO storage.BlockManagerInfo: Added broadcast_38_piece0 in memory on 192.168.80.128:40914 (size: 97.0 B, free: 534.4 MB)
16/09/29 17:26:48 INFO spark.SparkContext: Created broadcast 38 from broadcast at <console>:23
broadcastNumber: org.apache.spark.broadcast.Broadcast[Int] = Broadcast(38)

scala> val data = sc.parallelize
<console>:21: error: missing arguments for method parallelize in class SparkContext;
follow this method with `_' if you want to treat it as a partially applied function
val data = sc.parallelize
^

scala> val data = sc.parallelize(1 to 100)
data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[61] at parallelize at <console>:21

scala> val bn = data.map(_* broadcastNumber.value)
bn: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[62] at map at <console>:27

scala>

　　咱們知道，是test是要廣播變量，但，咱們編程，對rdd。

//經過在一個變量v上調用SparkContext.broadcast(v)能夠建立廣播變量。廣播變量是圍繞着v的封裝，能夠經過value方法訪問這個變量。

問：廣播變量裏有不少變量嗎？

答:固然能夠有不少，用java bin或scala封裝，就能夠了。

　　如，在這裏。廣播變量是，broadcastNumber，裏，有變量value等。

scala> val broadcastNumber = sc.broadcast(number)

scala> val bn = data.map(_* broadcastNumber.value)

scala> bn.collect

res12: Array[Int] = Array(10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000)

scala>

由此，可見，經過機制、流程圖和實戰，深度剖析對廣播全面詳解！

broadcast源碼分析

參考： http://www.cnblogs.com/seaspring/p/5682053.html

BroadcastManager源碼

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.broadcast

import java.util.concurrent.atomic.AtomicLong

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark] class BroadcastManager(
    val isDriver: Boolean,
    conf: SparkConf,
    securityManager: SecurityManager)
  extends Logging {

  private var initialized = false
  private var broadcastFactory: BroadcastFactory = null

  initialize()

  // Called by SparkContext or Executor before using Broadcast
  private def initialize() {
    synchronized {
      if (!initialized) {
        val broadcastFactoryClass =
          conf.get("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

        broadcastFactory =
          Utils.classForName(broadcastFactoryClass).newInstance.asInstanceOf[BroadcastFactory]

        // Initialize appropriate BroadcastFactory and BroadcastObject
        broadcastFactory.initialize(isDriver, conf, securityManager)

        initialized = true
      }
    }
  }

  def stop() {
    broadcastFactory.stop()
  }

  private val nextBroadcastId = new AtomicLong(0)

  def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = {
    broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement())
  }

  def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) {
    broadcastFactory.unbroadcast(id, removeFromDriver, blocking)
  }
}

Broadcast源碼

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.broadcast

import java.io.Serializable

import org.apache.spark.SparkException
import org.apache.spark.Logging
import org.apache.spark.util.Utils

import scala.reflect.ClassTag

/**
 * A broadcast variable. Broadcast variables allow the programmer to keep a read-only variable
 * cached on each machine rather than shipping a copy of it with tasks. They can be used, for
 * example, to give every node a copy of a large input dataset in an efficient manner. Spark also
 * attempts to distribute broadcast variables using efficient broadcast algorithms to reduce
 * communication cost.
 *
 * Broadcast variables are created from a variable `v` by calling
 * [[org.apache.spark.SparkContext#broadcast]].
 * The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the
 * `value` method. The interpreter session below shows this:
 *
 * {{{
 * scala> val broadcastVar = sc.broadcast(Array(1, 2, 3))
 * broadcastVar: org.apache.spark.broadcast.Broadcast[Array[Int]] = Broadcast(0)
 *
 * scala> broadcastVar.value
 * res0: Array[Int] = Array(1, 2, 3)
 * }}}
 *
 * After the broadcast variable is created, it should be used instead of the value `v` in any
 * functions run on the cluster so that `v` is not shipped to the nodes more than once.
 * In addition, the object `v` should not be modified after it is broadcast in order to ensure
 * that all nodes get the same value of the broadcast variable (e.g. if the variable is shipped
 * to a new node later).
 *
 * @param id A unique identifier for the broadcast variable.
 * @tparam T Type of the data contained in the broadcast variable.
 */
abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Logging {

  /**
   * Flag signifying whether the broadcast variable is valid
   * (that is, not already destroyed) or not.
   */
  @volatile private var _isValid = true

  private var _destroySite = ""

  /** Get the broadcasted value. */
  def value: T = {
    assertValid()
    getValue()
  }

  /**
   * Asynchronously delete cached copies of this broadcast on the executors.
   * If the broadcast is used after this is called, it will need to be re-sent to each executor.
   */
  def unpersist() {
    unpersist(blocking = false)
  }

  /**
   * Delete cached copies of this broadcast on the executors. If the broadcast is used after
   * this is called, it will need to be re-sent to each executor.
   * @param blocking Whether to block until unpersisting has completed
   */
  def unpersist(blocking: Boolean) {
    assertValid()
    doUnpersist(blocking)
  }


  /**
   * Destroy all data and metadata related to this broadcast variable. Use this with caution;
   * once a broadcast variable has been destroyed, it cannot be used again.
   * This method blocks until destroy has completed
   */
  def destroy() {
    destroy(blocking = true)
  }

  /**
   * Destroy all data and metadata related to this broadcast variable. Use this with caution;
   * once a broadcast variable has been destroyed, it cannot be used again.
   * @param blocking Whether to block until destroy has completed
   */
  private[spark] def destroy(blocking: Boolean) {
    assertValid()
    _isValid = false
    _destroySite = Utils.getCallSite().shortForm
    logInfo("Destroying %s (from %s)".format(toString, _destroySite))
    doDestroy(blocking)
  }

  /**
   * Whether this Broadcast is actually usable. This should be false once persisted state is
   * removed from the driver.
   */
  private[spark] def isValid: Boolean = {
    _isValid
  }

  /**
   * Actually get the broadcasted value. Concrete implementations of Broadcast class must
   * define their own way to get the value.
   */
  protected def getValue(): T

  /**
   * Actually unpersist the broadcasted value on the executors. Concrete implementations of
   * Broadcast class must define their own logic to unpersist their own data.
   */
  protected def doUnpersist(blocking: Boolean)

  /**
   * Actually destroy all data and metadata related to this broadcast variable.
   * Implementation of Broadcast class must define their own logic to destroy their own
   * state.
   */
  protected def doDestroy(blocking: Boolean)

  /** Check if this broadcast is valid. If not valid, exception is thrown. */
  protected def assertValid() {
    if (!_isValid) {
      throw new SparkException(
        "Attempted to use %s after it was destroyed (%s) ".format(toString, _destroySite))
    }
  }

  override def toString: String = "Broadcast(" + id + ")"
}

其餘的，不一一贅述了。

三、累加器

　　爲何須要，累加器？

　　答：第一種狀況，是，test把數據副本運行起來。

　　第二種狀況，有全局變量和局部變量，有了廣播，爲何還須要累加器？

（3）累加器（獲取全局惟一的狀態對象，SparkContext建立，被Driver控制，在Text實際運行的時候，每次均可以保證修改以後獲取全局惟一的對象，Driver中可讀，Executor可讀）

累加器是僅僅被相關操做累加的變量，所以能夠在並行中被有效地支持。它能夠被用來實現計數器和總和。Spark原生地只支持數字類型的累加器，編程者能夠添加新類型的支持。若是建立累加器時指定了名字，能夠在Spark的UI界面看到。這有利於理解每一個執行階段的進程。（對於python還不支持）

累加器經過對一個初始化了的變量v調用SparkContext.accumulator(v)來建立。在集羣上運行的任務能夠經過add或者"+="方法在累加器上進行累加操做。可是，它們不能讀取它的值。只有驅動程序可以讀取它的值，經過累加器的value方法。

　　累加器的特徵：全局的，Accumulator：對於Executor只能修改但不可讀，只對Driver可讀（由於經過Driver控制整個集羣的狀態），不一樣的executor 修改，不會彼此覆蓋（枷鎖機制）

累加器實戰：

scala> val sum = sc.accumulator(0)
sum: org.apache.spark.Accumulator[Int] = 0

scala> val data = sc.parallelize(1 to 100)
data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[63] at parallelize at <console>:21

scala> val result = data.foreach(item =>sum += item)

took 6.548568 s
result: Unit = ()

scala> println(sum)
5050

累加器在記錄集羣全局惟一的狀態的時候極其重要，保持惟一的全局狀態的變量，因此其重要性不言而喻。

Driver中取值，Executor中計算，

一、累計器全局（全集羣）惟一，只增不減（Executor中的task去修改，即累加）；

二、累加器是Executor共享；

個人理解應該是對的，集羣全局變量，誰操做，從driver上拿去操做，而後下個Executor在用的時候，拿上個Executor執行的結果，也就是從Driver那裏拿。

accumulator源碼

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark

import java.io.{ObjectInputStream, Serializable}

import scala.collection.generic.Growable
import scala.collection.Map
import scala.collection.mutable
import scala.ref.WeakReference
import scala.reflect.ClassTag

import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.Utils

/**
 * A data type that can be accumulated, ie has an commutative and associative "add" operation,
 * but where the result type, `R`, may be different from the element type being added, `T`.
 *
 * You must define how to add data, and how to merge two of these together.  For some data types,
 * such as a counter, these might be the same operation. In that case, you can use the simpler
 * [[org.apache.spark.Accumulator]]. They won't always be the same, though -- e.g., imagine you are
 * accumulating a set. You will add items to the set, and you will union two sets together.
 *
 * @param initialValue initial value of accumulator
 * @param param helper object defining how to add elements of type `R` and `T`
 * @param name human-readable name for use in Spark's web UI
 * @param internal if this [[Accumulable]] is internal. Internal [[Accumulable]]s will be reported
 *                 to the driver via heartbeats. For internal [[Accumulable]]s, `R` must be
 *                 thread safe so that they can be reported correctly.
 * @tparam R the full accumulated data (result type)
 * @tparam T partial data that can be added in
 */
class Accumulable[R, T] private[spark] (
    @transient initialValue: R,
    param: AccumulableParam[R, T],
    val name: Option[String],
    internal: Boolean)
  extends Serializable {

  private[spark] def this(
      @transient initialValue: R, param: AccumulableParam[R, T], internal: Boolean) = {
    this(initialValue, param, None, internal)
  }

  def this(@transient initialValue: R, param: AccumulableParam[R, T], name: Option[String]) =
    this(initialValue, param, name, false)

  def this(@transient initialValue: R, param: AccumulableParam[R, T]) =
    this(initialValue, param, None)

  val id: Long = Accumulators.newId

  @volatile @transient private var value_ : R = initialValue // Current value on master
  val zero = param.zero(initialValue)  // Zero value to be passed to workers
  private var deserialized = false

  Accumulators.register(this)

  /**
   * If this [[Accumulable]] is internal. Internal [[Accumulable]]s will be reported to the driver
   * via heartbeats. For internal [[Accumulable]]s, `R` must be thread safe so that they can be
   * reported correctly.
   */
  private[spark] def isInternal: Boolean = internal

  /**
   * Add more data to this accumulator / accumulable
   * @param term the data to add
   */
  def += (term: T) { value_ = param.addAccumulator(value_, term) }

  /**
   * Add more data to this accumulator / accumulable
   * @param term the data to add
   */
  def add(term: T) { value_ = param.addAccumulator(value_, term) }

  /**
   * Merge two accumulable objects together
   *
   * Normally, a user will not want to use this version, but will instead call `+=`.
   * @param term the other `R` that will get merged with this
   */
  def ++= (term: R) { value_ = param.addInPlace(value_, term)}

  /**
   * Merge two accumulable objects together
   *
   * Normally, a user will not want to use this version, but will instead call `add`.
   * @param term the other `R` that will get merged with this
   */
  def merge(term: R) { value_ = param.addInPlace(value_, term)}

  /**
   * Access the accumulator's current value; only allowed on master.
   */
  def value: R = {
    if (!deserialized) {
      value_
    } else {
      throw new UnsupportedOperationException("Can't read accumulator value in task")
    }
  }

  /**
   * Get the current value of this accumulator from within a task.
   *
   * This is NOT the global value of the accumulator.  To get the global value after a
   * completed operation on the dataset, call `value`.
   *
   * The typical use of this method is to directly mutate the local value, eg., to add
   * an element to a Set.
   */
  def localValue: R = value_

  /**
   * Set the accumulator's value; only allowed on master.
   */
  def value_= (newValue: R) {
    if (!deserialized) {
      value_ = newValue
    } else {
      throw new UnsupportedOperationException("Can't assign accumulator value in task")
    }
  }

  /**
   * Set the accumulator's value; only allowed on master
   */
  def setValue(newValue: R) {
    this.value = newValue
  }

  // Called by Java when deserializing an object
  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    value_ = zero
    deserialized = true
    // Automatically register the accumulator when it is deserialized with the task closure.
    //
    // Note internal accumulators sent with task are deserialized before the TaskContext is created
    // and are registered in the TaskContext constructor. Other internal accumulators, such SQL
    // metrics, still need to register here.
    val taskContext = TaskContext.get()
    if (taskContext != null) {
      taskContext.registerAccumulator(this)
    }
  }

  override def toString: String = if (value_ == null) "null" else value_.toString
}

/**
 * Helper object defining how to accumulate values of a particular type. An implicit
 * AccumulableParam needs to be available when you create [[Accumulable]]s of a specific type.
 *
 * @tparam R the full accumulated data (result type)
 * @tparam T partial data that can be added in
 */
trait AccumulableParam[R, T] extends Serializable {
  /**
   * Add additional data to the accumulator value. Is allowed to modify and return `r`
   * for efficiency (to avoid allocating objects).
   *
   * @param r the current value of the accumulator
   * @param t the data to be added to the accumulator
   * @return the new value of the accumulator
   */
  def addAccumulator(r: R, t: T): R

  /**
   * Merge two accumulated values together. Is allowed to modify and return the first value
   * for efficiency (to avoid allocating objects).
   *
   * @param r1 one set of accumulated data
   * @param r2 another set of accumulated data
   * @return both data sets merged together
   */
  def addInPlace(r1: R, r2: R): R

  /**
   * Return the "zero" (identity) value for an accumulator type, given its initial value. For
   * example, if R was a vector of N dimensions, this would return a vector of N zeroes.
   */
  def zero(initialValue: R): R
}

private[spark] class
GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T]
  extends AccumulableParam[R, T] {

  def addAccumulator(growable: R, elem: T): R = {
    growable += elem
    growable
  }

  def addInPlace(t1: R, t2: R): R = {
    t1 ++= t2
    t1
  }

  def zero(initialValue: R): R = {
    // We need to clone initialValue, but it's hard to specify that R should also be Cloneable.
    // Instead we'll serialize it to a buffer and load it back.
    val ser = new JavaSerializer(new SparkConf(false)).newInstance()
    val copy = ser.deserialize[R](ser.serialize(initialValue))
    copy.clear()   // In case it contained stuff
    copy
  }
}

/**
 * A simpler value of [[Accumulable]] where the result type being accumulated is the same
 * as the types of elements being merged, i.e. variables that are only "added" to through an
 * associative operation and can therefore be efficiently supported in parallel. They can be used
 * to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of numeric
 * value types, and programmers can add support for new types.
 *
 * An accumulator is created from an initial value `v` by calling [[SparkContext#accumulator]].
 * Tasks running on the cluster can then add to it using the [[Accumulable#+=]] operator.
 * However, they cannot read its value. Only the driver program can read the accumulator's value,
 * using its value method.
 *
 * The interpreter session below shows an accumulator being used to add up the elements of an array:
 *
 * {{{
 * scala> val accum = sc.accumulator(0)
 * accum: spark.Accumulator[Int] = 0
 *
 * scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x)
 * ...
 * 10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s
 *
 * scala> accum.value
 * res2: Int = 10
 * }}}
 *
 * @param initialValue initial value of accumulator
 * @param param helper object defining how to add elements of type `T`
 * @tparam T result type
 */
class Accumulator[T] private[spark] (
    @transient private[spark] val initialValue: T,
    param: AccumulatorParam[T],
    name: Option[String],
    internal: Boolean)
  extends Accumulable[T, T](initialValue, param, name, internal) {

  def this(initialValue: T, param: AccumulatorParam[T], name: Option[String]) = {
    this(initialValue, param, name, false)
  }

  def this(initialValue: T, param: AccumulatorParam[T]) = {
    this(initialValue, param, None, false)
  }
}

/**
 * A simpler version of [[org.apache.spark.AccumulableParam]] where the only data type you can add
 * in is the same type as the accumulated value. An implicit AccumulatorParam object needs to be
 * available when you create Accumulators of a specific type.
 *
 * @tparam T type of value to accumulate
 */
trait AccumulatorParam[T] extends AccumulableParam[T, T] {
  def addAccumulator(t1: T, t2: T): T = {
    addInPlace(t1, t2)
  }
}

object AccumulatorParam {

  // The following implicit objects were in SparkContext before 1.2 and users had to
  // `import SparkContext._` to enable them. Now we move them here to make the compiler find
  // them automatically. However, as there are duplicate codes in SparkContext for backward
  // compatibility, please update them accordingly if you modify the following implicit objects.

  implicit object DoubleAccumulatorParam extends AccumulatorParam[Double] {
    def addInPlace(t1: Double, t2: Double): Double = t1 + t2
    def zero(initialValue: Double): Double = 0.0
  }

  implicit object IntAccumulatorParam extends AccumulatorParam[Int] {
    def addInPlace(t1: Int, t2: Int): Int = t1 + t2
    def zero(initialValue: Int): Int = 0
  }

  implicit object LongAccumulatorParam extends AccumulatorParam[Long] {
    def addInPlace(t1: Long, t2: Long): Long = t1 + t2
    def zero(initialValue: Long): Long = 0L
  }

  implicit object FloatAccumulatorParam extends AccumulatorParam[Float] {
    def addInPlace(t1: Float, t2: Float): Float = t1 + t2
    def zero(initialValue: Float): Float = 0f
  }

  // TODO: Add AccumulatorParams for other types, e.g. lists and strings
}

// TODO: The multi-thread support in accumulators is kind of lame; check
// if there's a more intuitive way of doing it right
private[spark] object Accumulators extends Logging {
  /**
   * This global map holds the original accumulator objects that are created on the driver.
   * It keeps weak references to these objects so that accumulators can be garbage-collected
   * once the RDDs and user-code that reference them are cleaned up.
   */
  val originals = mutable.Map[Long, WeakReference[Accumulable[_, _]]]()

  private var lastId: Long = 0

  def newId(): Long = synchronized {
    lastId += 1
    lastId
  }

  def register(a: Accumulable[_, _]): Unit = synchronized {
    originals(a.id) = new WeakReference[Accumulable[_, _]](a)
  }

  def remove(accId: Long) {
    synchronized {
      originals.remove(accId)
    }
  }

  // Add values to the original accumulators with some given IDs
  def add(values: Map[Long, Any]): Unit = synchronized {
    for ((id, value) <- values) {
      if (originals.contains(id)) {
        // Since we are now storing weak references, we must check whether the underlying data
        // is valid.
        originals(id).get match {
          case Some(accum) => accum.asInstanceOf[Accumulable[Any, Any]] ++= value
          case None =>
            throw new IllegalAccessError("Attempted to access garbage collected Accumulator.")
        }
      } else {
        logWarning(s"Ignoring accumulator update for unknown accumulator id $id")
      }
    }
  }

}

private[spark] object InternalAccumulator {
  val PEAK_EXECUTION_MEMORY = "peakExecutionMemory"
  val TEST_ACCUMULATOR = "testAccumulator"

  // For testing only.
  // This needs to be a def since we don't want to reuse the same accumulator across stages.
  private def maybeTestAccumulator: Option[Accumulator[Long]] = {
    if (sys.props.contains("spark.testing")) {
      Some(new Accumulator(
        0L, AccumulatorParam.LongAccumulatorParam, Some(TEST_ACCUMULATOR), internal = true))
    } else {
      None
    }
  }

  /**
   * Accumulators for tracking internal metrics.
   *
   * These accumulators are created with the stage such that all tasks in the stage will
   * add to the same set of accumulators. We do this to report the distribution of accumulator
   * values across all tasks within each stage.
   */
  def create(sc: SparkContext): Seq[Accumulator[Long]] = {
    val internalAccumulators = Seq(
        // Execution memory refers to the memory used by internal data structures created
        // during shuffles, aggregations and joins. The value of this accumulator should be
        // approximately the sum of the peak sizes across all such data structures created
        // in this task. For SQL jobs, this only tracks all unsafe operators and ExternalSort.
        new Accumulator(
          0L, AccumulatorParam.LongAccumulatorParam, Some(PEAK_EXECUTION_MEMORY), internal = true)
      ) ++ maybeTestAccumulator.toSeq
    internalAccumulators.foreach { accumulator =>
      sc.cleaner.foreach(_.registerAccumulatorForCleanup(accumulator))
    }
    internalAccumulators
  }
}