1、自定義分區
1.概述html
默認的是Hash的分區策略,這點和Hadoop是相似的,具體的分區介紹,參見:https://blog.csdn.net/high2011/article/details/68491115java
2.實現apache
package cn.itcast.spark.day3 import java.net.URL import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext} import scala.collection.mutable /** * Created by root on 2016/5/18. */ object UrlCountPartition { def main(args: Array[String]) { val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]") val sc = new SparkContext(conf) //rdd1將數據切分,元組中放的是(URL, 1) val rdd1 = sc.textFile("c://itcast.log").map(line => { val f = line.split("\t") (f(1), 1) }) val rdd2 = rdd1.reduceByKey(_ + _) val rdd3 = rdd2.map(t => { val url = t._1 val host = new URL(url).getHost (host, (url, t._2)) }) val ints = rdd3.map(_._1).distinct().collect() val hostParitioner = new HostParitioner(ints) // val rdd4 = rdd3.partitionBy(new HashPartitioner(ints.length)) val rdd4 = rdd3.partitionBy(hostParitioner).mapPartitions(it => { it.toList.sortBy(_._2._2).reverse.take(2).iterator }) rdd4.saveAsTextFile("c://out4") //println(rdd4.collect().toBuffer) sc.stop() } } /** * 決定了數據到哪一個分區裏面 * @param ins */ class HostParitioner(ins: Array[String]) extends Partitioner { val parMap = new mutable.HashMap[String, Int]() var count = 0 for(i <- ins){ parMap += (i -> count) count += 1 } override def numPartitions: Int = ins.length override def getPartition(key: Any): Int = { parMap.getOrElse(key.toString, 0) } }
// 與Hadoop相通,再也不贅述ide
2、自定義排序
基本上就是結合以前的隱式轉換了:(這裏使用樣例類能夠不用new就能獲得實例,另外也能夠用於模式匹配)oop
package cn.itcast.spark.day3 import org.apache.spark.{SparkConf, SparkContext} object OrderContext { implicit val girlOrdering = new Ordering[Girl] { override def compare(x: Girl, y: Girl): Int = { if(x.faceValue > y.faceValue) 1 else if (x.faceValue == y.faceValue) { if(x.age > y.age) -1 else 1 } else -1 } } } /** * Created by root on 2016/5/18. */ //sort =>規則 先按faveValue,比較年齡 //name,faveValue,age object CustomSort { def main(args: Array[String]) { val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]") val sc = new SparkContext(conf) val rdd1 = sc.parallelize(List(("yuihatano", 90, 28, 1), ("angelababy", 90, 27, 2),("JuJingYi", 95, 22, 3))) import OrderContext._ val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false) println(rdd2.collect().toBuffer) sc.stop() } } /** * 第一種方式 * @param faceValue * @param age case class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable { override def compare(that: Girl): Int = { if(this.faceValue == that.faceValue) { that.age - this.age } else { this.faceValue -that.faceValue } } } */ /** * 第二種,經過隱式轉換完成排序 * @param faceValue * @param age */ case class Girl(faceValue: Int, age: Int) extends Serializable
// 複習隱式轉換,基本也無新內容ui