<iframe width="800" height="500" src="//player.bilibili.com/player.html?aid=38193405&cid=67137841&page=2" scrolling="no" border="0" frameborder="no" framespacing="0" allowfullscreen="true"> </iframe>html
val dataSet = spark.read.textFile("/home/liuwen/data/a.txt")
dataSet.count()
//其實調用的是head()函數 dataSet.first()
dataSet.show() //默認取前20行數據,並進行20個字符的截斷 dataSet.show(10,false) //取前20行數據,而且不進行截斷
dataSet.filter(line => line.contains("spark"))
import spark.implicits._ val lineWordLength = dataSet.map( line => line.split(" ").size) val result = lineWordLength.reduce((a,b) => a + b)
import spark.implicits._ val lineWordLength = dataSet.map( line => line.split(" ").size) val result = lineWordLength.reduce((a,b) => Math.max(a,b))
import spark.implicits._ val result = dataSet.reduce((a,b) => { if(a.split(" ").size > b.split(" ").size) a else b })
import spark.implicits._ val distFile = spark.read.textFile("hdfs://standalone.com:9000/home/liuwen/data/word.txt") //方式一 //val dataset = distFile.flatMap( line => line.split(" ")).groupByKey(x => x ).count() //方式二 val dataset = distFile.flatMap( line => line.split(" ")).map(x => (x,1)).groupByKey(x => x).reduceGroups((a,b) => (a._1,a._2+b._2)) //方式三 //val dataset = distFile.flatMap( line => line.split(" ")).groupByKey(identity ).count()
endjava