Spark SQL嘗試sql
// data from 2014.9.12 val transfer = sc.textFile("hdfs://LDKJSERVER1046:8020/user/flume/transfer20/year=2014/month=09/day=12/*.tsv") val structuredTransfer20RDD = transfer.map(line => { val trunks = line.split("\t") if(trunks.length == 33){ (trunks(6), trunks(8), trunks(9), trunks(14), trunks(12), trunks(3)) }}) val rdd = structuredTransfer20RDD.filter(arg => arg != ()).map(arg => arg.asInstanceOf[(String, String, String, String, String, String)]) // ORM class, pay attention to the name "f","t",etc // field name must not be the keywords in sql syntax, and upper case seems illegal case class ZapyaTransfer(f: String, t: String, mdfive: String, name: String, size: String, time: String) val ormTransfers = rdd.map(arg => ZapyaTransfer(arg._1, arg._2, arg._3, arg._4, arg._5, arg._6)) val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.createSchemaRDD ormTransfers.registerAsTable("transfers") // this sql is just a easy example val temp = sqlContext.sql("SELECT count(*) FROM transfers WHERE f = '8677380150235314813F370E464'") temp.map(c => "count(*): " + c(0)).collect().foreach(println)