1.1輕量級快速處理java
Saprk容許傳統的hadoop集羣中的應用程序在內存中已100倍的速度運行即便在磁盤上也比傳統的hadoop快10倍,Spark經過減小對磁盤的io達到性能上的提高,他將中間處理的數據放到內存中,spark使用了rdd(resilient distributed datasets)數據抽象node
這容許他在內存中存儲數據,因此減小了運行時間python
1.2 易於使用sql
spark支持多種語言。Spark容許java,scala python 及R語言,容許shell進行交互式查詢shell
1.3 支持複雜的查詢數據庫
除了簡單的map和reduce操做以外,Spark還支持filter、foreach、reduceByKey、aggregate以及SQL查詢、流式查詢等複雜查詢。Spark更爲強大之處是用戶能夠在同一個工做流中無縫的搭配這些功能,例如Spark能夠經過Spark Streaming(1.2.2小節對Spark Streaming有詳細介紹)獲取流數據,而後對數據進行實時SQL查詢或使用MLlib庫進行系統推薦,並且這些複雜業務的集成並不複雜,由於它們都基於RDD這一抽象數據集在不一樣業務過程當中進行轉換,轉換代價小,體現了統一引擎解決不一樣類型工做場景的特色。apache
1.4 實時的流處理架構
對比maprduce只能處理離線數據。Spark還能支持實時的流計算,spark streaming 主要用來對數據進行實時的處理,yarn的nodemanger統一調度管理很厲害,在yarn產生後hadoop也能夠整合資源進行實時的處理框架
2.時事產物機器學習
2.1 mapreduce產生時磁盤廉價,所以許多設計收回考慮到內存的使用,而spark產生時內存相對廉價,對計算速度有所要求,所以spark的產生是基於內存計算的框架結構mapreduce須要寫複雜的程序進行計算,
1.spark的體系結構
Spark的體系結構不一樣於Hadoop的mapreduce 和HDFS ,Spark主要包括spark core和在spark core的基礎上創建的應用框架sparkSql spark Streaming MLlib GraphX;
Core庫主要包括上下文(spark Context)抽象的數據集(RDD),調度器(Scheduler),洗牌(shuffle) 和序列化器(Seralizer)等。Spark系統中的計算,IO,調度和shuffle等系統的基本功能都在其中
在Core庫之上就根據業務需求分爲用於交互式查詢的SQL、實時流處理Streaming、機器學習Mllib和圖計算GraphX四大框架hdfs迄今是不可替代的
Spark架構組成圖
Hive建立數據庫 建立表 true
驗證策略 |
腳本 |
Hive |
Spark-sql |
建立庫 刪除庫 |
Create database lvhou_hive Create database lvhou_spark Dorp database lvhou_hive Dorp database lvhou_spark |
True |
True |
建立表 刪除表 |
Use lvhou_hive Create table hive_test(a string,b string) Use lvhou_spark Create table spark_test(a string,b string) Drop table hive_test Drop table spark_test
|
True |
True |
CTAS |
Create table lvhou_test as selec * from lvhou_test1;
|
true |
false |
Insert
|
Insert into lvhou_hive values(‘hhah’,’heheh’) |
true |
false |
insert |
Insert into lvhou_spark value(‘12’.’32’),(‘asd’,’asdf’) |
True |
false |
Select |
Select * from lvhou_hive Select * from lvhou_spark |
True |
True
|
Select in |
|
|
|
|
|
|
|
Select子查詢 in 兩條數據
not in 兩條數據 |
select * from test1 where a,b in (select a,b from test2 where a = 'aa');
select * from test1 where a,b not in (select a,b from test2 where a = 'aa');
|
falese |
false |
Select union查詢 union all |
select * from test union all select * from test0;(合一) select * from test union select * from test0;(去重) |
true |
false |
Select union 3查詢 union all
|
select * from (select * from test union select * from test0) a;
select a from (select * from test union all select * from test0) a; |
false |
False |
Select exit not exit |
select * from test t where exists(select * from test0 t0 where t0.a = t.a); select * from lv_test where exists(select * tfrom test t where lv_test.a = t.a); |
True |
False |
update |
update test1 set b = 'abc' where a = 'aa'; update test1 set a = 'abc';
Update test1 set b = 'abc'; |
True |
False |
delete |
delete from test1 where a = 'aa'; delete from test1; |
True |
False |
TRUNCATE TABLE
|
Truncate table test; |
True |
False |
Alter |
alter table test1 add columns (d string); alter table test drop a; alter table test rename a to a1;
|
True |
False |
索引 |
create index index_a on table test(a) as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' with deferred rebuild; |
True |
False |
INTERSECT 交集 |
select a from test INTERSECT select a from test0; |
False |
False |
EXCEPT
|
select a from test EXCPEPT select a from test0; |
False |
False |
Minus 返回第一個結果中不一樣的 |
select a from test minus select a from test0; |
False |
False |
order by 排序 |
select a from test order by a desc;
|
True |
False |
sort by 排序 |
select a,b from test sort by b desc; |
True |
False |
distribute by
|
select a,b from test distribute by a; |
True |
False |
distribute by + sort by |
select a,b from test distribute by a sort by b asc; |
True |
False |
cluster by
|
select a,b from test cluster by a;
|
True |
False |
trim(string a) 去空格 |
select trim(' aaa ') from test00; |
True |
True
|
substr(string A,int start,int len) 截取字符串 |
select substr('abcdefg',3,2) from test;
select substr('abcdefg',-3,2) from test; |
True |
True |
like
|
select * from test where a like '%a%'; |
True |
False |
Count
|
select count(*) from test00;
select count(distinct *) from test00; |
True |
False |
Sum
|
select sum(c) from test00;
select sum(distinct c) from test00; |
True |
False |
Avg
|
select avg(c) from test00 select avg(distinct c) from test00 |
True |
False |
Min
|
select min(distinct c) from test00; |
True |
False |
Max
|
select max(distinct c) from test00; |
True |
False |
group by
|
select a from test00 group by a ; select a,sum(c) from test00 group by a; select a,avg(c) from test00 group by a; |
True |
False |
Hiving
|
select a,avg(c) as ac from test00 group by a having ac=1; |
True |
False |
load |
load data local inpath '/tmp/qichangjian/test01.txt' overwrite into table test_load; |
True |
False |