用戶名,月份,訪問次數
A,2015-01,5 A,2015-01,15 B,2015-01,5 A,2015-01,8 B,2015-01,25 A,2015-01,5 A,2015-02,4 A,2015-02,6 B,2015-02,10 B,2015-02,5 A,2015-03,16 A,2015-03,22 B,2015-03,23 B,2015-03,10 B,2015-03,1
use myhive; create external table if not exists t_access( uname string comment '用戶名', umonth string comment '月份', ucount int comment '訪問次數' ) comment '用戶訪問表' row format delimited fields terminated by "," location "/hive/t_access";
load data local inpath "/home/hadoop/access.txt" into table t_access;
select * from t_access;
現要求出:
每一個用戶截止到每個月爲止的最大單月訪問次數和累計到該月的總訪問次數,結果數據格式以下數據庫
此結果須要根據用戶+月份進行分組安全
--求當月訪問次數 create table tmp_access( name string, mon string, num int ); insert into table tmp_access select uname,umonth,sum(ucount) from t_access t group by t.uname,t.umonth;
select * from tmp_access;
create view tmp_view as select a.name anme,a.mon amon,a.num anum,b.name bname,b.mon bmon,b.num bnum from tmp_access a join tmp_access b on a.name=b.name; select * from tmp_view;
select anme,amon,anum,max(bnum) as max_access,sum(bnum) as sum_access from tmp_view where amon>=bmon group by anme,amon,anum;
use myhive; CREATE TABLE `course` ( `id` int, `sid` int , `course` string, `score` int ) ;
// 插入數據 // 字段解釋:id, 學號, 課程, 成績 INSERT INTO `course` VALUES (1, 1, 'yuwen', 43); INSERT INTO `course` VALUES (2, 1, 'shuxue', 55); INSERT INTO `course` VALUES (3, 2, 'yuwen', 77); INSERT INTO `course` VALUES (4, 2, 'shuxue', 88); INSERT INTO `course` VALUES (5, 3, 'yuwen', 98); INSERT INTO `course` VALUES (6, 3, 'shuxue', 65);
求:全部數學課程成績 大於 語文課程成績的學生的學號ide
create view tmp_course_view as select sid, case course when "shuxue" then score else 0 end as shuxue, case course when "yuwen" then score else 0 end as yuwen from course; select * from tmp_course_view;
create view tmp_course_view1 as select aa.sid, max(aa.shuxue) as shuxue, max(aa.yuwen) as yuwen from tmp_course_view aa group by sid; select * from tmp_course_view1;
select * from tmp_course_view1 where shuxue > yuwen;
數據格式oop
2010012325
具體數據spa
2014010114 2014010216 2014010317 2014010410 2014010506 2012010609 2012010732 2012010812 2012010919 2012011023 2001010116 2001010212 2001010310 2001010411 2001010529 2013010619 2013010722 2013010812 2013010929 2013011023 2008010105 2008010216 2008010337 2008010414 2008010516 2007010619 2007010712 2007010812 2007010999 2007011023 2010010114 2010010216 2010010317 2010010410 2010010506 2015010649 2015010722 2015010812 2015010999 2015011023
數據解釋3d
2010012325表示在2010年01月23日的氣溫爲25度
好比:2010012325表示在2010年01月23日的氣溫爲25度。如今要求使用hive,計算每年出現過的最大氣溫的日期+溫度。
要計算出每年的最大氣溫。我用
select substr(data,1,4),max(substr(data,9,2)) from table2 group by substr(data,1,4);
出來的是 年份 + 溫度 這兩列數據例如 2015 99code
可是若是我是想select 的是:具體每年最大氣溫的那一天 + 溫度 。例如 20150109 99
請問該怎麼執行hive語句。。
group by 只須要substr(data,1,4),
可是select substr(data,1,8),又不在group by 的範圍內。
是我陷入了思惟死角。一直想不出因此然。。求大神指點一下。
在select 若是所須要的。不在group by的條件裏。這種狀況如何去分析?orm
create table tmp_weather as select substr(data,1,4) years,substr(data,5,2) months,substr(data,7,2) days,substr(data,9,2) temp from weather;
select * from tmp_weather;
create table tmp_year_weather as select substr(data,1,4) years,max(substr(data,9,2)) max_temp from weather group by substr(data,1,4);
select * from tmp_year_weather;
select * from tmp_year_weather a join tmp_weather b on a.years=b.years and a.max_temp=b.temp;
id course 1,a 1,b 1,c 1,e 2,a 2,c 2,d 2,f 3,a 3,b 3,c 3,e
表示有id爲1,2,3的學生選修了課程a,b,c,d,e,f中其中幾門。blog
create table t_course(id int,course string) row format delimited fields terminated by ",";
load data local inpath "/home/hadoop/course/course.txt" into table t_course;
編寫Hive的HQL語句來實現如下結果:表中的1表示選修,表中的0表示未選修hadoop
id a b c d e f 1 1 1 1 0 1 0 2 1 0 1 1 0 1 3 1 1 1 0 1 0
第一步:
select collect_set(course) as courses from id_course;
第二步:
set hive.strict.checks.cartesian.product=false; create table id_courses as select t1.id as id,t1.course as id_courses,t2.course courses from ( select id as id,collect_set(course) as course from id_course group by id ) t1 join (select collect_set(course) as course from id_course) t2;
啓用嚴格模式:hive.mapred.mode = strict // Deprecated
hive.strict.checks.large.query = true
該設置會禁用:1. 不指定分頁的orderby
2. 對分區表不指定分區進行查詢
3. 和數據量無關,只是一個查詢模式hive.strict.checks.type.safety = true
嚴格類型安全,該屬性不容許如下操做:1. bigint和string之間的比較
2. bigint和double之間的比較hive.strict.checks.cartesian.product = true
該屬性不容許笛卡爾積操做
第三步:得出最終結果:
思路:
拿出course字段中的每個元素在id_courses中進行判斷,看是否存在。
select id, case when array_contains(id_courses, courses[0]) then 1 else 0 end as a, case when array_contains(id_courses, courses[1]) then 1 else 0 end as b, case when array_contains(id_courses, courses[2]) then 1 else 0 end as c, case when array_contains(id_courses, courses[3]) then 1 else 0 end as d, case when array_contains(id_courses, courses[4]) then 1 else 0 end as e, case when array_contains(id_courses, courses[5]) then 1 else 0 end as f from id_courses;
a,01,150 a,01,200 b,01,1000 b,01,800 c,01,250 c,01,220 b,01,6000 a,02,2000 a,02,3000 b,02,1000 b,02,1500 c,02,350 c,02,280 a,03,350 a,03,250
店鋪,月份,金額
use class; create table t_store( name string, months int, money int ) row format delimited fields terminated by ",";
load data local inpath "/home/hadoop/store.txt" into table t_store;
編寫Hive的HQL語句求出每一個店鋪的當月銷售額和累計到當月的總銷售額
(1)按照商店名稱和月份進行分組統計
create table tmp_store1 as select name,months,sum(money) as money from t_store group by name,months; select * from tmp_store1;
(2)對tmp_store1 表裏面的數據進行自鏈接
create table tmp_store2 as select a.name aname,a.months amonths,a.money amoney,b.name bname,b.months bmonths,b.money bmoney from tmp_store1 a join tmp_store1 b on a.name=b.name order by aname,amonths; select * from tmp_store2;
(3)比較統計
select aname,amonths,amoney,sum(bmoney) as total from tmp_store2 where amonths >= bmonths group by aname,amonths,amoney;