1、數據準備apache
一、users.dat
數據格式:2::M::56::16::70072,
數據條數:共有6040條數據
對應字段:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
字段解釋:用戶id,性別,年齡,職業,郵政編碼oop
二、movies.dat
數據格式:2::Jumanji (1995)::Adventure|Children’s|Fantasy,
數據條數:共有3883條數據
對應字段:MovieID BigInt, Title String, Genres String
字段解釋:電影ID,電影名字,電影類型編碼
三、ratings.dat
數據格式:1::1193::5::978300760,
數據條數:共有1000209條數據
對應字段:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
字段解釋:用戶ID,電影ID,評分,評分時間戳spa
2、需求分析3d
一、正確建表,導入數據(三張表,三份數據),並驗證是否正確code
(1)建立表orm
create table t_user( userid bigint, --用戶id gender string, --性別 age int, --年齡 occupation string, --職業 zipcode string) --郵政編碼 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s') stored as textfile; create table t_movie( movieid bigint, --電影ID title string, --電影名字 genres string) --電影類型 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s') stored as textfile; create table t_rating( userid bigint, --用戶ID movieid bigint, --電影ID rating double, --評分 timestamped String) --評分時間戳 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s') stored as textfile;
(2)導入數據blog
load data local inpath '/usr/mydir/data/users.dat' into table t_user; load data local inpath '/usr/mydir/data/movies.dat' into table t_movie; load data local inpath '/usr/mydir/data/ratings.dat' into table t_rating;
(3)驗證數據ip
select * from t_user; select count(1) from t_user ; --6040條數據 select * from t_movie; select count(1) from t_movie ; --3883條數據 select * from t_rating; select count(1) from t_rating ; --1000209條數據
user表:hadoop
movie表:
rating表:
二、求被評分次數最多的10部電影,並給出評分次數(電影名,評分次數)
SELECT title, rating_count FROM (SELECT movieid, COUNT(rating) rating_count from t_rating group by movieid ) t1 left join t_movie t2 on t1.movieid = t2.movieid order by rating_count DESC LIMIT 10;
三、分別求男性,女性當中評分最高的10部電影(性別,電影名,影評分),評分人數大於等於50
SELECT * from (SELECT 'M' as gender, title, AVG(rating) avg_rating, count(rating) count_rating from (SELECT --三張表關聯,拿到性別,電影名,影評分 t2.gender, t3.title, t1.rating from t_rating t1 left join t_user t2 on t1.userid = t2.userid left join t_movie t3 on t1.movieid = t3.movieid where t2.gender = 'M' ) t4 group by title HAVING count_rating >= 50 order by avg_rating DESC LIMIT 10 ) a UNION ALL SELECT * from (SELECT 'F' as gender, title, AVG(rating) avg_rating, count(rating) count_rating from (SELECT --三張表關聯,拿到性別,電影名,影評分 t2.gender, t3.title, t1.rating from t_rating t1 left join t_user t2 on t1.userid = t2.userid left join t_movie t3 on t1.movieid = t3.movieid where t2.gender = 'F' ) t4 group by title HAVING count_rating >= 50 order by avg_rating DESC LIMIT 10 ) b
四、求好片(評分>=4.0)最多的那個年份的最好看的10部電影
SELECT title, avg_rating from (SELECT t1.title, SUBSTRING(t1.title,-5,4) year, avg(t2.rating) avg_rating from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid where t2.rating >= 4 group by t1.title ) t4 where year IN (SELECT year from (SELECT year, COUNT(year) count_year from (SELECT t1.title, SUBSTRING(t1.title,-5,4) year from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid where t2.rating >= 4 group by t1.title ) t3 group by year order by count_year DESC LIMIT 1) t5 ) order by avg_rating DESC limit 10;
五、求1997年上映的電影中,評分最高的10部Comedy類電影
SELECT title, genres, AVG(rating) avg_rating from (SELECT t1.title, substr(t1.title,-5,4) year, t1.genres, t2.rating from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid ) t3 where year = '1997' and genres LIKE '%Comedy%' group by title,genres order by avg_rating DESC LIMIT 10;
六、該影評庫中各類類型電影中評價最高的5部電影(類型,電影名,平均影評分)
SELECT type, title FROM (SELECT title, type, ROW_NUMBER() over(partition by type order by avg_rating DESC) rn from (SELECT title, type, AVG(rating) avg_rating from (select t2.genres, t2.title, t1.rating from t_rating t1 left join t_movie t2 on t1.movieid =t2.movieid ) t3 lateral view explode(split(t3.genres,"\\|")) tmp as type group by type,title ) t4 )t5 where rn <= 5;
七、各年評分最高的電影類型(年份,類型,影評分)
with tmp as (SELECT title, type, rating, movie_year from (select t2.genres, t2.title, t1.rating, SUBSTRING(t2.title,-5,4) as movie_year from t_rating t1 left join t_movie t2 on t1.movieid =t2.movieid ) t3 lateral view explode(split(t3.genres,"\\|")) tmp as type ) select movie_year, type, avg_rating from (select movie_year, type, avg_rating, ROW_NUMBER() over(partition by movie_year order by avg_rating DESC) rn FROM (select movie_year, type, AVG(rating) avg_rating FROM tmp group by movie_year,type ) t1 ) t2 where rn = 1;