一、数据准备 1、users.dat 2、movies.dat 3、ratings.dat
二、需求分析 1、正确建表,导入数据(三张表,三份数据),并验证是否正确 (1)创建表 create table t_user( userid bigint, --用户id gender string, --性别 age int, --年龄 occupation string, --职业 zipcode string) --邮政编码 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s') stored as textfile; create table t_movie( movieid bigint, --电影ID title string, --电影名字 genres string) --电影类型 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s') stored as textfile; create table t_rating( userid bigint, --用户ID movieid bigint, --电影ID rating double, --评分 timestamped String) --评分时间戳 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s') stored as textfile; (2)导入数据 load data local inpath '/usr/mydir/data/users.dat' into table t_user; load data local inpath '/usr/mydir/data/movies.dat' into table t_movie; load data local inpath '/usr/mydir/data/ratings.dat' into table t_rating; (3)验证数据 select * from t_user; select count(1) from t_user ; --6040条数据 select * from t_movie; select count(1) from t_movie ; --3883条数据 select * from t_rating; select count(1) from t_rating ; --1000209条数据 user表: movie表: rating表:
2、求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数) SELECT title, rating_count FROM (SELECT movieid, COUNT(rating) rating_count from t_rating group by movieid ) t1 left join t_movie t2 on t1.movieid = t2.movieid order by rating_count DESC LIMIT 10;
3、分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分),评分人数大于等于50 SELECT * from (SELECT 'M' as gender, title, AVG(rating) avg_rating, count(rating) count_rating from (SELECT --三张表关联,拿到性别,电影名,影评分 t2.gender, t3.title, t1.rating from t_rating t1 left join t_user t2 on t1.userid = t2.userid left join t_movie t3 on t1.movieid = t3.movieid where t2.gender = 'M' ) t4 group by title HAVING count_rating >= 50 order by avg_rating DESC LIMIT 10 ) a UNION ALL SELECT * from (SELECT 'F' as gender, title, AVG(rating) avg_rating, count(rating) count_rating from (SELECT --三张表关联,拿到性别,电影名,影评分 t2.gender, t3.title, t1.rating from t_rating t1 left join t_user t2 on t1.userid = t2.userid left join t_movie t3 on t1.movieid = t3.movieid where t2.gender = 'F' ) t4 group by title HAVING count_rating >= 50 order by avg_rating DESC LIMIT 10 ) b
4、求好片(评分>=4.0)最多的那个年份的最好看的10部电影 SELECT title, avg_rating from (SELECT t1.title, SUBSTRING(t1.title,-5,4) year, avg(t2.rating) avg_rating from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid where t2.rating >= 4 group by t1.title ) t4 where year IN (SELECT year from (SELECT year, COUNT(year) count_year from (SELECT t1.title, SUBSTRING(t1.title,-5,4) year from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid where t2.rating >= 4 group by t1.title ) t3 group by year order by count_year DESC LIMIT 1) t5 ) order by avg_rating DESC limit 10;
5、求1997年上映的电影中,评分最高的10部Comedy类电影 SELECT title, genres, AVG(rating) avg_rating from (SELECT t1.title, substr(t1.title,-5,4) year, t1.genres, t2.rating from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid ) t3 where year = '1997' and genres LIKE '%Comedy%' group by title,genres order by avg_rating DESC LIMIT 10;
6、该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分) SELECT type, title FROM (SELECT title, type, ROW_NUMBER() over(partition by type order by avg_rating DESC) rn from (SELECT title, type, AVG(rating) avg_rating from (select t2.genres, t2.title, t1.rating from t_rating t1 left join t_movie t2 on t1.movieid =t2.movieid ) t3 lateral view explode(split(t3.genres,"\\|")) tmp as type group by type,title ) t4 )t5 where rn <= 5;
7、各年评分最高的电影类型(年份,类型,影评分) with tmp as (SELECT title, type, rating, movie_year from (select t2.genres, t2.title, t1.rating, SUBSTRING(t2.title,-5,4) as movie_year from t_rating t1 left join t_movie t2 on t1.movieid =t2.movieid ) t3 lateral view explode(split(t3.genres,"\\|")) tmp as type ) select movie_year, type, avg_rating from (select movie_year, type, avg_rating, ROW_NUMBER() over(partition by movie_year order by avg_rating DESC) rn FROM (select movie_year, type, AVG(rating) avg_rating FROM tmp group by movie_year,type ) t1 ) t2 where rn = 1; |
|