HIVE简单实战
需求:统计各个城市所属区域下最受欢迎的Top 3产品
数据准备:
mysql:
city_info : 城市信息表
product_info: 产品信息表
hive:
user_click:用户行为点击日志表
create table user_click(
user_id int,
session_id string,
action_time string,
city_id int,
product_id int
) partitioned by (day string)
row format delimited fields terminated by ',';
一、在hive中创建city_info,product_info
create table city_info(
city_id int,
city_name string,
area string
)row format delimited fields terminated by '\t';
create table product_info(
product_id int,
product_name string,
extend_info string
)row format delimited fields terminated by '\t';
二、使用sqoop导入数据
sqoop用于大数据平台与关系型数据库之间的传输
>sqoop help 可以查看命令帮助
sqoop import \
--connect jdbc:mysql://localhost:3306/test06 \
--username root --password root \
--delete-target-dir \ ##导入之前删除目标目录
--table city_info \ ## 目标表
--hive-database test06\ ## hive 目标数据库
--hive-import \
--hive-table city_info \
--hive-overwrite \
--fields-terminated-by '\t' \
--lines-terminated-by '\n' \
--split-by city_id \ ## 设定主键方便进行分割
-m 2 ## 输出文件数
现在hive里面有三张表:city_info product_info user_click
三、创建商品基础信息表:先将city_info 与 user_click 关联起来
create table tmp_product_click_basic_info
as
select u.product_id, u.city_id, c.city_name, c.area
from
(select product_id, city_id from user_click where day='2016-05-05' ) u
join
(select city_id, city_name,area from city_info) c
on u.city_id = c.city_id;
四、统计各区域下各商品点击次数:按照product_id , area 进行分组,再写入到新的临时表
create table tmp_area_product_click_count
as
select
product_id, area, count(1) click_count
from
tmp_product_click_basic_info
group by
product_id, area ;
五:获取完整的商品信息的各区域的访问次数:将tmp_area_product_click_count 与procuct_info 进行关联
create table tmp_area_product_click_count_full_info
as
select
a.product_id, b.product_name, a.area, a.click_count
from
tmp_area_product_click_count a join product_info b
on a.product_id = b.product_id;
六、利用窗口函数统计各区域下点击次数top 3 的商品
create table area_product_click_count_top3 as
select *
from (
select
product_id, product_name,area, click_count,
row_number() over(partition by area order by click_count desc) rank
from
tmp_area_product_click_count_full_info
) t where t.rank <=3;