写点什么

大数据培训用 SQL 来实现用户行为漏斗分析

作者:@零度
  • 2022 年 5 月 27 日
  • 本文字数:21447 字

    阅读完需:约 70 分钟

​需求一:用户活跃主题


DWS 层--(用户行为宽表层) 目标:统计当日、当周、当月活动的每个设备明细


1 每日活跃设备明细 dwd_start_log--->dws_uv_detail_day


--把相同的字段 collect_set 到一个数组, 按 mid_id 分组(便于后边统计)


collect_set 将某字段的值进行去重汇总,产生 array 类型字段。如: concat_ws('|', collect_set(user_id)) user_id,


建分区表 dws_uv_detail_day:partitioned by ('dt' string)


drop table if exists dws_uv_detail_day;


create table dws_uv_detail_day(


mid_id string COMMENT '设备唯一标识',


user_id string COMMENT '用户标识',


version_code string COMMENT '程序版本号',


version_name string COMMENT '程序版本名',


lang string COMMENT '系统语言',


source string COMMENT '渠道号',


os string COMMENT '安卓系统版本',


area string COMMENT '区域',


model string COMMENT '手机型号',


brand string COMMENT '手机品牌',


sdk_version string COMMENT 'sdkVersion',


gmail string COMMENT 'gmail',


height_width string COMMENT '屏幕宽高',


app_time string COMMENT '客户端日志产生时的时间',


network string COMMENT '网络模式',


lng string COMMENT '经度',


lat string COMMENT '纬度'


) COMMENT '活跃用户按天明细'


PARTITIONED BY ( dt string)


stored as parquet


location '/warehouse/gmall/dws/dws_uv_detail_day/'


;


数据导入


按周分区;过滤出一周内的数据;按设备 id 分组;===>count(*)得到最终结果;


partition(dt='2019-02-10') from dwd_start_log where dt='2019-02-10' group by mid_id ( mid_id 设备唯一标示 )


以用户单日访问为 key 进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一


hive (gmall)>


set hive.exec.dynamic.partition.mode=nonstrict;


insert overwrite table dws_uv_detail_day partition(dt='2019-02-10')


select


mid_id,


concat_ws('|', collect_set(user_id)) user_id,


concat_ws('|', collect_set(version_code)) version_code,


concat_ws('|', collect_set(version_name)) version_name,


concat_ws('|', collect_set(lang))lang,


concat_ws('|', collect_set(source)) source,


concat_ws('|', collect_set(os)) os,


concat_ws('|', collect_set(area)) area,


concat_ws('|', collect_set(model)) model,


concat_ws('|', collect_set(brand)) brand,


concat_ws('|', collect_set(sdk_version)) sdk_version,


concat_ws('|', collect_set(gmail)) gmail,


concat_ws('|', collect_set(height_width)) height_width,


concat_ws('|', collect_set(app_time)) app_time,


concat_ws('|', collect_set(network)) network,


concat_ws('|', collect_set(lng)) lng,


concat_ws('|', collect_set(lat)) lat


from dwd_start_log


where dt='2019-02-10'


group by mid_id;


查询导入结果;


hive (gmall)> select * from dws_uv_detail_day limit 1;


###最后 count(*)即是每日活跃设备的个数;


hive (gmall)> select count(*) from dws_uv_detail_day;


2 每周(dws_uv_detail_wk)活跃设备明细 partition(wk_dt)


周一到周日 concat(date_add(next_day('2019-02-10', 'MO'), -7), '_', date_add(next_day('2019-02-10', 'MO'), -1))即 2019-02-04_2019-02-10


创建分区表:partitioned by('wk_dt' string)


hive (gmall)>


drop table if exists dws_uv_detail_wk;


create table dws_uv_detail_wk(


mid_id string COMMENT '设备唯一标识',


user_id string COMMENT '用户标识',


version_code string COMMENT '程序版本号',


version_name string COMMENT '程序版本名',


lang string COMMENT '系统语言',


source string COMMENT '渠道号',


os string COMMENT '安卓系统版本',


area string COMMENT '区域',


model string COMMENT '手机型号',


brand string COMMENT '手机品牌',


sdk_version string COMMENT 'sdkVersion',


gmail string COMMENT 'gmail',


height_width string COMMENT '屏幕宽高',


app_time string COMMENT '客户端日志产生时的时间',


network string COMMENT '网络模式',


lng string COMMENT '经度',


lat string COMMENT '纬度',


monday_date string COMMENT '周一日期',


sunday_date string COMMENT '周日日期'


) COMMENT '活跃用户按周明细'


PARTITIONED BY (wk_dt string)


stored as parquet


location '/warehouse/gmall/dws/dws_uv_detail_wk/'


;


导入数据:以周为分区;过滤出一个月内的数据,按设备 id 分组;


周一:date_add(next_day('2019-05-16','MO'),-7);


周日:date_add(next_day('2019-05-16','MO'),-1);


周一---周日:concat(date_add(next_day('2019-05-16', 'MO'), -7), "_", date_add(next_day('2019-05-16', 'MO'), -1));


insert overwrite table dws_uv_detail_wk partition(wk_dt)


select mid_id,


concat_ws('|', collect_set(user_id)) user_id,


concat_ws('|', collect_set(version_code)) version_code,


concat_ws('|', collect_set(version_name)) version_name,


concat_ws('|', collect_set(lang)) lang,


concat_ws('|', collect_set(source)) source,


concat_ws('|', collect_set(os)) os,


concat_ws('|', collect_set(area)) area,


concat_ws('|', collect_set(model)) model,


concat_ws('|', collect_set(brand)) brand,


concat_ws('|', collect_set(sdk_version)) sdk_version,


concat_ws('|', collect_set(gmail)) gmail,


concat_ws('|', collect_set(height_width)) height_width,


concat_ws('|', collect_set(app_time)) app_time,


concat_ws('|', collect_set(network)) network,


concat_ws('|', collect_set(lng)) lng,


concat_ws('|', collect_set(lat)) lat,


date_add(next_day('2019-02-10', 'MO'), -7),


date_add(next_day('2019-02-10', 'MO'), -1),


concat(date_add(next_day('2019-02-10', 'MO'), -7), '_', date_add(next_day('2019-02-10', 'MO'), -1))


from dws_uv_detail_day


where dt >= date_add(next_day('2019-02-10', 'MO'), -7) and dt <= date_add(next_day('2019-02-10', 'MO'), -1)


group by mid_id;


查询导入结果


hive (gmall)> select * from dws_uv_detail_wk limit 1;


hive (gmall)> select count(*) from dws_uv_detail_wk;


3 每月活跃设备明细 dws_uv_detail_mn partition(mn) - 把每日的数据插入进去


DWS 层创建分区表 partitioned by(mn string)


hive (gmall)>


drop table if exists dws_uv_detail_mn;


create external table dws_uv_detail_mn(


mid_id string COMMENT '设备唯一标识',


user_id string COMMENT '用户标识',


version_code string COMMENT '程序版本号',


version_name string COMMENT '程序版本名',


lang string COMMENT '系统语言',


source string COMMENT '渠道号',


os string COMMENT '安卓系统版本',


area string COMMENT '区域',


model string COMMENT '手机型号',


brand string COMMENT '手机品牌',


sdk_version string COMMENT 'sdkVersion',


gmail string COMMENT 'gmail',


height_width string COMMENT '屏幕宽高',


app_time string COMMENT '客户端日志产生时的时间',


network string COMMENT '网络模式',


lng string COMMENT '经度',


lat string COMMENT '纬度'


) COMMENT '活跃用户按月明细'


PARTITIONED BY (mn string)


stored as parquet


location '/warehouse/gmall/dws/dws_uv_detail_mn/'


;


数据导入 按月分区;过滤出一个月内的数据,按照设备 id 分组;


data_format('2019-03-10', 'yyyy-MM') ---> 2019-03


where date_format('dt', 'yyyy-MM') = date_format('2019-02-10', 'yyyy-MM') group by mid_id;


hive (gmall)>


set hive.exec.dynamic.partition.mode=nonstrict;


insert overwrite table dws_uv_detail_mn partition(mn)


select


mid_id,


concat_ws('|', collect_set(user_id)) user_id,


concat_ws('|', collect_set(version_code)) version_code,


concat_ws('|', collect_set(version_name)) version_name,


concat_ws('|', collect_set(lang)) lang,


concat_ws('|', collect_set(source)) source,


concat_ws('|', collect_set(os)) os,


concat_ws('|', collect_set(area)) area,


concat_ws('|', collect_set(model)) model,


concat_ws('|', collect_set(brand)) brand,


concat_ws('|', collect_set(sdk_version)) sdk_version,


concat_ws('|', collect_set(gmail)) gmail,


concat_ws('|', collect_set(height_width)) height_width,


concat_ws('|', collect_set(app_time)) app_time,


concat_ws('|', collect_set(network)) network,


concat_ws('|', collect_set(lng)) lng,


concat_ws('|', collect_set(lat)) lat,


date_format('2019-02-10','yyyy-MM')


from dws_uv_detail_day


where date_format(dt,'yyyy-MM') = date_format('2019-02-10','yyyy-MM')


group by mid_id;


查询导入结果


hive (gmall)> select * from dws_uv_detail_mn limit 1;


hive (gmall)> select count(*) from dws_uv_detail_mn ;


DWS 层加载数据脚本


在 hadoop101 的/home/kris/bin 目录下创建脚本


[kris@hadoop101 bin]$ vim dws.sh


#!/bin/bash

定义变量方便修改

APP=gmall


hive=/opt/module/hive/bin/hive

如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天

if [ -n "$1" ] ;then


do_date=$1


else


do_date=date -d "-1 day" +%F


fi


sql="


set hive.exec.dynamic.partition.mode=nonstrict;


insert overwrite table "do_date')


select


mid_id,


concat_ws('|', collect_set(user_id)) user_id,


concat_ws('|', collect_set(version_code)) version_code,


concat_ws('|', collect_set(version_name)) version_name,


concat_ws('|', collect_set(lang)) lang,


concat_ws('|', collect_set(source)) source,


concat_ws('|', collect_set(os)) os,


concat_ws('|', collect_set(area)) area,


concat_ws('|', collect_set(model)) model,


concat_ws('|', collect_set(brand)) brand,


concat_ws('|', collect_set(sdk_version)) sdk_version,


concat_ws('|', collect_set(gmail)) gmail,


concat_ws('|', collect_set(height_width)) height_width,


concat_ws('|', collect_set(app_time)) app_time,


concat_ws('|', collect_set(network)) network,


concat_ws('|', collect_set(lng)) lng,


concat_ws('|', collect_set(lat)) lat


from "$APP".dwd_start_log


where dt='$do_date'


group by mid_id;


insert overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)


select


mid_id,


concat_ws('|', collect_set(user_id)) user_id,


concat_ws('|', collect_set(version_code)) version_code,


concat_ws('|', collect_set(version_name)) version_name,


concat_ws('|', collect_set(lang)) lang,


concat_ws('|', collect_set(source)) source,


concat_ws('|', collect_set(os)) os,


concat_ws('|', collect_set(area)) area,


concat_ws('|', collect_set(model)) model,


concat_ws('|', collect_set(brand)) brand,


concat_ws('|', collect_set(sdk_version)) sdk_version,


concat_ws('|', collect_set(gmail)) gmail,


concat_ws('|', collect_set(height_width)) height_width,


concat_ws('|', collect_set(app_time)) app_time,


concat_ws('|', collect_set(network)) network,


concat_ws('|', collect_set(lng)) lng,


concat_ws('|', collect_set(lat)) lat,


date_add(next_day('$do_date','MO'),-7),


date_add(next_day('$do_date','SU'),-7),


concat(date_add( next_day('do_date','MO'),-7), '_' , date_add(next_day('do_date','MO'),-1)


)


from "$APP".dws_uv_detail_day


where dt>=date_add(next_day('do_date','MO'),-1)


group by mid_id;


insert overwrite table "$APP".dws_uv_detail_mn partition(mn)


select


mid_id,


concat_ws('|', collect_set(user_id)) user_id,


concat_ws('|', collect_set(version_code)) version_code,


concat_ws('|', collect_set(version_name)) version_name,


concat_ws('|', collect_set(lang))lang,


concat_ws('|', collect_set(source)) source,


concat_ws('|', collect_set(os)) os,


concat_ws('|', collect_set(area)) area,


concat_ws('|', collect_set(model)) model,


concat_ws('|', collect_set(brand)) brand,


concat_ws('|', collect_set(sdk_version)) sdk_version,


concat_ws('|', collect_set(gmail)) gmail,


concat_ws('|', collect_set(height_width)) height_width,


concat_ws('|', collect_set(app_time)) app_time,


concat_ws('|', collect_set(network)) network,


concat_ws('|', collect_set(lng)) lng,


concat_ws('|', collect_set(lat)) lat,


date_format('$do_date','yyyy-MM')


from "$APP".dws_uv_detail_day


where date_format(dt,'yyyy-MM') = date_format('$do_date','yyyy-MM')


group by mid_id;


"


sql"


增加脚本执行权限 chmod 777 dws.sh


脚本使用[kris@hadoop101 module]$ dws.sh 2019-02-11


查询结果


hive (gmall)> select count(*) from dws_uv_detail_day;


hive (gmall)> select count(*) from dws_uv_detail_wk;


hive (gmall)> select count(*) from dws_uv_detail_mn ;


脚本执行时间;企业开发中一般在每日凌晨 30 分~1 点


ADS 层 目标:当日、当周、当月活跃设备数 使用 day_count 表 join wk_count join mn_count , 把 3 张表连接一起


建表 ads_uv_count 表:


字段有 day_count、wk_count、mn_count is_weekend if(date_add(next_day('2019-02-10', 'MO'), -1) = '2019-02-10', 'Y', 'N') is_monthend if(last_day('2019-02-10') = '2019-02-10', 'Y', 'N')


drop table if exists ads_uv_count;


create external table ads_uv_count(


dt string comment '统计日期',


day_count bigint comment '当日用户量',


wk_count bigint comment '当周用户量',


mn_count bigint comment '当月用户量',


is_weekend string comment 'Y,N 是否是周末,用于得到本周最终结果',


is_monthend string comment 'Y,N 是否是月末,用于得到本月最终结果'


) comment '每日活跃用户数量'


stored as parquet


location '/warehouse/gmall/ads/ads_uv_count/';


导入数据:


hive (gmall)>


insert overwrite table ads_uv_count


select


'2019-02-10' dt,


daycount.ct,


wkcount.ct,


mncount.ct,


if(date_add(next_day('2019-02-10','MO'),-1)='2019-02-10','Y','N') ,


if(last_day('2019-02-10')='2019-02-10','Y','N')


from


(


select


'2019-02-10' dt,


count(*) ct


from dws_uv_detail_day


where dt='2019-02-10'


)daycount join


(


select


'2019-02-10' dt,


count (*) ct


from dws_uv_detail_wk


where wk_dt=concat(date_add(next_day('2019-02-10','MO'),-7),'_' ,date_add(next_day('2019-02-10','MO'),-1) )


) wkcount on daycount.dt=wkcount.dt


join


(


select


'2019-02-10' dt,


count (*) ct


from dws_uv_detail_mn


where mn=date_format('2019-02-10','yyyy-MM')


)mncount on daycount.dt=mncount.dt


;


查询导入结果


hive (gmall)> select * from ads_uv_count ;


ADS 层加载数据脚本


1)在 hadoop101 的/home/kris/bin 目录下创建脚本


[kris@hadoop101 bin]$ vim ads.sh


#!/bin/bash

定义变量方便修改

APP=gmall


hive=/opt/module/hive/bin/hive

如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天

if [ -n "$1" ] ;then


do_date=$1


else


do_date=date -d "-1 day" +%F


fi


sql="


set hive.exec.dynamic.partition.mode=nonstrict;


insert into table "$APP".ads_uv_count


select


'$do_date' dt,


daycount.ct,


wkcount.ct,


mncount.ct,


if(date_add(next_day('do_date','Y','N') ,


if(last_day('do_date','Y','N')


from


(


select


'$do_date' dt,


count(*) ct


from "$APP".dws_uv_detail_day


where dt='$do_date'


)daycount join


(


select


'$do_date' dt,


count (*) ct


from "$APP".dws_uv_detail_wk


where wk_dt=concat(date_add(next_day('do_date','MO'),-7),'_' ,date_add(next_day('do_date','MO'),-1) )


) wkcount on daycount.dt=wkcount.dt


join


(


select


'$do_date' dt,


count (*) ct


from "$APP".dws_uv_detail_mn


where mn=date_format('$do_date','yyyy-MM')


)mncount on daycount.dt=mncount.dt;


"


sql"


增加脚本执行权限 chmod 777 ads.sh


脚本使用 ads.sh 2019-02-11


查询导入结果 hive (gmall)> select * from ads_uv_count ;


需求二:用户新增主题


首次联网使用应用的用户。如果一个用户首次打开某 APP,那这个用户定义为新增用户;卸载再安装的设备,不会被算作一次新增。新增用户包括日新增用户、周新增用户、月新增用户。


每日新增(老用户不算,之前没登陆过,今天是第一次登陆)设备--没有分区 -->以往的新增库里边没有他,但他今天活跃了即新增加的用户_大数据培训


1 DWS 层(每日新增设备明细表) 创建每日新增设备明细表:dws_new_mid_day


hive (gmall)>


drop table if exists dws_new_mid_day;


create table dws_new_mid_day


(


mid_id string COMMENT '设备唯一标识',


user_id string COMMENT '用户标识',


version_code string COMMENT '程序版本号',


version_name string COMMENT '程序版本名',


lang string COMMENT '系统语言',


source string COMMENT '渠道号',


os string COMMENT '安卓系统版本',


area string COMMENT '区域',


model string COMMENT '手机型号',


brand string COMMENT '手机品牌',


sdk_version string COMMENT 'sdkVersion',


gmail string COMMENT 'gmail',


height_width string COMMENT '屏幕宽高',


app_time string COMMENT '客户端日志产生时的时间',


network string COMMENT '网络模式',


lng string COMMENT '经度',


lat string COMMENT '纬度',


create_date string comment '创建时间'


) COMMENT '每日新增设备信息'


stored as parquet


location '/warehouse/gmall/dws/dws_new_mid_day/';


dws_uv_detail_day(每日活跃设备明细) left join dws_new_mid_day nm(以往的新增用户表, 新建字段 create_time2019-02-10) nm.mid_id is null;


导入数据


用每日活跃用户表 left join 每日新增设备表,关联的条件是 mid_id 相等。如果是每日新增的设备,则在每日新增设备表中为 null。


from dws_uv_detail_day ud left join dws_new_mid_day nm on ud.mid_id=nm.mid_id


where ud.dt='2019-02-10' and nm.mid_id is null;


hive (gmall)>


insert into table dws_new_mid_day


select


ud.mid_id,


ud.user_id ,


ud.version_code ,


ud.version_name ,


ud.lang ,


ud.source,


ud.os,


ud.area,


ud.model,


ud.brand,


ud.sdk_version,


ud.gmail,


ud.height_width,


ud.app_time,


ud.network,


ud.lng,


ud.lat,


'2019-02-10'


from dws_uv_detail_day ud left join dws_new_mid_day nm on ud.mid_id=nm.mid_id


where ud.dt='2019-02-10' and nm.mid_id is null;


查询导入数据


hive (gmall)> select count(*) from dws_new_mid_day ;


2 ADS 层(每日新增设备表) 创建每日新增设备表 ads_new_mid_count


hive (gmall)>


drop table if exists ads_new_mid_count;


create table ads_new_mid_count


(


create_date string comment '创建时间' ,


new_mid_count BIGINT comment '新增设备数量'


) COMMENT '每日新增设备信息数量'


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_new_mid_count/';


导入数据 count(*) dws_new_mid_day 表即可


加了 create_date 就必须 group by create_time,否则报错:not in GROUP BY key 'create_date'


hive (gmall)>


insert into table ads_new_mid_count


select create_date , count(*) from dws_new_mid_day


where create_date='2019-02-10'


group by create_date ;


查询导入数据


hive (gmall)> select * from ads_new_mid_count;


扩展每月新增:


--每月新增


drop table if exists dws_new_mid_mn;


create table dws_new_mid_mn(


mid_id string COMMENT '设备唯一标识',


user_id string COMMENT '用户标识',


version_code string COMMENT '程序版本号',


version_name string COMMENT '程序版本名',


lang string COMMENT '系统语言',


source string COMMENT '渠道号',


os string COMMENT '安卓系统版本',


area string COMMENT '区域',


model string COMMENT '手机型号',


brand string COMMENT '手机品牌',


sdk_version string COMMENT 'sdkVersion',


gmail string COMMENT 'gmail',


height_width string COMMENT '屏幕宽高',


app_time string COMMENT '客户端日志产生时的时间',


network string COMMENT '网络模式',


lng string COMMENT '经度',


lat string COMMENT '纬度'


)comment "每月新增明细"


partitioned by(mn string)


stored as parquet


location "/warehouse/gmall/dws/dws_new_mid_mn";


insert overwrite table dws_new_mid_mn partition(mn)


select


um.mid_id,


um.user_id ,


um.version_code ,


um.version_name ,


um.lang ,


um.source,


um.os,


um.area,


um.model,


um.brand,


um.sdk_version,


um.gmail,


um.height_width,


um.app_time,


um.network,


um.lng,


um.lat,


date_format('2019-02-10', 'yyyy-MM')


from dws_uv_detail_mn um left join dws_new_mid_mn nm on um.mid_id = nm.mid_id


where um.mn =date_format('2019-02-10', 'yyyy-MM') and nm.mid_id = null; ----为什么加上它就是空的??查不到数据了呢


--##注意这里不能写出 date_format(um.mn, 'yyyy-MM') =date_format('2019-02-10', 'yyyy-MM')


|


需求三:用户留存主题


如果不考虑 2019-02-11 和 2019-02-12 的新增用户:2019-02-10 新增 100 人,一天后它的留存率是 30%,2 天 12 号它的留存率是 25%,3 天后留存率 32%;


站在 2019-02-12 号看 02-11 的留存率:新增 200 人,12 号的留存率是 20%;


站在 2019-02-13 号看 02-12 的留存率:新增 100 人,13 号即一天后留存率是 25%;


用户留存率的分析:昨日的新增且今天是活跃的 / 昨日的新增用户量


如今天 11 日,要统计 10 日的 用户留存率---->10 日的新设备且是 11 日活跃的 / 10 日新增设备   分母:10 日的新增设备(每日活跃 left join 以往新增设备表(nm) nm.mid_id is null )   分子:每日活跃表(ud) join 每日新增表(nm) where ud.dt='今天' and nm.create_date = '昨天'


① DWS 层(每日留存用户明细表 dws_user_retention_day) 用户 1 天留存的分析:===>>


留存用户=前一天新增 join 今天活跃


用户留存率=留存用户/前一天新增


创建表:dws_user_retention_day


hive (gmall)>


drop table if exists dws_user_retention_day;


create table dws_user_retention_day


(


mid_id string COMMENT '设备唯一标识',


user_id string COMMENT '用户标识',


version_code string COMMENT '程序版本号',


version_name string COMMENT '程序版本名',


lang string COMMENT '系统语言',


source string COMMENT '渠道号',


os string COMMENT '安卓系统版本',


area string COMMENT '区域',


model string COMMENT '手机型号',


brand string COMMENT '手机品牌',


sdk_version string COMMENT 'sdkVersion',


gmail string COMMENT 'gmail',


height_width string COMMENT '屏幕宽高',


app_time string COMMENT '客户端日志产生时的时间',


network string COMMENT '网络模式',


lng string COMMENT '经度',


lat string COMMENT '纬度',


create_date string comment '设备新增时间',


retention_day int comment '截止当前日期留存天数'


) COMMENT '每日用户留存情况'


PARTITIONED BY ( dt string)


stored as parquet


location '/warehouse/gmall/dws/dws_user_retention_day/'


;


导入数据(每天计算前 1 天的新用户访问留存明细)


from dws_uv_detail_day 每日活跃设备 ud join dws_new_mid_day 每日新增设备 nm on ud.mid_id =nm.mid_id where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1);


hive (gmall)>


insert overwrite table dws_user_retention_day partition(dt="2019-02-11")


select


nm.mid_id,


nm.user_id ,


nm.version_code ,


nm.version_name ,


nm.lang ,


nm.source,


nm.os,


nm.area,


nm.model,


nm.brand,


nm.sdk_version,


nm.gmail,


nm.height_width,


nm.app_time,


nm.network,


nm.lng,


nm.lat,


nm.create_date,


1 retention_day


from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id


where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1);


查询导入数据(每天计算前 1 天的新用户访问留存明细)


hive (gmall)> select count(*) from dws_user_retention_day;


② DWS 层(1,2,3,n 天留存用户明细表)直接插入数据:dws_user_retention_day 用 union all 连接起来,汇总到一个表中;1)直接导入数据(每天计算前 1,2,3,n 天的新用户访问留存明细) 直接改变这个即可以,date_add('2019-02-11',-3); -1 是一天的留存率;-2 是两天的留存率、-3 是三天的留存率


hive (gmall)>


insert overwrite table dws_user_retention_day partition(dt="2019-02-11")


select


nm.mid_id,


nm.user_id ,


nm.version_code ,


nm.version_name ,


nm.lang ,


nm.source,


nm.os,


nm.area,


nm.model,


nm.brand,


nm.sdk_version,


nm.gmail,


nm.height_width,


nm.app_time,


nm.network,


nm.lng,


nm.lat,


nm.create_date,


1 retention_day


from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id


where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1)


union all


select


nm.mid_id,


nm.user_id ,


nm.version_code ,


nm.version_name ,


nm.lang ,


nm.source,


nm.os,


nm.area,


nm.model,


nm.brand,


nm.sdk_version,


nm.gmail,


nm.height_width,


nm.app_time,


nm.network,


nm.lng,


nm.lat,


nm.create_date,


2 retention_day


from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id


where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-2)


union all


select


nm.mid_id,


nm.user_id ,


nm.version_code ,


nm.version_name ,


nm.lang ,


nm.source,


nm.os,


nm.area,


nm.model,


nm.brand,


nm.sdk_version,


nm.gmail,


nm.height_width,


nm.app_time,


nm.network,


nm.lng,


nm.lat,


nm.create_date,


3 retention_day


from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id


where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-3);


2)查询导入数据(每天计算前 1,2,3 天的新用户访问留存明细)


hive (gmall)> select retention_day , count(*) from dws_user_retention_day group by retention_day;


③ ADS 层 留存用户数 ads_user_retention_day_count 直接 count( * )即可 1)创建 ads_user_retention_day_count 表:


hive (gmall)>


drop table if exists ads_user_retention_day_count;


create table ads_user_retention_day_count


(


create_date string comment '设备新增日期',


retention_day int comment '截止当前日期留存天数',


retention_count bigint comment '留存数量'


) COMMENT '每日用户留存情况'


stored as parquet


location '/warehouse/gmall/ads/ads_user_retention_day_count/';


导入数据 按创建日期 create_date 和 留存天数 retention_day 进行分组 group by;


hive (gmall)>


insert into table ads_user_retention_day_count


select


create_date,


retention_day,


count(*) retention_count


from dws_user_retention_day


where dt='2019-02-11'


group by create_date,retention_day;


查询导入数据


hive (gmall)> select * from ads_user_retention_day_count;


---> 2019-02-10 1 112


④ 留存用户比率 retention_count / new_mid_count 即留存个数 / 新增个数 创建表 ads_user_retention_day_rate


hive (gmall)>


drop table if exists ads_user_retention_day_rate;


create table ads_user_retention_day_rate


(


stat_date string comment '统计日期',


create_date string comment '设备新增日期',


retention_day int comment '截止当前日期留存天数',


retention_count bigint comment '留存数量',


new_mid_count string comment '当日设备新增数量',


retention_ratio decimal(10,2) comment '留存率'


) COMMENT '每日用户留存情况'


stored as parquet


location '/warehouse/gmall/ads/ads_user_retention_day_rate/';


导入数据


join ads_new_mid_countt --->每日新增设备表


hive (gmall)>


insert into table ads_user_retention_day_rate


select


'2019-02-11' ,


ur.create_date,


ur.retention_day,


ur.retention_count ,


nc.new_mid_count,


ur.retention_count/nc.new_mid_count*100


from


(


select


create_date,


retention_day,


count(*) retention_count


from dws_user_retention_day


where dt='2019-02-11'


group by create_date,retention_day


) ur join ads_new_mid_count nc on nc.create_date=ur.create_date;


查询导入数据


hive (gmall)>select * from ads_user_retention_day_rate;


2019-02-11 2019-02-10 1 112 442 25.34


需求四:沉默用户数


沉默用户:指的是只在安装当天启动过,且启动时间是在一周前


使用日活明细表 dws_uv_detail_day 作为 DWS 层数据


建表语句


hive (gmall)>


drop table if exists ads_slient_count;


create external table ads_slient_count(


dt string COMMENT '统计日期',


slient_count bigint COMMENT '沉默设备数'


)


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_slient_count';


导入数据


hive (gmall)>


insert into table ads_slient_count


select


'2019-02-20' dt,


count(*) slient_count


from


(


select mid_id


from dws_uv_detail_day


where dt<='2019-02-20'


group by mid_id


having count(*)=1 and min(dt)<date_add('2019-02-20',-7)


) t1;


需求五:本周回流用户数


本周回流=本周活跃-本周新增-上周活跃


使用日活明细表 dws_uv_detail_day 作为 DWS 层数据


本周回流(上周以前活跃过,上周没活跃,本周活跃了)=本周活跃-本周新增-上周活跃 本周回流=本周活跃 left join 本周新增 left join 上周活跃,且本周新增 id 为 null,上周活跃 id 为 null_大数据视频


建表:


hive (gmall)>


drop table if exists ads_back_count;


create external table ads_back_count(


dt string COMMENT '统计日期',


wk_dt string COMMENT '统计日期所在周',


wastage_count bigint COMMENT '回流设备数'


)


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_back_count';


导入数据


hive (gmall)>


insert into table ads_back_count


select


'2019-02-20' dt,


concat(date_add(next_day('2019-02-20','MO'),-7),'_',date_add(next_day('2019-02-20','MO'),-1)) wk_dt,


count(*)


from


(


select t1.mid_id


from


(


select mid_id


from dws_uv_detail_wk


where wk_dt=concat(date_add(next_day('2019-02-20','MO'),-7),'_',date_add(next_day('2019-02-20','MO'),-1))


)t1


left join


(


select mid_id


from dws_new_mid_day


where create_date<=date_add(next_day('2019-02-20','MO'),-1) and create_date>=date_add(next_day('2019-02-20','MO'),-7)


)t2


on t1.mid_id=t2.mid_id


left join


(


select mid_id


from dws_uv_detail_wk


where wk_dt=concat(date_add(next_day('2019-02-20','MO'),-7*2),'_',date_add(next_day('2019-02-20','MO'),-7-1))


)t3


on t1.mid_id=t3.mid_id


where t2.mid_id is null and t3.mid_id is null


)t4;


需求六:流失用户数


流失用户:最近 7 天未登录我们称之为流失用户


使用日活明细表 dws_uv_detail_day 作为 DWS 层数据


建表语句


hive (gmall)>


drop table if exists ads_wastage_count;


create external table ads_wastage_count(


dt string COMMENT '统计日期',


wastage_count bigint COMMENT '流失设备数'


)


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_wastage_count';


导入数据


hive (gmall)>


insert into table ads_wastage_count


select


'2019-02-20',


count(*)


from


(


select mid_id


from dws_uv_detail_day


group by mid_id


having max(dt)<=date_add('2019-02-20',-7)


)t1;


需求七:最近连续 3 周活跃用户数


最近 3 周连续活跃的用户:通常是周一对前 3 周的数据做统计,该数据一周计算一次。


使用周活明细表 dws_uv_detail_wk 作为 DWS 层数据


建表语句


hive (gmall)>


drop table if exists ads_continuity_wk_count;


create external table ads_continuity_wk_count(


dt string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日期',


wk_dt string COMMENT '持续时间',


continuity_count bigint


)


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_continuity_wk_count';


导入数据


hive (gmall)>


insert into table ads_continuity_wk_count


select


'2019-02-20',


concat(date_add(next_day('2019-02-20','MO'),-7*3),'_',date_add(next_day('2019-02-20','MO'),-1)),


count(*)


from


(


select mid_id


from dws_uv_detail_wk


where wk_dt>=concat(date_add(next_day('2019-02-20','MO'),-73),'_',date_add(next_day('2019-02-20','MO'),-72-1))


and wk_dt<=concat(date_add(next_day('2019-02-20','MO'),-7),'_',date_add(next_day('2019-02-20','MO'),-1))


group by mid_id


having count(*)=3


)t1;


需求八:最近七天内连续三天活跃用户数


说明:最近 7 天内连续 3 天活跃用户数


使用日活明细表 dws_uv_detail_day 作为 DWS 层数据


建表


hive (gmall)>


drop table if exists ads_continuity_uv_count;


create external table ads_continuity_uv_count(


dt string COMMENT '统计日期',


wk_dt string COMMENT '最近 7 天日期',


continuity_count bigint


) COMMENT '连续活跃设备数'


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_continuity_uv_count';


导入数据


hive (gmall)>


insert into table ads_continuity_uv_count


select


'2019-02-12',


concat(date_add('2019-02-12',-6),'_','2019-02-12'),


count(*)


from


(


select mid_id


from


(


select mid_id


from


(


select


mid_id,


date_sub(dt,rank) date_dif


from


(


select


mid_id,


dt,


rank() over(partition by mid_id order by dt) rank


from dws_uv_detail_day


where dt>=date_add('2019-02-12',-6) and dt<='2019-02-12'


)t1


)t2


group by mid_id,date_dif


having count(*)>=3


)t3


group by mid_id


)t4;


ODS 层跟原始字段要一模一样;


DWD 层   dwd_order_info 订单表   dwd_order_detail 订单详情(订单和商品)   dwd_user_info 用户表   dwd_payment_info 支付流水   dwd_sku_info 商品表(增加分类)


每日用户行为宽表 dws_user_action


字段:user_id、order_count、order_amount、payment_count、payment_amount 、comment_count


drop table if exists dws_user_action;


create external table dws_user_action(


user_id string comment '用户 id',


order_count bigint comment '用户下单数',


order_amount decimal(16, 2) comment '下单金额',


payment_count bigint comment '支付次数',


payment_amount decimal(16, 2) comment '支付金额',


comment_count bigint comment '评论次数'


)comment '每日用户行为宽表'


partitioned by(dt string)


stored as parquet


location '/warehouse/gmall/dws/dws_user_action/'


tblproperties("parquet.compression"="snappy");


导入数据


0 占位符,第一个字段要有别名


with tmp_order as(


select user_id, count(*) order_count, sum(oi.total_amount) order_amount from dwd_order_info oi


where date_format(oi.create_time, 'yyyy-MM-dd')='2019-02-10' group by user_id


),


tmp_payment as(


select user_id, count(*) payment_count, sum(pi.total_amount) payment_amount from dwd_payment_info pi


where date_format(pi.payment_time, 'yyyy-MM-dd')='2019-02-10' group by user_id


),


tmp_comment as(


select user_id, count(*) comment_count from dwd_comment_log c


where date_format(c.dt, 'yyyy-MM-dd')='2019-02-10' group by user_id


)


insert overwrite table dws_user_action partition(dt='2019-02-10')


select user_actions.user_id, sum(user_actions.order_count), sum(user_actions.order_amount),


sum(user_actions.payment_count),


sum(user_actions.payment_amount),


sum(user_actions.comment_count) from(


select user_id, order_count, order_amount, 0 payment_count, 0 payment_amount, 0 comment_count from tmp_order


union all select user_id, 0, 0, payment_count, payment_amount, 0 from tmp_payment


union all select user_id, 0, 0, 0, 0, comment_count from tmp_comment


) user_actions group by user_id;


GMV 拍下订单金额;包括付款和未付款;


建表 ads_gmv_sum_day 语句:


drop table if exists ads_gmv_sum_day;


create table ads_gmv_sum_day(


dt string comment '统计日期',


gmv_count bigint comment '当日 GMV 订单个数',


gmv_amount decimal(16, 2) comment '当日 GMV 订单总额',


gmv_payment decimal(16, 2) comment '当日支付金额'


) comment 'GMV'


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_gmv_sum_day';


导入数据:from 用户行为宽表 dws_user_action


sum(order_count) gmv_count 、 sum(order_amount) gmv_amount 、sum(payment_amount) payment_amount 过滤日期,以 dt 分组;


insert into table ads_gmv_sum_day


select '2019-02-10' dt, sum(order_count) gmv_count, sum(order_amount) gmv_amount, sum(payment_amount) gmv_payment


from dws_user_action where dt='2019-02-10' group by dt;


编写脚本:


#/bin/bash


APP=gmall


hive=/opt/module/hive/bin/hive


if [ -n "$1" ]; then


do_date=$1


else


do_date=date -d "-1 day" +%F


fi


sql="


insert into table "$APP".ads_gmv_sum_day


select '$do_date' dt, sum(order_count) gmv_count, sum(order_amount) gmv_amount, sum(payment_amount) gmv_payment


from "do_date' group by dt;


"


sql";


需求十:转化率=新增用户/日活用户


ads_user_convert_day


dt


uv_m_count 当日活跃设备


new_m_count 当日新增设备


new_m_ratio 新增占日活比率


ads_uv_count 用户活跃数(在行为数仓中;) day_count dt


ads_new_mid_count 用户新增表(行为数仓中) new_mid_count create_date


建表 ads_user_convert_day


drop table if exists ads_user_convert_day;


create table ads_user_convert_day(


dt string comment '统计日期',


uv_m_count bigint comment '当日活跃设备',


new_m_count bigint comment '当日新增设备',


new_m_radio decimal(10, 2) comment '当日新增占日活比率'


)comment '转化率'


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_user_convert_day/';


数据导入 cast(sum( uc.nmc)/sum( uc.dc)*100 as decimal(10,2)) new_m_ratio ;使用 union all


insert into table ads_user_convert_day select '2019-02-10', sum(uc.dc) sum_dc, sum(uc.nmc) sum_nmc,


cast(sum(uc.nmc)/sum(uc.dc) * 100 as decimal(10, 2)) new_m_radio


from(select day_count dc, 0 nmc from ads_uv_count where dt='2019-02-10'


union all select 0 dc, new_mid_count from ads_new_mid_count where create_date='2019-02-10'


)uc;


访问到下单转化率| 下单到支付转化率


ads_user_action_convert_day


dt


total_visitor_m_count 总访问人数


order_u_count 下单人数


visitor2order_convert_ratio 访问到下单转化率


payment_u_count 支付人数


order2payment_convert_ratio 下单到支付转化率


dws_user_action (宽表中)


user_id


order_count


order_amount


payment_count


payment_amount


comment_count


ads_uv_count 用户活跃数(行为数仓中)


dt


day_count


wk_count


mn_count


is_weekend


is_monthend


建表


drop table if exists ads_user_action_convert_day;


create table ads_user_action_convert_day(


dt string comment '统计日期',


total_visitor_m_count bigint comment '总访问人数',


order_u_count bigint comment '下单人数',


visitor2order_convert_radio decimal(10, 2) comment '访问到下单转化率',


payment_u_count bigint comment '支付人数',


order2payment_convert_radio decimal(10, 2) comment '下单到支付的转化率'


)COMMENT '用户行为漏斗分析'


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_user_convert_day/'


;


插入数据


insert into table ads_user_action_convert_day


select '2019-02-10', uv.day_count, ua.order_count,


cast(ua.order_count/uv.day_count * 100 as decimal(10, 2)) visitor2order_convert_radio,


ua.payment_count,


cast(ua.payment_count/ua.order_count * 100 as decimal(10, 2)) order2payment_convert_radio


from(


select sum(if(order_count>0, 1, 0)) order_count,


sum(if(payment_count>0, 1, 0)) payment_count


from dws_user_action where dt='2019-02-10'


)ua, ads_uv_count uv where uv.dt='2019-02-10';


需求十一:品牌复购率


需求:以月为单位统计,购买 2 次以上商品的用户,用户购买商品明细表 dws_sale_detail_daycount:(宽表)建表 dws_sale_detail_daycount


drop table if exists dws_sale_detail_daycount;


create external table dws_sale_detail_daycount(


user_id string comment '用户 id',


sku_id string comment '商品 Id',


user_gender string comment '用户性别',


user_age string comment '用户年龄',


user_level string comment '用户等级',


order_price decimal(10,2) comment '商品价格',


sku_name string comment '商品名称',


sku_tm_id string comment '品牌 id',


sku_category3_id string comment '商品三级品类 id',


sku_category2_id string comment '商品二级品类 id',


sku_category1_id string comment '商品一级品类 id',


sku_category3_name string comment '商品三级品类名称',


sku_category2_name string comment '商品二级品类名称',


sku_category1_name string comment '商品一级品类名称',


spu_id string comment '商品 spu',


sku_num int comment '购买个数',


order_count string comment '当日下单单数',


order_amount string comment '当日下单金额'


) comment '用户购买商品明细表'


partitioned by(dt string)


stored as parquet


location '/warehouse/gmall/dws/dws_sale_detail_daycount'


tblproperties("parquet.compression"="snappy");


数据导入


ods_order_detail 订单详情表、dwd_user_info 用户表、dwd_sku_info 商品表


with tmp_detail as(


select user_id, sku_id, sum(sku_num) sku_num, count() order_count, sum(od.order_pricesku_num) order_amount


from ods_order_detail od where od.dt='2019-02-10' and user_id is not null group by user_id, sku_id


)


insert overwrite table dws_sale_detail_daycount partition(dt='2019-02-10')


select


tmp_detail.user_id,


tmp_detail.sku_id,


u.gender,


months_between('2019-02-10', u.birthday)/12 age,


u.user_level,


price,


sku_name,


tm_id,


category3_id ,


category2_id ,


category1_id ,


category3_name ,


category2_name ,


category1_name ,


spu_id,


tmp_detail.sku_num,


tmp_detail.order_count,


tmp_detail.order_amount


from tmp_detail


left join dwd_user_info u on u.id=tmp_detail.user_id and u.dt='2019-02-10'


left join dwd_sku_info s on s.id=tmp_detail.sku_id and s.dt='2019-02-10';


ADS 层 品牌复购率报表分析 建表 ads_sale_tm_category1_stat_mn


buycount 购买人数、buy_twice_last 两次以上购买人数、


buy_twice_last_ratio '单次复购率'、


buy_3times_last '三次以上购买人数',


buy_3times_last_ratio 多次复购率'


drop table ads_sale_tm_category1_stat_mn;


create table ads_sale_tm_category1_stat_mn


(


tm_id string comment '品牌 id ' ,


category1_id string comment '1 级品类 id ',


category1_name string comment '1 级品类名称 ',


buycount bigint comment '购买人数',


buy_twice_last bigint comment '两次以上购买人数',


buy_twice_last_ratio decimal(10,2) comment '单次复购率',


buy_3times_last bigint comment '三次以上购买人数',


buy_3times_last_ratio decimal(10,2) comment '多次复购率' ,


stat_mn string comment '统计月份',


stat_date string comment '统计日期'


) COMMENT '复购率统计'


row format delimited fields terminated by '\t'


location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/'


;


插入数据


insert into table ads_sale_tm_category1_stat_mn


select mn.sku_tm_id,


mn.sku_category1_id,


mn.sku_category1_name,


sum(if(mn.order_count >= 1, 1, 0)) buycount,


sum(if(mn.order_count >= 2, 1, 0)) buyTwiceLast,


sum(if(mn.order_count >= 2, 1, 0)) / sum(if(mn.order_count >= 1, 1, 0)) buyTwiceLastRatio,


sum(if(mn.order_count >= 3, 1, 0)) buy3timeLast,


sum(if(mn.order_count >= 3, 1, 0)) / sum(if(mn.order_count >= 1, 1, 0)) buy3timeLastRadio,


date_format ('2019-02-10' ,'yyyy-MM') stat_mn,


'2019-02-10' stat_date


from (


select sd.sku_tm_id, sd.sku_category1_id, sd.sku_category1_name, user_id, sum(order_count) order_count


from dws_sale_detail_daycount sd where date_format(dt, 'yyyy-MM') <= date_format('2019-02-10', 'yyyy-MM')


group by sd.sku_tm_id, sd.sku_category1_id, user_id, sd.sku_category1_name


) mn


group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name


;


数据导入脚本


1)在/home/kris/bin 目录下创建脚本 ads_sale.sh


[kris@hadoop101 bin]$ vim ads_sale.sh


#!/bin/bash

定义变量方便修改

APP=gmall


hive=/opt/module/hive/bin/hive

如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天

if [ -n "$1" ] ;then


do_date=$1


else


do_date=date -d "-1 day" +%F


fi


sql="


set hive.exec.dynamic.partition.mode=nonstrict;


insert into table "$APP".ads_sale_tm_category1_stat_mn


select


mn.sku_tm_id,


mn.sku_category1_id,


mn.sku_category1_name,


sum(if(mn.order_count>=1,1,0)) buycount,


sum(if(mn.order_count>=2,1,0)) buyTwiceLast,


sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0)) buyTwiceLastRatio,


sum(if(mn.order_count>=3,1,0)) buy3timeLast ,


sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0)) buy3timeLastRatio ,


date_format('$do_date' ,'yyyy-MM') stat_mn,


'$do_date' stat_date


from


(


select od.sku_tm_id,


od.sku_category1_id,


od.sku_category1_name,


user_id ,


sum(order_count) order_count


from "$APP".dws_sale_detail_daycount od


where date_format(dt,'yyyy-MM')<=date_format('$do_date' ,'yyyy-MM')


group by od.sku_tm_id, od.sku_category1_id, user_id, od.sku_category1_name


) mn


group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;


"


sql"


增加脚本执行权限


[kris@hadoop101 bin]$ chmod 777 ads_sale.sh


执行脚本导入数据


[kris@hadoop101 bin]$ ads_sale.sh 2019-02-11


查看导入数据


hive (gmall)>select * from ads_sale_tm_category1_stat_mn limit 2;


品牌复购率结果输出到 MySQL


1)在 MySQL 中创建 ads_sale_tm_category1_stat_mn 表


create table ads_sale_tm_category1_stat_mn


(


tm_id varchar(200) comment '品牌 id ' ,


category1_id varchar(200) comment '1 级品类 id ',


category1_name varchar(200) comment '1 级品类名称 ',


buycount varchar(200) comment '购买人数',


buy_twice_last varchar(200) comment '两次以上购买人数',


buy_twice_last_ratio varchar(200) comment '单次复购率',


buy_3times_last varchar(200) comment '三次以上购买人数',


buy_3times_last_ratio varchar(200) comment '多次复购率' ,


stat_mn varchar(200) comment '统计月份',


stat_date varchar(200) comment '统计日期'


)


2)编写 Sqoop 导出脚本


在/home/kris/bin 目录下创建脚本 sqoop_export.sh


[kris@hadoop101 bin]$ vim sqoop_export.sh


#!/bin/bash


db_name=gmall


export_data() {


/opt/module/sqoop/bin/sqoop export \


--connect "jdbc:mysql://hadoop101:3306/${db_name}?useUnicode=true&characterEncoding=utf-8" \


--username root \


--password 123456 \


--table $1 \


--num-mappers 1 \


--export-dir /warehouse/1 \


--input-fields-terminated-by "\t" \


--update-key "tm_id,category1_id,stat_mn,stat_date" \


--update-mode allowinsert \


--input-null-string '\N' \


--input-null-non-string '\N'


}


case $1 in


"ads_sale_tm_category1_stat_mn")


export_data "ads_sale_tm_category1_stat_mn"


;;


"all")


export_data "ads_sale_tm_category1_stat_mn"


;;


esac


3)执行 Sqoop 导出脚本


[kris@hadoop101 bin]$ chmod 777 sqoop_export.sh


[kris@hadoop101 bin]$ sqoop_export.sh all


4)在 MySQL 中查看结果


SELECT * FROM ads_sale_tm_category1_stat_mn;


需求十二:求每个等级的用户对应的复购率前十的商品排行


1)每个等级,每种商品,买一次的用户数,买两次的用户数=》得出复购率


2)利用开窗函数,取每个等级的前十


3)形成脚本


用户购买明细宽表 dws_sale_detail_daycount


① t1--按 user_leval, sku_id, user_id 统计下单次数


select


user_level,


sku_id,


user_id,


sum(order_count) order_count_sum


from dws_sale_detail_daycount


where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')


group by user_level, sku_id, user_id limit 10;


② t2 --求出每个等级,每种商品,买一次的用户数,买两次的用户数 得出复购率


select


t1.user_level,


t1.sku_id,


sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,


sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,


sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,


'2019-02-13' stat_date


from(


select


user_level,


sku_id,


user_id,


sum(order_count) order_count_sum


from dws_sale_detail_daycount


where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')


group by user_level, sku_id, user_id


) t1


group by t1.user_level, t1.sku_id;


③ t3 --按用户等级分区,复购率排序


select


t2.user_level,


t2.sku_id,


t2.buyOneCount,


t2.buyTwiceCount,


t2.buyTwiceCountRatio,


t2.stat_date


from(


select


t1.user_level,


t1.sku_id,


sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,


sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,


sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,


'2019-02-13' stat_date


from(


select


user_level,


sku_id,


user_id,


sum(order_count) order_count_sum


from dws_sale_detail_daycount


where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')


group by user_level, sku_id, user_id


) t1


group by t1.user_level, t1.sku_id


)t2


④ -分区排序 rank()


select


t2.user_level,


t2.sku_id,


t2.buyOneCount,


t2.buyTwiceCount,


t2.buyTwiceCountRatio,


rank() over(partition by t2.sku_id order by t2.buyTwiceCount) rankNo


from(


select


t1.user_level,


t1.sku_id,


sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,


sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,


sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,


'2019-02-13' stat_date


from(


select


user_level,


sku_id,


user_id,


sum(order_count) order_count_sum


from dws_sale_detail_daycount


where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')


group by user_level, sku_id, user_id


) t1


group by t1.user_level, t1.sku_id


)t2


⑤ 作为子查询取前 10


select t3.user_level, t3.sku_id, t3.buyOneCount, t3.buyTwiceCount, t3.buyTwiceCountRatio, t3.rankNo


from(


select


t2.user_level,


t2.sku_id,


t2.buyOneCount,


t2.buyTwiceCount,


t2.buyTwiceCountRatio,


rank() over(partition by t2.sku_id order by t2.buyTwiceCount) rankNo


from(


select


t1.user_level,


t1.sku_id,


sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,


sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,


sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,


'2019-02-13' stat_date


from(


select


user_level,


sku_id,


user_id,


sum(order_count) order_count_sum


from dws_sale_detail_daycount


where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')


group by user_level, sku_id, user_id


) t1


group by t1.user_level, t1.sku_id


)t2


) t3 where rankNo <= 10;


文章转载来源于大数据学习与分享


用户头像

@零度

关注

关注尚硅谷,轻松学IT 2021.11.23 加入

IT培训 www.atguigu.com

评论

发布
暂无评论
大数据培训用SQL来实现用户行为漏斗分析_大数据开发_@零度_InfoQ写作社区