case class DwdInfo (iceberg_ods_tbl_name:String,kafka_dwd_topic:String,browse_product_code:String,browse_product_tpcode:String,user_ip:String,obtain_points:String,user_id1:String,user_id2:String, front_product_url:String, log_time:String, browse_product_url:String ,id:String,ip:String, login_tm:String,logout_tm:String)
object ProduceODSDataToDWD {
private val kafkaBrokers: String = ConfigUtil.KAFKA_BROKERS
def main(args: Array[String]): Unit = { //1.准备环境 val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment val tblEnv: StreamTableEnvironment = StreamTableEnvironment.create(env) env.enableCheckpointing(5000)
import org.apache.flink.streaming.api.scala._
/** * 2.需要预先创建 Catalog * 创建Catalog,创建表需要在Hive中提前创建好,不在代码中创建,因为在Flink中创建iceberg表不支持create table if not exists ...语法 */ tblEnv.executeSql( """ |create catalog hadoop_iceberg with ( | 'type'='iceberg', | 'catalog-type'='hadoop', | 'warehouse'='hdfs://mycluster/lakehousedata' |) """.stripMargin)
/** * 2.创建 Kafka Connector,连接消费Kafka ods中数据 */ tblEnv.executeSql( """ |create table kafka_ods_tbl( | iceberg_ods_tbl_name string, | kafka_dwd_topic string, | data string |) with ( | 'connector' = 'kafka', | 'topic' = 'KAFKA-ODS-TOPIC', | 'properties.bootstrap.servers'='node1:9092,node2:9092,node3:9092', | 'scan.startup.mode'='latest-offset', --也可以指定 earliest-offset 、latest-offset | 'properties.group.id' = 'my-group-id', | 'format' = 'json' |) """.stripMargin)
val odsTbl :Table = tblEnv.sqlQuery( """ | select iceberg_ods_tbl_name,data,kafka_dwd_topic from kafka_ods_tbl """.stripMargin)
val odsDS: DataStream[Row] = tblEnv.toAppendStream[Row](odsTbl)
//3.设置Sink 到Kafka 数据输出到侧输出流标记 val kafkaDataTag = new OutputTag[JSONObject]("kafka_data")
/** * 4.表准换成对应的DataStream数据处理,清洗ODS 中的数据,存入Iceberg * { * "iceberg_ods_tbl_name": "ODS_BROWSELOG", * "data": "{\"browseProductCode\":\"yyRAteviDb\",\"browseProductTpCode\":\"120\",\"userIp\":\"117.233.5.190\",\"obtainPoints\":\"24\", * \"userId\":\"uid464936\",\"frontProductUrl\":\"https://1P//2RQbHFS2\",\"logTime\":\"1647065858856\",\"browseProductUrl\":\"https://RXm/iOUxR/Tliu9TE0\"}", * "kafka_dwd_topic": "KAFKA-DWD-BROWSE-LOG-TOPIC" * } * * { * "iceberg_ods_tbl_name": "ODS_USER_LOGIN", * "data": "{\"database\":\"lakehousedb\",\"xid\":\"14942\",\"user_id\":\"uid283876\",\"ip\":\"215.148.233.254\",\"commit\":\"true\", * \"id\":\"10052\",\"type\":\"insert\",\"logout_tm\":\"1647066506140\",\"table\":\"mc_user_login\",\"ts\":\"1647066504\",\"login_tm\":\"1647051931534\"}", * "kafka_dwd_topic": "KAFKA-DWD-USER-LOGIN-TOPIC" * } * * 这里将数据转换成DataStream后再转换成表写入Iceberg * */ //对数据只是时间进行清洗,转换成DwdInfo 类型DataStream 返回,先过滤一些数据为null的 val dwdDS: DataStream[DwdInfo] = odsDS.filter(row=>{row.getField(0)!=null && row.getField(1)!=null &&row.getField(2)!=null }) .process(new ProcessFunction[Row,DwdInfo]() { override def processElement(row: Row, context: ProcessFunction[Row, DwdInfo]#Context, collector: Collector[DwdInfo]): Unit = { val iceberg_ods_tbl_name: String = row.getField(0).toString val data: String = row.getField(1).toString val kafka_dwd_topic: String = row.getField(2).toString
val jsonObj: JSONObject = JSON.parseObject(data)
//清洗日期数据 jsonObj.put("logTime",DateUtil.getDateYYYYMMDDHHMMSS(jsonObj.getString("logTime"))) jsonObj.put("login_tm",DateUtil.getDateYYYYMMDDHHMMSS(jsonObj.getString("login_tm"))) jsonObj.put("logout_tm",DateUtil.getDateYYYYMMDDHHMMSS(jsonObj.getString("logout_tm")))
//解析json 嵌套数据 val browse_product_code: String = jsonObj.getString("browseProductCode") val browse_product_tpcode: String = jsonObj.getString("browseProductTpCode") val user_ip: String = jsonObj.getString("userIp") val obtain_points: String = jsonObj.getString("obtainPoints") val user_id1: String = jsonObj.getString("user_id") val user_id2: String = jsonObj.getString("userId") val front_product_url: String = jsonObj.getString("frontProductUrl") val log_time: String = jsonObj.getString("logTime") val browse_product_url: String = jsonObj.getString("browseProductUrl") val id: String = jsonObj.getString("id") val ip: String = jsonObj.getString("ip") val login_tm: String = jsonObj.getString("login_tm") val logout_tm: String = jsonObj.getString("logout_tm")
//往各类数据 data json 对象中加入sink dwd topic 的信息 jsonObj.put("kafka_dwd_topic",kafka_dwd_topic) context.output(kafkaDataTag,jsonObj)
collector.collect(DwdInfo(iceberg_ods_tbl_name, kafka_dwd_topic, browse_product_code, browse_product_tpcode, user_ip, obtain_points, user_id1,user_id2, front_product_url, log_time, browse_product_url, id, ip, login_tm, logout_tm)) } })
val props = new Properties() props.setProperty("bootstrap.servers",kafkaBrokers)
/** * 6.将以上数据写入到Kafka 各自DWD 层topic中,这里不再使用SQL方式,而是直接使用DataStream代码方式 Sink 到各自的DWD层代码中 */ dwdDS.getSideOutput(kafkaDataTag).addSink(new FlinkKafkaProducer[JSONObject]("KAFKA-DWD-DEFAULT-TOPIC",new KafkaSerializationSchema[JSONObject] { override def serialize(jsonObj: JSONObject, aLong: lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = { val sinkDwdTopic: String = jsonObj.getString("kafka_dwd_topic") new ProducerRecord[Array[Byte], Array[Byte]](sinkDwdTopic,null,jsonObj.toString.getBytes()) } },props,FlinkKafkaProducer.Semantic.AT_LEAST_ONCE))
env.execute() }
}
评论