# datawarehouse **Repository Path**: ddwanglife/datawarehouse ## Basic Information - **Project Name**: datawarehouse - **Description**: 数据仓库-大数据项目 - **Primary Language**: Java - **License**: Apache-2.0 - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 1 - **Created**: 2020-07-15 - **Last Updated**: 2021-10-23 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # 项目说明 ``` ScalaLearn 项目包括scala语言学习、spark、sparkML学习等基础内容,有学习笔记 log-collector 为数据仓库日志生成工具包 spring-boot-echarts-master 为数仓大屏展示系统 flume-intercept 为日志过滤器工具包 hive-function 为hive日志处理函数工具包 ``` # DataWareHouse ## 一、介绍 数据仓库-大数据项目 ## 二、数仓采集 ### 日志格式 ```json # 启动日志 { "action":"1", // 动作:开始加载=1,加载成功=2,加载失败=3 "ar":"MX", // (String) 区域 "ba":"Huawei", // (String) 手机品牌 "detail":"102", "en":"start", "entry":"2", "extend1":"", // 扩展字段 Extend1 "g":"6XH11TR2@gmail.com", // (String) gmail "hw":"750*1134", // (String) heightXwidth,屏幕宽高 "l":"es", // (String) 系统语言 "la":"-43.2", // (double) lat 纬度 "ln":"-65.9", // (double) lng经度 "loading_time":"19", // 加载时长:计算下拉开始到接口返回数据的时间,(开始加载报0,加载成功或加载失败才上报时间) "md":"Huawei-7", // (String) 手机型号 "mid":"0", // (String) 设备唯一标识 "nw":"WIFI", // (String) 网络模式 "open_ad_type":"1", "os":"8.2.7", // (String) Android系统版本 "sr":"E", // (String) 渠道号,应用从哪个渠道来的。 "sv":"V2.8.3", // (String) sdkVersion "t":"1595286964091", // (String) 客户端日志产生时的时间 "uid":"0", // (String) 用户标识 "vc":"11", // (String) versionCode,程序版本号 "vn":"1.0.7" // (String) versionName,程序版本名 } # 事件日志 1595294381518|{ "cm":{ // 公共字段 "ln":"-85.3", // (double) lng经度 "sv":"V2.1.9", // (String) sdkVersion "os":"8.0.1", // (String) Android系统版本 "g":"R3083858@gmail.com", // (String) gmail 邮箱 "mid":"2", // (String) 设备唯一标识 "nw":"3G", // (String) 网络模式 "l":"pt", // (String) 系统语言 "vc":"8", // (String) versionCode,程序版本号 "hw":"640*960", // (String) heightXwidth,屏幕宽高 "ar":"MX", // (String) 区域 "uid":"2", // (String) 用户标识 "t":"1595203106461", // (String) 客户端日志产生时的时间 "la":"21.6", // (double) lat 纬度 "md":"HTC-9", // (String) 手机型号 "vn":"1.2.5", // (String) versionName,程序版本名 "ba":"HTC", // (String) 手机品牌 "sr":"K" // (String) 渠道号,应用从哪个渠道来的。 }, "ap":"app", // 项目数据来源 app pc "et":[ //事件 { "ett":"1595248652565", //客户端事件产生时间 "en":"newsdetail", //事件名称 "kv":{ //事件结果,以key-value形式自行定义 "entry":"1", "goodsid":"0", "news_staytime":"4", "loading_time":"16", "action":"1", "showtype":"1", "category":"73", "type1":"" } }, { "ett":"1595198210890", "en":"notification", "kv":{ "ap_time":"1595198045595", "action":"4", "type":"2", "content":"" } }, { "ett":"1595204756330", "en":"active_foreground", "kv":{ "access":"1", "push_id":"3" } }, { "ett":"1595219440069", "en":"active_background", "kv":{ "active_source":"2" } }, { "ett":"1595204810031", "en":"error", "kv":{ "errorDetail":"at cn.lift.dfdfdf.control.CommandUtil.getInfo(CommandUtil.java:67)\n at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n at java.lang.reflect.Method.invoke(Method.java:606)\n", "errorBrief":"at cn.lift.appIn.control.CommandUtil.getInfo(CommandUtil.java:67)" } } ] } ``` ### 2.1 Hadoop安装与使用 #### 2.1.1 HDFS多目录配置 机器挂载多块磁盘,将磁盘配置到hadoop配置中去,否则后续增加,需要重启hadoop集群 #### 2.1.2 支持LZO压缩配置 #### 2.1.3 基准测试 ```shell # 测试写入的速度 hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.2-tests.jar TestDFSIO -write -nrFiles 10 -fileSize 128MB # 测试读取的速度 hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.2-tests.jar TestDFSIO -read -nrFiles 10 -fileSize 128MB # 删除测试数据 hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.2-tests.jar TestDFSIO -clean ``` #### 2.1.4 HDFS参数调优 ### 2.2 Zookeeper安装与使用 ### 2.3 日志采集Flume安装与使用 拷贝日志生成包到/opt/module,并执行 ```shell java -classpath /opt/module/log-collector-1.0-SNAPSHOT-jar-with-dependencies.jar com.atguigu.appclient.AppMain >/opt/module/test.log ``` vim file-flume-kafka.conf ```properties a1.sources=r1 a1.channels=c1 c2 # configure source a1.sources.r1.type = TAILDIR a1.sources.r1.positionFile = /opt/module/flume/test/log_position.json a1.sources.r1.filegroups = f1 a1.sources.r1.filegroups.f1 = /tmp/logs/app.+ a1.sources.r1.fileHeader = true a1.sources.r1.channels = c1 c2 #interceptor a1.sources.r1.interceptors = i1 i2 a1.sources.r1.interceptors.i1.type = com.ddwanglife.flume.interceptor.LogETLInterceptor$Builder a1.sources.r1.interceptors.i2.type = com.ddwanglife.flume.interceptor.LogTypeInterceptor$Builder a1.sources.r1.selector.type = multiplexing a1.sources.r1.selector.header = topic a1.sources.r1.selector.mapping.topic_start = c1 a1.sources.r1.selector.mapping.topic_event = c2 # configure channel a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel a1.channels.c1.kafka.bootstrap.servers = 192.168.208.50:9092 a1.channels.c1.kafka.topic = topic_start a1.channels.c1.parseAsFlumeEvent = false a1.channels.c1.kafka.consumer.group.id = flume-consumer a1.channels.c2.type = org.apache.flume.channel.kafka.KafkaChannel a1.channels.c2.kafka.bootstrap.servers = 192.168.208.50:9092 a1.channels.c2.kafka.topic = topic_event a1.channels.c2.parseAsFlumeEvent = false a1.channels.c2.kafka.consumer.group.id = flume-consumer ``` 增加flume-interceptor工程,自定义拦截器 ```shell bin/flume-ng agent --name a1 --conf-file conf/file-flume-kafka.conf & ``` 启动停止脚本 ```shell #! /bin/bash case $1 in "start"){ echo " --------启动采集flume-------" nohup /usr/local/apache-flume-1.7.0-bin/bin/flume-ng agent --conf-file /usr/local/apache-flume-1.7.0-bin/conf/file-flume-kafka.conf --name a1 -Dflume.root.logger=INFO,LOGFILE >/dev/null 2>&1 & };; "stop"){ echo " --------停止采集flume-------" ps -ef | grep file-flume-kafka | grep -v grep |awk '{print \$2}' | xargs kill };; esac ``` 创建消费flume ```she # Extra Java CLASSPATH elements. export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:${HADOOP_HOME}/share/hadoop/common export JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native ``` vim kafka-flume-hdfs.conf ```properties ## 组件 a1.sources=r1 r2 a1.channels=c1 c2 a1.sinks=k1 k2 ## source1 a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource a1.sources.r1.batchSize = 5000 a1.sources.r1.batchDurationMillis = 2000 a1.sources.r1.kafka.bootstrap.servers = 192.168.208.50:9092 a1.sources.r1.kafka.topics=topic_start ## source2 a1.sources.r2.type = org.apache.flume.source.kafka.KafkaSource a1.sources.r2.batchSize = 5000 a1.sources.r2.batchDurationMillis = 2000 a1.sources.r2.kafka.bootstrap.servers = 192.168.208.50:9092 a1.sources.r2.kafka.topics=topic_event ## channel1 a1.channels.c1.type = file a1.channels.c1.checkpointDir = /opt/module/flume/checkpoint/behavior1 a1.channels.c1.dataDirs = /opt/module/flume/data/behavior1/ a1.channels.c1.maxFileSize = 2146435071 a1.channels.c1.capacity = 1000000 a1.channels.c1.keep-alive = 6 ## channel2 a1.channels.c2.type = file a1.channels.c2.checkpointDir = /opt/module/flume/checkpoint/behavior2 a1.channels.c2.dataDirs = /opt/module/flume/data/behavior2/ a1.channels.c2.maxFileSize = 2146435071 a1.channels.c2.capacity = 1000000 a1.channels.c2.keep-alive = 6 ## sink1 a1.sinks.k1.type = hdfs a1.sinks.k1.hdfs.path = /origin_data/gmall/log/topic_start/%Y-%m-%d a1.sinks.k1.hdfs.filePrefix = logstart- a1.sinks.k1.hdfs.round = true a1.sinks.k1.hdfs.roundValue = 10 a1.sinks.k1.hdfs.roundUnit = second ##sink2 a1.sinks.k2.type = hdfs a1.sinks.k2.hdfs.path = /origin_data/gmall/log/topic_event/%Y-%m-%d a1.sinks.k2.hdfs.filePrefix = logevent- a1.sinks.k2.hdfs.round = true a1.sinks.k2.hdfs.roundValue = 10 a1.sinks.k2.hdfs.roundUnit = second ## 不要产生大量小文件 a1.sinks.k1.hdfs.rollInterval = 10 a1.sinks.k1.hdfs.rollSize = 134217728 a1.sinks.k1.hdfs.rollCount = 0 a1.sinks.k2.hdfs.rollInterval = 10 a1.sinks.k2.hdfs.rollSize = 134217728 a1.sinks.k2.hdfs.rollCount = 0 ## 控制输出文件是原生文件。 a1.sinks.k1.hdfs.fileType = CompressedStream a1.sinks.k2.hdfs.fileType = CompressedStream a1.sinks.k1.hdfs.codeC = lzop a1.sinks.k2.hdfs.codeC = lzop ## 拼装 a1.sources.r1.channels = c1 a1.sinks.k1.channel= c1 a1.sources.r2.channels = c2 a1.sinks.k2.channel= c2 ``` ```shell #! /bin/bash case $1 in "start"){ echo " --------启动flume消费kafka写入hadoop-------" nohup /usr/local/apache-flume-1.7.0-bin/bin/flume-ng agent --conf-file /usr/local/apache-flume-1.7.0-bin/conf/kafka-flume-hdfs.conf --name a1 -Dflume.root.logger=INFO,LOGFILE >/opt/module/flume/log.txt 2>&1 & };; "stop"){ echo " --------停止flume消费kafka写入hadoop-------" ps -ef | grep kafka-flume-hdfs | grep -v grep |awk '{print $2}' | xargs kill };; esac ``` ```shell # 删除文件 hadoop fs -rm -r -skipTrash /path_to_file/file_name # 删除文件夹 hadoop fs -rm -r -skipTrash /origin_data/gmall/log/topic_start/2020-07-20 hadoop fs -rm -r -skipTrash /origin_data/gmall/log/topic_event/2020-07-20 # 删除flume 日志 rm -f /opt/module/flume/log.txt tail -f /opt/module/flume/log.txt ``` ```java /** 因为flume在进行配置的时候需要使用到hadoop的本地库,因此flume上传hdfs需要安装hadoop环境 https://blog.csdn.net/Kk_is_me/article/details/106864402 在配置lzo环境的时候,可参照,具体错误原因,见如下代码 */ public Class getCompressorType() { if (!isNativeLzoLoaded(this.conf)) { throw new RuntimeException("native-lzo library not available"); } return (Class)LzoCompressor.class; } public static boolean isNativeLzoLoaded(Configuration conf) { assert conf != null : "Configuration cannot be null!"; return (nativeLzoLoaded && conf.getBoolean("hadoop.native.lib", true)); } // hadoop.native.lib 配置要求是true 我的配置是false,不知道从哪里看的 ``` ### 2.4 kafka安装与使用 ## 三、用户行为数仓 ODS层:原始数据层,存放原始数据,直接加载原始日志、数据,数据保持原貌不做处理。 ```sql //创建输入数据是lzo输出是text,支持json解析的分区表 drop table if exists ods_start_log; CREATE EXTERNAL TABLE ods_start_log (`line` string) PARTITIONED BY (`dt` string) STORED AS INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION '/warehouse/gmall/ods/ods_start_log'; //加载数据 hive (gmall)> load data inpath '/origin_data/gmall/log/topic_start/2020-07-18' into table gmall.ods_start_log partition(dt='2020-07-18'); //查看是否加载成功 select * from ods_start_log limit 2; ``` DWD层:对ODS层数据进行清洗(去除空值,脏数据,超过极限范围的数据,行式存储改为列存储,改压缩格式) ```shell drop table if exists dwd_start_log; CREATE EXTERNAL TABLE dwd_start_log( `mid_id` string, `user_id` string, `version_code` string, `version_name` string, `lang` string, `source` string, `os` string, `area` string, `model` string, `brand` string, `sdk_version` string, `gmail` string, `height_width` string, `app_time` string, `network` string, `lng` string, `lat` string, `entry` string, `open_ad_type` string, `action` string, `loading_time` string, `detail` string, `extend1` string ) PARTITIONED BY (dt string) location '/warehouse/gmall/dwd/dwd_start_log/'; insert overwrite table dwd_start_log PARTITION (dt='2020-07-18') select get_json_object(line,'$.mid') mid_id, get_json_object(line,'$.uid') user_id, get_json_object(line,'$.vc') version_code, get_json_object(line,'$.vn') version_name, get_json_object(line,'$.l') lang, get_json_object(line,'$.sr') source, get_json_object(line,'$.os') os, get_json_object(line,'$.ar') area, get_json_object(line,'$.md') model, get_json_object(line,'$.ba') brand, get_json_object(line,'$.sv') sdk_version, get_json_object(line,'$.g') gmail, get_json_object(line,'$.hw') height_width, get_json_object(line,'$.t') app_time, get_json_object(line,'$.nw') network, get_json_object(line,'$.ln') lng, get_json_object(line,'$.la') lat, get_json_object(line,'$.entry') entry, get_json_object(line,'$.open_ad_type') open_ad_type, get_json_object(line,'$.action') action, get_json_object(line,'$.loading_time') loading_time, get_json_object(line,'$.detail') detail, get_json_object(line,'$.extend1') extend1 from ods_start_log where dt='2020-07-18'; create temporary function base_analizer as 'com.atguigu.udf.BaseFieldUDF'; create temporary function flat_analizer as 'com.atguigu.udtf.EventJsonUDTF'; set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dwd_base_event_log PARTITION (dt='2020-07-18') select mid_id, user_id, version_code, version_name, lang, source, os, area, model, brand, sdk_version, gmail, height_width, app_time, network, lng, lat, event_name, event_json, server_time from ( select split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time from ods_event_log where dt='2020-07-18' and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>'' ) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json; vim dwd_base_log.sh #!/bin/bash # 定义变量方便修改 APP=gmall hive=/usr/local/hive/bin/hive # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天 if [ -n "$1" ] ;then do_date=$1 else do_date=`date -d "-1 day" +%F` fi sql=" add jar /usr/local/hive/hivefunction-1.0-SNAPSHOT.jar; create temporary function base_analizer as 'com.atguigu.udf.BaseFieldUDF'; create temporary function flat_analizer as 'com.atguigu.udtf.EventJsonUDTF'; set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table "$APP".dwd_base_event_log PARTITION (dt='$do_date') select mid_id, user_id, version_code, version_name, lang, source , os , area , model , brand , sdk_version , gmail , height_width , network , lng , lat , app_time , event_name , event_json , server_time from ( select split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops, split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time from "$APP".ods_event_log where dt='$do_date' and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>'' ) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json; " $hive -e "$sql" ``` ## 四、业务数仓 ```shell [root@dev-environment ~]# hadoop checknative 20/07/20 09:46:16 INFO bzip2.Bzip2Factory: Successfully loaded & initialized native-bzip2 library system-native 20/07/20 09:46:16 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library Native library checking: hadoop: true /usr/local/hadoop-2.7.7/lib/native/libhadoop.so.1.0.0 zlib: true /lib64/libz.so.1 snappy: true /lib64/libsnappy.so.1 lz4: true revision:99 bzip2: true /lib64/libbz2.so.1 openssl: false Cannot load libcrypto.so (libcrypto.so: 无法打开共享对象文件: 没有那个文件或目录)! ``` 安装sqoop ## 五、即席数仓 ## 六、CDH数仓 ## 七、面试题