练习 1
1.查看可用于 Hadoop 分布式文件系统的命令:
cd /home/oracle/movie/moviework/mapreduce
hadoop fs
2.列出 /user/oracle 的内容
hadoop fs -ls /user/oracle
3.在 /user/oracle 文件夹中创建一个名为“my_stuff”的子目录,然后确保已创建该目录:
hadoop fs -mkdir /user/oracle/my_stuff
hadoop fs -ls /user/oracle
4.删除“my_stuff”目录,然后确保该目录已经删除:
hadoop fs -rm -r my_stuff
hadoop fs -ls
5.检查压缩的 Avro 应用日志:
cd /home/oracle/movie/moviework/mapreduce
./read_avro_file.sh
6.查看可用于 Hadoop 分布式文件系统的命令并将经过 gzip 压缩的文件复制到 HDFS 中:
hadoop fs
hadoop fs -put movieapp_3months.avro /user/oracle/moviework/applog_avro
7.通过列出 HDFS 中的目录内容来验证复制:
hadoop fs -ls /user/oracle/moviework/applog_avro
练习 2
1.在 Linux 提示符下键入 hive,进入 Hive 命令行:
hive
2.新建一个名为 moviework 的 hive 数据库。确保该数据库已经成功创建:
create database moviework;
show databases;
3.查看 avro 文件的模式定义,然后使用该模式文件定义一个表
hadoop fs -cat moviework/schemas/activity.avsc
4.要在数据库中创建表,您可以完全限定表名称(即在表名前加上数据库),也可以指定您希望所有 DDL 和 DML 操作应用于特定数据库。为简单起见,您将对 moviework 数据库应用后续操作:
use moviework;
CREATE EXTERNAL TABLE movieapp_log_avro
ROW FORMAT
SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
WITH SERDEPROPERTIES ('avro.schema.url'='hdfs://bigdatalite.localdomain/user/oracle/moviework/schemas/activity.avsc')
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '/user/oracle/moviework/applog_avro';
SELECT * FROM movieapp_log_avro LIMIT 20;
5.HiveQL 支持许多标准 SQL 操作。找到日志文件中最小和最大的时间段:
SELECT MIN(time), MAX(time) FROM movieapp_log_avro;
练习 3
1.编写一个查询,只选择与启动、浏览、填写或购买电影相对应的点击。使用一个 CASE 语句将 RECOMMENDED 列转换成整数,其中“Y”代表 1,“N”代表 0。同样,确保 GENREID 不为 null。仅包含前 25 行:
SELECT custid,
movieid,
CASE WHEN genreid > 0 THEN genreid ELSE -1 END genreid,
time,
CASE recommended WHEN 'Y' THEN 1 ELSE 0 END recommended,
activity,
price
FROM movieapp_log_avro
WHERE activity IN (2,4,5,11) LIMIT 25;
2.编写一个查询,选择每部电影的客户 ID、电影 ID、推荐状态和最近评分。
SELECT
m1.custid,
m1.movieid,
CASE WHEN m1.genreid > 0 THEN m1.genreid ELSE -1 END genreid,
m1.time,
CASE m1.recommended WHEN 'Y' THEN 1 ELSE 0 END recommended,
m1.activity,
m1.rating
FROM movieapp_log_avro m1
JOIN
(SELECT
custid,
movieid,
CASE WHEN genreid > 0 THEN genreid ELSE -1 END genreid,
MAX(time) max_time,
activity
FROM movieapp_log_avro
GROUP BY custid,
movieid,
genreid,
activity
) m2
ON (
m1.custid = m2.custid
AND m1.movieid = m2.movieid
AND CASE WHEN m1.genreid > 0 THEN m1.genreid ELSE -1 END = CASE WHEN m2.genreid > 0 THEN m2.genreid ELSE -1 END
AND m1.time = m2.max_time
AND m1.activity = 1
AND m2.activity = 1
) LIMIT 25;
3.将上面两个查询的结果加载到一个临时表中。首先,创建临时表:
CREATE TABLE movieapp_log_stage (
custId INT,
movieId INT,
genreId INT,
time STRING,
recommended INT,
activity INT,
rating INT,
sales FLOAT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
4.接下来,将查询结果加载到这个临时表中:
INSERT OVERWRITE TABLE movieapp_log_stage
SELECT * FROM (
SELECT custid,
movieid,
CASE WHEN genreid > 0 THEN genreid ELSE -1 END genreid,
time,
CAST((CASE recommended WHEN 'Y' THEN 1 ELSE 0 END) AS INT) recommended,
activity,
cast(null AS INT) rating,
price
FROM movieapp_log_avro
WHERE activity IN (2,4,5,11)
UNION ALL
SELECT
m1.custid,
m1.movieid,
CASE WHEN m1.genreid > 0 THEN m1.genreid ELSE -1 END genreid,
m1.time,
CAST((CASE m1.recommended WHEN 'Y' THEN 1 ELSE 0 END) AS INT) recommended,
m1.activity,
m1.rating,
cast(null as float) price
FROM movieapp_log_avro m1
JOIN
(SELECT
custid,
movieid,
CASE WHEN genreid > 0 THEN genreid ELSE -1 END genreid,
MAX(time) max_time,
activity
FROM movieapp_log_avro
GROUP BY custid,
movieid,
genreid,
activity
) m2
ON (
m1.custid = m2.custid
AND m1.movieid = m2.movieid
AND CASE WHEN m1.genreid > 0 THEN m1.genreid ELSE -1 END = CASE WHEN m2.genreid > 0 THEN m2.genreid ELSE -1 END
AND m1.time = m2.max_time
AND m1.activity = 1
AND m2.activity = 1
)
) union_result;
可选:Pig
1.启动 Grunt shell 并执行以下语句,使用点击流数据建立一个数据流。注:Pig Latin 语句组装成 MapReduce 作业,然后在执行 DUMP 或 STORE 语句时启动。
pig
REGISTER /usr/lib/pig/piggybank.jar
REGISTER /usr/lib/pig/lib/avro-1.7.4.jar
REGISTER /usr/lib/pig/lib/json-simple-1.1.jar
REGISTER /usr/lib/pig/lib/snappy-java-1.0.4.1.jar
REGISTER /usr/lib/pig/lib/jackson-core-asl-1.8.8.jar
REGISTER /usr/lib/pig/lib/jackson-mapper-asl-1.8.8.jar
applogs = LOAD '/user/oracle/moviework/applog_avro'
USING org.apache.pig.piggybank.storage.avro.AvroStorage(
'no_schema_check',
'schema_file',
'hdfs://bigdatalite.localdomain/user/oracle/moviework/schemas/activity.avsc');
DESCRIBE applogs;
log_sample = SAMPLE applogs 0.001;
DESCRIBE log_sample;
DUMP log_sample;
2.按电影对 log_sample 分组并转储生成的包。
movie_logs = GROUP log_sample BY movieId;
dump movie_logs;