2012年(91)
分类:
2012-09-27 01:34:57
原文地址:Hive基础 作者:skybin090804
基本类型
整形:
Tinyint 1字节
Smallint 2字节
Int 4字节
Bigint 8字节
布尔型:
Boolean:true false
浮点数:
单精度 float
双精度 double
字符串:
String
复合类型
Structs maps(键值对) list(索引链表)
内建运算符和函数
关系运算符
<
<=
>
>=
is null
is not null
like
rlike
regexpb
算术运算符
+ - * / % & | ^ ~
逻辑运算符
And && 与
Or ||或
Not !非
复合类型运算
结构 A.x
链表A[x] 从0开始计算
映射M[key]
内建函数
Round(double a) return bigint
Floor(double a) return bigint
Ceil(double a) return bigint
Rand() rand(int seed) return double
Concat(string a, string b) return string
Substr(string str, int start) return string
Substr(string str, int start, int length) return string
Upper(string a) return string
Ucase(string a) return string
Lower(string a) return string
Lcase(string a) return string
Trim(string a) return string
Rtrim(string a) return string
Size(map
Size(array
Type Cast(type)
From_unixtime(int unixtime) return string
String to_date(string stringtimestamp) return string
Year(string date) return int
Month(string date) return int
Day(string date) return int
get_json_object(string json_string, string path) return string
hive内建聚合函数
count(1) count(distinct) return int 计数
sum(col) sum(distinct col) return double 求和
avg(col) avg(distinct col) return double 求平均值
min(col) return double 最小值
max(col) return double 最大值
HQL实例
创建表
CREATE TABLE page_view(viewTime INT, userid BIGINT,
page_url STRING, referrer_url STRING,
ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(dt STRING, country STRING)
STORED AS SEQUENCEFILE;
加入字段分隔符
CREATE TABLE page_view(viewTime INT, userid BIGINT,
page_url STRING, referrer_url STRING,
ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(dt STRING, country STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '1'
STORED AS SEQUENCEFILE;
加入分组,排序和buckets
CREATE TABLE page_view(viewTime INT, userid BIGINT,
page_url STRING, referrer_url STRING,
friends ARRAY
ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(dt STRING, country STRING)
CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '1'
COLLECTION ITEMS TERMINATED BY '2'
MAP KEYS TERMINATED BY '3'
STORED AS SEQUENCEFILE;
显示表和分区
Show tables;
Show partitions page_view; 显示表分区
Describe tablesname
Describe extended page_view
DESCRIBE EXTENDED page_view PARTITION (ds='2008-08-08');
加载数据
CREATE EXTERNAL TABLE page_view_stg(viewTime INT, userid BIGINT,
page_url STRING, referrer_url STRING,
ip STRING COMMENT 'IP Address of the User',
country STRING COMMENT 'country of origination')
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '44' LINES TERMINATED BY '12'
STORED AS TEXTFILE
LOCATION '/user/data/staging/page_view';
hadoop dfs -put /tmp/pv_2008-06-08.txt /user/data/staging/page_view
FROM page_view_stg pvs
INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country='US')
SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip
WHERE pvs.country = 'US';
外部数据文件
LOAD DATA LOCAL INPATH `/tmp/pv_2008-06-08_us.txt` INTO TABLE page_view PARTITION(date='2008-06-08', country='US')
Hdfs上的数据文件
LOAD DATA INPATH '/user/data/pv_2008-06-08_us.txt' INTO TABLE page_view PARTITION(date='2008-06-08', country='US')
简单查询
INSERT OVERWRITE TABLE user_active
SELECT user.*
FROM user
WHERE user.active = 1;
基于划分的查询
INSERT OVERWRITE TABLE xyz_com_page_views
SELECT page_views.*
FROM page_views
WHERE page_views.date >= '2008-03-01' AND page_views.date <= '2008-03-31' AND
page_views.referrer_url like '%xyz.com';
连接
等值连接
INSERT OVERWRITE TABLE pv_users
SELECT pv.*, u.gender, u.age
FROM user u JOIN page_view pv ON (pv.userid = u.id)
WHERE pv.date = '2008-03-03';
全外连接
INSERT OVERWRITE TABLE pv_users
SELECT pv.*, u.gender, u.age
FROM user u FULL OUTER JOIN page_view pv ON (pv.userid = u.id)
WHERE pv.date = '2008-03-03';
左连接
INSERT OVERWRITE TABLE pv_users
SELECT u.*
FROM user u LEFT SEMI JOIN page_view pv ON (pv.userid = u.id)
WHERE pv.date = '2008-03-03';
连接多个表
INSERT OVERWRITE TABLE pv_friends
SELECT pv.*, u.gender, u.age, f.friends
FROM page_view pv JOIN user u ON (pv.userid = u.id) JOIN friend_list f ON (u.id = f.uid)
WHERE pv.date = '2008-03-03';
聚合函数
计数
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
求和
INSERT OVERWRITE TABLE pv_gender_agg
SELECT pv_users.gender, count(DISTINCT pv_users.userid), count(1), sum(DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
多表或者文件插入
FROM pv_users
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count_distinct(pv_users.userid)
GROUP BY pv_users.gender
INSERT OVERWRITE DIRECTORY '/user/data/tmp/pv_age_sum'
SELECT pv_users.age, count_distinct(pv_users.userid)
GROUP BY pv_users.age;
动态划分插入
hive> set hive.exec.dynamic.partition.mode=nonstrict;
hive> FROM page_view_stg pvs
INSERT OVERWRITE TABLE page_view PARTITION(dt, country)
SELECT * FROM (
SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip,
from_unixtimestamp(pvs.viewTime, 'yyyy-MM-dd') ds, pvs.country
DISTRIBUTE BY ds, country
) T;
插入到本地文件
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/pv_gender_sum'
SELECT pv_gender_sum.*
FROM pv_gender_sum;
抽样
INSERT OVERWRITE TABLE pv_gender_sum_sample
SELECT pv_gender_sum.*
FROM pv_gender_sum TABLESAMPLE(BUCKET 3 OUT OF 32);
并集运算
INSERT OVERWRITE TABLE actions_users
SELECT u.id, actions.date
FROM (
SELECT av.uid AS uid
FROM action_video av
WHERE av.date = '2008-06-03'
UNION ALL
SELECT ac.uid AS uid
FROM action_comment ac
WHERE ac.date = '2008-06-03'
) actions JOIN users u ON(u.id = actions.uid);
数组运算
SELECT pv.friends[2]
FROM page_views pv;
SELECT pv.userid, size(pv.friends)
FROM page_view pv;
映射操作
INSERT OVERWRITE page_views_map
SELECT pv.userid, pv.properties['page type']
FROM page_views pv;
SELECT size(pv.properties)
FROM page_view pv;
自定义MapReduce脚本语言
FROM (
FROM pv_users
MAP pv_users.userid, pv_users.date
USING 'map_script'
AS dt, uid
CLUSTER BY dt) map_output
INSERT OVERWRITE TABLE pv_users_reduced
REDUCE map_output.dt, map_output.uid
USING 'reduce_script'
AS date, count;
Python代码
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print ','.join([userid, str(weekday)])
SELECT TRANSFORM(pv_users.userid, pv_users.date) USING 'map_script' AS dt, uid CLUSTER BY dt FROM pv_users;
分组处理
FROM (
FROM (
FROM action_video av
SELECT av.uid AS uid, av.id AS id, av.date AS date
UNION ALL
FROM action_comment ac
SELECT ac.uid AS uid, ac.id AS id, ac.date AS date
) union_actions
SELECT union_actions.uid, union_actions.id, union_actions.date
CLUSTER BY union_actions.uid) map
INSERT OVERWRITE TABLE actions_reduced
SELECT TRANSFORM(map.uid, map.id, map.date) USING 'reduce_script' AS (uid, id, reduced_val);
修改表
ALTER TABLE old_table_name RENAME TO new_table_name;
ALTER TABLE old_table_name REPLACE COLUMNS (col1 TYPE, ...);
ALTER TABLE tab1 ADD COLUMNS (c1 INT COMMENT 'a new int column', c2 STRING DEFAULT 'def val');
删除表
DROP TABLE pv_users;
ALTER TABLE pv_users DROP PARTITION (ds='2008-08-08')