本篇博客博主將分享Hive函數的基礎知識.java
1.內置運算符(https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF)python
2.內置函數(https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF)apache
#查看函數 SHOW FUNCTIONS; DESCRIBE FUNCTION <function_name>; DESCRIBE FUNCTION EXTENDED <function_name>;
3.測試各類內置函數的快捷方法:json
a.建立一個dual表:centos
create table dual(id string);
b.load一個文件(一行,一個空格)到dual表服務器
load data local inpath '/home/hadoop/dual.dat' into table dual;
c.select substr('angelababy',2,3) from dual;session
4.Hive自定義函數和Transformapp
當Hive提供的內置函數沒法知足你的業務處理須要時,此時就能夠考慮使用用戶自定義函數(UDF:user-defined function)。ide
自定義函數類別:
UDF 做用於單個數據行,產生一個數據行做爲輸出。(數學函數,字符串函數)
UDAF(用戶定義彙集函數):接收多個輸入數據行,併產生一個輸出數據行。(count,max)函數
5.UDF開發實例(簡單UDF示例)
a、先開發一個java類,繼承UDF,並重載evaluate方法
package com.empire.bigdata.udf import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; public final class Lower extends UDF{ public Text evaluate(final Text s){ if(s==null){return null;} return new Text(s.toString().toLowerCase()); } }
b、打成jar包上傳到服務器
c、將jar包添加到hive的classpath
hive>add JAR /home/hadoop/udf.jar; #查看jar包 hive>list jar;
d、建立臨時函數與開發好的java class關聯
Hive>create temporary function tolowercase as 'cn.empire.bigdata.udf.ToProvince';
e、便可在hql中使用自定義的函數tolowercase
6.Json數據解析UDF開發
數據樣例:
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"} {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"} {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"} {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"} {"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"} {"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"} {"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"} {"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"} {"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"} {"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"} {"movie":"595","rate":"5","timeStamp":"978824268","uid":"1"} {"movie":"938","rate":"4","timeStamp":"978301752","uid":"1"} {"movie":"2398","rate":"4","timeStamp":"978302281","uid":"1"} {"movie":"2918","rate":"4","timeStamp":"978302124","uid":"1"} {"movie":"1035","rate":"5","timeStamp":"978301753","uid":"1"} {"movie":"2791","rate":"4","timeStamp":"978302188","uid":"1"} {"movie":"2687","rate":"3","timeStamp":"978824268","uid":"1"} {"movie":"2018","rate":"4","timeStamp":"978301777","uid":"1"} {"movie":"3105","rate":"5","timeStamp":"978301713","uid":"1"} {"movie":"2797","rate":"4","timeStamp":"978302039","uid":"1"} {"movie":"2321","rate":"3","timeStamp":"978302205","uid":"1"} {"movie":"720","rate":"3","timeStamp":"978300760","uid":"1"} {"movie":"1270","rate":"5","timeStamp":"978300055","uid":"1"} {"movie":"527","rate":"5","timeStamp":"978824195","uid":"1"}
程序代碼:
package com.empire.hive.empire_hive; /** *json轉換bean */ //{"movie":"1721","rate":"3","timeStamp":"965440048","uid":"5114"} public class MovieRateBean { private String movie; private String rate; private String timeStamp; private String uid; public String getMovie() { return movie; } public void setMovie(String movie) { this.movie = movie; } public String getRate() { return rate; } public void setRate(String rate) { this.rate = rate; } public String getTimeStamp() { return timeStamp; } public void setTimeStamp(String timeStamp) { this.timeStamp = timeStamp; } public String getUid() { return uid; } public void setUid(String uid) { this.uid = uid; } @Override public String toString() { return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid; } }
package com.empire.hive.empire_hive; import org.apache.hadoop.hive.ql.exec.UDF; import parquet.org.codehaus.jackson.map.ObjectMapper; //自定義json轉換函數 public class JsonParser extends UDF { public String evaluate(String jsonLine) { ObjectMapper objectMapper = new ObjectMapper(); try { MovieRateBean bean = objectMapper.readValue(jsonLine, MovieRateBean.class); return bean.toString(); } catch (Exception e) { } return ""; } }
7.Transform實現(調用腳本)
Hive的 TRANSFORM 關鍵字提供了在SQL中調用自寫腳本的功能
適合實現Hive中沒有的功能又不想寫UDF的狀況
transform案例: 一、先加載rating.json文件到hive的一個原始表 rat_json create table rat_json(line string) row format delimited; load data local inpath '/home/hadoop/rating.json' into table rat_json; 二、須要解析json數據成四個字段,插入一張新的表 t_rating insert overwrite table t_rating select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json; 三、使用transform+python的方式去轉換unixtime爲weekday 先編輯一個python腳本文件 ########python######代碼 vi weekday_mapper.py #!/bin/python import sys import datetime for line in sys.stdin: line = line.strip() movieid, rating, unixtime,userid = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '\t'.join([movieid, rating, str(weekday),userid]) 保存文件 而後,將文件加入hive的classpath: hive>add FILE /home/hadoop/weekday_mapper.py; hive>create TABLE u_data_new as SELECT TRANSFORM (movieid, rate, timestring,uid) USING 'python weekday_mapper.py' AS (movieid, rate, weekday,uid) FROM t_rating; select distinct(weekday) from u_data_new limit 10;
運行效果圖:
0: jdbc:hive2://centos-aaron-h1:10000> show functions; OK +-------------------------+--+ | tab_name | +-------------------------+--+ | ! | | != | | % | | & | | * | | + | | - | | / | | < | | <= | | <=> | | <> | | = | | == | | > | | >= | | ^ | | abs | | acos | | add_months | | and | | array | | array_contains | | ascii | | asin | | assert_true | | atan | | avg | | base64 | | between | | bin | | case | | cbrt | | ceil | | ceiling | | coalesce | | collect_list | | collect_set | | compute_stats | | concat | | concat_ws | | context_ngrams | | conv | | corr | | cos | | count | | covar_pop | | covar_samp | | create_union | | cume_dist | | current_database | | current_date | | current_timestamp | | current_user | | date_add | | date_format | | date_sub | | datediff | | day | | dayofmonth | | decode | | degrees | | dense_rank | | div | | e | | elt | | encode | | ewah_bitmap | | ewah_bitmap_and | | ewah_bitmap_empty | | ewah_bitmap_or | | exp | | explode | | factorial | | field | | find_in_set | | first_value | | floor | | format_number | | from_unixtime | | from_utc_timestamp | | get_json_object | | greatest | | hash | | hex | | histogram_numeric | | hour | | if | | in | | in_file | | index | | initcap | | inline | | instr | | isnotnull | | isnull | | java_method | | json_tuple | | lag | | last_day | +-------------------------+--+ | tab_name | +-------------------------+--+ | last_value | | lcase | | lead | | least | | length | | levenshtein | | like | | ln | | locate | | log | | log10 | | log2 | | lower | | lpad | | ltrim | | map | | map_keys | | map_values | | matchpath | | max | | min | | minute | | month | | months_between | | named_struct | | negative | | next_day | | ngrams | | noop | | noopstreaming | | noopwithmap | | noopwithmapstreaming | | not | | ntile | | nvl | | or | | parse_url | | parse_url_tuple | | percent_rank | | percentile | | percentile_approx | | pi | | pmod | | posexplode | | positive | | pow | | power | | printf | | radians | | rand | | rank | | reflect | | reflect2 | | regexp | | regexp_extract | | regexp_replace | | repeat | | reverse | | rlike | | round | | row_number | | rpad | | rtrim | | second | | sentences | | shiftleft | | shiftright | | shiftrightunsigned | | sign | | sin | | size | | sort_array | | soundex | | space | | split | | sqrt | | stack | | std | | stddev | | stddev_pop | | stddev_samp | | str_to_map | | struct | | substr | | substring | | sum | | tan | | to_date | | to_unix_timestamp | | to_utc_timestamp | | translate | | trim | | trunc | | ucase | | unbase64 | | unhex | | unix_timestamp | | upper | | var_pop | | var_samp | +-------------------------+--+ | tab_name | +-------------------------+--+ | variance | | weekofyear | | when | | windowingtablefunction | | xpath | | xpath_boolean | | xpath_double | | xpath_float | | xpath_int | | xpath_long | | xpath_number | | xpath_short | | xpath_string | | year | | | | | ~ | +-------------------------+--+ 216 rows selected (0.677 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> list jar 0: jdbc:hive2://centos-aaron-h1:10000> ; +-----------+--+ | resource | +-----------+--+ +-----------+--+ No rows selected (0.006 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> DESCRIBE FUNCTION case; +------------------------------------------------------------------------------------------------------------------+--+ | tab_name | +------------------------------------------------------------------------------------------------------------------+--+ | CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f | +------------------------------------------------------------------------------------------------------------------+--+ 1 row selected (0.039 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> DESCRIBE FUNCTION EXTENDED case 0: jdbc:hive2://centos-aaron-h1:10000> ; +------------------------------------------------------------------------------------------------------------------+--+ | tab_name | +------------------------------------------------------------------------------------------------------------------+--+ | CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f | | Example: | | SELECT | | CASE deptno | | WHEN 1 THEN Engineering | | WHEN 2 THEN Finance | | ELSE admin | | END, | | CASE zone | | WHEN 7 THEN Americas | | ELSE Asia-Pac | | END | | FROM emp_details | +------------------------------------------------------------------------------------------------------------------+--+ 13 rows selected (0.016 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> create table rat_json(line string) row format delimited; No rows affected (0.628 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> load data local inpath '/home/hadoop/rating.json' into table rat_json; INFO : Loading data to table default.rat_json from file:/home/hadoop/rating.json INFO : Table default.rat_json stats: [numFiles=1, totalSize=65602698] No rows affected (2.373 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> add JAR /home/hadoop/udf.jar; INFO : Added [/home/hadoop/udf.jar] to class path INFO : Added resources: [/home/hadoop/udf.jar] No rows affected (0.007 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> list jar; +-----------------------+--+ | resource | +-----------------------+--+ | /home/hadoop/udf.jar | +-----------------------+--+ 1 row selected (0.005 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> create temporary function parsejson as 'com.empire.hive.empire_hive.JsonParser'; No rows affected (0.016 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> select split(parsejson(line),'\t')[0]as movieid,split(parsejson(line),'\t')[1] as rate,split(parsejson(line),'\t')[2] as timestring,split(parsejson(line),'\t')[3] as uid from rat_json limit 10; +----------+-------+-------------+------+--+ | movieid | rate | timestring | uid | +----------+-------+-------------+------+--+ | 1193 | 5 | 978300760 | 1 | | 661 | 3 | 978302109 | 1 | | 914 | 3 | 978301968 | 1 | | 3408 | 4 | 978300275 | 1 | | 2355 | 5 | 978824291 | 1 | | 1197 | 3 | 978302268 | 1 | | 1287 | 5 | 978302039 | 1 | | 2804 | 5 | 978300719 | 1 | | 594 | 4 | 978302268 | 1 | | 919 | 4 | 978301368 | 1 | +----------+-------+-------------+------+--+ 10 rows selected (0.766 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> create table t_rating(movieid string,rate int,timestring string,uid string)row format delimited fields terminated by '\t'; No rows affected (0.075 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> insert overwrite table t_rating 0: jdbc:hive2://centos-aaron-h1:10000> select split(parsejson(line),'\t')[0]as movieid,split(parsejson(line),'\t')[1] as rate,split(parsejson(line),'\t')[2] as timestring,split(parsejson(line),'\t')[3] as uid from rat_json limit 10; INFO : Number of reduce tasks determined at compile time: 1 INFO : In order to change the average load for a reducer (in bytes): INFO : set hive.exec.reducers.bytes.per.reducer=<number> INFO : In order to limit the maximum number of reducers: INFO : set hive.exec.reducers.max=<number> INFO : In order to set a constant number of reducers: INFO : set mapreduce.job.reduces=<number> INFO : number of splits:1 INFO : Submitting tokens for job: job_1548540680535_0001 INFO : The url to track the job: http://centos-aaron-h1:8088/proxy/application_1548540680535_0001/ INFO : Starting Job = job_1548540680535_0001, Tracking URL = http://centos-aaron-h1:8088/proxy/application_1548540680535_0001/ INFO : Kill Command = /home/hadoop/apps/hadoop-2.9.1/bin/hadoop job -kill job_1548540680535_0001 INFO : Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 INFO : 2019-01-27 08:18:39,805 Stage-1 map = 0%, reduce = 0% INFO : 2019-01-27 08:18:51,508 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.29 sec INFO : 2019-01-27 08:19:11,125 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 4.89 sec INFO : MapReduce Total cumulative CPU time: 4 seconds 890 msec INFO : Ended Job = job_1548540680535_0001 INFO : Loading data to table default.t_rating from hdfs://centos-aaron-h1:9000/user/hive/warehouse/t_rating/.hive-staging_hive_2019-01-27_08-18-11_540_2930931099472349170-1/-ext-10000 INFO : Table default.t_rating stats: [numFiles=1, numRows=10, totalSize=186, rawDataSize=176] No rows affected (61.028 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> select * from t_rating; +-------------------+----------------+----------------------+---------------+--+ | t_rating.movieid | t_rating.rate | t_rating.timestring | t_rating.uid | +-------------------+----------------+----------------------+---------------+--+ | 919 | 4 | 978301368 | 1 | | 594 | 4 | 978302268 | 1 | | 2804 | 5 | 978300719 | 1 | | 1287 | 5 | 978302039 | 1 | | 1197 | 3 | 978302268 | 1 | | 2355 | 5 | 978824291 | 1 | | 3408 | 4 | 978300275 | 1 | | 914 | 3 | 978301968 | 1 | | 661 | 3 | 978302109 | 1 | | 1193 | 5 | 978300760 | 1 | +-------------------+----------------+----------------------+---------------+--+ 10 rows selected (0.079 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 10; +--------+-------+--+ | moive | rate | +--------+-------+--+ | 1193 | 5 | | 661 | 3 | | 914 | 3 | | 3408 | 4 | | 2355 | 5 | | 1197 | 3 | | 1287 | 5 | | 2804 | 5 | | 594 | 4 | | 919 | 4 | +--------+-------+--+ 10 rows selected (0.23 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> add FILE /home/hadoop/weekday_mapper.py; INFO : Added resources: [/home/hadoop/weekday_mapper.py] No rows affected (0.046 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> create TABLE u_data_new as 0: jdbc:hive2://centos-aaron-h1:10000> SELECT 0: jdbc:hive2://centos-aaron-h1:10000> TRANSFORM (movieid, rate, timestring,uid) 0: jdbc:hive2://centos-aaron-h1:10000> USING 'python weekday_mapper.py' 0: jdbc:hive2://centos-aaron-h1:10000> AS (movieid, rate, weekday,uid) 0: jdbc:hive2://centos-aaron-h1:10000> FROM t_rating; INFO : Number of reduce tasks is set to 0 since there's no reduce operator INFO : number of splits:1 INFO : Submitting tokens for job: job_1548540680535_0002 INFO : The url to track the job: http://centos-aaron-h1:8088/proxy/application_1548540680535_0002/ INFO : Starting Job = job_1548540680535_0002, Tracking URL = http://centos-aaron-h1:8088/proxy/application_1548540680535_0002/ INFO : Kill Command = /home/hadoop/apps/hadoop-2.9.1/bin/hadoop job -kill job_1548540680535_0002 INFO : Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0 INFO : 2019-01-27 08:26:56,867 Stage-1 map = 0%, reduce = 0% INFO : 2019-01-27 08:27:06,186 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 0.86 sec INFO : MapReduce Total cumulative CPU time: 860 msec INFO : Ended Job = job_1548540680535_0002 INFO : Stage-4 is selected by condition resolver. INFO : Stage-3 is filtered out by condition resolver. INFO : Stage-5 is filtered out by condition resolver. INFO : Moving data to: hdfs://centos-aaron-h1:9000/user/hive/warehouse/.hive-staging_hive_2019-01-27_08-26-42_414_5787842099636710618-4/-ext-10001 from hdfs://centos-aaron-h1:9000/user/hive/warehouse/.hive-staging_hive_2019-01-27_08-26-42_414_5787842099636710618-4/-ext-10003 INFO : Moving data to: hdfs://centos-aaron-h1:9000/user/hive/warehouse/u_data_new from hdfs://centos-aaron-h1:9000/user/hive/warehouse/.hive-staging_hive_2019-01-27_08-26-42_414_5787842099636710618-4/-ext-10001 INFO : Table default.u_data_new stats: [numFiles=1, numRows=10, totalSize=106, rawDataSize=96] No rows affected (25.058 seconds) 0: jdbc:hive2://centos-aaron-h1:10000> select distinct(weekday) from u_data_new limit 10; INFO : Number of reduce tasks not specified. Estimated from input data size: 1 INFO : In order to change the average load for a reducer (in bytes): INFO : set hive.exec.reducers.bytes.per.reducer=<number> INFO : In order to limit the maximum number of reducers: INFO : set hive.exec.reducers.max=<number> INFO : In order to set a constant number of reducers: INFO : set mapreduce.job.reduces=<number> INFO : number of splits:1 INFO : Submitting tokens for job: job_1548540680535_0003 INFO : The url to track the job: http://centos-aaron-h1:8088/proxy/application_1548540680535_0003/ INFO : Starting Job = job_1548540680535_0003, Tracking URL = http://centos-aaron-h1:8088/proxy/application_1548540680535_0003/ INFO : Kill Command = /home/hadoop/apps/hadoop-2.9.1/bin/hadoop job -kill job_1548540680535_0003 INFO : Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 INFO : 2019-01-27 08:29:36,422 Stage-1 map = 0%, reduce = 0% INFO : 2019-01-27 08:29:43,605 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 0.69 sec INFO : 2019-01-27 08:29:51,811 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 1.81 sec INFO : MapReduce Total cumulative CPU time: 1 seconds 810 msec INFO : Ended Job = job_1548540680535_0003 +----------+--+ | weekday | +----------+--+ | 1 | | 7 | +----------+--+ 2 rows selected (27.742 seconds) 0: jdbc:hive2://centos-aaron-h1:10000>
總結:臨時函數只對當前session有效,若是想在下次登陸時使用需創建永久函數。
最後寄語,以上是博主本次文章的所有內容,若是你們以爲博主的文章還不錯,請點贊;若是您對博主其它服務器大數據技術或者博主本人感興趣,請關注博主博客,而且歡迎隨時跟博主溝通交流。