CREATE DATABASE [IF NOT EXISTS] database_name [COMMENT database_comment] [LOCATION hdfs_path]
DROP DATABASE [IF EXISTS] database_name [RESTRICT|CASCADE]; -- RESTRICT:默認選項只能刪除空庫 -- CASCADE:能夠刪除庫和庫裏的表
建立表 CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name [(col_name data_type [COMMENT col_comment], ...] [COMMENT table_comment] [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] [ [ROW FORMAT row_format] [STORED AS file_format] ] [LOCATION hdfs_path] [AS select_statement]; CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name LIKE existing_table_or_view_name [LOCATION hdfs_path]; primitive_data_type : TINYINT | SMALLINT | INT | BIGINT | BOOLEAN | FLOAT | DOUBLE | DOUBLE PRECISION | STRING | BINARY | TIMESTAMP | DECIMAL | DECIMAL(precision, scale) | DATE | VARCHAR | CHAR row_format : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char] [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char] [NULL DEFINED AS char] | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)] file_format: : SEQUENCEFILE | TEXTFILE -- (Default, depending on hive.default.fileformat configuration) | RCFILE | ORC | PARQUET | AVRO | JSONFILE | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
建立時指定external時爲外部表,不然爲內部表,或者經過desc formatted table_name;輸出表的詳細信息,其中Table Type:列會輸出存儲目錄信息和數據表類型內部表(managed table),外部表(external table)java
內部表數據由Hive自身管理,外部表數據由HDFS管理;
內部表數據存儲的位置是hive.metastore.warehouse.dir(默認:/user/hive/warehouse),外部表數據的存儲位置由本身制定;
刪除內部表會直接刪除元數據(metadata)及存儲數據;刪除外部表僅僅會刪除元數據,HDFS上的文件並不會被刪除;
對內部表的修改會將修改直接同步給元數據,而對外部表的表結構和分區進行修改,則須要修復(MSCK REPAIR TABLE table_name;)
外部表:hdfs上的共享數據,多個部門公用時採用node
-- 建立內部表 create table tb_managed( id int, name string, hobby array<string>, add map<String,string> ) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' lines terminated by '\n' ; -- 數據文件及內容 complex_data_type.txt 1,xiaoming,book-TV-code,beijing:chaoyang-shagnhai:pudong 2,lilei,book-code,nanjing:jiangning-taiwan:taibei 3,lihua,music-book,heilongjiang:haerbin -- load數據 load data local inpath '/usr/local/hive-2.1.1/data_dir/complex_data_type.txt' overwrite into table tb_managed; -- 查詢表模式信息 0: jdbc:hive2://node225:10000/db01> desc formatted tb_managed; OK +-------------------------------+-------------------------------------------------------------+-----------------------+--+ | col_name | data_type | comment | +-------------------------------+-------------------------------------------------------------+-----------------------+--+ | # col_name | data_type | comment | | | NULL | NULL | | id | int | | | name | string | | | hobby | array<string> | | | add | map<string,string> | | | | NULL | NULL | | # Detailed Table Information | NULL | NULL | | Database: | db01 | NULL | | Owner: | root | NULL | | CreateTime: | Wed Oct 10 15:21:31 CST 2018 | NULL | | LastAccessTime: | UNKNOWN | NULL | | Retention: | 0 | NULL | | Location: | hdfs://ns1/user/hive/warehouse/db01.db/tb_managed | NULL | | Table Type: | MANAGED_TABLE | NULL | | Table Parameters: | NULL | NULL | | | numFiles | 1 | | | numRows | 0 | | | rawDataSize | 0 | | | totalSize | 147 | | | transient_lastDdlTime | 1539156255 | | | NULL | NULL | | # Storage Information | NULL | NULL | | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | | Compressed: | No | NULL | | Num Buckets: | -1 | NULL | | Bucket Columns: | [] | NULL | | Sort Columns: | [] | NULL | | Storage Desc Params: | NULL | NULL | | | colelction.delim | - | | | field.delim | , | | | mapkey.delim | : | | | serialization.format | , | +-------------------------------+-------------------------------------------------------------+-----------------------+--+ 35 rows selected (0.346 seconds) -- 建立外部表 create external table tb_external( id int, name string, hobby array<string>, add map<String,string> ) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' lines terminated by '\n' location '/tmp/hive/tb_external' ; -- load數據 load data local inpath '/usr/local/hive-2.1.1/data_dir/complex_data_type.txt' overwrite into table tb_external; 0: jdbc:hive2://node225:10000/db01> desc formatted tb_external; OK +-------------------------------+-------------------------------------------------------------+-----------------------+--+ | col_name | data_type | comment | +-------------------------------+-------------------------------------------------------------+-----------------------+--+ | # col_name | data_type | comment | | | NULL | NULL | | id | int | | | name | string | | | hobby | array<string> | | | add | map<string,string> | | | | NULL | NULL | | # Detailed Table Information | NULL | NULL | | Database: | db01 | NULL | | Owner: | root | NULL | | CreateTime: | Wed Oct 10 15:39:29 CST 2018 | NULL | | LastAccessTime: | UNKNOWN | NULL | | Retention: | 0 | NULL | | Location: | hdfs://ns1/tmp/hive | NULL | | Table Type: | MANAGED_TABLE | NULL | | Table Parameters: | NULL | NULL | | | numFiles | 1 | | | totalSize | 147 | | | transient_lastDdlTime | 1539157215 | | | NULL | NULL | | # Storage Information | NULL | NULL | | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | | Compressed: | No | NULL | | Num Buckets: | -1 | NULL | | Bucket Columns: | [] | NULL | | Sort Columns: | [] | NULL | | Storage Desc Params: | NULL | NULL | | | colelction.delim | - | | | field.delim | , | | | line.delim | \n | | | mapkey.delim | : | | | serialization.format | , | +-------------------------------+-------------------------------------------------------------+-----------------------+--+ 34 rows selected (0.22 seconds)
是指按照數據的某列(值)或某些列(值)分爲多個區,形式分區即數據表的子文件夾,查詢時經過指定分區字段的值,直接從指定分區中查詢避免執行權標掃描。
建立分區表的時候,經過關鍵字 partitioned by (partition_col data_type)聲明該表是分區表,按partition_col進行分區,partition_col值一致的全部記錄存放在一個分區中,能夠依據多個列進行分區,即對某個分區的數據按照某些列繼續分區。
向分區表導入數據的時候,要經過關鍵字partition(partition_col=value)顯示聲明數據要導入到表的哪一個分區。
分區,是將知足某些條件的記錄打包,作個記號,在查詢時提升效率,在查詢時分區字段會顯示到客戶端上,但並不真正在存儲在數據表文件中,是所謂僞列。
分區表的分區會反映在文件系統中的存儲路徑,其實是在表目錄下建立了一個文件夾名爲partition_col=value,並將該區導入的數據放置該在文件夾下面。
show partitions table_name;sql
create table tb_partitions( id int, name string, hobby array<string>, add map<String,string> ) partitioned by (part_tag string) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' lines terminated by '\n' ; load data local inpath '/usr/local/hive-2.1.1/data_dir/complex_data_type.txt' into table tb_partitions partition(part_tag = 'first'); load data local inpath '/usr/local/hive-2.1.1/data_dir/complex_data_type.txt' into table tb_partitions partition(part_tag = 'second'); select * from tb_partitions where part_tag='first'; insert overwrite table table_name partition(partition_col=value) select ... 0: jdbc:hive2://node225:10000/db01> show partitions tb_partitions; OK +-----------------+--+ | partition | +-----------------+--+ | part_tag=first | +-----------------+--+ 1 row selected (0.278 seconds)
hive是批處理系統,爲提升多分區插入數據的效率,hive提供了一個動態分區功能,其能夠基於查詢參數的位置去推斷分區的名稱,從而創建分區。
set hive.exec.dynamic.partition =true(默認false),表示開啓動態分區功能? ?
set hive.exec.dynamic.partition.mode = nonstrict(默認strict),表示容許全部分區都是動態的,不然必須有靜態分區字段
系統默認以最後一個字段做爲分區名,分區須要分區的字段只能放在後面,不能把順序弄錯。系統是根據查詢字段的位置推斷分區名的,而不是字段名稱。數據庫
create table tb_part_dynamic_1( id int, name string, hobby array<string>, add map<String,string> ) partitioned by (tag string) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' lines terminated by '\n' ; insert overwrite table tb_part_dynamic_1 partition(tag) select id,name,hobby,add,part_tag from tb_partitions;
create table tb_part_dynamic_2( id int, name string, hobby array<string>, add map<String,string> ) partitioned by (categ string,tag string) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' lines terminated by '\n' ; insert overwrite table tb_part_dynamic_2 partition(categ = 'big',tag) select id,name,hobby,add,part_tag from tb_partitions;
create table tb_part_dynamic_3( id int, name string, hobby array<string>, add map<String,string> ) partitioned by (tag1 string,tag2 string) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' lines terminated by '\n' ; insert overwrite table tb_part_dynamic_3 partition(tag1,tag2) select id,name,hobby,add,categ,tag from tb_part_dynamic_2;
相對分區進行更細粒度的劃分。分桶將整個數據內容安照某列屬性值得hash值進行區分,如要安照clustered_col屬性分爲num_buckets個桶,就是對clustered_col屬性值的hash值對num_buckets取摸,按照取模結果對數據分桶。如取模結果爲0,1,2,..num_buckets-1的數據記錄分別存放到數據表目錄下的一個文件。
分桶以前要執行命令hive.enforce.bucketiong=true;
要使用關鍵字clustered by 指定分區依據的列名,還要指定分爲多少桶num_buckets。
分桶的字段爲數據表中已有的字段,因此不用指定數據類型。
查看分桶數據須要使用tablesample。
select * from table_name tablesample(bucket 1 out of 3 on id)
能夠分桶又分區,但從文件目錄信息看不到分桶的子文件信息,能夠看到分區的目錄信息apache
0: jdbc:hive2://node225:10000/db01> set hive.enforce.bucketing=true; No rows affected (0.008 seconds) 0: jdbc:hive2://node225:10000/db01> set hive.enforce.bucketing; +------------------------------+--+ | set | +------------------------------+--+ | hive.enforce.bucketing=true | +------------------------------+--+ 1 row selected (0.016 seconds) create table tb_clusters( id int, name string, hobby array<string>, add map<String,string> ) clustered by (id) sorted by (name asc) into 2 buckets row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' lines terminated by '\n' ; insert overwrite table tb_clusters select id,name,hobby,add from tb_partitions;
The CLUSTERED BY and SORTED BY creation commands do not affect how data is inserted into a table – only how it is read. bash
此處經過測試有些問題,先記錄測試狀況oop
# 將正常的數據按所謂的默認分割符寫入本地文件系統 insert overwrite local directory '/usr/local/hive-2.1.1/data_dir/tb_insert_multi_02' row format delimited fields terminated by '^A' collection items terminated by '^B' map keys terminated by '^C' lines terminated by '\n' stored as textfile select * from tb_insert_multi_02 ; # 本地文件狀況 [root@node225 ~]# cat /usr/local/hive-2.1.1/data_dir/tb_insert_multi_02/000000_0 1^xiaoming^book^TV^code^beijing^chaoyang^shagnhai^pudong^first^100 2^lilei^book^code^nanjing^jiangning^taiwan^taibei^first^100 3^lihua^music^book^heilongjiang^haerbin^first^100 1^xiaoming^book^TV^code^beijing^chaoyang^shagnhai^pudong^first^100 2^lilei^book^code^nanjing^jiangning^taiwan^taibei^first^100 3^lihua^music^book^heilongjiang^haerbin^first^100 # 按所謂默認分隔符建立表 create table tb_data_format( id int, name string, hobby array<string>, add map<String,string>, tag1 string, tag2 int ) row format delimited fields terminated by '^A' collection items terminated by '^B' map keys terminated by '^C' lines terminated by '\n' ; 0: jdbc:hive2://node225:10000/db01> desc formatted tb_data_format; OK +-------------------------------+-------------------------------------------------------------+-----------------------------+--+ | col_name | data_type | comment | +-------------------------------+-------------------------------------------------------------+-----------------------------+--+ | # col_name | data_type | comment | | | NULL | NULL | | id | int | | | name | string | | | hobby | array<string> | | | add | map<string,string> | | | tag1 | string | | | tag2 | int | | | | NULL | NULL | | # Detailed Table Information | NULL | NULL | | Database: | db01 | NULL | | Owner: | root | NULL | | CreateTime: | Thu Oct 11 16:42:25 CST 2018 | NULL | | LastAccessTime: | UNKNOWN | NULL | | Retention: | 0 | NULL | | Location: | hdfs://ns1/user/hive/warehouse/db01.db/tb_data_format | NULL | | Table Type: | MANAGED_TABLE | NULL | | Table Parameters: | NULL | NULL | | | COLUMN_STATS_ACCURATE | {\"BASIC_STATS\":\"true\"} | | | numFiles | 0 | | | numRows | 0 | | | rawDataSize | 0 | | | totalSize | 0 | | | transient_lastDdlTime | 1539247345 | | | NULL | NULL | | # Storage Information | NULL | NULL | | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | | Compressed: | No | NULL | | Num Buckets: | -1 | NULL | | Bucket Columns: | [] | NULL | | Sort Columns: | [] | NULL | | Storage Desc Params: | NULL | NULL | | | colelction.delim | ^B | | | field.delim | ^A | | | line.delim | \n | | | mapkey.delim | ^C | | | serialization.format | ^A | +-------------------------------+-------------------------------------------------------------+-----------------------------+--+ # load 數據 load data local inpath '/usr/local/hive-2.1.1/data_dir/tb_insert_multi_02/000000_0' into table tb_data_format; # 查詢確認表中數據,並未定期望的形式加載數據 0: jdbc:hive2://node225:10000/db01> select * from tb_data_format; OK +--------------------+----------------------+-----------------------+---------------------+----------------------+----------------------+--+ | tb_data_format.id | tb_data_format.name | tb_data_format.hobby | tb_data_format.add | tb_data_format.tag1 | tb_data_format.tag2 | +--------------------+----------------------+-----------------------+---------------------+----------------------+----------------------+--+ | 1 | xiaoming | ["book"] | {"TV":null} | code | NULL | | 2 | lilei | ["book"] | {"code":null} | nanjing | NULL | | 3 | lihua | ["music"] | {"book":null} | heilongjiang | NULL | | 1 | xiaoming | ["book"] | {"TV":null} | code | NULL | | 2 | lilei | ["book"] | {"code":null} | nanjing | NULL | | 3 | lihua | ["music"] | {"book":null} | heilongjiang | NULL | +--------------------+----------------------+-----------------------+---------------------+----------------------+----------------------+--+ # 利用默認的分隔符建立數據表,查詢表的結構信息,顯示默認的分隔符並不是所謂的「^A」等 0: jdbc:hive2://node225:10000/db01> create table tb_data_delimit( . . . . . . . . . . . . . . . . . > id int, . . . . . . . . . . . . . . . . . > name string, . . . . . . . . . . . . . . . . . > hobby array<string>, . . . . . . . . . . . . . . . . . > add map<String,string>, . . . . . . . . . . . . . . . . . > tag1 string, . . . . . . . . . . . . . . . . . > tag2 int . . . . . . . . . . . . . . . . . > ) . . . . . . . . . . . . . . . . . > ; OK No rows affected (0.207 seconds) 0: jdbc:hive2://node225:10000/db01> desc formatted tb_data_delimit; OK +-------------------------------+-------------------------------------------------------------+-----------------------------+--+ | col_name | data_type | comment | +-------------------------------+-------------------------------------------------------------+-----------------------------+--+ | # col_name | data_type | comment | | | NULL | NULL | | id | int | | | name | string | | | hobby | array<string> | | | add | map<string,string> | | | tag1 | string | | | tag2 | int | | | | NULL | NULL | | # Detailed Table Information | NULL | NULL | | Database: | db01 | NULL | | Owner: | root | NULL | | CreateTime: | Thu Oct 11 16:43:28 CST 2018 | NULL | | LastAccessTime: | UNKNOWN | NULL | | Retention: | 0 | NULL | | Location: | hdfs://ns1/user/hive/warehouse/db01.db/tb_data_delimit | NULL | | Table Type: | MANAGED_TABLE | NULL | | Table Parameters: | NULL | NULL | | | COLUMN_STATS_ACCURATE | {\"BASIC_STATS\":\"true\"} | | | numFiles | 0 | | | numRows | 0 | | | rawDataSize | 0 | | | totalSize | 0 | | | transient_lastDdlTime | 1539247408 | | | NULL | NULL | | # Storage Information | NULL | NULL | | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | | Compressed: | No | NULL | | Num Buckets: | -1 | NULL | | Bucket Columns: | [] | NULL | | Sort Columns: | [] | NULL | | Storage Desc Params: | NULL | NULL | | | serialization.format | 1 | +-------------------------------+-------------------------------------------------------------+-----------------------------+--+ # 一樣將分隔符換成「\001」等也有問題 0: jdbc:hive2://node225:10000/db01> create table tb_data_format( . . . . . . . . . . . . . . . . . > id int, . . . . . . . . . . . . . . . . . > name string, . . . . . . . . . . . . . . . . . > hobby array<string>, . . . . . . . . . . . . . . . . . > add map<String,string>, . . . . . . . . . . . . . . . . . > tag1 string, . . . . . . . . . . . . . . . . . > tag2 int . . . . . . . . . . . . . . . . . > ) . . . . . . . . . . . . . . . . . > row format delimited . . . . . . . . . . . . . . . . . > fields terminated by '\001' . . . . . . . . . . . . . . . . . > collection items terminated by '\002' . . . . . . . . . . . . . . . . . > map keys terminated by '\003' . . . . . . . . . . . . . . . . . > lines terminated by '\n' . . . . . . . . . . . . . . . . . > ;
DELIMITED性能
每一個字段之間
FIELDS TERMINATED BY char ','
集合內元素與元素之間,每組K-V對之間
COLLECTION ITEMS TERMINATED BY '-'
每組K-V對內部
MAP KEYS TERMINATED BY ':'
每條數據之間由換行符
LINES TERMINATED BY '\n'測試
SERDE
SerDe是Serialize/Deserilize的簡稱,目的是用於序列化和反序列化。
序列化是對象轉換爲字節序列的過程。
反序列化是字節序列恢復爲對象的過程。
Serialize把hive使用的java object轉換成能寫入hdfs的字節序列,或者其餘系統能識別的流文件。Deserilize把字符串或者二進制流轉換成hive能識別的java object對象。好比:select語句會用到Serialize對象, 把hdfs數據解析出來;insert語句會使用Deserilize,數據寫入hdfs系統,須要把數據序列化。
hive建立表時, 經過自定義的SerDe或使用Hive內置的SerDe類型指定數據的序列化和反序列化方式。
用row format 參數說明SerDe的類型
SerDe包括內置類型
Avro
ORC
RegEx
Thrift
Parquet
CSV
JsonSerDe
經常使用RegexSerde解析多字節分隔符
須要兩個參數:
input.regex = "(.*)::(.*)::(.*)"
output.format.string = "%1$s %2$s %3$s"編碼
create table t_user( userid bigint comment '用戶id', gender string comment '性別', age int comment '年齡', occupation string comment '職業', zipcode string comment '郵政編碼' ) comment '用戶信息表' row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s') stored as textfile;
TEXTFILE:爲默認格式,導入數據時會直接把數據文件拷貝到hdfs上不進行處理,磁盤開銷大,數據解析開銷大。
SEQUENCEFILE,RCFILE,ORCFILE格式的表不能直接從本地文件導入數據,數據要先導入到textfile格式的表中, 而後再從表中用insert導入SequenceFile,RCFile,ORCFile表中。
相比TEXTFILE和SEQUENCEFILE,RCFILE因爲列式存儲方式,數據加載時性能消耗較大,可是具備較好的壓縮比和查詢響應。數據倉庫的特色是一次寫入、屢次讀取,所以,總體來看,RCFILE相比其他兩種格式具備較明顯的優點。
SEQUENCEFILE
是Hadoop API提供的一種二進制文件支持,其具備使用方便、可分割、可壓縮的特色。支持三種壓縮選擇:NONE,RECORD,BLOCK。Record壓縮率低,通常建議使用BLOCK壓縮。
RCFILE
是一種行列存儲相結合的存儲方式。首先,其將數據按行分塊,保證同一個record在一個塊上,避免讀一個記錄須要讀取多個block。其次,塊數據列式存儲,有利於數據壓縮和快速的列存取。