load data local inpath '/home/hadoop/emp.txt' into table emp_1;mysql
load data local inpath 文件路徑 [overwrite] into table 表名
hive> create table emp_1 > (empno int, > ename string, > job string, > mgr int, > hirdate string, > sal double, > comm double, > deptno int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'; OK Time taken: 0.157 seconds
hive> load data local inpath '/home/hadoop/emp.txt' overwrite into table emp_1; Loading data to table default.emp_1 Table default.emp_1 stats: [numFiles=1, numRows=0, totalSize=700, rawDataSize=0] OK Time taken: 0.444 seconds
hive> select * from emp_1; OK 7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20 7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30 7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30 7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20 7654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30 7698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 30 7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10 7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20 7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10 7844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30 7876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 20 7900 JAMES CLERK 7698 1981-12-3 950.0 NULL 30 7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20 7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10 8888 HIVE PROGRAM 7839 1988-1-23 10300.0 NULL NULL Time taken: 0.125 seconds, Fetched: 15 row(s)
yarn查看任務進度:sql
http://192.168.83.11:8088/cluster,shell
要指定分隔符,因此建立表的時候要指定分隔符(默認分隔符是\001 ^A); 空格、製表符(\t)數據庫
hadoop上傳文件emp1.txtapache
hadoop fs -put /home/hadoop/emp1.txt / 安全
create table emp1 as select * from emp1 where 1>1;app
load data inpath '/emp1.txt' into table emp_2;ide
這裏須要注意下,採用如上方法並無建立分隔符語法。工具
hive> show create table emp_2; OK CREATE TABLE `emp_2`( `empno` int, `ename` string, `job` string, `mgr` int, `hirdate` string, `sal` double, `comm` double, `deptno` int) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs://hd1:9000/user/hive/warehouse/emp_2' TBLPROPERTIES ( 'COLUMN_STATS_ACCURATE'='false', 'numFiles'='2', 'numRows'='-1', 'rawDataSize'='-1', 'totalSize'='700', 'transient_lastDdlTime'='1540480415') Time taken: 0.138 seconds, Fetched: 24 row(s)
load data inpath '/home/hadoop/emp1.txt' into table emp1;oop
The EXTERNAL keyword lets you create a table and provide a LOCATION so that Hive does not use a default location for this table. This comes in handy if you already have data generated. When dropping an EXTERNAL table, data in the table is NOT deleted from the file system.
create external table exter_emp
(empno int,
ename string,
job string,
mgr int,
hirdate string,
sal double,
comm double,
deptno int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
location '/external/';
location '/external' 爲hdfs路徑,把文件丟在裏面,表會按照分隔符自動加載數據。若是 '/external/'下面有多個文件,那麼外部表 exter-emp都會把數據加載進來。
固然外部表也能夠加載本地數據, load data local inpath '/home/hadoop/emp1.txt' into table exter_emp;
Partitioned tables can be created using the PARTITIONED BY clause. A table can have one or more partition columns and a separate data directory is created for each distinct value combination in the partition columns. Further, tables or partitions can be bucketed using CLUSTERED BY columns, and data can be sorted within that bucket via SORT BY columns. This can improve performance on certain kinds of queries.
create table part_emp(
empno int,
ename string,
job string,
hirdate string,
sal double,
comm double,
deptno int)
partitioned by(mgr int)
row format
delimited fields terminated by '\t';
load data local inpath '/home/hadoop/emp.txt' into table part_emp PARTITION (mgr=10,mgr=20,mgr=30);
須要注意一點:分區表的分區字段不能出如今表字段裏面。若是經過load加載數據就須要把txt文檔中的mgr這個列的數據刪除,否則加載數據會錯位,致使txt數據跟分區表數據不一致。
Usage:INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement;
注意:select跟insert列必須一一對應
hive> insert overwrite table t1 select * from emp_1;
hive> insert overwrite table t2 partition(mgr=10) select empno,ename,job,hirdate,sal,comm,deptno from part_emp;
注意:t2也必須爲分區表且select列跟分區表T2列數量一致。
多重插入:
hive> create table emp2 (j string,s double,c double);
OK
Time taken: 0.114 seconds
hive> from emp_1
> insert into emp1 select empno,ename
> insert into emp2 select job,sal,comm ;
Query ID = hadoop_20181026015757_f2abf70b-7249-44e5-895d-ad22eddfcd7a
Total jobs = 5
Launching Job 1 out of 5
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1540475767311_0010, Tracking URL = http://hd1:8088/proxy/application_1540475767311_0010/
Kill Command = /home/hadoop/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1540475767311_0010
Hadoop job information for Stage-2: number of mappers: 1; number of reducers: 0
2018-10-26 02:02:47,297 Stage-2 map = 0%, reduce = 0%
2018-10-26 02:02:54,835 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.51 sec
MapReduce Total cumulative CPU time: 1 seconds 510 msec
Ended Job = job_1540475767311_0010
Stage-5 is selected by condition resolver.
Stage-4 is filtered out by condition resolver.
Stage-6 is filtered out by condition resolver.
Stage-11 is selected by condition resolver.
Stage-10 is filtered out by condition resolver.
Stage-12 is filtered out by condition resolver.
Moving data to: hdfs://hd1:9000/user/hive/warehouse/emp1/.hive-staging_hive_2018-10-26_02-02-34_121_1621965597337759090-1/-ext-10000
Moving data to: hdfs://hd1:9000/user/hive/warehouse/emp2/.hive-staging_hive_2018-10-26_02-02-34_121_1621965597337759090-1/-ext-10002
Loading data to table default.emp1
Loading data to table default.emp2
Table default.emp1 stats: [numFiles=1, numRows=0, totalSize=164, rawDataSize=0]
Table default.emp2 stats: [numFiles=1, numRows=0, totalSize=278, rawDataSize=0]
MapReduce Jobs Launched:
Stage-Stage-2: Map: 1 Cumulative CPU: 1.51 sec HDFS Read: 4567 HDFS Write: 580 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 510 msec
OK
Time taken: 22.873 seconds
create database if not exists 庫名; #建立庫
alter database dbname set dbproperties('edited-by'='joe'); #修改庫(不能刪除或「重置」數據庫屬性)
describe database extended dbname; #查詢庫
drop database [if exists] dbname; #刪除庫
desc database extended 庫; #顯示庫的擴展信息
hive>create external table dat0204(filmname string ,filmdate date ,filmscore string)
>comment '別名'
>row format delimited
>fields terminated by '\t'
>lines terminated by '\n'
>stored as textfile; #建立外部表,外部表相對來講更加安全些,數據組織也更加靈活,方便共享源數據。
hive>create table if not exists dat0204(id int , name string , age int)
>comment '別名'
>row format delimited
>fields terminated by '\t'
>lines terminated by '\n'
>stored as textfile; #建立內部表
desc 表; #表的描述
desc formatted 表; #查詢表的結構
desc extended 表; #顯示錶的擴展信息
select * from 表; #查詢表的信息
create table 庫名1.表名1 like 庫名2.表名2; #複製表(表結構+數據)
alter table hive1.test2 add partition(province='hebei',city='baoding') #添加分區
show partitions hive1; #查看錶的分區
insert overwrite table test2 partition(provice='hebei',city='shijiazhuang') select id , name , age from test1; #增長數據
drop table 表; #刪除空表
drop table 表 cascade; #刪除非空表
show tables like '*name*'; #模糊搜索表
hive>load data local inpath 'path/filename' overwrite into table 表名; #從本地數據導入Hive表
hive>load data inpath 'path/filename' into table 表名; #HDFS上導入數據到Hive表
hive> insert overwrite directory "hodoop目錄" select user, login_time from user_login; #將查詢數據輸出hdfs目錄(hdfs目錄不存在)
$ hive -e "sql語句" > /tmp/out.txt #保存sql語句查詢信息到本地文件
hive>dfs -lsr / //顯示dfs下文件:路徑/庫/表/文件
hive>dfs -rmr /目錄 //dfs命令,刪除目錄
hive>!clear ; //hive中執行shell命令
hive>!dfs -lsr / ; //hive中執行hdfs命令
use hive用戶庫;
select * from VERSION; #查看hive版本
select * from TBLS \G; #查看有哪些表,易區分各表。
select * from SDS \G; #查看錶對應的hdfs目錄的metedata
select * from PARTITIONS where TBL_ID=1 \G; #查看某個表的partitions:
select * from COLUMNS_V2; #查看某個表的列:
select * from PARTITION_KEYS; #查看某個表的partition
select * from DBS; #查看數據倉庫信息
調優
1.explain———解釋執行計劃
explain select sum(*) from test2 ;
一、起源:由Facebook開源用於解決海量結構化日誌的數據統計;
二、結構:Hive是基於hadoop的一個數據倉庫工具,能夠將結構化的數據文件映射成一張表,並提供類SQL查詢功能;
(使用HQL做爲查詢接口;使用HDFS存儲;使用MapReduce計算;)
本質是:將HQL轉化成MapReduce程序。
三、應用:適合離線數據處理。
四、schema(模式,元信息存放到數據庫中)
五、數據庫和表都是路徑。
六、hive在寫操做是不校驗,讀時校驗。
在hive裏面建立了這麼多表,前面有提到過 hive的元數據都是存儲在mysql中,那麼hive中的元數據在哪裏呢?
use hive用戶庫;
select * from VERSION; #查看hive版本
select * from TBLS \G; #查看有哪些表,易區分各表。
select * from SDS \G; #查看錶對應的hdfs目錄的metedata
select * from PARTITIONS where TBL_ID=1 \G; #查看某個表的partitions:
select * from COLUMNS_V2; #查看某個表的列:
select * from PARTITION_KEYS; #查看某個表的partition
select * from DBS; #查看數據倉庫信息
調優
1.explain———解釋執行計劃
explain select sum(*) from test2 ;
mysql> select * from TBLS; +--------+-------------+-------+------------------+--------+-----------+-------+-----------+----------------+--------------------+--------------------+ | TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT | +--------+-------------+-------+------------------+--------+-----------+-------+-----------+----------------+--------------------+--------------------+ | 1 | 1540322901 | 1 | 0 | hadoop | 0 | 1 | test | MANAGED_TABLE | NULL | NULL | | 6 | 1540378940 | 6 | 0 | hadoop | 0 | 6 | t1 | MANAGED_TABLE | NULL | NULL | | 13 | 1540477772 | 1 | 0 | hadoop | 0 | 13 | emp_1 | MANAGED_TABLE | NULL | NULL | | 16 | 1540481399 | 1 | 0 | hadoop | 0 | 16 | emp_2 | MANAGED_TABLE | NULL | NULL | | 17 | 1540481750 | 1 | 0 | hadoop | 0 | 17 | emp_3 | MANAGED_TABLE | NULL | NULL | | 21 | 1540482372 | 1 | 0 | hadoop | 0 | 21 | exter_emp | EXTERNAL_TABLE | NULL | NULL | | 26 | 1540484035 | 1 | 0 | hadoop | 0 | 26 | part_emp | MANAGED_TABLE | NULL | NULL | +--------+-------------+-------+------------------+--------+-----------+-------+-----------+----------------+--------------------+--------------------+ 7 rows in set (0.00 sec)
都在HIVE.TBLS表中。
desc 表; #表的描述
desc formatted 表; #查詢表的結構
desc extended 表; #顯示錶的擴展信息
select * from 表; #查詢表的信息
修改表字段:
hive> alter table test change id id string;
OK
Time taken: 0.388 seconds
hive> desc test;
OK
id string
Time taken: 0.179 seconds, Fetched: 1 row(s)
hive> create table t1 like emp_1; OK Time taken: 0.146 seconds hive> desc formatted t1l; FAILED: SemanticException [Error 10001]: Table not found t1l hive> desc formatted emp_1; OK # col_name data_type comment empno int ename string job string mgr int hirdate string sal double comm double deptno int # Detailed Table Information Database: default Owner: hadoop CreateTime: Thu Oct 25 22:29:32 CST 2018 LastAccessTime: UNKNOWN Protect Mode: None Retention: 0 Location: hdfs://hd1:9000/user/hive/warehouse/emp_1 Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 0 rawDataSize 0 totalSize 700 transient_lastDdlTime 1540477792 # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: field.delim \t serialization.format \t Time taken: 0.214 seconds, Fetched: 39 row(s)