學習筆記--Impala

參考 https://www.cloudera.com/documentation/enterprise/5-5-x/topics/impala_datetime_functions.htmlhtml

hdfs文件操做

$ whoami
cloudera
$ hdfs dfs -ls /user
Found 3 items
drwxr-xr-x  - cloudera cloudera            0 2013-04-22 18:54 /user/cloudera
drwxrwx---  - mapred  mapred              0 2013-03-15 20:11 /user/history
drwxr-xr-x  - hue      supergroup          0 2013-03-15 20:10 /user/hive

$ hdfs dfs -mkdir -p /user/cloudera/sample_data/tab1 /user/cloudera/sample_data/tab2
$ hdfs dfs -put tab1.csv /user/cloudera/sample_data/tab1
$ hdfs dfs -cat /user/hive/tpcds/customer/customer.dat

Impala和關係數據庫

Impala 關係型數據庫
Impala使用相似於HiveQL的相似SQL的查詢語言。 關係數據庫使用SQL語言。
在Impala中,您沒法更新或刪除單個記錄。 在關係數據庫中,能夠更新或刪除單個記錄。
Impala不支持事務。 關係數據庫支持事務。
Impala不支持索引。 關係數據庫支持索引。
Impala存儲和管理大量數據(PB)。 與Impala相比,關係數據庫處理的數據量較少(TB)。

HBase、Hive和Impala

HBase Hive Impala
HBase是基於Apache Hadoop的寬列存儲數據庫。 它使用BigTable的概念。 Hive是一個數據倉庫軟件。 使用它,咱們能夠訪問和管理基於Hadoop的大型分佈式數據集。 Impala是一個管理,分析存儲在Hadoop上的數據的工具。
HBase的數據模型是寬列存儲。 Hive遵循關係模型。 Impala遵循關係模型。
HBase是使用Java語言開發的。 Hive是使用Java語言開發的。 Impala是使用C ++開發的。
HBase的數據模型是無模式的。 Hive的數據模型是基於模式的。 Impala的數據模型是基於模式的。
HBase提供Java,RESTful和Thrift API。 Hive提供JDBC,ODBC,Thrift API。 Impala提供JDBC和ODBC API。
支持C,C#,C ++,Groovy,Java PHP,Python和Scala等編程語言。 支持C ++,Java,PHP和Python等編程語言。 Impala支持全部支持JDBC / ODBC的語言。
HBase提供對觸發器的支持。 Hive不提供任何觸發器支持。 Impala不提供對觸發器的任何支持。

打開Impala-shellshell

[root@quickstart cloudera] # impala-shell

SQL

利用外部數據文件建立Impala的表數據庫

CREATE EXTERNAL TABLE tab1
(
  id INT,
  col_1 BOOLEAN,
  col_2 DOUBLE,
  col_3 TIMESTAMP
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/cloudera/sample_data/tab1';

建立分區express

create database external_partitions;
use external_partitions;
create table logs (field1 string, field2 string, field3 string)
  partitioned by (year string, month string , day string, host string)
  row format delimited fields terminated by ',';
insert into logs partition (year="2013", month="07", day="28", host="host1") values ("foo","foo","foo");
insert into logs partition (year="2013", month="07", day="28", host="host2") values ("foo","foo","foo");

ALTER TABLE編程

ALTER TABLE [old_db_name.]old_table_name RENAME TO [new_db_name.]new_table_name

ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name DROP [COLUMN] column_name
ALTER TABLE name CHANGE column_name new_name new_type
ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])

ALTER TABLE name { ADD [IF NOT EXISTS] | DROP [IF EXISTS] } PARTITION (partition_spec) [PURGE]
ALTER TABLE name RECOVER PARTITIONS

ALTER TABLE name [PARTITION (partition_spec)]
  SET { FILEFORMAT file_format
  | LOCATION 'hdfs_path_of_directory'
  | TBLPROPERTIES (table_properties)
  | SERDEPROPERTIES (serde_properties) }

ALTER TABLE name [PARTITION (partition_spec)] SET { CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED }

new_name ::= [new_database.]new_table_name

col_spec ::= col_name type_name

partition_spec ::= partition_col=constant_value

table_properties ::= 'name'='value'[, 'name'='value' ...]

serde_properties ::= 'name'='value'[, 'name'='value' ...]

file_format ::= { PARQUET | TEXTFILE | RCFILE | SEQUENCEFILE | AVRO }

ALTER VIEW編程語言

ALTER VIEW [database_name.]view_name AS select_statement
ALTER VIEW [database_name.]view_name RENAME TO [database_name.]view_name

COMPUTE STATS分佈式

COMPUTE STATS [db_name.]table_name
COMPUTE INCREMENTAL STATS [db_name.]table_name [PARTITION (partition_spec)]

partition_spec ::= partition_col=constant_value

CREATE DATABASE函數

CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name[COMMENT 'database_comment']
  [LOCATION hdfs_path];

CREATE FUNCTION工具

--Creates a user-defined function (UDF)
CREATE FUNCTION [IF NOT EXISTS] [db_name.]function_name([arg_type[, arg_type...])
  RETURNS return_type
  LOCATION 'hdfs_path'
  SYMBOL='symbol_or_class'
--user-defined aggregate function (UDA)
CREATE [AGGREGATE] FUNCTION [IF NOT EXISTS] [db_name.]function_name([arg_type[, arg_type...])
  RETURNS return_type
  LOCATION 'hdfs_path'
  [INIT_FN='function]
  UPDATE_FN='function
  MERGE_FN='function
  [PREPARE_FN='function]
  [CLOSEFN='function]
  [SERIALIZE_FN='function]
  [FINALIZE_FN='function]
  [INTERMEDIATE type_spec]

CREATE ROLEoop

CREATE ROLE role_name

CREATE TABLE

--Explicit column definitions:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  (col_name data_type [COMMENT 'col_comment'], ...)
  [PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
  [COMMENT 'table_comment']
  [WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [
  [ROW FORMAT row_format] [STORED AS file_format]
  ]
  [LOCATION 'hdfs_path']
  [TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
--Column definitions inferred from data file:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  LIKE PARQUET 'hdfs_path_of_parquet_file'
  [COMMENT 'table_comment']
  [PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
  [WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [
  [ROW FORMAT row_format] [STORED AS file_format]
  ]
  [LOCATION 'hdfs_path']
  [TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
data_type:
    primitive_type
  | array_type
  | map_type
  | struct_type
--CREATE TABLE AS SELECT:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] db_name.]table_name
  [COMMENT 'table_comment']
  [WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [
  [ROW FORMAT row_format] [STORED AS ctas_file_format]
  ]
  [LOCATION 'hdfs_path']
  [TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
AS
  select_statement
primitive_type:
    TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  | DECIMAL
  | STRING
  | CHAR
  | VARCHAR
  | TIMESTAMP

complex_type:
    struct_type
  | array_type
  | map_type

struct_type: STRUCT < name : primitive_or_complex_type [COMMENT 'comment_string'], ... >

array_type: ARRAY < primitive_or_complex_type >

map_type: MAP < primitive_type, primitive_or_complex_type >

row_format:
  DELIMITED [FIELDS TERMINATED BY 'char' [ESCAPED BY 'char']]
  [LINES TERMINATED BY 'char']

file_format:
    PARQUET
  | TEXTFILE
  | AVRO
  | SEQUENCEFILE
  | RCFILE

ctas_file_format:
    PARQUET
  | TEXTFILE

CREATE VIEW

CREATE VIEW [IF NOT EXISTS] view_name [(column_list)]
  AS select_statement

DESCRIBE

DESCRIBE [FORMATTED] [db_name.]table_name[.complex_col_name ...]

DROP DATABASE

DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT | CASCADE];

DROP TABLE

DROP TABLE [IF EXISTS] [db_name.]table_name [PURGE]

DROP VIEW

DROP VIEW [IF EXISTS] [db_name.]view_name

DROP FUNCTION

DROP [AGGREGATE] FUNCTION [IF EXISTS] [db_name.]function_name(type[, type...])

DROP ROLE

DROP ROLE role_name

DROP STATS

DROP STATS [database_name.]table_name
DROP INCREMENTAL STATS [database_name.]table_name PARTITION (partition_spec)

partition_spec ::= partition_col=constant_value

EXPLAIN

EXPLAIN { select_query | ctas_stmt | insert_stmt }

GRANT

GRANT ROLE role_name TO GROUP group_name

GRANT privilege ON object_type object_name
  TO [ROLE] roleName
  [WITH GRANT OPTION]

privilege ::= SELECT | SELECT(column_name) | INSERT | ALL
object_type ::= TABLE | DATABASE | SERVER | URI

INSERT

[with_clause]
INSERT { INTO | OVERWRITE } [TABLE] table_name
  [(column_list)]
  [ PARTITION (partition_clause)]
{
    [hint_clause] select_statement
  | VALUES (value [, value ...]) [, (value [, value ...]) ...]
}

partition_clause ::= col_name [= constant] [, col_name [= constant] ...]

hint_clause ::= [SHUFFLE] | [NOSHUFFLE]    (Note: the square brackets are part of the syntax.)

INVALIDATE METADATA

INVALIDATE METADATA [[db_name.]table_name]

LOAD DATA

LOAD DATA INPATH 'hdfs_file_or_directory_path' [OVERWRITE] INTO TABLE tablename
  [PARTITION (partcol1=val1, partcol2=val2 ...)]

REFRESH

REFRESH [db_name.]table_name

REVOKE

REVOKE ROLE role_name FROM GROUP group_name

REVOKE privilege ON object_type object_name
  FROM [ROLE] role_name

privilege ::= SELECT | SELECT(column_name) | INSERT | ALL
object_type ::= TABLE | DATABASE | SERVER | URI

SELECT

[WITH name AS (select_expression) [, ...] ]
SELECT
  [ALL | DISTINCT]
  [STRAIGHT_JOIN]
  expression [, expression ...]
FROM table_reference [, table_reference ...]
[[FULL | [LEFT | RIGHT] INNER | [LEFT | RIGHT] OUTER | [LEFT | RIGHT] SEMI | [LEFT | RIGHT] ANTI | CROSS]
  JOIN table_reference
  [ON join_equality_clauses | USING (col1[, col2 ...]] ...
WHERE conditions
GROUP BY { column | expression [ASC | DESC] [NULLS FIRST | NULLS LAST] [, ...] }
HAVING conditions
GROUP BY { column | expression [ASC | DESC] [, ...] }
LIMIT expression [OFFSET expression]
[UNION [ALL] select_statement] ...]

SET

SET [query_option=option_value]

SHOW

SHOW DATABASES [[LIKE] 'pattern']
SHOW SCHEMAS [[LIKE] 'pattern'] - an alias for SHOW DATABASES
SHOW TABLES [IN database_name] [[LIKE] 'pattern']
SHOW [AGGREGATE | ANALYTIC] FUNCTIONS [IN database_name] [[LIKE] 'pattern']
SHOW CREATE TABLE [database_name].table_name
SHOW TABLE STATS [database_name.]table_name
SHOW COLUMN STATS [database_name.]table_name
SHOW PARTITIONS [database_name.]table_name
SHOW FILES IN [database_name.]table_name [PARTITION (key_col=value [, key_col=value]]

SHOW ROLES
SHOW CURRENT ROLES
SHOW ROLE GRANT GROUP group_name
SHOW GRANT ROLE role_name

TRUNCATE TABLE

TRUNCATE TABLE [db_name.]table_name

USE

USE db_name

經常使用函數

數學函數

abs(numeric_type a)
acos(double a)
asin(double a)
atan(double a)
atan2(double a, double b)
bin(bigint a)
ceil(double a), ceil(decimal(p,s) a), ceiling(double a), ceiling(decimal(p,s) a), dceil(double a), dceil(decimal(p,s) a)
conv(bigint num, int from_base, int to_base), conv(string num, int from_base, int to_base)
cos(double a)
cosh(double a)
cot(double a)
degrees(double a)
e()
exp(double a), dexp(double a)
factorial(integer_type a)

位運算函數

bitand(integer_type a, same_type b)
bitnot(integer_type a)
bitor(integer_type a, same_type b)
bitxor(integer_type a, same_type b)

類型轉換函數

cast(expr AS type)
typeof(type value)

時間日期函數

add_months(timestamp date, int months), add_months(timestamp date, bigint months)
adddate(timestamp startdate, int days), adddate(timestamp startdate, bigint days),
current_timestamp()
now()
date_add(timestamp startdate, int days), date_add(timestamp startdate, interval_expression)
date_part(string, timestamp)
date_sub(timestamp startdate, int days), date_sub(timestamp startdate, interval_expression)
datediff(timestamp enddate, timestamp startdate)
day(timestamp date), dayofmonth(timestamp date)
dayname(timestamp date)
dayofweek(timestamp date)
dayofyear(timestamp date)
days_add(timestamp startdate, int days), days_add(timestamp startdate, bigint days)
days_sub(timestamp startdate, int days), days_sub(timestamp startdate, bigint days)
extract(timestamp, string unit)extract(unit FROM timestamp)

條件函數

CASE a WHEN b THEN c [WHEN d THEN e]... [ELSE f] END
if(boolean condition, type ifTrue, type ifFalseOrNull)
ifnull(type a, type ifNull)
isfalse(boolean)
isnotfalse(boolean)
isnull(type a, type ifNull)
istrue(boolean)

字符串函數

ascii(string str)
btrim(string a), btrim(string a, string chars_to_trim)
char_length(string a), character_length(string a)
chr(int character_code)
concat(string a, string b...)
find_in_set(string str, string strList)
length(string a)
lower(string a), lcase(string a)
regexp_extract(string subject, string pattern, int index)
regexp_like(string source, string pattern[, string options])
repeat(string str, int n)
reverse(string a)
substr(string a, int start [, int len]), substring(string a, int start [, int len])
translate(string input, string from, string to)
trim(string a)
upper(string a), ucase(string a)

聚合函數

APPX_MEDIAN([DISTINCT | ALL] expression)
AVG([DISTINCT | ALL] expression) [OVER (analytic_clause)]
COUNT([DISTINCT | ALL] expression) [OVER (analytic_clause)]
GROUP_CONCAT([ALL] expression [, separator])
MAX([DISTINCT | ALL] expression) [OVER (analytic_clause)]
MIN([DISTINCT | ALL] expression) [OVER (analytic_clause)]
NDV([DISTINCT | ALL] expression)
{ STDDEV | STDDEV_SAMP | STDDEV_POP } ([DISTINCT | ALL] expression)
SUM([DISTINCT | ALL] expression) [OVER (analytic_clause)]
{ VARIANCE | VAR[IANCE]_SAMP | VAR[IANCE]_POP } ([DISTINCT | ALL] expression)
相關文章
相關標籤/搜索