經常使用SQL
建立表
1
2
3
4
5
6
7
|
CREATE TABLE b6logs(
eventDate
Date,
impid UInt64,
uid
String,
idfa
String,
imei
String
)
ENGINE=MergeTree(eventDate, (impid, eventDate), 8192)
|
通常狀況下, 都建議使用 MergeTree
引擎. 這個引擎必需要有一個 Date
的列來做爲索引, 即上面的 eventDate
.nginx
導入CSV數據
1
|
cat xxx.csv | clickhouse-client
--query="INSERT INTO b6logs FORMAT CSV";
|
指定分隔符
1
|
cat xxx.csv | clickhouse-client
--format_csv_delimiter="|" --query="INSERT INTO b6logs FORMAT CSV";
|
導入數據時忽略錯誤
1
|
clickhouse-client
--input_format_allow_errors_num=100000 --input_format_allow_errors_ratio=0.2
|
--input_format_allow_errors_num
: 是容許的錯誤數git
--input_format_allow_errors_ratio
: 是容許的錯誤率, 範圍是 [0-1]github
導出 CSV 數據
1
|
clickhouse-client
--query="select uid, idfa, imei from (select impid, uid from b2logs where impid >= 15289903030261609347 and impid <= 15289904230261609347) any inner join (select impid, idfa, imei from b6logs where impid >= 15289903030261609347 and impid <= 15289904230261609347) using(impid) format CSV" > 9c9dc608-269b-4f02-b122-ef5dffb2669d.log
|
即語法爲 select xxxx format CSV
sql
重命名錶
1
|
rename table tbl1 to btl2;
|
刪除表
1
|
drop table tbl;
|
添加列
1
|
alter table dsp_statis add column cost UInt32 default 0;
|
查看錶結構
1
|
desc tbl;
|
更多語法, 參考官方文檔. https://clickhouse.yandex/docs/en/query_language/queries/數據庫
MergeTree 引擎中刪除分區
注意, 默認狀況下 mergeTree 引擎是按月分區的, 刪除分區的格式爲
201808
vim
若是想修改成按日分區, 則在建表時加上:less
1
2
3
|
ENGINE = MergeTree PARTITION BY eventDate ORDER BY imp_id SETTINGS index_granularity = 8192;
而後就能夠:
alter table xxx drop partition
'2018-08-08';
|
默認狀況下, Clickhouse 不容許刪除分區或表的大小大於 50GB 的分區或表. 能夠經過修改server的配置文件來永久配置. 也能夠臨時設置一下來刪除而不用重啓服務.性能
永久配置
1
2
3
4
5
6
7
8
|
sudo vim /etc/clickhouse-server/config.xml
而後註釋掉下面兩行
<!-- <max_table_size_to_drop>0</max_table_size_to_drop> -->
<!-- <max_partition_size_to_drop>0</max_partition_size_to_drop> -->
0表示不限制. 或者你能夠設置爲你想限制的最大的大小.
|
臨時設置
建立個標誌文件:優化
1
|
sudo touch
'/home/username/clickhouse/flags/force_drop_table' && sudo chmod 666 '/home/username/clickhouse/flags/force_drop_table'
|
建立好以後, 就能夠執行上面的刪除分區或表的命令了.ui
查看錶大小
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
SELECT
database,
table,
formatReadableSize(
size) AS size,
formatReadableSize(bytes_on_disk)
AS bytes_on_disk,
formatReadableSize(data_uncompressed_bytes)
AS data_uncompressed_bytes,
formatReadableSize(data_compressed_bytes)
AS data_compressed_bytes,
compress_rate,
rows,
days,
formatReadableSize(avgDaySize)
AS avgDaySize
FROM
(
SELECT
database,
table,
sum(bytes) AS size,
sum(rows) AS rows,
min(min_date) AS min_date,
max(max_date) AS max_date,
sum(bytes_on_disk) AS bytes_on_disk,
sum(data_uncompressed_bytes) AS data_uncompressed_bytes,
sum(data_compressed_bytes) AS data_compressed_bytes,
(data_compressed_bytes / data_uncompressed_bytes) *
100 AS compress_rate,
max_date - min_date
AS days,
size / (max_date - min_date) AS avgDaySize
FROM system.parts
WHERE active
GROUP BY
database,
table
ORDER BY
database ASC,
size DESC
)
|
執行 SQL 文件
1
|
clickhouse-client -d 數據庫 --multiquery < /tmp/your.sql.file
|
查看分區信息
1
|
select partition, name, active from system.parts WHERE table = 'visits'
|
性能相關收集
join 表性能
切記, 要用大表 join 小表. (不知道具體爲何, 從經驗上看, 用大表做爲驅動表, 性能遠遠快於用小表做爲驅動表). (MySQL 裏的話, 則是小表驅動大表).
優化 distinct count
以前
1
|
select yob, count(), count(distinct uid, idfa, imei) from nginx_bid_log where eventDate='2018-9-1' group by yob;
|
以後
1
|
select yob, count(), count(distinct(sipHash64(concat(uid, idfa, imei)))) from nginx_bid_log where eventDate='2018-9-1' group by yob;
|
查看數據分佈
1
|
select histogram(100)(upstream_resp_time) from (select upstream_resp_time from nginx_bid_log where eventDate = '2018-12-13') format CSV;
|
histogram(100) 表示組距100 (即分紅100等份的的分佈) , 後面的
upstream_resp_time
是你的列名, 即按這個列的數據來進行統計.
bar
1
|
select upstream_resp_time, bar(列名, 最小值, 最大, step) from tableXX;
|
顯示簡單的圖形.
hex 十六進制 轉換爲 十進制
1
|
SELECT reinterpretAsInt64(reverse(unhex('123')));
|
md5 分區
1
2
3
4
5
|
# 一
SELECT reinterpretAsInt64(reverse(unhex(substring(md5_field, 1, 1))));
# 二, md5 => hex => 十進制 => 取模
SELECT modulo(reinterpretAsInt64(reverse(unhex(substring(md5_field, 1, 1)))), 5);
|