若在Hive中執行INSERT OVERWRITE重寫同一個表的數據時,有可能會形成數據丟失。apache
如 INSERT OVERWRITE TABLE table_name SELECT * FROM table_nameoop
1、新建一張分區表
create table test_chj_cols (id string, name string, age string) partitioned by (ds string) stored as textfile;
2、插入一條記錄
insert into test_chj_cols partition (ds='20181224') values ('1','chj','18');
3、確認表數據及結構
> select * from test_chj_cols; OK test_chj_cols.id test_chj_cols.name test_chj_cols.age test_chj_cols.ds 1 chj 18 20181224 > desc formatted test_chj_cols partition (ds='20181224'); OK col_name data_type comment # col_name data_type comment id string name string age string # Partition Information # col_name data_type comment ds string # Detailed Partition Information Partition Value: [20181224] Database: hduser05db Table: test_chj_cols CreateTime: Mon Dec 24 19:35:28 CST 2018 LastAccessTime: UNKNOWN Protect Mode: None Location: hdfs://bdphdp02/user/hive/warehouse/hduser05/hduser05db.db/test_chj_cols/ds=20181224 Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 1 rawDataSize 8 totalSize 17 transient_lastDdlTime 1545651329 # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 Time taken: 0.099 seconds, Fetched: 37 row(s)
4、在表中間新增字段
alter table test_chj_cols replace columns (id string, name string, money string, age string); > desc formatted test_chj_cols; OK col_name data_type comment # col_name data_type comment id string name string money string age string # Partition Information # col_name data_type comment ds string # Detailed Table Information Database: hduser05db Owner: hadoop CreateTime: Mon Dec 24 19:34:46 CST 2018 LastAccessTime: UNKNOWN Protect Mode: None Retention: 0 Location: hdfs://bdphdp02/user/hive/warehouse/hduser05/hduser05db.db/test_chj_cols Table Type: MANAGED_TABLE Table Parameters: last_modified_by hadoop last_modified_time 1545651722 transient_lastDdlTime 1545651722 # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 Time taken: 0.051 seconds, Fetched: 36 row(s)
5、重寫數據
insert overwrite table test_chj_cols partition (ds='20181224') select id,name,age,name fromspa
test_chj_cols;
6、age字段數據丟失
> select * from test_chj_cols; OK test_chj_cols.id test_chj_cols.name test_chj_cols.age test_chj_cols.money test_chj_cols.ds 1 chj NULL NULL 20181224