GreenPlum 大數據平臺--外部表(三)

一,外部表介紹

  Greenplum 在數據加載上有一個明顯的優點,就是支持數據的併發加載,gpfdisk是併發加載的工具,數據庫中對應的就是外部表web

  所謂外部表,就是在數據庫中只有表定義、沒有數據,數據都存放在數據庫以外的數據文件。greenplum能夠對一個外部表執行正常的DML操做,當讀取數據的時候,數據庫從數據文件中加載數據。外部表支持在segment上併發地告訴從gpfdist導入數據,因爲是從segment上導入數據,因此效率很高。數據庫

  結構圖:併發

  外部表須要指定gpfdist的IP和端口,還要有詳細的目錄地址,文件名支持通配符匹配。能夠編寫多個gpfdist地址,可是總數不能超過總的segment數量,不然會報錯。app

  GPDB提供兩種外部表:可讀外部表用於數據裝載、可寫外部表用於數據卸載。外部表可基於文件、亦可基於WEB,這兩種都能實現可讀、可寫。tcp

當一個查詢使用一個常規的外部表,該外部表被認爲是可重讀的,由於在該查詢期間數據是靜態的。而對於WEB外部表,數據是不可重讀的,由於在該查詢的執行期間數據可能會發生變化。工具

    可寫外部表用以從數據庫表中選擇記錄並輸出到文件、命名管道或其餘可執行程序。 好比,能夠從GPDB中卸載數據併發送到一個可執行程序,該程序鏈接到其餘數據庫或者ETL工具並裝載數據到其餘地方。 可寫外部表還能夠用於輸出到GPDB的並行MapReduce計算。post

    可寫外部表被定義後,數據便可從數據庫表中被選擇並插入到該可寫外部表。 可寫外部表只容許INSERT操做 – SELECT、 UPDATE、 DELETE或TRUNCATE是不容許的。可寫外部表輸出數據到一個可執行程序,該程序要可以接受流輸入數據。編碼

    在建立外部表的時候,能夠指定分隔符、err表、指定容許出錯的數據條數,以及源文件的編碼等信息。spa

二,外部表語法

CREATE [READABLE] EXTERNAL TABLE table_name    
    ( column_name data_type [, ...] | LIKE other_table )
      LOCATION ('file://seghost[:port]/path/file' [, ...])
        | ('gpfdist://filehost[:port]/file_pattern[#transform]'
        | ('gpfdists://filehost[:port]/file_pattern[#transform]'
            [, ...])
        | ('gphdfs://hdfs_host[:port]/path/file')
      FORMAT 'TEXT'
            [( [HEADER]
               [DELIMITER [AS] 'delimiter' | 'OFF']
               [NULL [AS] 'null string']
               [ESCAPE [AS] 'escape' | 'OFF']
               [NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
               [FILL MISSING FIELDS] )]
           | 'CSV'
            [( [HEADER]
               [QUOTE [AS] 'quote']
               [DELIMITER [AS] 'delimiter']
               [NULL [AS] 'null string']
               [FORCE NOT NULL column [, ...]]
               [ESCAPE [AS] 'escape']
               [NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
               [FILL MISSING FIELDS] )]
           | 'AVRO'
           | 'PARQUET'
 
           | 'CUSTOM' (Formatter=<formatter specifications>)
     [ ENCODING 'encoding' ]
     [ [LOG ERRORS [INTO error_table]] SEGMENT REJECT LIMIT count
       [ROWS | PERCENT] ]
 
CREATE [READABLE] EXTERNAL WEB TABLE table_name    
   ( column_name data_type [, ...] | LIKE other_table )
      LOCATION ('http://webhost[:port]/path/file' [, ...])
    | EXECUTE 'command' [ON ALL
                          | MASTER
                          | number_of_segments
                          | HOST ['segment_hostname']
                          | SEGMENT segment_id ]
      FORMAT 'TEXT'
            [( [HEADER]
               [DELIMITER [AS] 'delimiter' | 'OFF']
               [NULL [AS] 'null string']
               [ESCAPE [AS] 'escape' | 'OFF']
               [NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
               [FILL MISSING FIELDS] )]
           | 'CSV'
            [( [HEADER]
               [QUOTE [AS] 'quote']
               [DELIMITER [AS] 'delimiter']
               [NULL [AS] 'null string']
               [FORCE NOT NULL column [, ...]]
               [ESCAPE [AS] 'escape']
               [NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
               [FILL MISSING FIELDS] )]
           | 'CUSTOM' (Formatter=<formatter specifications>)
     [ ENCODING 'encoding' ]
     [ [LOG ERRORS [INTO error_table]] SEGMENT REJECT LIMIT count
       [ROWS | PERCENT] ]
 
CREATE WRITABLE EXTERNAL TABLE table_name
    ( column_name data_type [, ...] | LIKE other_table )
     LOCATION('gpfdist://outputhost[:port]/filename[#transform]'
      | ('gpfdists://outputhost[:port]/file_pattern[#transform]'
          [, ...])
      | ('gphdfs://hdfs_host[:port]/path')
      FORMAT 'TEXT'
               [( [DELIMITER [AS] 'delimiter']
               [NULL [AS] 'null string']
               [ESCAPE [AS] 'escape' | 'OFF'] )]
          | 'CSV'
               [([QUOTE [AS] 'quote']
               [DELIMITER [AS] 'delimiter']
               [NULL [AS] 'null string']
               [FORCE QUOTE column [, ...]] ]
               [ESCAPE [AS] 'escape'] )]
           | 'AVRO'
           | 'PARQUET'
 
           | 'CUSTOM' (Formatter=<formatter specifications>)
    [ ENCODING 'write_encoding' ]
    [ DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY ]
 
CREATE WRITABLE EXTERNAL WEB TABLE table_name
    ( column_name data_type [, ...] | LIKE other_table )
    EXECUTE 'command' [ON ALL]
    FORMAT 'TEXT'
               [( [DELIMITER [AS] 'delimiter']
               [NULL [AS] 'null string']
               [ESCAPE [AS] 'escape' | 'OFF'] )]
          | 'CSV'
               [([QUOTE [AS] 'quote']
               [DELIMITER [AS] 'delimiter']
               [NULL [AS] 'null string']
               [FORCE QUOTE column [, ...]] ]
               [ESCAPE [AS] 'escape'] )]
           | 'CUSTOM' (Formatter=<formatter specifications>)
    [ ENCODING 'write_encoding' ]
    [ DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY ]

三,建立外部表

  01,語法

gpfdist [-d directory] [-p http_port] [-l log_file] [-t timeout]
[-S] [-w time] [-v | -V] [-s] [-m max_length] [--ssl certificate_path]
gpfdist -? | --help
gpfdist --version

  02,啓動進程

--建立gpdist進程
[gpadmin@greenplum02 ~]$ mkdir script
[gpadmin@greenplum02 ~]$ nohup gpfdist -d /home/gpadmin/script/ -p 8081 -l /home/gpadmin/script/gpfdist.log &
[1] 6904
[gpadmin@greenplum02 ~]$ nohup: ignoring input and appending output to ‘nohup.out’
[gpadmin@greenplum02 ~]$ ss -lntup|grep 8081
tcp    LISTEN     0      128      :::8081                 :::*                   users:(("gpfdist",pid=6904,fd=6))
---配置讀取文件
[gpadmin@greenplum02 script]$ cat test.txt
Prague,Jan,101,4875.33
Rome,Mar,87,1557.39
Bangalore,May,317,8936.99
Beijing,Jul,411,11600.67
San Francisco,Sept,156,6846.34
Paris,Nov,159,7134.56
San Francisco,Jan,113,5397.89
Prague,Dec,333,9894.77
Bangalore,Jul,271,8320.55
Beijing,Dec,100,4248.41

q
[gpadmin@greenplum02 script]$ pwd
/home/gpadmin/script
--後面的是錯誤信息

  03,建立外部表

create external table public.test
(
country varchar(128),
name varchar(128),
id int,
sale varchar(128)
)
location ('gpfdist://192.168.0.222:8081/test.txt')
format 'text'
(delimiter ',' null as '' escape 'off')
encoding 'utf8'
log error segment reject limit 3 rows;

--- location   文件所在位置,能夠直接是本地路徑、gpfdist地址、gpfdists地址、gphdfs地址。
--- format     文本類型
--- delimiter  分隔符
--- encoding    編碼
--- log error into  錯誤數據表,記錄錯誤數據,會自動建立。通常都是tablename_err格式,例如t1_err。
--- segment reject limit  錯誤數據的條數/百分比(rows/percent),超過設置值會報錯。最小值是2。用來確保數據的完整性。
結果:
postgres=# create external table public.test99(country varchar(128),name varchar(128),id int,sale varchar(128))location ('gpfdist://192.168.0.222:8081/test.txt')format 'text'(delimiter ',' null as '' escape 'off')encoding 'utf8'log errors segment reject limit 3 rows; CREATE EXTERNAL TABLE postgres=# SELECT * from public.test99 postgres-# ; NOTICE: Found 2 data formatting errors (2 or more input rows). Rejected related input data. country | name | id | sale ---------------+------+-----+---------- Prague | Jan | 101 | 4875.33 Rome | Mar | 87 | 1557.39 Bangalore | May | 317 | 8936.99 Beijing | Jul | 411 | 11600.67 San Francisco | Sept | 156 | 6846.34 Paris | Nov | 159 | 7134.56 San Francisco | Jan | 113 | 5397.89 Prague | Dec | 333 | 9894.77 Bangalore | Jul | 271 | 8320.55 Beijing | Dec | 100 | 4248.41 (10 rows) postgres=# SELECT * from test99; NOTICE: Found 2 data formatting errors (2 or more input rows). Rejected related input data. country | name | id | sale ---------------+------+-----+---------- Prague | Jan | 101 | 4875.33 Rome | Mar | 87 | 1557.39 Bangalore | May | 317 | 8936.99 Beijing | Jul | 411 | 11600.67 San Francisco | Sept | 156 | 6846.34 Paris | Nov | 159 | 7134.56 San Francisco | Jan | 113 | 5397.89 Prague | Dec | 333 | 9894.77 Bangalore | Jul | 271 | 8320.55 Beijing | Dec | 100 | 4248.41 (10 rows)

  04,數據裝載

insert into table select * from table_ext;

內部表<----外部表
相關文章
相關標籤/搜索