hbase快速入門

時間 2019-11-13

標籤 hbase 快速入門欄目 Hadoop 简体版

原文原文鏈接

hbase 是什麼？html

 Apache HBase is an open-source, distributed, versioned, non-relational database modeled after Google's Bigtable: A Distributed Storage System for Structured Data by Chang et al. Just as Bigtable leverages the distributed data storage provided by the Google File System, Apache HBase provides Bigtable-like capabilities on top of Hadoop and HDFS.

hbase的應用場景java

Use Apache HBase™ when you need random, realtime read/write access to your Big Data. This project's goal is the hosting of very large tables -- billions of rows X millions of columns -- atop clusters of commodity hardware.

hbase的特性shell

>>>Linear and modular scalability.
>>>Strictly consistent reads and writes.
>>>Automatic and configurable sharding of tables
>>>Automatic failover support between RegionServers.
>>>Convenient base classes for backing Hadoop MapReduce jobs with Apache HBase tables.
>>>Easy to use Java API for client access.
>>>Block cache and Bloom Filters for real-time queries.
>>>Query predicate push down via server side Filters
>>>Thrift gateway and a REST-ful Web service that supports XML, Protobuf, and binary data encoding options
>>>Extensible jruby-based (JIRB) shell
>>>Support for exporting metrics via the Hadoop metrics subsystem to files or Ganglia; or via JMX

hbase 有哪些操做？apache

先看一下hbase的基本操做：建立一個表，刪除一個表，增長一條記錄，刪除一條記錄，遍歷一條記錄。ruby

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
 
public class HBaseTest {
 
    private static Configuration conf = null;
    /**
     * Initialization
     */
    static {
        conf = HBaseConfiguration.create();
    }
 
    /**
     * Create a table
     */
    public static void creatTable(String tableName, String[] familys)
            throws Exception {
        HBaseAdmin admin = new HBaseAdmin(conf);
        if (admin.tableExists(tableName)) {
            System.out.println("table already exists!");
        } else {
            HTableDescriptor tableDesc = new HTableDescriptor(tableName);
            for (int i = 0; i < familys.length; i++) {
                tableDesc.addFamily(new HColumnDescriptor(familys[i]));
            }
            admin.createTable(tableDesc);
            System.out.println("create table " + tableName + " ok.");
        }
    }
 
    /**
     * Delete a table
     */
    public static void deleteTable(String tableName) throws Exception {
        try {
            HBaseAdmin admin = new HBaseAdmin(conf);
            admin.disableTable(tableName);
            admin.deleteTable(tableName);
            System.out.println("delete table " + tableName + " ok.");
        } catch (MasterNotRunningException e) {
            e.printStackTrace();
        } catch (ZooKeeperConnectionException e) {
            e.printStackTrace();
        }
    }
 
    /**
     * Put (or insert) a row
     */
    public static void addRecord(String tableName, String rowKey,
            String family, String qualifier, String value) throws Exception {
        try {
            HTable table = new HTable(conf, tableName);
            Put put = new Put(Bytes.toBytes(rowKey));
            put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier), Bytes
                    .toBytes(value));
            table.put(put);
            System.out.println("insert recored " + rowKey + " to table "
                    + tableName + " ok.");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
    /**
     * Delete a row
     */
    public static void delRecord(String tableName, String rowKey)
            throws IOException {
        HTable table = new HTable(conf, tableName);
        List<Delete> list = new ArrayList<Delete>();
        Delete del = new Delete(rowKey.getBytes());
        list.add(del);
        table.delete(list);
        System.out.println("del recored " + rowKey + " ok.");
    }
 
    /**
     * Get a row
     */
    public static void getOneRecord (String tableName, String rowKey) throws IOException{
        HTable table = new HTable(conf, tableName);
        Get get = new Get(rowKey.getBytes());
        Result rs = table.get(get);
        for(KeyValue kv : rs.raw()){
            System.out.print(new String(kv.getRow()) + " " );
            System.out.print(new String(kv.getFamily()) + ":" );
            System.out.print(new String(kv.getQualifier()) + " " );
            System.out.print(kv.getTimestamp() + " " );
            System.out.println(new String(kv.getValue()));
        }
    }
    /**
     * Scan (or list) a table
     */
    public static void getAllRecord (String tableName) {
        try{
             HTable table = new HTable(conf, tableName);
             Scan s = new Scan();
             ResultScanner ss = table.getScanner(s);
             for(Result r:ss){
                 for(KeyValue kv : r.raw()){
                    System.out.print(new String(kv.getRow()) + " ");
                    System.out.print(new String(kv.getFamily()) + ":");
                    System.out.print(new String(kv.getQualifier()) + " ");
                    System.out.print(kv.getTimestamp() + " ");
                    System.out.println(new String(kv.getValue()));
                 }
             }
        } catch (IOException e){
            e.printStackTrace();
        }
    }
 
    public static void main(String[] agrs) {
        try {
            String tablename = "scores";
            String[] familys = { "grade", "course" };
            HBaseTest.creatTable(tablename, familys);
 
            // add record zkb
            HBaseTest.addRecord(tablename, "zkb", "grade", "", "5");
            HBaseTest.addRecord(tablename, "zkb", "course", "", "90");
            HBaseTest.addRecord(tablename, "zkb", "course", "math", "97");
            HBaseTest.addRecord(tablename, "zkb", "course", "art", "87");
            // add record baoniu
            HBaseTest.addRecord(tablename, "baoniu", "grade", "", "4");
            HBaseTest.addRecord(tablename, "baoniu", "course", "math", "89");
 
            System.out.println("===========get one record========");
            HBaseTest.getOneRecord(tablename, "zkb");
 
            System.out.println("===========show all record========");
            HBaseTest.getAllRecord(tablename);
 
            System.out.println("===========del one record========");
            HBaseTest.delRecord(tablename, "baoniu");
            HBaseTest.getAllRecord(tablename);
 
            System.out.println("===========show all record========");
            HBaseTest.getAllRecord(tablename);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

基本概念　服務器

HTable
核心概念，實現了Table，用來和hbase的一個單表進行通訊。輕量級的，提供獲取和關閉方法。
這個類不能經過構造函數直接構建出來。能夠經過Connection獲取該類的一個實例。參考ConnectionFactory類生成實例：

 Connection connection = ConnectionFactory.createConnection(config);
 Table table = connection.getTable(TableName.valueOf("table1"));
 try {
   // Use the table as needed, for a single operation and a single thread
 } finally {
    table.close();
    connection.close();
  }

Htable的字段有哪些呢？數據結構

public class HTable implements HTableInterface, RegionLocator {
  private static final Log LOG = LogFactory.getLog(HTable.class);
  protected ClusterConnection connection;
  private final TableName tableName;
  private volatile Configuration configuration;
  private TableConfiguration tableConfiguration;
  protected BufferedMutatorImpl mutator;
  private boolean autoFlush = true;
  private boolean closed = false;
  protected int scannerCaching;
  private ExecutorService pool;  // For Multi & Scan
  private int operationTimeout;
  private final boolean cleanupPoolOnClose; // shutdown the pool in close()
  private final boolean cleanupConnectionOnClose; // close the connection in close()
  private Consistency defaultConsistency = Consistency.STRONG;

  /** The Async process for batch */
  protected AsyncProcess multiAp;
  private RpcRetryingCallerFactory rpcCallerFactory;
  private RpcControllerFactory rpcControllerFactory;
}

HTableDescriptor包含了HBase表的詳細信息，例如全部列家族的描述，該表是否一個分類表，ROOT或者hbase:meta，該表是否只讀，當region分片時memstore的最大值，關聯的coprocessor等等。app

Htable繼承並實現了Table，Table用來和一個hbase單表進行通訊，從表中獲取，插入，刪除或者掃描數據。使用Connection來獲取Table實例，使用完畢後調用close()方法。dom

Htable也繼承實現了RegionLocator，RegionLocator用來定位一張Hbase單表的區域位置信息，能夠經過Connection獲取該類的實例，RegionLocator的getRegionLocation方法返回HRegionLocation。ide

HRegionLocation

記錄HRegionInfo和HRegionServer的主機地址的數據結構。

構造函數：

  public HRegionLocation(HRegionInfo regionInfo, ServerName serverName) {
    this(regionInfo, serverName, HConstants.NO_SEQNUM);
  }

  public HRegionLocation(HRegionInfo regionInfo, ServerName serverName, long seqNum) {
    this.regionInfo = regionInfo;
    this.serverName = serverName;
    this.seqNum = seqNum;
  }

HRegionInfo

　　一個區域的信息。區域是在一張表的整個鍵空間中一系列的鍵，一個標識(時間戳)區分不一樣子序列(在區間分隔以後)，一個複製ID區分同一序列和同一區域狀態信息的不一樣實例。

　　一個區域有一個位於的名稱，名稱由下列的字段組成：

　　　　表名(tableName)：表的名稱。

　　　　開始鍵(startKey)：一個區間的開始鍵。

　　　　區域ID(regionId)：建立區域的時間戳。

　　　　複製ID(replicaId)：一個區分同一區域序列的Id，從0開始，保存到不一樣的服務器，同一個區域的序列能夠保存在多個位置中。

　　　　加密後的名稱(encodedName):md5加密後的區域名稱。

除了區域名稱外，區域信息還包含：
　　　　結束鍵(endkey):區域的結束鍵(獨有的)
　　　　分片(split)：區域是否分片
　　　　離線(offline)：區域是否離線
在0.98版本或者以前，一組表的區域會徹底包含全部的鍵空間，在任什麼時候間點，一個行鍵一般屬於一個單獨的區域，該單獨區域又屬於一個單獨的服務器。在0.99+版本，一個區域能夠有多個實例（叫作備份），所以一行能夠對應多個HRegionInfo。這些HRI除了備份Id字段外均可以共用字段。若備份Id未設置，默認爲0。

   /**
   * The new format for a region name contains its encodedName at the end.
   * The encoded name also serves as the directory name for the region
   * in the filesystem.
   *
   * New region name format:
   *    &lt;tablename>,,&lt;startkey>,&lt;regionIdTimestamp>.&lt;encodedName>.
   * where,
   *    &lt;encodedName> is a hex version of the MD5 hash of
   *    &lt;tablename>,&lt;startkey>,&lt;regionIdTimestamp>
   *
   * The old region name format:
   *    &lt;tablename>,&lt;startkey>,&lt;regionIdTimestamp>
   * For region names in the old format, the encoded name is a 32-bit
   * JenkinsHash integer value (in its decimal notation, string form).
   *<p>
   * **NOTE**
   *
   * The first hbase:meta region, and regions created by an older
   * version of HBase (0.20 or prior) will continue to use the
   * old region name format.
   */

  /** Separator used to demarcate the encodedName in a region name
   * in the new format. See description on new format above.
   */
  private static final int ENC_SEPARATOR = '.';
  public  static final int MD5_HEX_LENGTH   = 32;

  /** A non-capture group so that this can be embedded. */
  public static final String ENCODED_REGION_NAME_REGEX = "(?:[a-f0-9]+)";

  // to keep appended int's sorted in string format. Only allows 2 bytes to be
  // sorted for replicaId
  public static final String REPLICA_ID_FORMAT = "%04X";

  public static final byte REPLICA_ID_DELIMITER = (byte)'_';

  private static final int MAX_REPLICA_ID = 0xFFFF;
  static final int DEFAULT_REPLICA_ID = 0;

private byte [] endKey = HConstants.EMPTY_BYTE_ARRAY;
  // This flag is in the parent of a split while the parent is still referenced
  // by daughter regions.  We USED to set this flag when we disabled a table
  // but now table state is kept up in zookeeper as of 0.90.0 HBase.
  private boolean offLine = false;
  private long regionId = -1;
  private transient byte [] regionName = HConstants.EMPTY_BYTE_ARRAY;
  private boolean split = false;
  private byte [] startKey = HConstants.EMPTY_BYTE_ARRAY;
  private int hashCode = -1;
  //TODO: Move NO_HASH to HStoreFile which is really the only place it is used.
  public static final String NO_HASH = null;
  private String encodedName = null;
  private byte [] encodedNameAsBytes = null;
  private int replicaId = DEFAULT_REPLICA_ID;

  // Current TableName
  private TableName tableName = null;

  /** HRegionInfo for first meta region */
  public static final HRegionInfo FIRST_META_REGIONINFO =
      new HRegionInfo(1L, TableName.META_TABLE_NAME);


先了解一下HBase的數據結構：

組成部件說明：

Row Key：　　　　Table主鍵行鍵 Table中記錄按照Row Key排序
Timestamp：　　每次對數據操做對應的時間戳，也即數據的version number
Column Family：　列簇，一個table在水平方向有一個或者多個列簇，列簇可由任意多個Column組成，列簇支持動態擴展，無須預約義數量及類型，二進制存儲，用戶需自行進行類型轉換。

行操做


Get用來對一個單獨的行進行Get操做：
獲取一個row的全部信息前，須要實例化一個Get對象。
獲取特定家族的全部列，使用addFamily(byte[])。
獲取特定列，使用addColumn(byte[], byte[])
獲取在特定的時間戳內的一系列列，使用setTimeRange(long, long)
獲取在特定時間戳的列，使用setTimeStamp(long)
限制返回的列數，使用setMaxVersions(int)
增長過濾器，使用setFilter(Filter)。

Put用來對一個單獨的列進行Put操做：
使用Put前需初始化Put對象，來插入一行使用add(byte[], byte[], byte[])，若設定時間戳則使用add(byte[], byte[], long, byte[])。

Append 操做：
對一行增長多個列，使用add(byte[], byte[], byte[])；

參考文獻：

【1】http://hbase.apache.org/

【2】https://autofei.wordpress.com/2012/04/02/java-example-code-using-hbase-data-model-operations/

【3】http://www.cnblogs.com/shitouer/archive/2012/06/04/2533518.html