You are on page 1of 14

HIVE HBASE INTEGRATION

Hive Storage Handlers


When STORED BY is specified, then row_format (DELIMITED or SERDE) and STORED AS cannot be specified. Optional SERDEPROPERTIES can be set.

CREATE TABLE hbase_table_1(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ( "hbase.columns.mapping" = "cf:string", "hbase.table.name" = "hbase_table_0" );
DROP TABLE works as usual, but ALTER TABLE is not yet supported.

JARS
$HIVE_SRC/build/dist/bin/hive --auxpath $HIVE_SRC/build/dist/lib/hivehbase-handler-0.9.0.jar,$HIVE_SRC/build/dist/lib/hbase0.92.0.jar,$HIVE_SRC/build/dist/lib/zookeeper3.3.4.jar,$HIVE_SRC/build/dist/lib/guava-r09.jar -hiveconf hbase.master=hbase.yoyodyne.com:60000 Here's an example which instead targets a distributed HBase cluster where a quorum of 3 zookeepers is used to elect the HBase master: $HIVE_SRC/build/dist/bin/hive --auxpath $HIVE_SRC/build/dist/lib/hivehbase-handler-0.9.0.jar,$HIVE_SRC/build/dist/lib/hbase0.92.0.jar,$HIVE_SRC/build/dist/lib/zookeeper3.3.4.jar,$HIVE_SRC/build/dist/lib/guava-r09.jar -hiveconf hbase.zookeeper.quorum=zk1.yoyodyne.com,zk2.yoyodyne.com,zk3.yoyod yne.com

HIVE-HBASE
Create HBase table managed by Hive, CREATE TABLE hbase_table_1(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val") TBLPROPERTIES ("hbase.table.name" = "xyz"); INSERT OVERWRITE TABLE hbase_table_1 SELECT * FROM pokes WHERE foo=98; Inserting large amounts of data may be slow due to WAL overhead. You can disable it. set hive.hbase.wal.enabled=false; Warning: disabling WAL may lead to data loss if an HBase failure occur.

HBASE-HIVE
If you want to give Hive access to an existing HBase table

CREATE EXTERNAL TABLE hbase_table_2(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = "cf1:val") TBLPROPERTIES("hbase.table.name" = "some_existing_table");

Multiple Columns and Families


CREATE TABLE hbase_table_1(key int, value1 string, value2 int, value3 int) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ( "hbase.columns.mapping" = ":key,a:b,a:c,d:e" ); INSERT OVERWRITE TABLE hbase_table_1 SELECT foo, bar, foo+1, foo+2 FROM pokes WHERE foo=98 OR foo=100;

Illegal
CREATE TABLE hbase_table_1(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ( "hbase.columns.mapping" = ":key,cf:" ); FAILED: Error in metadata: java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serd e2.SerDeException org.apache.hadoop.hive.hbase.HBaseSerDe: hbase column family 'cf:' should be mapped to map<string,?> but is mapped to string)

Binary columns
CREATE TABLE hbase_table_1 (key int, value string, foobar double) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ( "hbase.columns.mapping" = ":key#b,cf:val,cf:foo#b" ); Specifying hbase.table.default.storage.type: CREATE TABLE hbase_table_1 (key int, value string, foobar double) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ( "hbase.columns.mapping" = ":key,cf:val#s,cf:foo", "hbase.table.default.storage.type" = "binary" ); (Binary or string)

Key Uniqueness
difference between HBase tables and Hive tables is that HBase tables have a unique key, whereas Hive tables do not. When multiple rows with the same key are inserted into HBase, only one of them is stored (the choice is arbitrary).

This is in contrast to Hive, which is happy to store multiple rows with the same key and different values.

Hive HBase Bulk Load


CREATE TABLE new_hbase_table(rowkey string, x int, y int) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:x,cf:y"); SET hive.hbase.bulk=true;

INSERT OVERWRITE TABLE new_hbase_table SELECT rowkey_expression, x, y FROM ...any_hive_query...;

Add necessary JARs


hadoop dfs -put /usr/lib/hive/lib/hbase-VERSION.jar /user/hive/hbase-VERSION.jar hadoop dfs -put /usr/lib/hive/lib/hive-hbase-handler-VERSION.jar /user/hive/hive-hbase-handler-VERSION.jar

Then add them to your hive-site.xml:


<property> <name>hive.aux.jars.path</name> <value>/user/hive/hbase-VERSION.jar,/user/hive/hivehbase-handler-VERSION.jar</value> </property>

create external table hb_range_keys(transaction_id_range_start string) row format serde 'org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerD e stored as inputformat 'org.apache.hadoop.mapred.TextInputFormat' outputformat 'org.apache.hadoop.hive.ql.io.HiveNullValueSequenceFileOutputFor mat' location '/tmp/hb_range_keys'; insert overwrite table hb_range_keys select transaction_id from (select transaction_id from transactions tablesample(bucket 1 out of 10000 on transaction_id) s order by transaction_id limit 10000000) x where (row_sequence() % 910000)=0 order by transaction_id limit 11;

Sort Data
set mapred.reduce.tasks=12; set hive.mapred.partitioner=org.apache.hadoop.mapred.lib.TotalOrderPartitio ner; set total.order.partitioner.path=/tmp/hb_range_key_list; set hfile.compression=gz; create table hbsort(transaction_id string, user_name string, amount double, ...) stored as INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.hbase.HiveHFileOutputFormat' TBLPROPERTIES ('hfile.family.path' = '/tmp/hbsort/cf'); insert overwrite table hbsort select transaction_id, user_name, amount, ... from transactions cluster by transaction_id;

You might also like