Etiquetas

, ,

  • Crear proyecto cliente JDBC Hive

mvn archetype:generate -DgroupId=test -DartifactId=hiveclientjdbc

  • Crear el pom.xml a utilizar con las dependencias necesarias

pom.xml

HiveClientJdbc

  • Cargar ficheros de datos en HDFS. Los ficheros que se utilizan se encuentran en el raíz de la distribución de Pivotal (/retail_demo)

./load_data_to_HDFS.sh

load_data_to_HDFS

  • Crear las tablas de Hive

hive -f create_hive_tables.sql

  • Cargar las tablas de Hive con los datos de los ficheros

hive -f loadDataHiveTables.sql

loadDataHieTables

  • Ejecutar el proyecto maven cliente

mvn compile exec:java -Dexec.mainClass=”test.HiveClientJdbc”

Ficheros utilizados:

pom.xml

  <modelVersion>4.0.0</modelVersion>
  <groupId>test</groupId>
  <artifactId>hiveclientjdbc</artifactId>
  <version>1.0-SNAPSHOT</version>
  <packaging>jar</packaging>
  <name>hiveclientjdbc</name>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>
    <repositories>
        <repository>
            <name>Cloudera repository</name>
            <id>cloudera-releases</id>
        </repository>
    </repositories>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>0.7.1-cdh3u5</version>
            <type>jar</type>
            <exclusions>
                <exclusion>
                    <groupId>log4j</groupId>
                    <artifactId>log4j</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop.hive</groupId>
            <artifactId>hive-metastore</artifactId>
            <version>0.7.1-cdh3u5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop.hive</groupId>
            <artifactId>hive-service</artifactId>
            <version>0.7.1-cdh3u5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>0.7.1-cdh3u5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>0.20.2</version>
        </dependency>
  </dependencies>
</project>

——————-

HiveClientJdbc.java

package test;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
public class HiveClientJdbc {
    private static final String HIVE_DRIVER = “org.apache.hadoop.hive.jdbc.HiveDriver”;
    private static final String CONNECTION_URL = “jdbc:hive://localhost:10000/default”;
    public static void main(String[] args) throws SQLException, ClassNotFoundException {
        Class.forName(HIVE_DRIVER);
        Connection con = DriverManager.getConnection(CONNECTION_URL, “”, “”);
        Statement stmt = con.createStatement();
        ResultSet rslt = stmt.executeQuery(“select count(*) from retail_demo.email_addresses_dim_hive”);
        if (rslt.next()) {
            System.out.println(rslt.getInt(1));
        }
        con.close();
    }
}

——————-

load_data_to_HDFS.sh

#!/bin/bash
base_dir=”/retail_demo”
# Clean up any previous load
echo “hadoop fs -rm -r -skipTrash $base_dir”
hadoop fs -rm -r -skipTrash $base_dir
echo “hadoop fs -mkdir $base_dir”
hadoop fs -mkdir $base_dir
for file in *.tsv.gz
do
  dir=`echo $file | perl -ne ‘s/^(.+?)\..+$/$1/;print;’`
  echo “hadoop fs -mkdir $base_dir/$dir”
  hadoop fs -mkdir $base_dir/$dir
  echo “hadoop fs -put $file $base_dir/$dir/”
  hadoop fs -put $file $base_dir/$dir/
done
——————-

create_hive_tables.sql

CREATE DATABASE IF NOT EXISTS retail_demo;
CREATE TABLE retail_demo.order_lineitems_hive
(
  Order_ID                      string
, Order_Item_ID                 bigint
, Product_ID                    int
, Product_Name                  string
, Customer_ID                   int
, Store_ID                      int
, Item_Shipment_Status_Code     string
, Order_Datetime                timestamp
, Ship_Datetime                 timestamp
, Item_Return_Datetime          timestamp
, Item_Refund_Datetime          timestamp
, Product_Category_ID           int
, Product_Category_Name         string
, Payment_Method_Code           string
, Tax_Amount                    double
, Item_Quantity                 int
, Item_Price                    double
, Discount_Amount               double
, Coupon_Code                   string
, Coupon_Amount                 double
, Ship_Address_Line1            string
, Ship_Address_Line2            string
, Ship_Address_Line3            string
, Ship_Address_City             string
, Ship_Address_State            string
, Ship_Address_Postal_Code      string
, Ship_Address_Country          string
, Ship_Phone_Number             string
, Ship_Customer_Name            string
, Ship_Customer_Email_Address   string
, Ordering_Session_ID           string
, Website_URL                   string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/order_lineitems/’;
CREATE TABLE retail_demo.orders_hive
(
  Order_ID                      string
, Customer_ID                   int
, Store_ID                      int
— , Order_Datetime_Orig           timestamp
, Order_Datetime                timestamp
, Ship_Completion_Datetime      timestamp
, Return_Datetime               timestamp
, Refund_Datetime               timestamp
, Payment_Method_Code           string
, Total_Tax_Amount              double
, Total_Paid_Amount             double
, Total_Item_Quantity           int
, Total_Discount_Amount         double
, Coupon_Code                   string
, Coupon_Amount                 double
, Order_Canceled_Flag           string
, Has_Returned_Items_Flag       string
, Has_Refunded_Items_Flag       string
, Fraud_Code                    string
, Fraud_Resolution_Code         string
, Billing_Address_Line1         string
, Billing_Address_Line2         string
, Billing_Address_Line3         string
, Billing_Address_City          string
, Billing_Address_State         string
, Billing_Address_Postal_Code   string
, Billing_Address_Country       string
, Billing_Phone_Number          string
, Customer_Name                 string
, Customer_Email_Address        string
, Ordering_Session_ID           string
, Website_URL                   string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/orders/’;
CREATE TABLE retail_demo.products_dim_hive
(
  Product_ID      int,
  Category_ID     smallint,
  Price           double,
  Product_Name    string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/products_dim/’;
CREATE TABLE retail_demo.categories_dim_hive
(
  Category_ID    bigint,
  Category_Name  string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/categories_dim/’;
CREATE TABLE retail_demo.email_addresses_dim_hive
(
  Customer_ID     int,
  Email_Address   string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/email_addresses_dim/’;
CREATE TABLE retail_demo.date_dim_hive
(
  calendar_day      timestamp,
  reporting_year    smallint,
  reporting_quarter smallint,
  reporting_month   smallint,
  reporting_week    smallint,
  reporting_dow     smallint
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/date_dim/’;
CREATE TABLE retail_demo.customers_dim_hive
(
  Customer_ID    bigint,
  First_Name     string,
  Last_Name      string,
  Gender         string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/customers_dim/’;
CREATE TABLE retail_demo.payment_methods_hive
(
  Payment_Method_ID    SMALLINT,
  Payment_Method_Code  string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/payment_methods/’;
CREATE TABLE retail_demo.customer_addresses_dim_hive
(
  Customer_Address_ID  bigint,
  Customer_ID          bigint,
  Valid_From_Timestamp timestamp,
  Valid_To_Timestamp   timestamp,
  House_Number         string,
  Street_Name          string,
  Appt_Suite_No        string,
  City                 string,
  State_Code           string,
  Zip_Code             string,
  Zip_Plus_Four        string,
  Country              string,
  Phone_Number         string
)
  ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
  STORED AS TEXTFILE
  LOCATION ‘/retail_demo/customer_addresses_dim/’;

——————-

loadDataHiveTables.sql

LOAD DATA LOCAL INPATH ‘/retail_demo/orders/orders.tsv.gz’ OVERWRITE INTO TABLE retail_demo.orders_hive;
LOAD DATA LOCAL INPATH ‘/retail_demo/products_dim/products_dim.tsv.gz’ OVERWRITE INTO TABLE retail_demo.products_dim_hive;
LOAD DATA LOCAL INPATH ‘/retail_demo/categories_dim/categories_dim.tsv.gz’ OVERWRITE INTO TABLE retail_demo.categories_dim_hive;
LOAD DATA LOCAL INPATH ‘/retail_demo/email_addresses_dim/email_addresses_dim.tsv.gz’ OVERWRITE INTO TABLE retail_demo.email_addresses_dim_hive;
LOAD DATA LOCAL INPATH ‘/retail_demo/date_dim/date_dim.tsv.gz’ OVERWRITE INTO TABLE retail_demo.date_dim_hive;
LOAD DATA LOCAL INPATH ‘/retail_demo/customers_dim/customers_dim.tsv.gz’ OVERWRITE INTO TABLE retail_demo.customers_dim_hive;
LOAD DATA LOCAL INPATH ‘/retail_demo/payment_methods/payment_methods.tsv.gz’ OVERWRITE INTO TABLE retail_demo.payment_methods_hive;
LOAD DATA LOCAL INPATH ‘/retail_demo/customer_addresses_dim/customer_addresses_dim.tsv.gz’ OVERWRITE INTO TABLE retail_demo.customer_addresses_dim_hive;

Anuncios