trino+hive metastore+k8s+minio environment prepared

minio+presto

Make the Hive images

 1FROM openjdk:8u302
 2
 3WORKDIR /opt/hive
 4ENV HADOOP_HOME=/opt/hadoop-3.2.3
 5
 6ADD ./hadoop-3.2.3 /opt/hadoop-3.2.3
 7ADD ./apache-hive-3.1.3-bin /opt/hive
 8ADD ./hadoop-3.2.3/share/hadoop/common/lib/guava-27.0-jre.jar /opt/hive/lib/
 9ADD ./hadoop-3.2.3/share/hadoop/tools/lib/hadoop-aws-3.2.3.jar /opt/hive/lib/
10ADD ./hadoop-3.2.3/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.901.jar /opt/hive/lib/
11
12RUN echo 'export HIVE_HOME=/opt/hive \n export HIVE_CONF_DIR=/opt/hive/conf \n export PATH=$PATH:$HIVE_HOME/bin' >> ~/.bashrc
13# RUN curl https://jdbc.postgresql.org/download/postgresql-42.7.1.jar --output /opt/hive/lib/postgresql-42.7.1.jar
14RUN rm -rf /opt/hive/lib/guava-19.0.jar

Define the docker-compose.yaml

 1version: '3.7'
 2services:
 3  minio:
 4    image: 'minio/minio:latest'
 5    hostname: minio
 6    container_name: minio
 7    ports:
 8      - '9000:9000'
 9      - '9001:9001'
10    volumes:
11      - ./minio-data:/data
12    environment:
13      MINIO_ACCESS_KEY: xxxx
14      MINIO_SECRET_KEY: xxxx
15    command: server --console-address ":9001" /data
16    networks:
17      - trino-network
18
19  postgres:
20    image: 'postgres:11'
21    hostname: postgres
22    expose:
23      - '5432'
24    ports:
25      - '5431:5432'
26    volumes:
27      - ./hive-schema-3.1.0.sql:/docker-entrypoint-initdb.d/hive-schema-3.1.0.sql
28    environment:
29      POSTGRES_USER: xxxx
30      POSTGRES_PASSWORD: xxxx
31      POSTGRES_DB: hive_db
32    networks:
33      - trino-network 
34
35  hive:
36    image: 'hivems:3.1.2'
37    hostname: hive
38    ports:
39      - '9083:9083' # Metastore Thrift
40    command: ["/opt/hive/bin/hive", "--service", "metastore"]
41    volumes:
42      - ./hive-site.xml:/opt/hive/conf/hive-site.xml:ro
43    environment:
44      METASTORE_DB_HOSTNAME: postgres
45      METASTORE_TYPE: postgres
46    depends_on:
47      - postgres
48    networks:
49      - trino-network       
50
51networks:
52  trino-network:
53    driver: bridge

The hive-site.xml file content

 1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3<configuration>
 4    <property>
 5        <name>fs.s3a.access.key</name>
 6        <value>xxxx</value>
 7    </property>
 8    <property>
 9        <name>fs.s3a.secret.key</name>
10        <value>xxxx</value>
11    </property>
12    <property>
13        <name>fs.s3a.connection.ssl.enabled</name>
14        <value>false</value>
15    </property>
16    <property>
17        <name>fs.s3a.path.style.access</name>
18        <value>true</value>
19    </property>
20    <property>
21        <name>fs.s3a.endpoint</name>
22        <value>http://minio:9000</value>
23    </property>
24    <property>
25        <name>javax.jdo.option.ConnectionURL</name>
26        <value>jdbc:postgresql://postgres:5432/hive_db?allowPublicKeyRetrieval=true;useSSL=false;serverTimezone=UTC</value>
27    </property>
28    <property>
29        <name>javax.jdo.option.ConnectionDriverName</name>
30        <value>org.postgresql.Driver</value>
31    </property>
32    <property>
33        <name>javax.jdo.option.ConnectionUserName</name>
34        <value>xxxx</value>
35    </property>
36    <property>
37        <name>javax.jdo.option.ConnectionPassword</name>
38        <value>xxxx</value>
39    </property>
40    <property>
41        <name>hive.metastore.event.db.notification.api.auth</name>
42        <value>false</value>
43    </property>
44    <property>
45        <name>metastore.thrift.uris</name>
46        <value>thrift://hive:9083</value>
47        <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
48    </property>
49    <property>
50        <name>metastore.task.threads.always</name>
51        <value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
52    </property>
53    <property>
54        <name>metastore.expression.proxy</name>
55        <value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
56    </property>
57    <property>
58        <name>metastore.warehouse.dir</name>
59        <value>s3a://warehouse/hive</value>
60    </property>
61</configuration>

Import the sql to postgresql

the sql file can find in this url: hive schema 3.1.0 sql

1psql --user xxxx --password -d hive_db < hive-schema-3.1.0.sql

===================The following steps support aws s3 because the hadoop 2.8.0 above only =======================

Install the hive metastore

 1# add helm repo
 2helm repo add bigdata-gradiant https://gradiant.github.io/bigdata-charts/
 3helm repo update
 4# search the charts of bigdata-gradiant repo
 5helm search repo bigdata-gradiant
 6
 7# create namespace
 8kubectl create ns analytics
 9
10# launch the hive metastore deployment
11helm install hivems bigdata-gradiant/hive-metastore -n analytics
12# view the pods
13kubectl -n analytics get pods
14# chage the hivems docker image because the original image didn't include the hadoop-aws-2.7.4.jar file
15docker pull jboothomas/hive-metastore-s3:v6
16# load the docker image to Kind cluster
17kind load docker-image jboothomas/hive-metastore-s3:v6  -n local-k8s
18# change the hivems image to modified
19# bde2020/hive:2.3.2-postgresql-metastore
20kubectl patch statefulset  hivems-hive-metastore -n analytics  -p '{"spec": { "template": { "spec": {"containers":[{"name": "metastore","image": "jboothomas/hive-metastore-s3:v6"}]}}}}'

Deployment the minio to the analytics namespace

  • kubectl apply -f ./minio-deployment.yaml
 1apiVersion: apps/v1 #  for k8s versions before 1.9.0 use apps/v1beta2  and before 1.8.0 use extensions/v1beta1
 2kind: Deployment
 3metadata:
 4  # This name uniquely identifies the Deployment
 5  name: minio-deployment
 6spec:
 7  selector:
 8    matchLabels:
 9      app: minio
10  strategy:
11    type: Recreate
12  template:
13    metadata:
14      labels:
15        # Label is used as selector in the service.
16        app: minio
17    spec:
18      # Refer to the PVC created earlier
19      volumes:
20      - name: storage
21        persistentVolumeClaim:
22          # Name of the PVC created earlier
23          claimName: minio-pv-claim
24      containers:
25      - name: minio
26        # Pulls the default Minio image from Docker Hub
27        image: minio/minio:latest
28        args:
29        - server
30        - /storage
31        env:
32        # Minio access key and secret key
33        - name: MINIO_ACCESS_KEY
34          value: "minio"
35        - name: MINIO_SECRET_KEY
36          value: "minio123"
37        ports:
38        - containerPort: 9000
39          hostPort: 9000
40        # Mount the volume into the pod
41        volumeMounts:
42        - name: storage # must match the volume name, above
43          mountPath: "/storage"
44---
45apiVersion: v1
46kind: Service
47metadata:
48  name: minio-service
49spec:
50  selector:
51    app: minio
52  ports:
53    - protocol: TCP
54      port: 9000
55      targetPort: 9000          
56---
57apiVersion: v1
58kind: PersistentVolumeClaim
59metadata:
60  name: minio-pv-claim
61spec:
62  accessModes:
63    - ReadWriteOnce
64  resources:
65    requests:
66      storage: 10Gi          

Download the New York City Taxi datasets from s3

manually upload to container mount folder maybe didn’t working, only port-forward to and upload file create bucket in minio web ui is working.

 1# view the datasets bucket
 2aws s3 ls s3://nyc-tlc/
 3# download some csv
 4aws s3 cp s3://nyc-tlc/csv_backup/yellow_tripdata_2022-02.csv ./
 5# This way is doesn't working please use minio web ui to upload
 6# copy the dataset file to Kind k8s cluster host mount folder
 7# also this is minio mount pvc path in container
 8# docker cp ./yellow_tripdata_2022-02.csv local-k8s-worker2:/var/local-path-provisioner/pvc-1a14ef0e-49b2-4ad4-ac51-cf8e877306be_analytics_minio-pv-claim/
 9# # create bucket in mount folder and move the file to that bucket
10# mkdir nyctaxi
11# cp yellow_tripdata_2022-02.csv nyctaxi/
12
13# can also port forward the minio service to localhost
14# and access http://127.0.0.1:34389 to access minio web ui to create 
15k port-forward deploy/minio-deployment -n analytics 34389:34389  9000:9000

Install HDFS chats, This is not necessary, can change the hive-site.xml warehouse dir

1helm pull bigdata-gradiant/hdfs
2tar -xzvf ./hdfs-0.1.10.tgz
3# change the pdb version to policy/v1 in yaml file
4cd hdfs && vi ./templates/hdfs-dn-pdb.yaml 
5vi ./templates/hdfs-nn-pdb.yaml 
6# install the hdfs in local cluster
7helm install hdfs ./hdfs -n analytics

Change hivems configmap add the minio properies

 1$ kubectl -n analytics get configmap hivems-hive-metastore -o yaml  > hivems-hive-metastore.yaml
 2$ vi hivems-hive-metastore.yaml 
 3### I MAKE THE FOLLOWING CHANGE/ADDITION ###
 4data:
 5  hive-site.xml: |
 6    <?xml version="1.0"?>
 7    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 8    <configuration>
 9        <property>
10          <name>fs.s3a.endpoint</name>
11          <value>http://minio-service:9000</value> 
12        </property>
13        <property>
14          <name>fs.s3a.access.key</name>
15          <value>xxxxxx</value> 
16        </property>
17        <property>
18          <name>fs.s3a.secret.key</name>
19          <value>xxxxxx</value> 
20        </property>
21        <property>
22          <name>fs.s3a.connection.ssl.enabled</name>
23          <value>false</value>
24        </property>
25        <property>
26            <name>fs.s3a.path.style.access</name>
27            <value>true</value>
28            <description>Enable S3 path style access.</description>
29        </property>
30        <property>
31            <name>fs.s3a.impl</name>
32            <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
33            <description>The implementation of S3A Filesystem</description>
34        </property>    
  • also change the hive.metastore.warehouse.dir section
 1#<property>
 2#    <name>hive.metastore.warehouse.dir</name>
 3#    <value>hdfs://hdfs:8020/user/hive/warehouse</value>
 4#</property>
 5
 6# TO
 7<property>
 8<name>hive.metastore.warehouse.dir</name>
 9<value>s3a://hive/warehouse</value>
10</property>

Create the table schema

 1# enter pod 
 2k exec -it hivems-hive-metastore-0 -n analytics --  sh
 3# execute
 4hive
 5hive> 
 6create external table if not exists nyctaxi(
 7    VendorID bigint,
 8    tpep_pickup_datetime timestamp,
 9    tpep_dropoff_datetime timestamp,
10    passenger_count double,
11    trip_distance double,
12    RatecodeID double,
13    store_and_fwd_flag string,
14    PULocationID bigint,
15    DOLocationID bigint,
16    payment_type bigint,
17    fare_amount double,
18    extra double,
19    mta_tax double,
20    tip_amount double,
21    tolls_amount double,
22    improvement_surcharge double,
23    total_amount double
24    )
25    ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES('separatorChar'=',','quoteChar'='\"')
26    STORED AS TEXTFILE 
27    LOCATION 's3a://test/' TBLPROPERTIES('skip.header.line.count'='1');

Relative jar package can download in

Reference