trino+hive metastore+k8s+minio environment prepared
Make the Hive images
1FROM openjdk:8u302
2
3WORKDIR /opt/hive
4ENV HADOOP_HOME=/opt/hadoop-3.2.3
5
6ADD ./hadoop-3.2.3 /opt/hadoop-3.2.3
7ADD ./apache-hive-3.1.3-bin /opt/hive
8ADD ./hadoop-3.2.3/share/hadoop/common/lib/guava-27.0-jre.jar /opt/hive/lib/
9ADD ./hadoop-3.2.3/share/hadoop/tools/lib/hadoop-aws-3.2.3.jar /opt/hive/lib/
10ADD ./hadoop-3.2.3/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.901.jar /opt/hive/lib/
11
12RUN echo 'export HIVE_HOME=/opt/hive \n export HIVE_CONF_DIR=/opt/hive/conf \n export PATH=$PATH:$HIVE_HOME/bin' >> ~/.bashrc
13# RUN curl https://jdbc.postgresql.org/download/postgresql-42.7.1.jar --output /opt/hive/lib/postgresql-42.7.1.jar
14RUN rm -rf /opt/hive/lib/guava-19.0.jar
Define the docker-compose.yaml
1version: '3.7'
2services:
3 minio:
4 image: 'minio/minio:latest'
5 hostname: minio
6 container_name: minio
7 ports:
8 - '9000:9000'
9 - '9001:9001'
10 volumes:
11 - ./minio-data:/data
12 environment:
13 MINIO_ACCESS_KEY: xxxx
14 MINIO_SECRET_KEY: xxxx
15 command: server --console-address ":9001" /data
16 networks:
17 - trino-network
18
19 postgres:
20 image: 'postgres:11'
21 hostname: postgres
22 expose:
23 - '5432'
24 ports:
25 - '5431:5432'
26 volumes:
27 - ./hive-schema-3.1.0.sql:/docker-entrypoint-initdb.d/hive-schema-3.1.0.sql
28 environment:
29 POSTGRES_USER: xxxx
30 POSTGRES_PASSWORD: xxxx
31 POSTGRES_DB: hive_db
32 networks:
33 - trino-network
34
35 hive:
36 image: 'hivems:3.1.2'
37 hostname: hive
38 ports:
39 - '9083:9083' # Metastore Thrift
40 command: ["/opt/hive/bin/hive", "--service", "metastore"]
41 volumes:
42 - ./hive-site.xml:/opt/hive/conf/hive-site.xml:ro
43 environment:
44 METASTORE_DB_HOSTNAME: postgres
45 METASTORE_TYPE: postgres
46 depends_on:
47 - postgres
48 networks:
49 - trino-network
50
51networks:
52 trino-network:
53 driver: bridge
The hive-site.xml
file content
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3<configuration>
4 <property>
5 <name>fs.s3a.access.key</name>
6 <value>xxxx</value>
7 </property>
8 <property>
9 <name>fs.s3a.secret.key</name>
10 <value>xxxx</value>
11 </property>
12 <property>
13 <name>fs.s3a.connection.ssl.enabled</name>
14 <value>false</value>
15 </property>
16 <property>
17 <name>fs.s3a.path.style.access</name>
18 <value>true</value>
19 </property>
20 <property>
21 <name>fs.s3a.endpoint</name>
22 <value>http://minio:9000</value>
23 </property>
24 <property>
25 <name>javax.jdo.option.ConnectionURL</name>
26 <value>jdbc:postgresql://postgres:5432/hive_db?allowPublicKeyRetrieval=true;useSSL=false;serverTimezone=UTC</value>
27 </property>
28 <property>
29 <name>javax.jdo.option.ConnectionDriverName</name>
30 <value>org.postgresql.Driver</value>
31 </property>
32 <property>
33 <name>javax.jdo.option.ConnectionUserName</name>
34 <value>xxxx</value>
35 </property>
36 <property>
37 <name>javax.jdo.option.ConnectionPassword</name>
38 <value>xxxx</value>
39 </property>
40 <property>
41 <name>hive.metastore.event.db.notification.api.auth</name>
42 <value>false</value>
43 </property>
44 <property>
45 <name>metastore.thrift.uris</name>
46 <value>thrift://hive:9083</value>
47 <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
48 </property>
49 <property>
50 <name>metastore.task.threads.always</name>
51 <value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
52 </property>
53 <property>
54 <name>metastore.expression.proxy</name>
55 <value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
56 </property>
57 <property>
58 <name>metastore.warehouse.dir</name>
59 <value>s3a://warehouse/hive</value>
60 </property>
61</configuration>
Import the sql to postgresql
the sql file can find in this url: hive schema 3.1.0 sql
1psql --user xxxx --password -d hive_db < hive-schema-3.1.0.sql
===================The following steps support aws s3 because the hadoop 2.8.0 above only =======================
Install the hive metastore
1# add helm repo
2helm repo add bigdata-gradiant https://gradiant.github.io/bigdata-charts/
3helm repo update
4# search the charts of bigdata-gradiant repo
5helm search repo bigdata-gradiant
6
7# create namespace
8kubectl create ns analytics
9
10# launch the hive metastore deployment
11helm install hivems bigdata-gradiant/hive-metastore -n analytics
12# view the pods
13kubectl -n analytics get pods
14# chage the hivems docker image because the original image didn't include the hadoop-aws-2.7.4.jar file
15docker pull jboothomas/hive-metastore-s3:v6
16# load the docker image to Kind cluster
17kind load docker-image jboothomas/hive-metastore-s3:v6 -n local-k8s
18# change the hivems image to modified
19# bde2020/hive:2.3.2-postgresql-metastore
20kubectl patch statefulset hivems-hive-metastore -n analytics -p '{"spec": { "template": { "spec": {"containers":[{"name": "metastore","image": "jboothomas/hive-metastore-s3:v6"}]}}}}'
Deployment the minio to the analytics namespace
kubectl apply -f ./minio-deployment.yaml
1apiVersion: apps/v1 # for k8s versions before 1.9.0 use apps/v1beta2 and before 1.8.0 use extensions/v1beta1
2kind: Deployment
3metadata:
4 # This name uniquely identifies the Deployment
5 name: minio-deployment
6spec:
7 selector:
8 matchLabels:
9 app: minio
10 strategy:
11 type: Recreate
12 template:
13 metadata:
14 labels:
15 # Label is used as selector in the service.
16 app: minio
17 spec:
18 # Refer to the PVC created earlier
19 volumes:
20 - name: storage
21 persistentVolumeClaim:
22 # Name of the PVC created earlier
23 claimName: minio-pv-claim
24 containers:
25 - name: minio
26 # Pulls the default Minio image from Docker Hub
27 image: minio/minio:latest
28 args:
29 - server
30 - /storage
31 env:
32 # Minio access key and secret key
33 - name: MINIO_ACCESS_KEY
34 value: "minio"
35 - name: MINIO_SECRET_KEY
36 value: "minio123"
37 ports:
38 - containerPort: 9000
39 hostPort: 9000
40 # Mount the volume into the pod
41 volumeMounts:
42 - name: storage # must match the volume name, above
43 mountPath: "/storage"
44---
45apiVersion: v1
46kind: Service
47metadata:
48 name: minio-service
49spec:
50 selector:
51 app: minio
52 ports:
53 - protocol: TCP
54 port: 9000
55 targetPort: 9000
56---
57apiVersion: v1
58kind: PersistentVolumeClaim
59metadata:
60 name: minio-pv-claim
61spec:
62 accessModes:
63 - ReadWriteOnce
64 resources:
65 requests:
66 storage: 10Gi
Download the New York City Taxi
datasets from s3
manually upload to container mount folder maybe didn’t working, only port-forward to and upload file create bucket in minio web ui is working.
1# view the datasets bucket
2aws s3 ls s3://nyc-tlc/
3# download some csv
4aws s3 cp s3://nyc-tlc/csv_backup/yellow_tripdata_2022-02.csv ./
5# This way is doesn't working please use minio web ui to upload
6# copy the dataset file to Kind k8s cluster host mount folder
7# also this is minio mount pvc path in container
8# docker cp ./yellow_tripdata_2022-02.csv local-k8s-worker2:/var/local-path-provisioner/pvc-1a14ef0e-49b2-4ad4-ac51-cf8e877306be_analytics_minio-pv-claim/
9# # create bucket in mount folder and move the file to that bucket
10# mkdir nyctaxi
11# cp yellow_tripdata_2022-02.csv nyctaxi/
12
13# can also port forward the minio service to localhost
14# and access http://127.0.0.1:34389 to access minio web ui to create
15k port-forward deploy/minio-deployment -n analytics 34389:34389 9000:9000
Install HDFS
chats, This is not necessary, can change the hive-site.xml warehouse dir
1helm pull bigdata-gradiant/hdfs
2tar -xzvf ./hdfs-0.1.10.tgz
3# change the pdb version to policy/v1 in yaml file
4cd hdfs && vi ./templates/hdfs-dn-pdb.yaml
5vi ./templates/hdfs-nn-pdb.yaml
6# install the hdfs in local cluster
7helm install hdfs ./hdfs -n analytics
Change hivems configmap add the minio properies
1$ kubectl -n analytics get configmap hivems-hive-metastore -o yaml > hivems-hive-metastore.yaml
2$ vi hivems-hive-metastore.yaml
3### I MAKE THE FOLLOWING CHANGE/ADDITION ###
4data:
5 hive-site.xml: |
6 <?xml version="1.0"?>
7 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
8 <configuration>
9 <property>
10 <name>fs.s3a.endpoint</name>
11 <value>http://minio-service:9000</value>
12 </property>
13 <property>
14 <name>fs.s3a.access.key</name>
15 <value>xxxxxx</value>
16 </property>
17 <property>
18 <name>fs.s3a.secret.key</name>
19 <value>xxxxxx</value>
20 </property>
21 <property>
22 <name>fs.s3a.connection.ssl.enabled</name>
23 <value>false</value>
24 </property>
25 <property>
26 <name>fs.s3a.path.style.access</name>
27 <value>true</value>
28 <description>Enable S3 path style access.</description>
29 </property>
30 <property>
31 <name>fs.s3a.impl</name>
32 <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
33 <description>The implementation of S3A Filesystem</description>
34 </property>
- also change the hive.metastore.warehouse.dir section
1#<property>
2# <name>hive.metastore.warehouse.dir</name>
3# <value>hdfs://hdfs:8020/user/hive/warehouse</value>
4#</property>
5
6# TO
7<property>
8<name>hive.metastore.warehouse.dir</name>
9<value>s3a://hive/warehouse</value>
10</property>
Create the table schema
1# enter pod
2k exec -it hivems-hive-metastore-0 -n analytics -- sh
3# execute
4hive
5hive>
6create external table if not exists nyctaxi(
7 VendorID bigint,
8 tpep_pickup_datetime timestamp,
9 tpep_dropoff_datetime timestamp,
10 passenger_count double,
11 trip_distance double,
12 RatecodeID double,
13 store_and_fwd_flag string,
14 PULocationID bigint,
15 DOLocationID bigint,
16 payment_type bigint,
17 fare_amount double,
18 extra double,
19 mta_tax double,
20 tip_amount double,
21 tolls_amount double,
22 improvement_surcharge double,
23 total_amount double
24 )
25 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES('separatorChar'=',','quoteChar'='\"')
26 STORED AS TEXTFILE
27 LOCATION 's3a://test/' TBLPROPERTIES('skip.header.line.count'='1');
Relative jar package can download in
Reference
- New York City Taxi and Limousine Commission
- minio with hive
- Hive-metastore on K8S with S3 external table
- Create your first table on Hive using data from CSV
- Modern Data Lake with MinIO : Part 2
- Access MinIO S3 Storage in Trino with Hive Metastore
- Hive sql syntax
- hive schema 3.1.0 sql
- Querying S3 Object Stores with Presto or Trino
- Configure Apache Hive to use Postgres as Metastore
- Determine Compatibility of hadoop-aws and aws-java-sdk-bundle JARs
- trino-minio-docker compose
- https://blog.csdn.net/w8998036/article/details/134568944
- Hive Standalone Metastore for Trino in Docker