Skip to content

Commit d360d1d

Browse files
SergeAlexandreidirze
authored andcommitted
feat: Create hive metastore docker image
1 parent 6a73f25 commit d360d1d

File tree

4 files changed

+343
-0
lines changed

4 files changed

+343
-0
lines changed

docker/Dockerfile-build

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Need these versions of maven and jdk
2+
FROM maven:3.8-jdk-8 as builder
3+
4+
ARG ARG_METASTORE_VERSION
5+
6+
WORKDIR /workspace
7+
8+
RUN git clone https://github.com/apache/hive.git
9+
RUN cd hive && git checkout rel/release-${ARG_METASTORE_VERSION}
10+
RUN cd hive && mvn -pl standalone-metastore package -Pdist -DskipTests -Dmaven.javadoc.skip=true
11+
12+
# Take reference From:
13+
# https://github.com/apache/spark-docker
14+
# 3.5.0/scala2.12-java11-ubuntu/Dockerfile
15+
#
16+
# https://hub.docker.com/_/eclipse-temurin/tags?page=1&name=11-jre-focal
17+
#
18+
# NB: Got jdbc issue with FROM eclipse-temurin:8u402-b06-jre-alpine
19+
#
20+
FROM eclipse-temurin:11-jre-focal
21+
22+
ARG ARG_HADOOP_VERSION
23+
ARG ARG_METASTORE_VERSION
24+
ARG ARG_JDBC_VERSION
25+
26+
ENV HADOOP_VERSION=$ARG_HADOOP_VERSION
27+
ENV METASTORE_VERSION=$ARG_METASTORE_VERSION
28+
ENV JDBC_VERSION=$ARG_JDBC_VERSION
29+
30+
ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION}
31+
ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin
32+
33+
RUN apt-get update && apt-get install -y postgresql-client sudo && apt-get clean
34+
35+
WORKDIR /opt
36+
37+
COPY --from=builder /workspace/hive/standalone-metastore/target/apache-hive-metastore-*-bin.tar.gz /tmp/
38+
39+
RUN tar zxf /tmp/apache-hive-metastore-*-bin.tar.gz && rm /tmp/apache-hive-metastore-*-bin.tar.gz
40+
41+
# dlcdn.apache.org is faster, but does not provide 3.2.0 version. So fallback on archive.apache.org
42+
#RUN curl -L https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \
43+
# rm -f hadoop-${HADOOP_VERSION}/share/hadoop/common/lib/slf4j-log4j12-*.jar
44+
45+
RUN curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \
46+
rm -f hadoop-${HADOOP_VERSION}/share/hadoop/common/lib/slf4j-log4j12-*.jar
47+
48+
RUN curl -L https://jdbc.postgresql.org/download/postgresql-${JDBC_VERSION}.jar -o /opt/apache-hive-metastore-${METASTORE_VERSION}-bin/lib/postgresql-${JDBC_VERSION}.jar
49+
50+
COPY docker/metastore.sh /metastore.sh
51+
52+
RUN groupadd -r hive --gid=1000 && \
53+
useradd -r -g hive --uid=1000 -d ${HIVE_HOME} hive && \
54+
chown hive:hive -R ${HIVE_HOME} && \
55+
chown hive:hive /metastore.sh && chmod +x /metastore.sh
56+
57+
# Security breach, but must be in a namespace with relaxed psp to be able to sudo
58+
COPY docker/hive.sudoer /etc/sudoers.d/hive
59+
60+
USER 1000
61+
EXPOSE 9083
62+
63+
ENTRYPOINT ["sh", "-c", "/metastore.sh"]

docker/Dockerfile-download

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
2+
3+
# Take reference From:
4+
# https://github.com/apache/spark-docker
5+
# 3.5.0/scala2.12-java11-ubuntu/Dockerfile
6+
#
7+
# https://hub.docker.com/_/eclipse-temurin/tags?page=1&name=11-jre-focal
8+
#
9+
# NB: Got jdbc issue with FROM eclipse-temurin:8u402-b06-jre-alpine
10+
#
11+
FROM eclipse-temurin:11-jre-focal
12+
13+
ARG ARG_HADOOP_VERSION
14+
ARG ARG_METASTORE_VERSION
15+
ARG ARG_JDBC_VERSION
16+
17+
ENV HADOOP_VERSION=$ARG_HADOOP_VERSION
18+
ENV METASTORE_VERSION=$ARG_METASTORE_VERSION
19+
ENV JDBC_VERSION=$ARG_JDBC_VERSION
20+
21+
ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION}
22+
ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin
23+
24+
RUN apt-get update && apt-get install -y postgresql-client sudo && apt-get clean
25+
26+
WORKDIR /opt
27+
28+
RUN curl -L https://repo1.maven.org/maven2/org/apache/hive/hive-standalone-metastore/${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar xvzf -
29+
30+
# dlcdn.apache.org is faster, but does not provide 3.2.0 version. So fallback on archive.apache.org
31+
#RUN curl -L https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \
32+
# rm -f hadoop-${HADOOP_VERSION}/share/hadoop/common/lib/slf4j-log4j12-*.jar
33+
34+
RUN curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \
35+
rm -f hadoop-${HADOOP_VERSION}/share/hadoop/common/lib/slf4j-log4j12-*.jar
36+
37+
RUN curl -L https://jdbc.postgresql.org/download/postgresql-${JDBC_VERSION}.jar -o /opt/apache-hive-metastore-${METASTORE_VERSION}-bin/lib/postgresql-${JDBC_VERSION}.jar
38+
39+
COPY docker/metastore.sh /metastore.sh
40+
41+
RUN groupadd -r hive --gid=1000 && \
42+
useradd -r -g hive --uid=1000 -d ${HIVE_HOME} hive && \
43+
chown hive:hive -R ${HIVE_HOME} && \
44+
chown hive:hive /metastore.sh && chmod +x /metastore.sh
45+
46+
# Security breach, but must be in a namespace with relaxed psp to be able to sudo
47+
COPY docker/hive.sudoer /etc/sudoers.d/hive
48+
49+
USER 1000
50+
EXPOSE 9083
51+
52+
ENTRYPOINT ["sh", "-c", "/metastore.sh"]

docker/hive.sudoer

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
hive ALL=(ALL) NOPASSWD:ALL

docker/metastore.sh

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
#!/bin/sh
2+
3+
4+
MODE=$1
5+
6+
if [ "$MODE" = "hms" ]; then
7+
echo "Will run metastore"
8+
elif [ "$MODE" = "init" ]; then
9+
echo "Will initialize DB if not yet done"
10+
else
11+
echo "First parameter must be 'hms' or 'init'"
12+
exit 20
13+
fi
14+
15+
if [ -z "${HIVEMS_DB}" ]; then echo "HIVEMS_DB env variable must be defined!"; exit 1; fi
16+
if [ -z "${HIVEMS_USER}" ]; then echo "HIVEMS_USER env variable must be defined!"; exit 1; fi
17+
if [ -z "${HIVEMS_PASSWORD}" ]; then echo "HIVEMS_PASSWORD env variable must be defined!"; exit 1; fi
18+
if [ -z "${DB_HOST}" ]; then echo "DB_HOST env variable must be defined!"; exit 1; fi
19+
if [ -z "${DB_PORT}" ]; then echo "DB_PORT env variable must be defined!"; exit 1; fi
20+
if [ -z "${METASTORE_VERSION}" ]; then echo "METASTORE_VERSION env variable must be defined!"; exit 1; fi
21+
if [ -z "${HADOOP_VERSION}" ]; then echo "HADOOP_VERSION env variable must be defined!"; exit 1; fi
22+
23+
# May be null in case of usage of AWS instance roles
24+
#if [ -z "${S3_ENDPOINT}" ]; then echo "S3_ENDPOINT env variable must be defined!"; exit 1; fi
25+
#if [ -z "${S3_ACCESS_KEY}" ]; then echo "S3_ACCESS_KEY env variable must be defined!"; exit 1; fi
26+
#if [ -z "${S3_SECRET_KEY}" ]; then echo "S3_SECRET_KEY env variable must be defined!"; exit 1; fi
27+
28+
if [ -z "${JAVA_HOME}" ]; then export JAVA_HOME=/usr/local/openjdk-8; fi
29+
if [ -z "${BASEDIR}" ]; then export BASEDIR=/opt; fi
30+
if [ -z "${LOG_LEVEL}" ]; then export LOG_LEVEL=INFO; fi
31+
if [ -z "${THRIFT_LISTENING_PORT}" ]; then export THRIFT_LISTENING_PORT=9083; fi
32+
if [ -z "${S3_REQUEST_TIMEOUT}" ]; then export S3_REQUEST_TIMEOUT=0; fi
33+
34+
export HADOOP_HOME=${BASEDIR}/hadoop-${HADOOP_VERSION}
35+
export HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-*.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar
36+
37+
echo ""
38+
echo "METASTORE_VERSION=$METASTORE_VERSION"
39+
echo "HADOOP_VERSION=$HADOOP_VERSION"
40+
echo ""
41+
42+
cat >${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/conf/metastore-log4j2.properties <<-EOF
43+
status = INFO
44+
name = MetastoreLog4j2
45+
packages = org.apache.hadoop.hive.metastore
46+
# list of all appenders
47+
appenders = console
48+
# console appender
49+
appender.console.type = Console
50+
appender.console.name = console
51+
appender.console.target = SYSTEM_ERR
52+
appender.console.layout.type = PatternLayout
53+
appender.console.layout.pattern = %d{ISO8601} %5p [%t] %c{2}: %m%n
54+
# list of all loggers
55+
loggers = DataNucleus, Datastore, JPOX, PerfLogger
56+
logger.DataNucleus.name = DataNucleus
57+
logger.DataNucleus.level = ERROR
58+
logger.Datastore.name = Datastore
59+
logger.Datastore.level = ERROR
60+
logger.JPOX.name = JPOX
61+
logger.JPOX.level = ERROR
62+
logger.PerfLogger.name = org.apache.hadoop.hive.ql.log.PerfLogger
63+
logger.PerfLogger.level = INFO
64+
# root logger
65+
rootLogger.level = ${LOG_LEVEL}
66+
rootLogger.appenderRefs = root
67+
rootLogger.appenderRef.root.ref = console
68+
EOF
69+
70+
cat >${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/conf/metastore-site.xml <<-EOF
71+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
72+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
73+
<configuration>
74+
<property>
75+
<name>metastore.thrift.uris</name>
76+
<value>thrift://localhost:9083</value>
77+
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
78+
</property>
79+
<property>
80+
<name>metastore.task.threads.always</name>
81+
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsRebuildLockCleanerTask</value>
82+
</property>
83+
<property>
84+
<name>metastore.expression.proxy</name>
85+
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
86+
</property>
87+
<property>
88+
<name>metastore.server.min.threads</name>
89+
<value>5</value>
90+
</property>
91+
<property>
92+
<name>metastore.server.max.threads</name>
93+
<value>20</value>
94+
</property>
95+
<property>
96+
<name>javax.jdo.option.Multithreaded</name>
97+
<value>true</value>
98+
<description>Set this to true if multiple threads access metastore through JDO concurrently.</description>
99+
</property>
100+
<property>
101+
<name>javax.jdo.PersistenceManagerFactoryClass</name>
102+
<value>org.datanucleus.api.jdo.JDOPersistenceManagerFactory</value>
103+
<description>class implementing the jdo persistence</description>
104+
</property>
105+
<property>
106+
<name>javax.jdo.option.ConnectionDriverName</name>
107+
<value>org.postgresql.Driver</value>
108+
</property>
109+
<property>
110+
<name>javax.jdo.option.ConnectionURL</name>
111+
<value>jdbc:postgresql://${DB_HOST}:${DB_PORT}/${HIVEMS_DB}</value>
112+
</property>
113+
<property>
114+
<name>javax.jdo.option.ConnectionUserName</name>
115+
<value>${HIVEMS_USER}</value>
116+
</property>
117+
<property>
118+
<name>javax.jdo.option.ConnectionPassword</name>
119+
<value>${HIVEMS_PASSWORD}</value>
120+
</property>
121+
<property>
122+
<name>fs.s3a.path.style.access</name>
123+
<value>true</value>
124+
</property>
125+
<property>
126+
<name>fs.s3a.connection.request.timeout</name>
127+
<value>${S3_REQUEST_TIMEOUT}</value>
128+
</property>
129+
EOF
130+
131+
if [ ! -z "${S3_ENDPOINT}" ]
132+
then
133+
cat >>${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/conf/metastore-site.xml <<-EOF
134+
<property>
135+
<name>fs.s3a.endpoint</name>
136+
<value>${S3_ENDPOINT}</value>
137+
</property>
138+
EOF
139+
fi
140+
141+
if [ ! -z "${S3_ACCESS_KEY}" ]
142+
then
143+
cat >>${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/conf/metastore-site.xml <<-EOF
144+
<property>
145+
<name>fs.s3a.access.key</name>
146+
<value>${S3_ACCESS_KEY}</value>
147+
</property>
148+
<property>
149+
<name>fs.s3a.secret.key</name>
150+
<value>${S3_SECRET_KEY}</value>
151+
</property>
152+
EOF
153+
fi
154+
155+
if [ ! -z "${ASSUME_ROLE_ARN}" ]
156+
then
157+
cat >>${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/conf/metastore-site.xml <<-EOF
158+
<property>
159+
<name>fs.s3a.aws.credentials.provider</name>
160+
<value>org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider</value>
161+
</property>
162+
<property>
163+
<name>fs.s3a.assumed.role.credentials.provider</name>
164+
<value>com.amazonaws.auth.InstanceProfileCredentialsProvider</value>
165+
</property>
166+
<property>
167+
<name>fs.s3a.assumed.role.arn</name>
168+
<value>${ASSUME_ROLE_ARN}</value>
169+
</property>
170+
EOF
171+
fi
172+
173+
cat >>${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/conf/metastore-site.xml <<-EOF
174+
</configuration>
175+
EOF
176+
177+
# set +x
178+
179+
export PGPASSWORD=${HIVEMS_PASSWORD}
180+
181+
echo "Will wait for postgresql server to be ready"
182+
while ! pg_isready --host ${DB_HOST} --port ${DB_PORT}; do echo "Waiting for postgresql to be ready..."; sleep 2; done;
183+
184+
echo "Will wait for \"${HIVEMS_DB}\" to exists"
185+
while ! psql --host ${DB_HOST} --port ${DB_PORT} -U ${HIVEMS_USER} -d ${HIVEMS_DB} -c "\c ${HIVEMS_DB}" >/dev/null 2>&1; do echo "Waiting for ${HIVEMS_DB} database to be ready..."; sleep 2; done;
186+
187+
188+
if [ "$MODE" = "init" ]; then
189+
echo "Initialize schema if DBS table does not exists"
190+
psql --host ${DB_HOST} --port ${DB_PORT} -U ${HIVEMS_USER} -d ${HIVEMS_DB} -c 'SELECT "DB_ID" FROM "DBS"' >/dev/null 2>&1;
191+
if [ $? -ne 0 ]
192+
then
193+
echo "Will initialize the DB"
194+
${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/bin/schematool -initSchema -dbType postgres
195+
fi
196+
echo "DATABASE SCHEMA SHOULD BE OK NOW!!"
197+
exit 0
198+
fi
199+
200+
# MODE = "hms" here
201+
202+
echo "Will wait for database schema to be ready...."
203+
while ! psql --host ${DB_HOST} --port ${DB_PORT} -U ${HIVEMS_USER} -d ${HIVEMS_DB} -c 'SELECT "SCHEMA_VERSION" FROM "VERSION"' >/dev/null 2>&1; do echo "Waiting for ${HIVEMS_DB} schema to be ready..."; sleep 2; done;
204+
echo "DATABASE SCHEMA IS OK. CAN LAUNCH!!"
205+
echo ""
206+
207+
unset PGPASSWORD
208+
209+
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS -Dcom.amazonaws.sdk.disableCertChecking=true"
210+
211+
# WARNING: This variable is set by Kubernetes in a form: tcp://XX.XX.XX.XX:9083.
212+
# For the metastore, this is an entry variable hosting only the listening port, as a single number. So failure.
213+
unset METASTORE_PORT
214+
215+
${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/bin/start-metastore -p $THRIFT_LISTENING_PORT
216+
err=$?
217+
218+
if [ -n "$WAIT_ON_ERROR" ]; then
219+
if [ $err -ne 0 ]; then
220+
echo "ERROR: rc=$err. Will wait $WAIT_ON_ERROR sec...."
221+
sleep $WAIT_ON_ERROR
222+
fi
223+
fi
224+
225+
return $err
226+
227+

0 commit comments

Comments
 (0)