THIS IS A TEST INSTANCE. ALL YOUR CHANGES WILL BE LOST!!!!

Apache Kylin : Analytical Data Warehouse for Big Data

Page tree

Welcome to Kylin Wiki.


Page History

DateAuthorComment
2021-01-06xxyu@apache.orgCreate for Kylin 4.0.0-beta.



Kylin on EMR 5.31 

Create a EMR cluster

Create Cluster
# Create a EMR cluster
$ aws emr create-cluster --applications Name=Hadoop Name=Hive Name=Pig Name=Spark Name=Sqoop Name=Tez Name=ZooKeeper \
	--release-label emr-5.31.0  \
	--ec2-attributes '{"KeyName":"XiaoxiangYu","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-XXX","EmrManagedSlaveSecurityGroup":"XXX","EmrManagedMasterSecurityGroup":"XXX"}'  \
	--log-uri 's3n://aws-logs-XXX/elasticmapreduce/xiaoxiangyu' \
	--instance-groups '[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":100,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Configurations":[{"Classification":"hive-site","Properties":{"hive.optimize.sort.dynamic.partition":"false"}}],"Name":"Master Node"},{"InstanceCount":2,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":50,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Configurations":[{"Classification":"hive-site","Properties":{"hive.optimize.sort.dynamic.partition":"false"}}],"Name":"Worker Node"}]' \
	--configurations '[{"Classification":"mapred-site","Properties":{"mapreduce.map.memory.mb":"3072","mapreduce.reduce.memory.mb":"6144","mapreduce.map.java.opts":"-Xmx2458m","mapreduce.reduce.java.opts":"-Xmx4916m"}},{"Classification":"yarn-site","Properties":{"yarn.nodemanager.resource.cpu-vcores":"4","yarn.nodemanager.resource.memory-mb":"12288","yarn.scheduler.maximum-allocation-mb":"12288","yarn.app.mapreduce.am.resource.mb":"6144"}},{"Classification":"emrfs-site","Properties":{"fs.s3.consistent":"false"}}]' --auto-scaling-role EMR_AutoScaling_DefaultRole \
	--ebs-root-volume-size 50 --service-role EMR_DefaultRole --enable-debugging --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
	--name 'OSS-Dev-Cluster' \
	--region XXX

# Login into master node
$ ssh -i ~/XXX.pem hadoop@ec2-XXX.compute.amazonaws.com.cn 

Check Hadoop version and download Kylin and Spark

[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# hadoop version
Hadoop 2.10.0-amzn-0
Subversion git@aws157git.com:/pkg/Aws157BigTop -r d1e860a34cc1aea3d600c57c5c0270ea41579e8c
Compiled by ec2-user on 2020-09-19T02:05Z
Compiled with protoc 2.5.0
From source with checksum 61f0bc74ab37bcbfbc09b3846ee32b
This command was run using /usr/lib/hadoop/hadoop-common-2.10.0-amzn-0.jar
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# hive --version
Hive 2.3.7-amzn-1
Git git://ip-10-0-0-57/workspace/workspace/bigtop.release-rpm-5.31.0/build/hive/rpm/BUILD/apache-hive-2.3.7-amzn-1-src -r d1e860a34cc1aea3d600c57c5c0270ea41579e8c
Compiled by ec2-user on Sat Sep 19 02:48:49 UTC 2020
From source with checksum b7d9cc83f78a0b3e0f2b22c78e54aae1


[root@ip-172-31-1-253 hadoop]# aws s3 cp s3://XXX/xxyu_upload/apache-kylin-4.0.0-SNAPSHOT-bin.tar-b08c1be22eb51796fb58c3694e86e60f948337f6.gz .
download: s3://xiaoxiang-yu/xxyu_upload/apache-kylin-4.0.0-SNAPSHOT-bin.tar-b08c1be22eb51796fb58c3694e86e60f948337f6.gz to ./apache-kylin-4.0.0-SNAPSHOT-bin.tar-b08c1be22eb51796fb58c3694e86e60f948337f6.gz
[root@ip-172-31-1-253 hadoop]# tar zxf apache-kylin-4.0.0-SNAPSHOT-bin.tar-b08c1be22eb51796fb58c3694e86e60f948337f6.gz
[root@ip-172-31-1-253 hadoop]# cd apache-kylin-4.0.0-SNAPSHOT-bin/
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# aws s3 cp s3://XXX/xxyu_upload/spark-2.4.6-bin-hadoop2.7.tgz .
download: s3://XXX/xxyu_upload/spark-2.4.6-bin-hadoop2.7.tgz to ./spark-2.4.6-bin-hadoop2.7.tgz
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# tar zxf spark-2.4.6-bin-hadoop2.7.tgz
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# mv spark-2.4.6-bin-hadoop2.7 spark
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cat commit_SHA1
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
b08c1be22eb51796fb58c3694e86e60f948337f6

Prepare kylin.properties

Edit $KYLIN_HOME/conf/kylin.properties and add following content.

kylin.metadata.url=kylin_default_instance@jdbc,url=jdbc:mysql://${MASTER_HOST}:3306/kylin,driverClassName=org.mariadb.jdbc.Driver,username=${USER_NAME},password=${PASSWORD}
kylin.env.zookeeper-connect-string=${MASTER_HOST}

Prepare Metastore (Optional, only for test purpose)

[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# mysql
Welcome to the MariaDB monitor.  Commands end with ; or \g.
Your MariaDB connection id is 70
Server version: 5.5.68-MariaDB MariaDB Server
#

Copyright (c) 2000, 2018, Oracle, MariaDB Corporation Ab and others.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

MariaDB [(none)]> CREATE USER '${USER_NAME}'@'${MASTER_HOST}' IDENTIFIED BY '${PASSWORD}' ;
Query OK, 0 rows affected (0.00 sec)

MariaDB [(none)]> GRANT ALL PRIVILEGES ON *.* TO '${USER_NAME}'@'${MASTER_HOST}' WITH GRANT OPTION ;
Query OK, 0 rows affected (0.00 sec)

MariaDB [(none)]> FLUSH PRIVILEGES;
Query OK, 0 rows affected (0.00 sec)

MariaDB [(none)]> Bye

[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# mysql -u ${USER_NAME} -h ${MASTER_HOST}  -p
Enter password:
Welcome to the MariaDB monitor.  Commands end with ; or \g.
Your MariaDB connection id is 73
Server version: 5.5.68-MariaDB MariaDB Server

Copyright (c) 2000, 2018, Oracle, MariaDB Corporation Ab and others.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

MariaDB [(none)]> create database kylin;
Query OK, 1 row affected (0.00 sec)

MariaDB [(none)]> show databases;
+--------------------+
| Database           |
+--------------------+
| information_schema |
| hive               |
| hue                |
| kylin              |
| mysql              |
| performance_schema |
+--------------------+
6 rows in set (0.00 sec)

MariaDB [(none)]> Bye

Replace jars under $KYLIN_HOME/spark/jars

The Spark we downloaded is for Apache Hadoop 2.7, please replace with EMR-provided jars.

// Step 1 : Add jdbc driver(if you use mariadb)
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# find /usr/lib -name "*mariadb*"
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# mkdir ext
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/hive/lib/mariadb-connector-java.jar ext


// Step 2 : replace Hadoop related jars
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# rm -rf $SPARK_HOME/jars/hadoop-*.jar
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/spark/jars/hadoop-*.jar $SPARK_HOME/jars/
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/spark/jars/emr-spark-goodies.jar  $SPARK_HOME/jars/
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/spark/jars/htrace-core4-4.1.0-incubating.jar $SPARK_HOME/jars
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar $SPARK_HOME/jars
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/hadoop/lib/woodstox-core-5.0.3.jar $SPARK_HOME/jars/
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/hadoop/lib/stax2-api-3.1.4.jar $SPARK_HOME/jars/

Start Kylin 

[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# sh bin/kylin.sh start
Retrieving hadoop conf dir...
KYLIN_HOME is set to /home/hadoop/apache-kylin-4.0.0-SNAPSHOT-bin
Retrieving hive dependency...
Retrieving hadoop conf dir...
Retrieving Spark dependency...
Start replacing hadoop jars under /home/hadoop/apache-kylin-4.0.0-SNAPSHOT-bin/spark/jars.
find: ‘/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/../hadoop/’: No such file or directory
2.10.0-amzn-0.jar
Find platform specific jars: , will replace with these jars under /home/hadoop/apache-kylin-4.0.0-SNAPSHOT-bin/spark/jars.
Please confirm that the corresponding hadoop jars have been replaced. The automatic replacement program cannot be executed correctly.
Done hadoop jars replacement under /home/hadoop/apache-kylin-4.0.0-SNAPSHOT-bin/spark/jars.
Start to check whether we need to migrate acl tables
Not HBase metadata. Skip check.

A new Kylin instance is started by root. To stop it, run 'kylin.sh stop'
Check the log at /home/hadoop/apache-kylin-4.0.0-SNAPSHOT-bin/logs/kylin.log
Web UI is at http://ip-172-31-1-253.cn-northwest-1.compute.internal:7070/kylin

Modify $KYLIN_HOME/hadoop_conf/hive-site.xml (after Kylin instance started)

[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# ll hadoop_conf/
总用量 0
lrwxrwxrwx 1 root root 30 1月   6 11:46 core-site.xml -> /etc/hadoop/conf/core-site.xml
lrwxrwxrwx 1 root root 30 1月   6 11:46 hadoop-env.sh -> /etc/hadoop/conf/hadoop-env.sh
lrwxrwxrwx 1 root root 30 1月   6 11:46 hdfs-site.xml -> /etc/hadoop/conf/hdfs-site.xml
lrwxrwxrwx 1 root root 28 1月   6 11:46 hive-site.xml -> /etc/hive/conf/hive-site.xml
lrwxrwxrwx 1 root root 32 1月   6 11:46 mapred-site.xml -> /etc/hadoop/conf/mapred-site.xml
lrwxrwxrwx 1 root root 31 1月   6 11:46 ssl-client.xml -> /etc/hadoop/conf/ssl-client.xml
lrwxrwxrwx 1 root root 30 1月   6 11:46 yarn-site.xml -> /etc/hadoop/conf/yarn-site.xml
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# rm hadoop_conf/hive-site.xml
rm:是否删除符号链接 "hadoop_conf/hive-site.xml"?y
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /etc/hive/conf/hive-site.xml hadoop_conf/

# Change value of "hive.execution.engine" from "tez" to "mr"
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# vim hadoop_conf/hive-site.xml

Kylin on EMR 5.31  with Working set to S3

EmrFileSystem not found

If you configure "kylin.env.hdfs-working-dir=s3://XXX/kylin/", you will faced ClassNotFoundException .

21/01/06 13:07:15 INFO JobWorker: Start running job.
21/01/06 13:07:15 INFO SparkApplication: Executor task org.apache.kylin.engine.spark.job.ResourceDetectBeforeCubingJob with args : {"distMetaUrl":"kylin_default_instance@hdfs,path=s3://xiaoxiang-yu/kylin-workingdir/121212/kylin_default_instance/learn_kylin/job_tmp/6af48fa3-35a7-4007-bb54-81a2b3ae9c67-00/meta","submitter":"ADMIN","dataRangeEnd":"1356998400000","targetModel":"0928468a-9fab-4185-9a14-6f2e7c74823f","dataRangeStart":"1325376000000","project":"learn_kylin","className":"org.apache.kylin.engine.spark.job.ResourceDetectBeforeCubingJob","segmentName":"20120101000000_20130101000000","parentId":"6af48fa3-35a7-4007-bb54-81a2b3ae9c67","jobId":"6af48fa3-35a7-4007-bb54-81a2b3ae9c67","outputMetaUrl":"kylin_default_instance@jdbc,url=jdbc:mysql://ip-172-31-1-253.cn-northwest-1.compute.internal:3306/kylin,driverClassName=org.mariadb.jdbc.Driver,username=xxyu,password=newpassword","segmentId":"25dd0543-f7e2-813a-610c-dbd892ad4c99","cuboidsNum":"9","cubeName":"kylin_sales_cube_S3","jobType":"BUILD","cubeId":"6415d237-6671-284c-b7bc-12c02ef124eb","segmentIds":"25dd0543-f7e2-813a-610c-dbd892ad4c99"}
21/01/06 13:07:15 INFO MetaDumpUtil: Ready to load KylinConfig from uri: kylin_default_instance@hdfs,path=s3://xiaoxiang-yu/kylin-workingdir/121212/kylin_default_instance/learn_kylin/job_tmp/6af48fa3-35a7-4007-bb54-81a2b3ae9c67-00/meta
21/01/06 13:07:15 ERROR SparkApplication: The spark job execute failed!
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.amazon.ws.emr.hadoop.fs.EmrFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2395)
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3256)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3288)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:120)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3339)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3307)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:473)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at org.apache.kylin.engine.spark.utils.MetaDumpUtil.loadKylinConfigFromHdfs(MetaDumpUtil.java:122)
	at org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:225)
	at org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:89)
	at org.apache.spark.application.JobWorker$$anon$2.run(JobWorker.scala:55)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: Class com.amazon.ws.emr.hadoop.fs.EmrFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2299)
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2393)
	... 14 more

Copy emrfs-hadoop-assembly.jar

To fix this, please copy related jar from env.

[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.43.0.jar $SPARK_HOME/jars/

Screenshots

  • Monitor Pages

  • Sparder(Query Engine) UI


Kylin on EMR 6.0.0

    Each step is same other than the "Replace jars under $KYLIN_HOME/spark/jars" .
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ hadoop version
Hadoop 3.2.1-amzn-0
Source code repository git@aws157git.com:/pkg/Aws157BigTop -r 702dcbb487699cf833043bee677ea99c0136673e
Compiled by ec2-user on 2020-02-19T04:10Z
Compiled with protoc 2.5.0
From source with checksum d467a0d98b48769d63fc56b247d9b7e1
This command was run using /usr/lib/hadoop/hadoop-common-3.2.1-amzn-0.jar


Replace jars under $KYLIN_HOME/spark/jars

// Step 0 : Add jdbc driver(if you use mariadb)
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# find /usr/lib -name "*mariadb*"
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# mkdir ext
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/hive/lib/mariadb-connector-java.jar ext


// Step 1 : Replace hadoop related jars
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp commons-configuration-1.10.jar lib/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ rm -rf $SPARK_HOME/jars/hadoop-*.jar
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/spark/jars/hadoop-*.jar $SPARK_HOME/jars/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/spark/jars/emr-spark-goodies.jar  $SPARK_HOME/jars/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/spark/jars/htrace-core4-4.1.0-incubating.jar $SPARK_HOME/jars/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/hadoop/lib/woodstox-core-5.0.3.jar $SPARK_HOME/jars/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/spark/jars/commons-configuration2-2.1.1.jar $SPARK_HOME/jars/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/spark/jars/re2j-1.1.jar $SPARK_HOME/jars/
[root@ip-172-31-1-253 apache-kylin-4.0.0-SNAPSHOT-bin]# cp /usr/lib/hadoop/lib/stax2-api-3.1.4.jar $SPARK_HOME/jars/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/spark/jars/hive-exec-1.2.1-spark2-amzn-1.jar $SPARK_HOME/jars/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ rm spark/jars/hive-exec-1.2.1.spark2.jar


// Step 2 : Change value of "hive.execution.engine" from "tez" to "mr"
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ rm hadoop_conf/hive-site.xml
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /etc/hive/conf/hive-site.xml hadoop_conf/
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ vim hadoop_conf/hive-site.xml 

// Step 3 : Add lzo jars
[hadoop@ip-172-31-8-116 apache-kylin-4.0.0-SNAPSHOT-bin]$ cp /usr/lib/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar $SPARK_HOME/jars





  • No labels