官网:http://atlas.apache.org
Apache Atlas为组织提供了开放的元数据管理和治理功能,以建立其数据资产的目录,对这些资产进行分类和治理,并为数据科学家,分析师和数据治理团队提供围绕这些数据资产的协作功能。
数据治理的过程是非常的繁琐和复杂的,所以需要借助一些系统来帮助数据更好的服务业务。
环境
安装
修改配置文件
文件地址:pom.xml
1
2
3
4
5
6
|
和CDH的版本一致
<hadoop.version>3.0.0</hadoop.version>
<hbase.version>2.1.0</hbase.version>
<kafka.version>2.1.0</kafka.version>
<zookeeper.version>3.4.5</zookeeper.version>
|
文件地址:apache-atlas-2.0.0-sources\apache-atlas-sources-2.0.0\distro\src\conf\atlas-application.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
atlas.graph.storage.backend=hbase2
atlas.graph.storage.hbase.table=apache_atlas_janus
# 需要zk的地址
atlas.graph.storage.hostname=cdh6-1:2181,cdh6-2:2181,cdh6-3:2181
atlas.graph.storage.hbase.regions-per-server=1
atlas.graph.storage.lock.wait-time=10000
atlas.EntityAuditRepository.impl=org.apache.atlas.repository.audit.HBaseBasedAuditRepository
# Graph Search Index
atlas.graph.index.search.backend=solr
#Solr
#Solr cloud mode properties
atlas.graph.index.search.solr.mode=cloud
# 需要zk的地址
atlas.graph.index.search.solr.zookeeper-url=cdh6-1:2181,cdh6-2:2181,cdh6-3:2181/solr
atlas.graph.index.search.solr.zookeeper-connect-timeout=60000
atlas.graph.index.search.solr.zookeeper-session-timeout=60000
atlas.graph.index.search.solr.wait-searcher=true
# Solr-specific configuration property
atlas.graph.index.search.max-result-set-size=150
######### Notification Configs #########
atlas.notification.embedded=false
atlas.kafka.data=${sys:atlas.home}/data/kafka
# 需要zk的地址
atlas.kafka.zookeeper.connect=cdh6-1:2181,cdh6-2:2181,cdh6-3:2181
atlas.kafka.bootstrap.servers=cdh6-3:9092,cdh6-4:9092,cdh6-5:9092
atlas.kafka.zookeeper.session.timeout.ms=4000
atlas.kafka.zookeeper.connection.timeout.ms=2000
atlas.kafka.zookeeper.sync.time.ms=20
atlas.kafka.auto.commit.interval.ms=1000
atlas.kafka.hook.group.id=atlas
atlas.kafka.enable.auto.commit=true
atlas.kafka.auto.offset.reset=earliest
atlas.kafka.session.timeout.ms=30000
atlas.kafka.offsets.topic.replication.factor=1
atlas.kafka.poll.timeout.ms=1000
atlas.notification.create.topics=true
atlas.notification.replicas=1
atlas.notification.topics=ATLAS_HOOK,ATLAS_ENTITIES
atlas.notification.log.failed.messages=true
atlas.notification.consumer.retry.interval=500
atlas.notification.hook.retry.interval=1000
######### Security Properties #########
# SSL config
atlas.enableTLS=false
atlas.authentication.method.kerberos=false
atlas.authentication.method.file=true
#### ldap.type= LDAP or AD
atlas.authentication.method.ldap.type=none
#### user credentials file
atlas.authentication.method.file.filename=${sys:atlas.home}/conf/users-credentials.properties
######### Server Properties #########
atlas.rest.address=http://localhost:21000
# If enabled and set to true, this will run setup steps when the server starts
#atlas.server.run.setup.on.start=false
######### Entity Audit Configs #########
atlas.audit.hbase.tablename=apache_atlas_entity_audit
atlas.audit.zookeeper.session.timeout.ms=1000
atlas.audit.hbase.zookeeper.quorum=localhost:2181
######### High Availability Configuration ########
atlas.server.ha.enabled=false
######### Atlas Authorization #########
atlas.authorizer.impl=simple
atlas.authorizer.simple.authz.policy.file=atlas-simple-authz-policy.json
######### CSRF Configs #########
atlas.rest-csrf.enabled=true
atlas.rest-csrf.browser-useragents-regex=^Mozilla.*,^Opera.*,^Chrome.*
atlas.rest-csrf.methods-to-ignore=GET,OPTIONS,HEAD,TRACE
atlas.rest-csrf.custom-header=X-XSRF-HEADER
#Set to false to disable gremlin search.
atlas.search.gremlin.enable=false
#集成添加hive钩子配置(文件最下面即可)
#在hive中做任何操作,都会被钩子所感应到,并生成相应的事件发往atlas所订阅的kafka-topic,再由atlas进行元数据生成和存储管理
######### Hive Hook Configs #######
atlas.hook.hive.synchronous=false
atlas.hook.hive.numRetries=3
atlas.hook.hive.queueSize=10000
atlas.cluster.name=primary
#集成修改其他配置
atlas.rest.address=http://cdh01.cm:21000
#访问地址端口,此值修改不生效,默认本地21000端口,此端口和impala冲突
atlas.server.run.setup.on.start=false
#如果启用并设置为true,则在服务器启动时将运行安装步骤
atlas.audit.hbase.zookeeper.quorum=cdh6-1:2181,cdh6-2:2181,cdh6-3:2181
|
文件地址:apache-atlas-2.0.0-sources\apache-atlas-sources-2.0.0\distro\src\conf\atlas-env.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# indicates whether or not a local instance of HBase should be started for Atlas
export MANAGE_LOCAL_HBASE=false
# indicates whether or not a local instance of Solr should be started for Atlas
export MANAGE_LOCAL_SOLR=false
# indicates whether or not cassandra is the embedded backend for Atlas
export MANAGE_EMBEDDED_CASSANDRA=false
# indicates whether or not a local instance of Elasticsearch should be started for Atlas
export MANAGE_LOCAL_ELASTICSEARCH=false
export HBASE_CONF_DIR=/usr/local/src/atlas/apache-atlas-2.0.0/conf/hbase/conf
|
编译安装
1
2
3
|
export MAVEN_OPTS="-Xms2g -Xmx2g"
mvn clean -DskipTests install
mvn clean -DskipTests package -Pdist
|
使用编译好的文件
联系本站或者发邮箱要文件
开始安装
1
2
3
4
5
6
7
8
|
mkdir /usr/local/src/atlas
cd /usr/local/src/atlas
#复制apache-atlas-2.0.0-bin.tar.gz到安装目录
tar -zxvf apache-atlas-2.0.0-bin.tar.gz
cd apache-atlas-2.0.0/conf
#这里的配置文件是编译前修改的,如果是直接下载的编译好的文件
#更新配置文件内容是一样的 在conf里面找打对应的配置文件即可
|
集成Solr
这里的Solr的服务是直接用CDH直接添加的服务。
将atlas下的/conf/solr拷贝到/opt/cloudera/parcels/CDH/etc/solr/conf.dist/下
1
2
|
#这里因为我就用了一个实例Solr
scp -r /usr/local/src/atlas/apache-atlas-2.0.0/conf/solr/* root@172.24.10.22:/opt/cloudera/parcels/CDH/etc/solr/conf.dist/
|
切换solr 新建collection:
1
2
|
# 下面的命令是一行
/opt/cloudera/parcels/CDH/lib/solr/bin/solr create -c vertex_index -d /opt/cloudera/parcels/CDH/etc/solr/conf.dist/ -shards 1 -replicationFactor 1
|
1
2
|
# 下面的命令是一行
/opt/cloudera/parcels/CDH/lib/solr/bin/solr create -c edge_index -d /opt/cloudera/parcels/CDH/etc/solr/conf.dist/ -shards 1 -replicationFactor 1
|
1
2
|
# 下面的命令是一行
/opt/cloudera/parcels/CDH/lib/solr/bin/solr create -c fulltext_index -d /opt/cloudera/parcels/CDH/etc/solr/conf.dist/ -shards 1 -replicationFactor 1
|
集成HBase
1
|
ln -s /etc/hbase/conf/ /usr/local/src/atlas/apache-atlas-2.0.0/conf/hbase/
|
集成Kafka
1
2
3
|
kafka-topics --zookeeper cdh6-1:2181,cdh6-2:2181,cdh6-3:2181 --create --replication-factor 3 --partitions 3 --topic _HOATLASOK
kafka-topics --zookeeper cdh6-1:2181,cdh6-2:2181,cdh6-3:2181 --create --replication-factor 3 --partitions 3 --topic ATLAS_ENTITIES
kafka-topics --zookeeper cdh6-1:2181,cdh6-2:2181,cdh6-3:2181 --create --replication-factor 3 --partitions 3 --topic ATLAS_HOOK
|
集成Hive
1
2
3
4
|
#必须在此路径打包,才能打到第一级目录下
cd /usr/local/src/atlas/apache-atlas-2.0.0/conf
zip -u /usr/local/src/atlas/apache-atlas-2.0.0/hook/hive/atlas-plugin-classloader-2.0.0.jar atlas-application.properties
|
修改hive-site.xml,在CDH的hive的配置中搜索文件名,添加xml配置
1
2
3
4
|
<property>
<name>hive.exec.post.hooks</name>
<value>org.apache.atlas.hive.hook.HiveHook</value>
</property>
|
修改 hive-env.sh 的 Gateway 客户端环境高级配置代码段(安全阀)
1
|
HIVE_AUX_JARS_PATH=/usr/local/src/atlas/apache-atlas-2.0.0/hook/hive
|
修改 HIVE_AUX_JARS_PATH
1
|
/usr/local/src/atlas/apache-atlas-2.0.0/hook/hive
|
修改 hive-site.xml 的 HiveServer2 高级配置代码段(安全阀)
1
2
3
4
5
6
7
8
|
<property>
<name>hive.exec.post.hooks</name>
<value>org.apache.atlas.hive.hook.HiveHook</value>
</property>
<property>
<name>hive.reloadable.aux.jars.path</name>
<value>/usr/local/src/atlas/apache-atlas-2.0.0/hook/hive</value>
</property>
|
修改 HiveServer2 环境高级配置代码段
1
|
HIVE_AUX_JARS_PATH=/usr/local/src/atlas/apache-atlas-2.0.0/hook/hive
|
将配置好的Atlas包发往各个hive节点后重启集群
1
2
3
4
5
6
7
8
|
scp -r /usr/local/src/atlas/apache-atlas-2.0.0 root@cdh02.cm:/usr/local/src/atlas/
# 重启集群
# 将atlas配置文件copy到/etc/hive/conf下(集群各个节点)
scp /usr/local/src/atlas/apache-atlas-2.0.0/conf/atlas-application.properties root@cdh6-1:/etc/hive/conf
scp /usr/local/src/atlas/apache-atlas-2.0.0/conf/atlas-application.properties root@cdh6-2:/etc/hive/conf
scp /usr/local/src/atlas/apache-atlas-2.0.0/conf/atlas-application.properties root@cdh6-3:/etc/hive/conf
scp /usr/local/src/atlas/apache-atlas-2.0.0/conf/atlas-application.properties root@cdh6-4:/etc/hive/conf
scp /usr/local/src/atlas/apache-atlas-2.0.0/conf/atlas-application.properties root@cdh6-5:/etc/hive/conf
|
启动
1
2
3
4
5
|
#启动
./bin/atlas_start.py
#停止:./bin/atlas_stop.py
# 访问 localhost:21000
|
Hive 元数据导入 Atlas
所有节点添加hive环境变量
1
2
3
4
5
6
7
|
vim /etc/profile
export HIVE_HOME=/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hive
export HIVE_CONF_DIR=/etc/hive/conf
export PATH=$HIVE_HOME/bin:$PATH
source /etc/profile
|
执行atlas脚本
