-
Notifications
You must be signed in to change notification settings - Fork 0
/
docker-compose.yml
111 lines (104 loc) · 4.27 KB
/
docker-compose.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
version: '3'
services:
# Spark master node
spark:
image: docker.io/bitnami/spark:3.1.2
environment:
- SPARK_MODE=master
volumes:
# Optionally, mount Spark conf for easier configuration
- ./spark-conf/cluster.conf:/opt/bitnami/spark/conf/spark-defaults.conf
ports:
# Spark master UI
- "8080:8080"
depends_on:
- spark-worker
# Spark worker node
# Note:
# Unless the python worker pool would be running inside the same container, we cannot use more than one worker as
# the required port range 8100 - 8200 can't be mapped more than once.
spark-worker:
image: docker.io/bitnami/spark:3.1.2
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark:7077
# By default Beam expects workers (of the SDK worker pool) to connect to a Spark worker on `localhost`. When running
# the worker pool in docker on a Mac this isn't possible due to the lack of `host` networking. Using
# BEAM_WORKER_POOL_IN_DOCKER_VM=1, Beam will use `host.docker.internal` to communicate via the docker host instead.
- BEAM_WORKER_POOL_IN_DOCKER_VM=1
# DOCKER_MAC_CONTAINER=1 limits the ports on a Spark worker for communication with SDK workers to the range 8100 - 8200
# instead of using random ports. Ports of the range are used in a round-robin fashion and have to be published.
- DOCKER_MAC_CONTAINER=1
volumes:
# Optionally, mount Spark conf for easier configuration
- ./spark-conf/cluster.conf:/opt/bitnami/spark/conf/spark-defaults.conf
# Mount work directory to local directory for easy access to logs
- ./work/spark:/opt/bitnami/spark/work
ports:
# Spark Worker UI
- "8081:8081"
# Ports used for SDK workers to communicate with the Spark worker
- "8100-8200:8100-8200"
links:
# Not really necessary, we could also configure the job to use --environment_config=beam-python-workers:50000
# This is just to demonstrate that only the Spark worker requests a Python worker from the pool.
- "beam-python-workers:pysdk"
depends_on:
- beam-python-workers
# Beam Python SDK harness / worker pool
beam-python-workers:
image: apache/beam_python3.8_sdk:2.41.0
command: ["--worker_pool"]
volumes:
# Mount /tmp to access result files
- ./work/pythonsdk:/tmp
# Beam Job server for Apache Spark
# Note: The Spark dependency of the job-server must match the cluster's Spark version.
beam-job-server:
# Until https://github.com/apache/beam/issues/21092 is released, you have to build the image yourself.
# ./gradlew :runners:spark:3:job-server:container:docker
image: apache/beam_spark3_job_server:latest
command: ["--spark-master-url=spark://spark:7077", "--clean-artifacts-per-job=true"]
ports:
# Spark job UI on the driver
- "4040:4040"
# Job endpoint
- "8099:8099"
# Artifact endpoint
- "8098:8098"
depends_on:
- spark
beam-job:
image: apache/beam_python3.8_sdk:2.41.0
entrypoint: [
"python",
"/tmp/app/wordcount.py",
"--runner=PortableRunner",
"--job_endpoint=beam-job-server:8099",
"--artifact_endpoint=beam-job-server:8098",
"--environment_type=EXTERNAL",
# used by Spark workers to connect to the SDK worker pool
"--environment_config=pysdk:50000",
# files will be written on SDK worker pool
"--output=/tmp/result"
]
volumes:
- ./wordcount.py:/tmp/app/wordcount.py
depends_on:
- beam-job-server