-
Notifications
You must be signed in to change notification settings - Fork 0
/
dynamic_torque.conf
221 lines (170 loc) · 8.25 KB
/
dynamic_torque.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
[global]
# pool_min_num is the min number of cores that should be launched (only applied to passive mode)
#pool_min_num: 1
# pool_max_num is the max number of cores that can be used (only applied to passive mode)
#pool_max_num: 40
# max_idle_time (in seconds) the worker node will be deleted if it has been idle for this amount of time
# default is 600
#max_idle_time: 600
# max_down_time (in seconds) the worker node will be deleted if it has been down for this amount of time
# default is 480
#max_down_time: 480
# max_inaccessible_time (in seconds) it takes some time for a VM to be built in the cloud, from when a request is sent to the cloud
# if the VM is still not in a useful state after this amount of time, it will be destroyed and another attempt will be made
# default is 1200
#max_inaccessible_time: 1200
# job_poller_interval (in seconds) the time interval that Dynamic Torque queries Torque
# default is 10
#job_poller_interval: 10
# cloud_poller_interval (in seconds) the time interval that Dynamic Torque queries the cloud for instance states
# default is 20
#cloud_poller_interval: 10
# persistent_file_location is the file keeps current worker nodes,
# it will be used for Dynamic Torque to pick up previous worker nodes after a restart
# it is in the same format as Torque's nodes file
# default is /var/lib/dynamictorque/nodes
#persistent_file_location: /var/lib/dynamictorque/nodes
# a script to be executed after adding a node to Torque (just before setting it to online)
# it takes two parameters which are the hostname of the VM and the IP of the VM
#post_add_node_command: /the/path/some.sh {0} {1}
# a script to be executed after removing a node from Torque (after it is destroyed from the cloud)
# it takes two parameters which are the hostname of the VM and the IP of the VM
#post_remove_node_command: /the/path/some.sh {0} {1}
# a script to be executed after cloud provisioning is finished (VM state in the cloud becomes ACTIVE)
# it takes two parameters which are the hostname of the VM, the IP of the VM and its name in dynamic torque
#post_vm_provision_command: /the/path/some.sh {0} {1} {2}
# a script to be executed after a VM is destroyed from the cloud
# it takes two parameters which are the hostname of the VM, the IP of the VM and its name in dynamic torque
#post_vm_destroy_command: /the/path/some.sh {0} {1} {2}
# the command to query jobs in the queue
# it must return data in XML format (-x)
#qstat_command: /usr/bin/qstat -x -t
# the command to run pbsnodes command with differnt options
# it takes two parameters, the option and the hostname of the VM
# -x query node's details
# -o hold node (set it to offline)
# -c release node (clear OFFLINE)
# it must return data in XML format (-x)
#pbsnodes_command: /usr/bin/pbsnodes {0} {1}
# the command to add a new node to torque
# it takes one parameter which is the hostname of the VM
#add_node_command: /usr/bin/qmgr -c "create node {0}"
# the command to check node state in maui
# it takes one parameter which is the hostname of the VM
#check_node_command: /usr/bin/checknode {0}
# the command to delete node from torque
# it takes one parameter which is the hostname of the VM
#remove_node_command: /usr/bin/qmgr -c "delete node {0}"
# the command to set a property to node in torque
# it takes three parameters {0} is the hostname of the VM, {1} is the name of the property, {2} is the value of the property
#set_node_command: /usr/bin/qmgr -c "set node {0} {1} {2} {3}"
# the command to get jobs' priorities
#diagnose_p_command: /usr/bin/diagnose -p
# the command to set account_string to a node
#setres_command: /usr/bin/setres {0} {1} {2}
# the command to unset account_string in a node
#releaseres_command: /usr/bin/releaseres {0}
# at the moment only OpenStack is supported
[cloud]
# username
cloud_username: username@userorg
# password
cloud_password: abcdefg
# tenant name
cloud_tenant_name: CoEPP-Tier3
# image uuid
cloud_image_uuid: 13afb676-9d8a-4445-877b-6f5b0e40eada
# keystone URL
cloud_auth_url: https://keystone.rc.nectar.org.au:5000/v2.0/
# SSH key name
cloud_key_name: torque
# security group name
cloud_security_groups: default
# private key file path
cloud_private_key_location: /opt/torque.pem
# the default availability zone to launch new VMs
cloud_availability_zone: melbourne-qh2
# prefix of the VM's name
cloud_vm_prefix: pbsdynwn-
# deprecated, leave it to blank
cloud_vm_init_file:
# user data file, used when lanched a new VM
cloud_vm_userdata_file: /opt/userdata.sh
# after vm contextualization finishes, this file has to be created
# so that dynamic torque can check to existence of this file
# to determine if contextualization is finished
# we recommend cloud-init for contextualization
#cloud_vm_init_finish_file:
# the user to ssh to the VMs to check existence of cloud_vm_init_finish_file
#cloud_shell_user: root
# the number of static WNs
# static WNs stay even no jobs are running
# the value is in the form of x[:location_property]+...
# x:somewhere means totally x static VMs in 'somewhere'
# if 'somewhere' is not present, it will be in the default zone
static_worker_nodes:
# the number of dynamic WNs
# dynamic WNs will be shut down if no jobs are running
# the value is an integer
number_of_dynamic_worker_nodes:
# the flavor of a worker node instance
# the default value is m1.small
worker_node_flavor: m1.small
# Another config file that keeps information about other cloud resources, e.g. for other tenants
# Dynamic Torque uses these credentials to launch special worker nodes exclusively for these tenants
cloud_resources_config: cloud_resources.conf
[torque]
# only monitor jobs submitted to these queues
# if not set all queues will be monitors
torque_queue_to_monitor: long,short
# node property value that will be added to the node when a node is added to Torque
# it is normally used to associated with a specific queue via "set queue queuename resources_default.neednodes = property_value"
node_property: PRODUCTION
# a list of mappings between node location property and availability zone name, this list is mandatory even one AZ is used
# syntax: [LOCATION_PROPERTY_NAME]:[AZ_NAME],...
node_location_property: MEL:melbourne-qh2,MON:monash-01
# if location is not specified in a job, the value of default_location tells the system which AZ to launch the new WN
# 0: use the one in cloud_availability_zone of [cloud] section
# 1: randomly choose one from node_location_property
# the default value is 0
default_location: 0
# If unset, DT reads in all jobs to process.
# If set, DT reads in the specific number of jobs, sometimes there may be thousands of jobs in the queue
# but we only have a limited number of cores, so no need to read in all jobs, we can't fire up any more WNs anyway.
#max_number_of_jobs:
[logging]
# log_level specifies how much information from VM Pool to log.
#
# Choose from VERBOSE, DEBUG, INFO, WARNING, ERROR and CRITICAL
# The default is INFO
#
# WARNING!!! - DO NOT USE VERBOSE WITH LARGE NUMBERS OF JOBS OR DEBUG
# WITH VERY LARGE NUMBERS OF JOBS
#log_level: INFO
# log_format is the format of the logging output. It is a string in the
# Python logging module format, as specified in its module
# documentation here:
# http://docs.python.org/library/logging.html#formatter
#
#
#
# The default is "%(asctime)s - %(levelname)s - %(threadName)s - %(message)s",
# which yields messages like the following:
# "2010-07-13 11:02:08,722 - DEBUG - MainThread - message"
#log_format: %(asctime)s - %(levelname)s - %(threadName)s - %(message)s
# log_location specifies where to put the VM Pool log file. If left
# blank, logging will just be sent to standard out
#
#log_location: /var/log/dynamictorque/dynamictorque.log
# log_stdout specifies whether to log to standard out. If set to true, this
# will log to stdout in addition to logging to a file specified
# in log_location, if set to false, VM Pool won't log to
# stdout, even if there is no value specified for log_location
#
# The default is false
#log_stdout: false
# log_max_size is the maximum filesize in Bytes for your log file.
#
# The default is unlimited file size. This allows you to use logrotate
# if you prefer to use it to manage the rotation of your log files.
#log_max_size: 2097152