Skip to content

Commit 2af8d35

Browse files
authored
Merge branch 'develop' into AN-146-batch-vm-cost
2 parents 26f94ed + 4d01b91 commit 2af8d35

File tree

22 files changed

+98
-98
lines changed

22 files changed

+98
-98
lines changed

CHANGELOG.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ be found [here](https://cromwell.readthedocs.io/en/stable/backends/HPC/#optional
3535
- Fixed `google_project` and `google_compute_service_account` workflow options not taking effect when using GCP Batch backend
3636
- Added a way to use a custom LogsPolicy for the job execution, setting `backend.providers.batch.config.batch.logs-policy` to "CLOUD_LOGGING" (default) keeps the current behavior, or, set it to "PATH" to save the logs into the the mounted disk, at the end, this log file gets copied to the google cloud storage bucket with "task.log" as the name.
3737
- When "CLOUD_LOGGING" is used, many more Cromwell / WDL labels for workflow, root workflow, call, shard etc. are now assigned to GCP Batch log entries.
38+
- Fixed subnet selection for networks that use custom subnet creation
3839

3940
### Improved handling of Life Sciences API quota errors
4041

@@ -54,7 +55,9 @@ The `IX_WORKFLOW_STORE_ENTRY_WS` index is removed from `WORKFLOW_STORE_ENTRY`.
5455

5556
The index had low cardinality and workflow pickup is faster without it. Migration time depends on workflow store size, but should be very fast for most installations. Terminal workflows are removed from the workflow store, so only running workflows contribute to the cost.
5657

57-
### Bug fixes
58+
### Bug fixes and small changes
59+
60+
* Changed default boot disk size from 10GB to 20GB in PipelinesAPI and Google Batch backends
5861

5962
#### Improved `size()` function performance on arrays
6063

@@ -88,6 +91,12 @@ The config keys `services.HealthMonitor.config.check-gcs` and `.gcs-bucket-to-ch
8891
Code relating to the Google Genomics API (aka `v1Alpha`) has been removed since Google has entirely disabled that service.
8992
Cloud Life Sciences (aka `v2Beta`, deprecated) and Google Batch (aka `batch`, recommended) remain the two viable GCP backends.
9093

94+
#### GPU changes
95+
* Removed support for Nvidia K80 "Kepler" GPUs, which were [discontinued by GCP in May 2024](https://cloud.google.com/compute/docs/eol/k80-eol).
96+
* Default GPU on Life Sciences is now Nvidia P100
97+
* Default GPU on GCP Batch is now Nvidia T4
98+
* Updated runtime attributes documentation to clarify that the `nvidiaDriverVersion` key is ignored on GCP Batch.
99+
91100
## 87 Release Notes
92101

93102
### GCP Batch

centaur/src/main/resources/standardTestCases/docker_size_dockerhub.test

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ files {
1111

1212
metadata {
1313
status: Succeeded
14-
"outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 17
15-
"outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 17
14+
"outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 27
15+
"outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 27
1616
}
1717

1818
workflowType: WDL

centaur/src/main/resources/standardTestCases/docker_size_gcr.test

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ files {
1111

1212
metadata {
1313
status: Succeeded
14-
"outputs.docker_size_gcr.large_gcr_image_with_hash.bootDiskSize": 17
15-
"outputs.docker_size_gcr.large_gcr_image_with_tag.bootDiskSize": 17
14+
"outputs.docker_size_gcr.large_gcr_image_with_hash.bootDiskSize": 27
15+
"outputs.docker_size_gcr.large_gcr_image_with_tag.bootDiskSize": 27
1616
}
1717

1818
workflowType: WDL
Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
name: gpu_cuda_image
22
testFormat: workflowsuccess
33
backends: [Papi, GCPBATCH]
4-
ignore: true
54

65
files {
76
workflow: gpu_on_papi/gpu_cuda_image.wdl
87
}
98

9+
# As of November 2024, GCP Batch was using driver 550 and Life Sciences 535.
10+
# Neither was on the 418 version that used to be specified in this test.
11+
#
12+
# On Life Sciences it seems to be straight up ignored by the API.
13+
#
14+
# In Batch it is not wired through Cromwell, and we may not do so if we don't find a reason to.
15+
1016
metadata {
1117
status: Succeeded
12-
"outputs.gpu_cuda_image.modprobe_check.0": "good"
13-
"outputs.gpu_cuda_image.smi_check.0": "good"
18+
"outputs.gpu_cuda_image.smi_check": "gpu_good\nvram_good"
1419
}

centaur/src/main/resources/standardTestCases/gpu_on_papi/gpu_cuda_image.wdl

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,48 +2,32 @@ version 1.0
22

33
workflow gpu_cuda_image {
44

5-
input {
6-
Array[String] driver_versions = [ "418.87.00" ]
7-
}
8-
9-
scatter (driver_version in driver_versions) {
10-
call get_machine_info { input: driver_version = driver_version }
11-
}
5+
call get_machine_info
126

137
output {
14-
Array[String] modprobe_check = get_machine_info.modprobe_check
15-
Array[String] smi_check = get_machine_info.smi_check
16-
17-
Array[File] modprobe_contents = get_machine_info.modprobe_content
18-
Array[File] smi_contents = get_machine_info.smi_content
8+
String smi_check = get_machine_info.smi_check
9+
File smi_contents = get_machine_info.smi_content
1910
}
2011
}
2112

2213
task get_machine_info {
23-
input {
24-
String driver_version
25-
}
2614

2715
command <<<
28-
nvidia-modprobe --version > modprobe
29-
cat modprobe | grep -q "~{driver_version}" && echo "good" > modprobe_check || echo "bad" > modprobe_check
3016
nvidia-smi > smi
31-
cat smi | grep -q "~{driver_version}" && echo "good" > smi_check || echo "bad" > smi_check
17+
cat smi | grep -q "Tesla T4" && echo "gpu_good" > smi_check || echo "bad" > smi_check
18+
cat smi | grep -q "15360MiB" && echo "vram_good" >> smi_check || echo "bad" >> smi_check
3219
>>>
3320

3421
runtime {
35-
docker: "nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04"
22+
docker: "nvidia/cuda:12.6.2-cudnn-devel-ubuntu24.04"
3623
bootDiskSizeGb: 20
37-
gpuType: "nvidia-tesla-k80"
24+
gpuType: "nvidia-tesla-t4"
3825
gpuCount: 1
39-
nvidiaDriverVersion: driver_version
4026
zones: "us-central1-c"
4127
}
4228

4329
output {
44-
String modprobe_check = read_string("modprobe_check")
4530
String smi_check = read_string("smi_check")
46-
File modprobe_content = "modprobe"
4731
File smi_content = "smi"
4832
}
4933
}

centaur/src/main/resources/standardTestCases/missing_input_failure_papiv2.test

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@ metadata {
1111
workflowName: missing_input_failure
1212
status: Failed
1313
"failures.0.message": "Workflow failed"
14-
"failures.0.causedBy.0.message": "Failed to evaluate 'missing_input_failure.hello.addressee' (reason 1 of 1): Evaluating read_string(wf_hello_input) failed: [Attempted 1 time(s)] - IOException: Could not read from gs://nonexistingbucket/path/doesnt/exist: File not found: gs://nonexistingbucket/path/doesnt/exist"
14+
# The GCS error message occasionally varies for unknown reasons. Do not try to assert on it, just make sure we get the right Cromwell exception. (AN-273)
15+
"failures.0.causedBy.0.message": "Failed to evaluate 'missing_input_failure.hello.addressee' (reason 1 of 1): Evaluating read_string(wf_hello_input) failed: [Attempted 1 time(s)] - IOException: Could not read from gs://nonexistingbucket/path/doesnt/exist"~~
1516
}

docs/RuntimeAttributes.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -395,19 +395,20 @@ Make sure to choose a zone for which the type of GPU you want to attach is avail
395395

396396
The types of compute GPU supported are:
397397

398-
* `nvidia-tesla-k80`
399398
* `nvidia-tesla-v100`
400399
* `nvidia-tesla-p100`
401400
* `nvidia-tesla-p4`
402401
* `nvidia-tesla-t4`
403402

404403
For the latest list of supported GPU's, please visit [Google's GPU documentation](nvidia-drivers-us-public).
405404

406-
The default driver is `418.87.00`, you may specify your own via the `nvidiaDriverVersion` key. Make sure that driver exists in the `nvidia-drivers-us-public` beforehand, per the [Google Pipelines API documentation](https://cloud.google.com/genomics/reference/rest/Shared.Types/Metadata#VirtualMachine).
405+
On Life Sciences API, the default driver is `418.87.00`. You may specify your own via the `nvidiaDriverVersion` key. Make sure that driver exists in the `nvidia-drivers-us-public` beforehand, per the [Google Pipelines API documentation](https://cloud.google.com/genomics/reference/rest/Shared.Types/Metadata#VirtualMachine).
406+
407+
On GCP Batch, `nvidiaDriverVersion` is currently ignored; Batch selects the correct driver version automatically.
407408

408409
```
409410
runtime {
410-
gpuType: "nvidia-tesla-k80"
411+
gpuType: "nvidia-tesla-t4"
411412
gpuCount: 2
412413
nvidiaDriverVersion: "418.87.00"
413414
zones: ["us-central1-c"]

docs/backends/GCPBatch.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -253,14 +253,17 @@ backend {
253253
```
254254

255255
The `network-name` and `subnetwork-name` should reference the name of your private network and subnetwork within that
256-
network respectively. The `subnetwork-name` is an optional config. Note that in the
257-
PAPI v2 backend `subnetwork-name` was an optional configuration parameter which accepted a `*` wildcard for choosing the
258-
appropriate subnetwork region, but in GCP Batch the `subnetwork-name` specification can be omitted
259-
and GCP Batch will choose the appropriate subnetwork automatically.
260-
261-
For example, if your `virtual-private-cloud` config looks like the one above, then Cromwell will use the value of the
262-
configuration key, which is `vpc-network` here, as the name of private network and run the jobs on this network.
263-
If the network name is not present in the config Cromwell will fall back to trying to run jobs on the default network.
256+
network respectively. For example, if your `virtual-private-cloud` config looks like the one above, then Cromwell will
257+
use the value of the configuration key, which is `vpc-network` here, as the name of private network and run the jobs on
258+
this network. If the network name is not present in the config Cromwell will fall back to trying to run jobs on the
259+
default network.
260+
261+
`subnetwork-name` is an optional configuration parameter which accepts a `*` wildcard for choosing the appropriate
262+
subnetwork region. If your network is using "auto" subnet creation, `subnetwork-name`specification can be omitted and
263+
GCP Batch will choose the appropriate subnetwork automatically. If the network's subnet creation strategy is "custom,"
264+
the full subnetwork name (with `*` for region) must be supplied (ex. `"projects/${projectId}/regions/*/subnetworks/subnetwork"`).
265+
Note that wildcard regions are not supported by GCP Batch, Cromwell will replace `*` with the correct region at job
266+
creation time.
264267

265268
If the `network-name` or `subnetwork-name` values contain the string `${projectId}` then that value will be replaced
266269
by Cromwell with the name of the project running GCP Batch.

src/ci/resources/gcp_batch_shared_application.inc.conf

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,10 @@ backend {
120120
virtual-private-cloud {
121121
# integration testing:
122122
# - fully qualified name
123-
# - hardcoded project id
123+
# - templated project and subnet region
124124
# - does not end with `/`
125-
network-name = "projects/broad-dsde-cromwell-dev/global/networks/cromwell-ci-gcpbatch-vpc-network"
126-
# For GCP Batch we do not reference the subnetwork name, Batch has to work that out for itself in order to
127-
# enable running jobs in regions that are different from the region of the GCP Batch to which we send jobs.
125+
network-name = "projects/${projectId}/global/networks/cromwell-ci-gcpbatch-vpc-network"
126+
subnetwork-name = "projects/${projectId}/regions/*/subnetworks/cromwell-ci-gcpbatch-vpc-network"
128127
}
129128

130129
# Have the engine authenticate to docker.io. See BT-141 for more info.

supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,15 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
5454

5555
val network = NetworkInterface.newBuilder
5656
.setNoExternalIpAddress(data.gcpBatchParameters.runtimeAttributes.noAddress)
57-
.setNetwork(vpcAndSubnetworkProjectLabelValues.networkName(data.gcpBatchParameters.projectId))
57+
.setNetwork(vpcAndSubnetworkProjectLabelValues.networkName(data.createParameters.projectId))
58+
59+
// When selecting a subnet region, prefer zones set in runtime attrs, then fall back to
60+
// the region the host google project is in. Note that zones in runtime attrs will always
61+
// be in a single region.
62+
val region = zonesToRegion(data.createParameters.runtimeAttributes.zones).getOrElse(data.gcpBatchParameters.region)
5863

5964
vpcAndSubnetworkProjectLabelValues
60-
.subnetNameOption(data.gcpBatchParameters.projectId)
65+
.subnetNameOption(projectId = data.createParameters.projectId, region = region)
6166
.foreach(network.setSubnetwork)
6267

6368
network
@@ -85,7 +90,15 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
8590
): InstancePolicy.Builder = {
8691

8792
// set GPU count to 0 if not included in workflow
88-
val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) // TODO: Driver version
93+
// `setDriverVersion()` is available but we're using the Batch default for now
94+
//
95+
// Nvidia lifecycle reference:
96+
// https://docs.nvidia.com/datacenter/tesla/drivers/index.html#cuda-drivers
97+
//
98+
// GCP docs:
99+
// https://cloud.google.com/batch/docs/create-run-job-gpus#install-gpu-drivers
100+
// https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#Accelerator.FIELDS.driver_version
101+
val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType(""))
89102

90103
val instancePolicy = InstancePolicy.newBuilder
91104
.setProvisioningModel(spotModel)
@@ -164,9 +177,9 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
164177

165178
override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = {
166179

167-
val runtimeAttributes = data.gcpBatchParameters.runtimeAttributes
168180
val createParameters = data.createParameters
169-
val retryCount = data.gcpBatchParameters.runtimeAttributes.preemptible
181+
val runtimeAttributes = createParameters.runtimeAttributes
182+
val retryCount = runtimeAttributes.preemptible
170183
val allDisksToBeMounted: Seq[GcpBatchAttachedDisk] =
171184
createParameters.disks ++ createParameters.referenceDisksForLocalizationOpt.getOrElse(List.empty)
172185
val gcpBootDiskSizeMb = convertGbToMib(runtimeAttributes)
@@ -226,7 +239,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
226239
val monitoringShutdown: List[Runnable] = monitoringShutdownRunnables(createParameters)
227240
val checkpointingStart: List[Runnable] = checkpointingSetupRunnables(createParameters, allVolumes)
228241
val checkpointingShutdown: List[Runnable] = checkpointingShutdownRunnables(createParameters, allVolumes)
229-
val sshAccess: List[Runnable] = List.empty // sshAccessActions(createPipelineParameters, mounts)
230242

231243
val sortedRunnables: List[Runnable] = RunnableUtils.sortRunnables(
232244
containerSetup = containerSetup,
@@ -238,7 +250,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
238250
monitoringShutdown = monitoringShutdown,
239251
checkpointingStart = checkpointingStart,
240252
checkpointingShutdown = checkpointingShutdown,
241-
sshAccess = sshAccess,
242253
isBackground = _.getBackground
243254
)
244255

@@ -252,7 +263,7 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
252263
)
253264
val instancePolicy =
254265
createInstancePolicy(cpuPlatform = cpuPlatform, spotModel, accelerators, allDisks, machineType = machineType)
255-
val locationPolicy = LocationPolicy.newBuilder.addAllowedLocations(zones).build
266+
val locationPolicy = LocationPolicy.newBuilder.addAllAllowedLocations(zones.asJava).build
256267
val allocationPolicy =
257268
createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa, accelerators)
258269
val logsPolicy = data.gcpBatchParameters.batchAttributes.logsPolicy match {

0 commit comments

Comments
 (0)