broadinstitute
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎centaur/src/main/resources/standardTestCases/docker_size_dockerhub.test‎
Lines changed: 2 additions & 2 deletions b/‎centaur/src/main/resources/standardTestCases/docker_size_dockerhub.test‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎centaur/src/main/resources/standardTestCases/docker_size_gcr.test‎
Lines changed: 2 additions & 2 deletions b/‎centaur/src/main/resources/standardTestCases/docker_size_gcr.test‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎centaur/src/main/resources/standardTestCases/gpu_cuda_image.test‎
Lines changed: 8 additions & 3 deletions b/‎centaur/src/main/resources/standardTestCases/gpu_cuda_image.test‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎centaur/src/main/resources/standardTestCases/gpu_on_papi/gpu_cuda_image.wdl‎
Lines changed: 7 additions & 23 deletions b/‎centaur/src/main/resources/standardTestCases/gpu_on_papi/gpu_cuda_image.wdl‎
Lines changed: 7 additions & 23 deletions
diff --git a/‎centaur/src/main/resources/standardTestCases/missing_input_failure_papiv2.test‎
Lines changed: 2 additions & 1 deletion b/‎centaur/src/main/resources/standardTestCases/missing_input_failure_papiv2.test‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/RuntimeAttributes.md‎
Lines changed: 4 additions & 3 deletions b/‎docs/RuntimeAttributes.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/backends/GCPBatch.md‎
Lines changed: 11 additions & 8 deletions b/‎docs/backends/GCPBatch.md‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎src/ci/resources/gcp_batch_shared_application.inc.conf‎
Lines changed: 3 additions & 4 deletions b/‎src/ci/resources/gcp_batch_shared_application.inc.conf‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala‎
Lines changed: 19 additions & 8 deletions b/‎supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala‎
Lines changed: 19 additions & 8 deletions
@@ -35,6 +35,7 @@ be found [here](https://cromwell.readthedocs.io/en/stable/backends/HPC/#optional
 - Fixed `google_project` and `google_compute_service_account` workflow options not taking effect when using GCP Batch backend
 - Added a way to use a custom LogsPolicy for the job execution, setting `backend.providers.batch.config.batch.logs-policy` to "CLOUD_LOGGING" (default) keeps the current behavior, or, set it to "PATH" to save the logs into the the mounted disk, at the end, this log file gets copied to the google cloud storage bucket with "task.log" as the name.
 - When "CLOUD_LOGGING" is used, many more Cromwell / WDL labels for workflow, root workflow, call, shard etc. are now assigned to GCP Batch log entries.
+- Fixed subnet selection for networks that use custom subnet creation
 
 ### Improved handling of Life Sciences API quota errors
 
@@ -54,7 +55,9 @@ The `IX_WORKFLOW_STORE_ENTRY_WS` index is removed from `WORKFLOW_STORE_ENTRY`.
 
 The index had low cardinality and workflow pickup is faster without it. Migration time depends on workflow store size, but should be very fast for most installations. Terminal workflows are removed from the workflow store, so only running workflows contribute to the cost.
 
-### Bug fixes
+### Bug fixes and small changes
+
+ * Changed default boot disk size from 10GB to 20GB in PipelinesAPI and Google Batch backends
 
 #### Improved `size()` function performance on arrays
 
@@ -88,6 +91,12 @@ The config keys `services.HealthMonitor.config.check-gcs` and `.gcs-bucket-to-ch
 Code relating to the Google Genomics API (aka `v1Alpha`) has been removed since Google has entirely disabled that service.
 Cloud Life Sciences (aka `v2Beta`, deprecated) and Google Batch (aka `batch`, recommended) remain the two viable GCP backends.
 
+#### GPU changes
+ * Removed support for Nvidia K80 "Kepler" GPUs, which were [discontinued by GCP in May 2024](https://cloud.google.com/compute/docs/eol/k80-eol).
+   * Default GPU on Life Sciences is now Nvidia P100
+   * Default GPU on GCP Batch is now Nvidia T4
+ * Updated runtime attributes documentation to clarify that the `nvidiaDriverVersion` key is ignored on GCP Batch.
+
 ## 87 Release Notes
 
 ### GCP Batch
 
@@ -11,8 +11,8 @@ files {
 
 metadata {
   status: Succeeded
-  "outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 17
-  "outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 17
+  "outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 27
+  "outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 27
 }
 
 workflowType: WDL
@@ -11,8 +11,8 @@ files {
 
 metadata {
   status: Succeeded
-  "outputs.docker_size_gcr.large_gcr_image_with_hash.bootDiskSize": 17
-  "outputs.docker_size_gcr.large_gcr_image_with_tag.bootDiskSize": 17
+  "outputs.docker_size_gcr.large_gcr_image_with_hash.bootDiskSize": 27
+  "outputs.docker_size_gcr.large_gcr_image_with_tag.bootDiskSize": 27
 }
 
 workflowType: WDL
@@ -1,14 +1,19 @@
 name: gpu_cuda_image
 testFormat: workflowsuccess
 backends: [Papi, GCPBATCH]
-ignore: true
 
 files {
   workflow: gpu_on_papi/gpu_cuda_image.wdl
 }
 
+# As of November 2024, GCP Batch was using driver 550 and Life Sciences 535.
+# Neither was on the 418 version that used to be specified in this test.
+#
+# On Life Sciences it seems to be straight up ignored by the API.
+#
+# In Batch it is not wired through Cromwell, and we may not do so if we don't find a reason to.
+
 metadata {
   status: Succeeded
-  "outputs.gpu_cuda_image.modprobe_check.0": "good"
-  "outputs.gpu_cuda_image.smi_check.0": "good"
+  "outputs.gpu_cuda_image.smi_check": "gpu_good\nvram_good"
 }
@@ -2,48 +2,32 @@ version 1.0
 
 workflow gpu_cuda_image {
 
-    input {
-      Array[String] driver_versions = [ "418.87.00" ]
-    }
-
-    scatter (driver_version in driver_versions) {
-        call get_machine_info { input: driver_version = driver_version }
-    }
+    call get_machine_info
 
     output {
-      Array[String] modprobe_check = get_machine_info.modprobe_check
-      Array[String] smi_check = get_machine_info.smi_check
-
-      Array[File] modprobe_contents = get_machine_info.modprobe_content
-      Array[File] smi_contents = get_machine_info.smi_content
+      String smi_check = get_machine_info.smi_check
+      File smi_contents = get_machine_info.smi_content
     }
 }
 
 task get_machine_info {
-    input {
-        String driver_version
-    }
 
 	command <<<
-		nvidia-modprobe --version > modprobe
-		cat modprobe | grep -q "~{driver_version}" && echo "good" > modprobe_check || echo "bad" > modprobe_check
 		nvidia-smi > smi
-		cat smi | grep -q "~{driver_version}" && echo "good" > smi_check || echo "bad" > smi_check
+		cat smi | grep -q "Tesla T4" && echo "gpu_good" > smi_check || echo "bad" > smi_check
+		cat smi | grep -q "15360MiB" && echo "vram_good" >> smi_check || echo "bad" >> smi_check
 	 >>>
 
     runtime {
-        docker: "nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04"
+        docker: "nvidia/cuda:12.6.2-cudnn-devel-ubuntu24.04"
         bootDiskSizeGb: 20
-        gpuType: "nvidia-tesla-k80"
+        gpuType: "nvidia-tesla-t4"
         gpuCount: 1
-        nvidiaDriverVersion: driver_version
         zones: "us-central1-c"
     }
 
     output {
-      String modprobe_check = read_string("modprobe_check")
       String smi_check = read_string("smi_check")
-      File modprobe_content = "modprobe"
       File smi_content = "smi"
     }
 }
@@ -11,5 +11,6 @@ metadata {
     workflowName: missing_input_failure
     status: Failed
     "failures.0.message": "Workflow failed"
-    "failures.0.causedBy.0.message": "Failed to evaluate 'missing_input_failure.hello.addressee' (reason 1 of 1): Evaluating read_string(wf_hello_input) failed: [Attempted 1 time(s)] - IOException: Could not read from gs://nonexistingbucket/path/doesnt/exist: File not found: gs://nonexistingbucket/path/doesnt/exist"
+    # The GCS error message occasionally varies for unknown reasons. Do not try to assert on it, just make sure we get the right Cromwell exception. (AN-273)
+    "failures.0.causedBy.0.message": "Failed to evaluate 'missing_input_failure.hello.addressee' (reason 1 of 1): Evaluating read_string(wf_hello_input) failed: [Attempted 1 time(s)] - IOException: Could not read from gs://nonexistingbucket/path/doesnt/exist"~~
 }
@@ -395,19 +395,20 @@ Make sure to choose a zone for which the type of GPU you want to attach is avail
 
 The types of compute GPU supported are:
 
-* `nvidia-tesla-k80` 
 * `nvidia-tesla-v100`
 * `nvidia-tesla-p100`
 * `nvidia-tesla-p4`
 * `nvidia-tesla-t4`
 
 For the latest list of supported GPU's, please visit [Google's GPU documentation](nvidia-drivers-us-public).
 
-The default driver is `418.87.00`, you may specify your own via the `nvidiaDriverVersion` key.  Make sure that driver exists in the `nvidia-drivers-us-public` beforehand, per the [Google Pipelines API documentation](https://cloud.google.com/genomics/reference/rest/Shared.Types/Metadata#VirtualMachine). 
+On Life Sciences API, the default driver is `418.87.00`. You may specify your own via the `nvidiaDriverVersion` key.  Make sure that driver exists in the `nvidia-drivers-us-public` beforehand, per the [Google Pipelines API documentation](https://cloud.google.com/genomics/reference/rest/Shared.Types/Metadata#VirtualMachine). 
+
+On GCP Batch, `nvidiaDriverVersion` is currently ignored; Batch selects the correct driver version automatically.
 
 ```
 runtime {
-    gpuType: "nvidia-tesla-k80"
+    gpuType: "nvidia-tesla-t4"
     gpuCount: 2
     nvidiaDriverVersion: "418.87.00"
     zones: ["us-central1-c"]
 
@@ -253,14 +253,17 @@ backend {
 ```
 
 The `network-name` and `subnetwork-name` should reference the name of your private network and subnetwork within that
-network respectively. The `subnetwork-name` is an optional config. Note that in the
-PAPI v2 backend `subnetwork-name` was an optional configuration parameter which accepted a `*` wildcard for choosing the
-appropriate subnetwork region, but in GCP Batch the `subnetwork-name` specification can be omitted
-and GCP Batch will choose the appropriate subnetwork automatically.
-
-For example, if your `virtual-private-cloud` config looks like the one above, then Cromwell will use the value of the
-configuration key, which is `vpc-network` here, as the name of private network and run the jobs on this network.
-If the network name is not present in the config Cromwell will fall back to trying to run jobs on the default network.
+network respectively. For example, if your `virtual-private-cloud` config looks like the one above, then Cromwell will 
+use the value of the configuration key, which is `vpc-network` here, as the name of private network and run the jobs on 
+this network. If the network name is not present in the config Cromwell will fall back to trying to run jobs on the 
+default network.
+
+`subnetwork-name` is an optional configuration parameter which accepts a `*` wildcard for choosing the appropriate 
+subnetwork region. If your network is using "auto" subnet creation, `subnetwork-name`specification can be omitted and 
+GCP Batch will choose the appropriate subnetwork automatically. If the network's subnet creation strategy is "custom," 
+the full subnetwork name (with `*` for region) must be supplied (ex. `"projects/${projectId}/regions/*/subnetworks/subnetwork"`).
+Note that wildcard regions are not supported by GCP Batch, Cromwell will replace `*` with the correct region at job
+creation time.
 
 If the `network-name` or `subnetwork-name` values contain the string `${projectId}` then that value will be replaced
 by Cromwell with the name of the project running GCP Batch.
 
@@ -120,11 +120,10 @@ backend {
         virtual-private-cloud {
           # integration testing:
           #  - fully qualified name
-          #  - hardcoded project id
+          #  - templated project and subnet region
           #  - does not end with `/`
-          network-name = "projects/broad-dsde-cromwell-dev/global/networks/cromwell-ci-gcpbatch-vpc-network"
-          # For GCP Batch we do not reference the subnetwork name, Batch has to work that out for itself in order to
-          # enable running jobs in regions that are different from the region of the GCP Batch to which we send jobs.
+          network-name = "projects/${projectId}/global/networks/cromwell-ci-gcpbatch-vpc-network"
+          subnetwork-name = "projects/${projectId}/regions/*/subnetworks/cromwell-ci-gcpbatch-vpc-network"
         }
 
         # Have the engine authenticate to docker.io. See BT-141 for more info.
 
@@ -54,10 +54,15 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
 
     val network = NetworkInterface.newBuilder
       .setNoExternalIpAddress(data.gcpBatchParameters.runtimeAttributes.noAddress)
-      .setNetwork(vpcAndSubnetworkProjectLabelValues.networkName(data.gcpBatchParameters.projectId))
+      .setNetwork(vpcAndSubnetworkProjectLabelValues.networkName(data.createParameters.projectId))
+
+    // When selecting a subnet region, prefer zones set in runtime attrs, then fall back to
+    // the region the host google project is in. Note that zones in runtime attrs will always
+    // be in a single region.
+    val region = zonesToRegion(data.createParameters.runtimeAttributes.zones).getOrElse(data.gcpBatchParameters.region)
 
     vpcAndSubnetworkProjectLabelValues
-      .subnetNameOption(data.gcpBatchParameters.projectId)
+      .subnetNameOption(projectId = data.createParameters.projectId, region = region)
       .foreach(network.setSubnetwork)
 
     network
@@ -85,7 +90,15 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
   ): InstancePolicy.Builder = {
 
     // set GPU count to 0 if not included in workflow
-    val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) // TODO: Driver version
+    // `setDriverVersion()` is available but we're using the Batch default for now
+    //
+    // Nvidia lifecycle reference:
+    // https://docs.nvidia.com/datacenter/tesla/drivers/index.html#cuda-drivers
+    //
+    // GCP docs:
+    // https://cloud.google.com/batch/docs/create-run-job-gpus#install-gpu-drivers
+    // https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs#Accelerator.FIELDS.driver_version
+    val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType(""))
 
     val instancePolicy = InstancePolicy.newBuilder
       .setProvisioningModel(spotModel)
@@ -164,9 +177,9 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
 
   override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = {
 
-    val runtimeAttributes = data.gcpBatchParameters.runtimeAttributes
     val createParameters = data.createParameters
-    val retryCount = data.gcpBatchParameters.runtimeAttributes.preemptible
+    val runtimeAttributes = createParameters.runtimeAttributes
+    val retryCount = runtimeAttributes.preemptible
     val allDisksToBeMounted: Seq[GcpBatchAttachedDisk] =
       createParameters.disks ++ createParameters.referenceDisksForLocalizationOpt.getOrElse(List.empty)
     val gcpBootDiskSizeMb = convertGbToMib(runtimeAttributes)
@@ -226,7 +239,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
     val monitoringShutdown: List[Runnable] = monitoringShutdownRunnables(createParameters)
     val checkpointingStart: List[Runnable] = checkpointingSetupRunnables(createParameters, allVolumes)
     val checkpointingShutdown: List[Runnable] = checkpointingShutdownRunnables(createParameters, allVolumes)
-    val sshAccess: List[Runnable] = List.empty // sshAccessActions(createPipelineParameters, mounts)
 
     val sortedRunnables: List[Runnable] = RunnableUtils.sortRunnables(
       containerSetup = containerSetup,
@@ -238,7 +250,6 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
       monitoringShutdown = monitoringShutdown,
       checkpointingStart = checkpointingStart,
       checkpointingShutdown = checkpointingShutdown,
-      sshAccess = sshAccess,
       isBackground = _.getBackground
     )
 
@@ -252,7 +263,7 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe
     )
     val instancePolicy =
       createInstancePolicy(cpuPlatform = cpuPlatform, spotModel, accelerators, allDisks, machineType = machineType)
-    val locationPolicy = LocationPolicy.newBuilder.addAllowedLocations(zones).build
+    val locationPolicy = LocationPolicy.newBuilder.addAllAllowedLocations(zones.asJava).build
     val allocationPolicy =
       createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa, accelerators)
     val logsPolicy = data.gcpBatchParameters.batchAttributes.logsPolicy match {
Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,8 @@ files {`
`11`	`11`
`12`	`12`	`metadata {`
`13`	`13`	`status: Succeeded`
`14`		`- "outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 17`
`15`		`- "outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 17`
	`14`	`+ "outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 27`
	`15`	`+ "outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 27`
`16`	`16`	`}`
`17`	`17`
`18`	`18`	`workflowType: WDL`
Original file line number	Diff line number	Diff line change
`@@ -11,5 +11,6 @@ metadata {`
`11`	`11`	`workflowName: missing_input_failure`
`12`	`12`	`status: Failed`
`13`	`13`	`"failures.0.message": "Workflow failed"`
`14`		`- "failures.0.causedBy.0.message": "Failed to evaluate 'missing_input_failure.hello.addressee' (reason 1 of 1): Evaluating read_string(wf_hello_input) failed: [Attempted 1 time(s)] - IOException: Could not read from gs://nonexistingbucket/path/doesnt/exist: File not found: gs://nonexistingbucket/path/doesnt/exist"`
	`14`	`+ # The GCS error message occasionally varies for unknown reasons. Do not try to assert on it, just make sure we get the right Cromwell exception. (AN-273)`
	`15`	`+ "failures.0.causedBy.0.message": "Failed to evaluate 'missing_input_failure.hello.addressee' (reason 1 of 1): Evaluating read_string(wf_hello_input) failed: [Attempted 1 time(s)] - IOException: Could not read from gs://nonexistingbucket/path/doesnt/exist"~~`
`15`	`16`	`}`