diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 9c9af639fb..60debc51e3 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -81,7 +81,7 @@ } CLI_FILE_PATH = "/root/.config/pulp/cli.toml" POST_TIMEOUT = 3600 -TAR_POLL_VAL = 3 +TAR_POLL_VAL = 25 FILE_POLL_VAL = 1 ISO_POLL_VAL = 15 FILE_URI = "/pulp/api/v3/content/file/files/" diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py new file mode 100644 index 0000000000..a697764683 --- /dev/null +++ b/common/library/modules/parallel_file_copy.py @@ -0,0 +1,175 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +# pylint: disable=import-error,no-name-in-module,line-too-long + +""" +Ansible module for parallel copying of files. + +Supports copying multiple source → destination pairs in parallel, +with logging, retries, and optional cleanup. +""" + +import os +import shutil +import threading +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.local_repo.standard_logger import setup_standard_logger + +# ============================================================ +# Default Values +# ============================================================ + +DEFAULT_MAX_WORKERS = 4 +DEFAULT_RETRY_COUNT = 2 +DEFAULT_DELETE_EXISTING = True +PARALLEL_FILE_COPY_LOG = '/opt/omnia/log/core/playbooks/parallel_file_copy.log/' + +# ============================================================ +# Copy Worker Function +# ============================================================ + +def copy_single_file(src_file, dest_dir, retry_count, delete_existing, slogger, summary): + """Copy one directory pair with retry support.""" + thread_name = threading.current_thread().name + start_time = datetime.now() + + if not os.path.isfile(src_file): + slogger.info(f"NOT COPIED - Source file missing: {src_file}") + summary["skipped"].append(src_file) + return + + os.makedirs(dest_dir, exist_ok=True) + dest_file = os.path.join(dest_dir, os.path.basename(src_file)) + + for attempt in range(1, retry_count + 1): + try: + slogger.info(f"[{thread_name}] START {start_time} Copying {src_file} (Attempt {attempt})") + + if delete_existing and os.path.exists(dest_file): + os.remove(dest_file) + slogger.info(f"Deleted existing file: {dest_file}") + + shutil.copy2(src_file, dest_file) + + end_time = datetime.now() + duration = (end_time - start_time).total_seconds() + slogger.info(f"[{thread_name}] SUCCESS {end_time} Copied {src_file} -> {dest_file} (Duration={duration:.2f}s)") + + summary["copied"].append(src_file) + return + + except Exception as err: + slogger.error(f"[{thread_name}] ERROR copying {src_file} (Attempt {attempt}) Reason: {err}") + if attempt == retry_count: + summary["failed"].append(src_file) + +# ============================================================ +# Main Parallel Copy Logic +# ============================================================ + +def execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger): + """ + Executes parallel copy for all pairs. + Returns summary dict. + """ + summary = {"copied": [], "skipped": [], "failed": []} + futures = [] + + slogger.info("===== PARALLEL FILE COPY STARTED =====") + slogger.info(f"Copy pairs received: {copy_pairs}") + slogger.info(f"Max workers: {max_workers}") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for src_dir, dest_dir in copy_pairs: + + if not os.path.isdir(src_dir): + slogger.info(f"NOT COPIED - Source directory missing: {src_dir}") + summary["skipped"].append(src_dir) + continue + + files = [os.path.join(src_dir, f) for f in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, f))] + if not files: + slogger.info(f"NOT COPIED - No files found in directory: {src_dir}") + summary["skipped"].append(src_dir) + continue + + # ⚡ Show Ansible warning for in-progress copy + module.warn(f"Copy in progress for {src_dir} -> {dest_dir}. Please wait ...") + + slogger.info(f"Copying {len(files)} files from {src_dir} -> {dest_dir} ...") + + for file_path in files: + futures.append(executor.submit(copy_single_file, file_path, dest_dir, retry_count, delete_existing, slogger, summary)) + + # Wait for all copies to finish + for future in as_completed(futures): + future.result() + + slogger.info("===== PARALLEL FILE COPY FINISHED =====") + return summary + +# ============================================================ +# Ansible Module Entry Point +# ============================================================ + +def main(): + """Main Ansible module execution entrypoint.""" + module_args = dict( + copy_pairs=dict(type="list", required=True), + max_workers=dict(type="int", required=False, default=DEFAULT_MAX_WORKERS), + retry_count=dict(type="int", required=False, default=DEFAULT_RETRY_COUNT), + delete_existing=dict(type="bool", required=False, default=DEFAULT_DELETE_EXISTING), + slog_file=dict(type="str", required=False, default=PARALLEL_FILE_COPY_LOG), + ) + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + copy_pairs = module.params["copy_pairs"] + max_workers = module.params["max_workers"] + retry_count = module.params["retry_count"] + delete_existing = module.params["delete_existing"] + slog_file = module.params["slog_file"] + + slogger = setup_standard_logger(slog_file) + + result = dict(changed=False, copied=[], skipped=[], failed=[]) + + try: + summary = execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger) + + result["copied"] = summary["copied"] + result["skipped"] = summary["skipped"] + result["failed"] = summary["failed"] + if summary["copied"]: + result["changed"] = True + + overall_status = "SUCCESS" + if summary["failed"] and summary["copied"]: + overall_status = "PARTIAL" + elif summary["failed"] and not summary["copied"]: + overall_status = "FAILURE" + + result["overall_status"] = overall_status + module.exit_json(**result) + + except Exception as err: + slogger.error(f"Parallel copy execution failed: {err}") + module.fail_json(msg=str(err), **result) + +if __name__ == "__main__": + main() diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index de236ed958..bc3068843a 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -98,7 +98,7 @@ echo "[INFO] Setting up shared CUDA directory..." # Create and mount shared directory for compute nodes mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 3195fad9e3..a1f8a55f50 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -105,7 +105,7 @@ echo "[INFO] Setting up shared CUDA directory..." # Create and mount shared directory for compute nodes mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." @@ -190,6 +190,18 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + - path: /usr/local/bin/install_openmpi.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_openmpi.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_ucx.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} + - path: /etc/hosts append: true content: | @@ -207,6 +219,18 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -299,66 +323,22 @@ {% endif %} {% if hostvars['localhost']['ucx_support'] %} - # UCX build and install - - | - UCX_BIN={{ client_mount_path }}/benchmarks/ucx - mkdir -p {{ client_mount_path }}/compile/ucx - mkdir -p {{ client_mount_path }}/benchmarks/ucx - cd {{ client_mount_path }}/compile/ucx - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz -O ucx.tar.gz - tar xzf ucx.tar.gz - cd ucx-* - mkdir -p build - cd build - ../contrib/configure-release --prefix={{ client_mount_path }}/benchmarks/ucx - make -j 8 - make install + - echo "===== UCX Setup =====" + - echo "UCX support is enabled." + - /usr/local/bin/install_ucx.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_ucx.sh" + # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} {% if hostvars['localhost']['openmpi_support'] %} - # OpenMPI build and install with UCX + Slurm detection - - | - OPENMPI_INSTALL_PREFIX="{{ client_mount_path }}/benchmarks/openmpi" - OPENMPI_SRC="{{ client_mount_path }}/compile/openmpi" - mkdir -p $OPENMPI_SRC - mkdir -p $OPENMPI_INSTALL_PREFIX - - cd $OPENMPI_SRC - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz -O openmpi.tar.gz - - tar xzf openmpi.tar.gz - cd openmpi-* - mkdir -p build - - # Check Slurm - if sinfo >/dev/null 2>&1; then - SLURM_FLAG="--with-slurm=yes --with-munge=/usr" - else - SLURM_FLAG="--with-slurm=no" - fi - - # Check UCX - if [ -x "{{ client_mount_path }}/benchmarks/ucx/bin/ucx_info" ]; then - {{ client_mount_path }}/benchmarks/ucx/bin/ucx_info -v - if [ $? -eq 0 ]; then - UCX_FLAG="--with-ucx={{ client_mount_path }}/benchmarks/ucx" - else - echo "ucx_info failed, disabling UCX" - UCX_FLAG="" - fi - else - echo "ucx_info not found, disabling UCX" - UCX_FLAG="" - fi - - cd build - ../configure --prefix=$OPENMPI_INSTALL_PREFIX \ - --enable-mpi1-compatibility \ - --enable-prte-prefix-by-default \ - $SLURM_FLAG $UCX_FLAG 2>&1 | tee config.out - - make -j 8 - make install + - echo "===== OpenMPI Setup =====" + - echo "OpenMPI support is enabled." + - /usr/local/bin/install_openmpi.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_openmpi.sh" + # - echo "Run UCX installation first if UCX support is enabled." + # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} {% if hostvars['localhost']['ldms_support'] %} @@ -366,4 +346,8 @@ - /root/ldms_sampler.sh {% endif %} + + # nvidia sdk install + - /usr/local/bin/install_nvhpc_sdk.sh + - /usr/local/bin/configure_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index cc784bdd10..9b3ac1a501 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -127,7 +127,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 5128aee1d1..64315adf38 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -135,7 +135,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda @@ -408,6 +408,24 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/configure_ucx_openmpi_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }} + + - path: /usr/local/bin/setup_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/setup_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/export_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/export_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -455,6 +473,18 @@ - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - mount -a + - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + - echo "Shared NFS mount is available at: {{ client_mount_path }}" + - /usr/local/bin/configure_ucx_openmpi_env.sh + # - echo "" + # - echo "IMPORTANT:" + # - echo "1. Install UCX and/or OpenMPI on the LOGIN / COMPILER node first." + # - echo "2. Ensure they are installed under the shared mount:" + # - echo " {{ client_mount_path }}/hpc_tools/benchmarks/" + # - echo "3. On this node, run the environment setup script when ready:" + # - echo "" + # - echo "This step is intentionally NOT run automatically." + - echo "==================================================" {% endif %} {% if hostvars['localhost']['ldms_support'] %} @@ -462,4 +492,6 @@ - /root/ldms_sampler.sh {% endif %} + - /usr/local/bin/setup_nvhpc_sdk.sh + - /usr/local/bin/export_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 new file mode 100644 index 0000000000..3c7efbc88b --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 @@ -0,0 +1,71 @@ +#!/bin/bash +set -e + +LOGFILE="/var/log/nvhpc_env_config.log" +exec >> "$LOGFILE" 2>&1 + +echo "===== Configuring NVIDIA HPC SDK environment =====" + +# Cloud-init safe defaults +export HOME=/root + +NVCOMPILERS="{{ nvhpc_local_mount | default('/opt/nvidia/nvhpc') }}" +NVARCH="$(uname -s)_$(uname -m)" +NVHPC_VERSION="{{ nvhpc_version | default('25.11') }}" + +NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION" +PROFILE_FILE="/etc/profile.d/nvhpc.sh" + +if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then + echo "[ERROR] NVHPC compilers not found at $NVHPC_BASE" + exit 1 +fi + +echo "[INFO] NVHPC detected at $NVHPC_BASE" +echo "[INFO] Writing persistent environment to $PROFILE_FILE" + +cat << EOF > "$PROFILE_FILE" +# NVIDIA HPC SDK environment +export NVCOMPILERS=$NVCOMPILERS +export NVARCH=$NVARCH +export NVHPC_VERSION=$NVHPC_VERSION + +export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/bin:\$PATH +export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/man + +# MPI (optional but recommended) +export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/bin:\$PATH +export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/man + +# Modules support (optional) +export MODULEPATH=\$NVCOMPILERS/modulefiles:\${MODULEPATH:-} +EOF + +chmod 644 "$PROFILE_FILE" + +# Source profile for current shell and all future non-login shells +if [ -f "$PROFILE_FILE" ]; then + echo "[INFO] Sourcing NVHPC profile for current shell" + source "$PROFILE_FILE" + grep -q "nvhpc.sh" /etc/bashrc || echo "source $PROFILE_FILE" >> /etc/bashrc +fi + +# NVHPC marker file path +MARKER_TARGET="{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}/.nvhpc_env_ready" + +if ! grep -q "{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" /etc/fstab; then + echo "[ERROR] NVHPC NFS path not found in /etc/fstab" + exit 1 +fi + +echo "[INFO] NVHPC NFS entry found in /etc/fstab" + +if [ ! -d "{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" ]; then + echo "[ERROR] Marker directory missing: {{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" + exit 1 +fi + +touch "$MARKER_TARGET" +echo "[SUCCESS] NVHPC marker created: $MARKER_TARGET" + +echo "===== NVHPC environment configuration completed successfully =====" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 new file mode 100644 index 0000000000..4064eddbb1 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 @@ -0,0 +1,56 @@ +#!/bin/bash +LOGFILE="/var/log/configure_ucx_openmpi_env.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Configuring UCX / OpenMPI environment (Slurm node) =====" + +CLIENT_MOUNT="{{ client_mount_path }}" +UCX_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" +OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi" + +PROFILE_DIR="/etc/profile.d" + +# Ensure client mount exists and is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[WARN] $CLIENT_MOUNT is not mounted. Skipping UCX/OpenMPI env setup." + exit 0 +fi + +# ---------------- UCX ---------------- +if [ -d "$UCX_PREFIX/bin" ]; then + echo "[INFO] UCX detected at $UCX_PREFIX" + + cat > "$PROFILE_DIR/ucx.sh" < "$PROFILE_DIR/openmpi.sh" < >(tee -a "$LOGFILE") 2>&1 + +# Check that NFS is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[ERROR] $CLIENT_MOUNT is not mounted." + echo " Please mount the NFS path before running export_nvhpc_env.sh" + exit 1 +fi + +echo "===== NVHPC environment export started =====" + +# Validate compilers directory exists +if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then + echo "[ERROR] NVHPC compilers not found at:" + echo " $NVHPC_BASE/compilers/bin" + exit 1 +fi + +echo "[INFO] Writing persistent NVHPC profile at $PROFILE_FILE" + +# Write environment file system-wide +cat > "$PROFILE_FILE" </dev/null"; then + echo "[ERROR] nvc verification failed" + exit 1 +fi + +# Verify nvfortran +if ! bash -lc "command -v nvfortran && nvfortran --version >/dev/null"; then + echo "[ERROR] nvfortran verification failed" + exit 1 +fi + +echo "[SUCCESS] NVHPC environment exported successfully" +echo "[INFO] Environment file configured in $PROFILE_FILE" +echo "===== NVHPC export completed =====" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 new file mode 100644 index 0000000000..26f3fd1775 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 @@ -0,0 +1,75 @@ +#!/bin/bash +set -e + +LOGFILE="/var/log/nvhpc_sdk_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA HPC SDK installation =====" + +NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}" +NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" +NVHPC_MOUNT="/shared-nvhpc-sdk" +NVHPC_TARBALL="$NVHPC_MOUNT/${NVHPC_PKG_NAME}.tar.gz" +NVHPC_INSTALL_DIR_NFS="$NVHPC_MOUNT/nvhpc" +NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" +NVHPC_EXTRACT_DIR="$NVHPC_MOUNT/${NVHPC_PKG_NAME}" + +# Skip if already mounted +if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation." + exit 0 +fi + +# Skip if local directory exists +if [ -d "$NVHPC_LOCAL_MOUNT" ]; then + echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping." + exit 0 +fi + +mkdir -p "$NVHPC_MOUNT" +mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT" + +# Check tarball +echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..." +if [ ! -f "$NVHPC_TARBALL" ]; then + echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation." + exit 0 +fi + +# Extract if needed +EXTRACT_SIZE_GB=$(du -sBG "$NVHPC_EXTRACT_DIR" 2>/dev/null | cut -f1 | tr -d 'G') +if [ -d "$NVHPC_EXTRACT_DIR" ] && [ "$EXTRACT_SIZE_GB" -ge 13 ] && [ -f "$NVHPC_EXTRACT_DIR/install" ]; then + echo "[INFO] NVHPC already extracted. Skipping." +else + echo "[INFO] Extracting NVIDIA HPC SDK tarball..." + tar -xzf "$NVHPC_TARBALL" -C "$NVHPC_MOUNT" \ + --checkpoint=2000 \ + --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait" +fi + +mkdir -p "$NVHPC_INSTALL_DIR_NFS" +INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_x86_64/25.11/compilers/bin" + +if [ -x "$INSTALL_BIN_DIR/nvc" ]; then + echo "[INFO] NVHPC already installed. Skipping installer." +else + echo "[INFO] Running NVIDIA HPC SDK installer..." + cd "$NVHPC_EXTRACT_DIR" + NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install +fi + +echo "[SUCCESS] NVIDIA HPC SDK installation completed." + +# Mount NVHPC locally +mkdir -p "$NVHPC_LOCAL_MOUNT" +NVHPC_INSTALL_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" +FSTAB_ENTRY="$NVHPC_INSTALL_EXPORT $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" + +if ! grep -qE "^[^#].*$NVHPC_INSTALL_EXPORT[[:space:]]+$NVHPC_LOCAL_MOUNT[[:space:]]+nfs" /etc/fstab; then + echo "[INFO] Adding NVHPC mount to /etc/fstab" + echo "$FSTAB_ENTRY" >> /etc/fstab +fi + +echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..." +mount "$NVHPC_LOCAL_MOUNT" +echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 new file mode 100644 index 0000000000..44e1a786b7 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 @@ -0,0 +1,73 @@ +#!/bin/bash +set -e + +CLIENT_MOUNT="{{ client_mount_path }}" +OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi" +OPENMPI_BUILD="{{ client_mount_path }}/slurm/hpc_tools/compile/openmpi" + +# Check that NFS is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[ERROR] $CLIENT_MOUNT is not mounted." + echo " Please mount the NFS path before running install_openmpi.sh" + exit 1 +fi + +echo "===== OpenMPI build started =====" + +mkdir -p "$OPENMPI_BUILD" "$OPENMPI_PREFIX" +cd "$OPENMPI_BUILD" + +if [ ! -f openmpi.tar.gz ]; then + wget --no-check-certificate \ + https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \ + -O openmpi.tar.gz \ + >> "$OPENMPI_PREFIX/openmpi_tar_output.log" 2>&1 +else + echo "openmpi.tar.gz already exists, skipping download." \ + >> "$OPENMPI_PREFIX/openmpi_tar_output.log" +fi + +tar xzf openmpi.tar.gz +cd openmpi-* +mkdir -p build + +# Slurm detection +if sinfo >/dev/null 2>&1; then + SLURM_FLAG="--with-slurm=yes --with-munge=/usr" +else + SLURM_FLAG="--with-slurm=no" +fi + +# UCX detection +if [ -x "{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx/bin/ucx_info" ]; then + UCX_FLAG="--with-ucx={{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" +else + UCX_FLAG="" +fi + +cd build +../configure --prefix="$OPENMPI_PREFIX" \ + --enable-mpi1-compatibility \ + --enable-prte-prefix-by-default \ + $SLURM_FLAG $UCX_FLAG + +make -j {{ openmpi_build_threads | default(8) }} +make install + +# Configure OpenMPI environment variables system-wide +OPENMPI_ENV_FILE="/etc/profile.d/openmpi.sh" + +cat > "$OPENMPI_ENV_FILE" <> "$UCX_PREFIX/ucx_tar_output.log" 2>&1 +else + echo "ucx.tar.gz already exists, skipping download." \ + >> "$UCX_PREFIX/ucx_tar_output.log" +fi + +tar xzf ucx.tar.gz +cd ucx-* +mkdir -p build +cd build + +../contrib/configure-release --prefix="$UCX_PREFIX" +make -j {{ ucx_build_threads | default(8) }} +make install + +# Configure UCX environment variables system-wide +UCX_ENV_FILE="/etc/profile.d/ucx.sh" + +cat > "$UCX_ENV_FILE" < >(tee -a "$LOGFILE") 2>&1 + +echo "===== NVHPC SDK setup (mount + wait) =====" + +PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" +PARENT_MOUNT="/shared-nvhpc-sdk" + +NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" +NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" + +NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready" + +WAIT_TIMEOUT=3600 +SLEEP_INTERVAL=20 +ELAPSED=0 + +# 1. Mount parent export +mkdir -p "$PARENT_MOUNT" + +if ! mountpoint -q "$PARENT_MOUNT"; then + mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT" +fi + +if ! mountpoint -q "$PARENT_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC parent export" + exit 1 +fi + +echo "[INFO] Parent NVHPC export mounted" + +# 2. Wait for readiness marker +echo "[INFO] Waiting for NVHPC readiness marker..." + +while [ ! -f "$NVHPC_MARKER" ]; do + if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then + echo "[ERROR] Timeout waiting for NVHPC readiness marker" + exit 1 + fi + sleep "$SLEEP_INTERVAL" + ELAPSED=$((ELAPSED + SLEEP_INTERVAL)) +done + +echo "[SUCCESS] NVHPC readiness marker detected" + +# 3. Ensure fstab entry exists +if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then + echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab + echo "[INFO] NVHPC fstab entry added" +else + echo "[INFO] NVHPC fstab entry already present" +fi + +# 4. Mount NVHPC SDK +mkdir -p "$NVHPC_LOCAL_MOUNT" + +if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + mount "$NVHPC_LOCAL_MOUNT" +fi + +if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC SDK" + exit 1 +fi + +echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT" +echo "===== NVHPC setup completed =====" \ No newline at end of file diff --git a/discovery/roles/slurm_config/tasks/hpc_tools.yml b/discovery/roles/slurm_config/tasks/hpc_tools.yml index c8bdb5d335..46260da267 100644 --- a/discovery/roles/slurm_config/tasks/hpc_tools.yml +++ b/discovery/roles/slurm_config/tasks/hpc_tools.yml @@ -25,6 +25,7 @@ - runfile - scripts - container_images + - nvidia_sdk - name: Deploy download_container_image.sh to NFS share ansible.builtin.template: @@ -122,34 +123,27 @@ ansible.builtin.set_fact: oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" -- name: Check if source directory exists - ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_x86_64 +- name: Build parallel copy list for HPC tools + ansible.builtin.set_fact: + parallel_copy_pairs: [] -- name: Check if source directory exists +- name: Check which parallel copy source directories exist ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_aarch64 - -- name: Copy cuda run file using copy module for aarch64 - ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' - owner: root - group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir + path: "{{ item.src }}" + loop: "{{ parallel_copy_candidates }}" + register: copy_source_checks + failed_when: false -- name: Copy cuda run file using copy module for x86_64 - ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' - owner: root - group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir +- name: Add only valid copy pairs (source exists) + ansible.builtin.set_fact: + parallel_copy_pairs: >- + {{ parallel_copy_pairs + + [[ item.item.src, item.item.dest ]] }} + loop: "{{ copy_source_checks.results }}" + when: item.stat.exists + +- name: Parallel copy HPC tool files + parallel_file_copy: + copy_pairs: "{{ parallel_copy_pairs }}" + max_workers: "{{ parallel_copy_max_workers }}" + when: parallel_copy_pairs | length > 0 diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 3a8c43ad93..f911ce975e 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -130,4 +130,33 @@ offline_path_aarch64: ssh_private_key_path: /root/.ssh/oim_rsa +# nvidia sdk vars +nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" +nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz" +nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" + +# parallel file copy +parallel_copy_max_workers: 4 + +# ------------------------------------------------------------ +# Parallel Copy Candidates (Only path existence matters) +# ------------------------------------------------------------ + +parallel_copy_candidates: + + # CUDA Runfile (aarch64 repo path) + - name: cuda_runfile_aarch64 + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + + # CUDA Runfile (x86_64 repo path) + - name: cuda_runfile_x86_64 + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + + # NVIDIA HPC SDK (x86_64 tarball extracted dir) + - name: nvhpc_sdk_x86_64 + src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" + dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" + backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 9531239fd2..ecf628883b 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -34,6 +34,11 @@ {"package": "cuda-run", "type": "iso", "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run" + }, + { + "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0", + "type": "tarball", + "url": "https://developer.download.nvidia.com/hpc-sdk/25.11/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz" } ] },