Skip to content

Torch Nightly

Torch Nightly #35

name: Torch Nightly
on:
schedule:
# Run at 8 AM UTC (staggered 2 hours after ONNX nightly at 6 AM)
- cron: '0 8 * * *'
workflow_dispatch:
inputs:
suite:
description: 'Test suite to run'
required: false
type: choice
options:
- nightly-torch
- smoke-torch
- weekly-torch
default: 'nightly-torch'
env:
VER_PYTHON: "3.10"
VER_TORCH: "2.*"
VER_CUDA: "12.1.1"
VER_CUDA_SHORT: "cu121"
jobs:
#=============================================================================
# Build AIMET wheel with Torch and GPU support
#=============================================================================
build-aimet-torch-gpu:
name: Build AIMET (Torch + GPU)
uses: ./.github/workflows/build-wheels.yml
with:
variants: >
{"include":[{
"id":"torch-gpu",
"runs-on":"k8s-gpu",
"VER_PYTHON":"3.10",
"VER_TORCH":"2.*",
"VER_CUDA":"12.1.1",
"ENABLE_TESTS":"OFF",
"PIP_INDEX":""
}]}
image-tag: latest
secrets: inherit
#=============================================================================
# Run Torch regression tests on GPU
#=============================================================================
torch-gpu:
name: Run Torch Regression (GPU)
needs: build-aimet-torch-gpu
runs-on: k8s-gpu
container:
image: "${{ vars.DOCKER_REGISTRY }}/${{ vars.DOCKER_IMAGE }}-torch-gpu:latest"
credentials:
username: ${{ secrets.DOCKER_LOGIN }}
password: ${{ secrets.DOCKER_CREDENTIALS }}
options: --gpus all
timeout-minutes: 720
env:
PYTHONUNBUFFERED: "1"
INPUT_SUITE: ${{ github.event.inputs.suite || 'nightly-torch' }}
WORKFLOW_FILE: "torch-nightly.yaml"
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
steps:
#=========================================================================
# Setup: Code and dependencies
#=========================================================================
- name: Checkout code
uses: actions/checkout@v4
- name: Cache models and datasets
uses: actions/cache@v4
with:
path: |
~/.cache
~/.qai_hub_cache
~/.qaihm
~/.cache/huggingface
~/.cache/torch
key: aimet-torch-cache-${{ runner.os }}-${{ hashFiles('ONNXRegression/**') }}
restore-keys: aimet-torch-cache-${{ runner.os }}-
- name: Download AIMET wheel
uses: actions/download-artifact@v3
with:
name: torch-gpu-wheel
path: downloads
- name: Install system dependencies
run: |
apt-get update -qq
apt-get install -y \
libgl1-mesa-glx libglib2.0-0 libsm6 libxext6 \
libxrender-dev libgomp1 git openssh-client unzip curl
apt-get clean
rm -rf /var/lib/apt/lists/*
- name: Setup Python environment with UV
run: |
set -e
apt-get update -qq
apt-get install -y python${{ env.VER_PYTHON}} python${{ env.VER_PYTHON}}-venv python${{ env.VER_PYTHON}}-dev
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$PATH"
uv venv .venv --python python${{ env.VER_PYTHON}}
. .venv/bin/activate
# Install PyTorch with CUDA support
uv pip install torch==${{ env.VER_TORCH}} torchvision --index-url https://download.pytorch.org/whl/${{ env.VER_CUDA_SHORT }}
# Install QAI Hub packages
uv pip install qai-hub qai-hub-models
uv pip install pynvml pytest requests object-detection-metrics
- name: Install AIMET
run: |
. .venv/bin/activate
export PATH="$HOME/.local/bin:$PATH"
uv pip install downloads/*.whl
python -c "import aimet_torch; print('AIMET Torch version:', aimet_torch.__version__)"
- name: Verify PyTorch GPU
run: |
. .venv/bin/activate
python -c "
import torch
print('PyTorch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
print('CUDA device:', torch.cuda.get_device_name(0))
print('CUDA version:', torch.version.cuda)
assert torch.cuda.is_available(), 'CUDA not available!'
"
#=========================================================================
# Configure AI Hub
#=========================================================================
- name: Configure AI Hub
env:
QAI_HUB_API_TOKEN: ${{ secrets.AIMET_BOT_AIHUB_DEV_TOKEN }}
run: |
. .venv/bin/activate
python -m ONNXRegression.workflow.utils configure-aihub
#=========================================================================
# Baseline management
#=========================================================================
- name: Download baseline from previous runs
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
. .venv/bin/activate
python -m ONNXRegression.workflow.artifacts download-baseline \
--output-dir ONNXRegression/baselines/downloaded
- name: Setup baseline for comparison
run: |
. .venv/bin/activate
python -m ONNXRegression.workflow.utils setup-baseline \
--suite "${{ github.event.inputs.suite || 'nightly-torch' }}"
#=========================================================================
# Run test suite
#=========================================================================
- name: Run test suite
run: |
. .venv/bin/activate
python -m ONNXRegression.suite_runner \
--suite "${{ github.event.inputs.suite || 'nightly-torch' }}"
#=========================================================================
# Baseline comparison and reporting
#=========================================================================
- name: Compare with baseline and generate report
run: |
. .venv/bin/activate
SUITE="${{ github.event.inputs.suite || 'nightly-torch' }}"
RESULTS_CSV="ONNXRegression/reports/results_${SUITE}.csv"
if [ ! -f "$RESULTS_CSV" ]; then
echo "Results CSV not found: $RESULTS_CSV"
exit 1
fi
python ONNXRegression/baseline_comparison.py run \
--results "$RESULTS_CSV" \
--suite-name "$SUITE" \
--baselines-dir "ONNXRegression/baselines" \
--github-summary
#=========================================================================
# Generate lockfile and artifacts
#=========================================================================
- name: Generate environment lockfile
if: always()
run: |
. .venv/bin/activate
export PATH="$HOME/.local/bin:$PATH"
python -m ONNXRegression.workflow.utils generate-lockfile \
--run-id "${{ github.run_id }}"
- name: Generate timestamp for artifacts
if: always()
id: timestamp
run: |
echo "datetime=$(date '+%Y%m%d-%H%M%S')" >> $GITHUB_OUTPUT
- name: Generate baseline artifact name
id: artifact_name
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
. .venv/bin/activate
ARTIFACT_NAME=$(python -c "
from ONNXRegression.workflow.artifacts import ArtifactManager, WorkflowConfig
config = WorkflowConfig.from_env()
manager = ArtifactManager(config)
print(manager.get_artifact_name('${{ github.ref_name }}', '${{ github.event.inputs.suite || 'nightly-torch' }}'))
")
echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT
echo "Generated artifact name: $ARTIFACT_NAME"
#=========================================================================
# Upload artifacts
#=========================================================================
- name: Upload baseline artifact
uses: actions/upload-artifact@v3
with:
name: ${{ steps.artifact_name.outputs.name }}
path: ONNXRegression/baselines/latest.json
retention-days: 30
if-no-files-found: error
- name: Upload test reports
if: always()
uses: actions/upload-artifact@v3
with:
name: torch-test-reports-${{ steps.timestamp.outputs.datetime }}-${{ github.run_id }}
path: ONNXRegression/reports/**/*
retention-days: 30