Torch Nightly #35

Workflow file for this run

.github/workflows/torch-nightly.yaml at 6561f0e

	name: Torch Nightly

	on:
	schedule:
	# Run at 8 AM UTC (staggered 2 hours after ONNX nightly at 6 AM)
	- cron: '0 8 * * *'

	workflow_dispatch:
	inputs:
	suite:
	description: 'Test suite to run'
	required: false
	type: choice
	options:
	- nightly-torch
	- smoke-torch
	- weekly-torch
	default: 'nightly-torch'

	env:
	VER_PYTHON: "3.10"
	VER_TORCH: "2.*"
	VER_CUDA: "12.1.1"
	VER_CUDA_SHORT: "cu121"

	jobs:
	#=============================================================================
	# Build AIMET wheel with Torch and GPU support
	#=============================================================================
	build-aimet-torch-gpu:
	name: Build AIMET (Torch + GPU)
	uses: ./.github/workflows/build-wheels.yml
	with:
	variants: >
	{"include":[{
	"id":"torch-gpu",
	"runs-on":"k8s-gpu",
	"VER_PYTHON":"3.10",
	"VER_TORCH":"2.*",
	"VER_CUDA":"12.1.1",
	"ENABLE_TESTS":"OFF",
	"PIP_INDEX":""
	}]}
	image-tag: latest
	secrets: inherit

	#=============================================================================
	# Run Torch regression tests on GPU
	#=============================================================================
	torch-gpu:
	name: Run Torch Regression (GPU)
	needs: build-aimet-torch-gpu
	runs-on: k8s-gpu

	container:
	image: "${{ vars.DOCKER_REGISTRY }}/${{ vars.DOCKER_IMAGE }}-torch-gpu:latest"
	credentials:
	username: ${{ secrets.DOCKER_LOGIN }}
	password: ${{ secrets.DOCKER_CREDENTIALS }}
	options: --gpus all

	timeout-minutes: 720

	env:
	PYTHONUNBUFFERED: "1"
	INPUT_SUITE: ${{ github.event.inputs.suite \|\| 'nightly-torch' }}
	WORKFLOW_FILE: "torch-nightly.yaml"
	PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

	steps:
	#=========================================================================
	# Setup: Code and dependencies
	#=========================================================================

	- name: Checkout code
	uses: actions/checkout@v4

	- name: Cache models and datasets
	uses: actions/cache@v4
	with:
	path: \|
	~/.cache
	~/.qai_hub_cache
	~/.qaihm
	~/.cache/huggingface
	~/.cache/torch
	key: aimet-torch-cache-${{ runner.os }}-${{ hashFiles('ONNXRegression/**') }}
	restore-keys: aimet-torch-cache-${{ runner.os }}-

	- name: Download AIMET wheel
	uses: actions/download-artifact@v3
	with:
	name: torch-gpu-wheel
	path: downloads

	- name: Install system dependencies
	run: \|
	apt-get update -qq
	apt-get install -y \
	libgl1-mesa-glx libglib2.0-0 libsm6 libxext6 \
	libxrender-dev libgomp1 git openssh-client unzip curl
	apt-get clean
	rm -rf /var/lib/apt/lists/*

	- name: Setup Python environment with UV
	run: \|
	set -e

	apt-get update -qq
	apt-get install -y python${{ env.VER_PYTHON}} python${{ env.VER_PYTHON}}-venv python${{ env.VER_PYTHON}}-dev

	curl -LsSf https://astral.sh/uv/install.sh \| sh
	export PATH="$HOME/.local/bin:$PATH"

	uv venv .venv --python python${{ env.VER_PYTHON}}
	. .venv/bin/activate

	# Install PyTorch with CUDA support
	uv pip install torch==${{ env.VER_TORCH}} torchvision --index-url https://download.pytorch.org/whl/${{ env.VER_CUDA_SHORT }}

	# Install QAI Hub packages
	uv pip install qai-hub qai-hub-models
	uv pip install pynvml pytest requests object-detection-metrics

	- name: Install AIMET
	run: \|
	. .venv/bin/activate
	export PATH="$HOME/.local/bin:$PATH"
	uv pip install downloads/*.whl
	python -c "import aimet_torch; print('AIMET Torch version:', aimet_torch.__version__)"

	- name: Verify PyTorch GPU
	run: \|
	. .venv/bin/activate
	python -c "
	import torch
	print('PyTorch version:', torch.__version__)
	print('CUDA available:', torch.cuda.is_available())
	if torch.cuda.is_available():
	print('CUDA device:', torch.cuda.get_device_name(0))
	print('CUDA version:', torch.version.cuda)
	assert torch.cuda.is_available(), 'CUDA not available!'
	"

	#=========================================================================
	# Configure AI Hub
	#=========================================================================

	- name: Configure AI Hub
	env:
	QAI_HUB_API_TOKEN: ${{ secrets.AIMET_BOT_AIHUB_DEV_TOKEN }}
	run: \|
	. .venv/bin/activate
	python -m ONNXRegression.workflow.utils configure-aihub

	#=========================================================================
	# Baseline management
	#=========================================================================

	- name: Download baseline from previous runs
	continue-on-error: true
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	. .venv/bin/activate
	python -m ONNXRegression.workflow.artifacts download-baseline \
	--output-dir ONNXRegression/baselines/downloaded

	- name: Setup baseline for comparison
	run: \|
	. .venv/bin/activate
	python -m ONNXRegression.workflow.utils setup-baseline \
	--suite "${{ github.event.inputs.suite \|\| 'nightly-torch' }}"

	#=========================================================================
	# Run test suite
	#=========================================================================

	- name: Run test suite
	run: \|
	. .venv/bin/activate
	python -m ONNXRegression.suite_runner \
	--suite "${{ github.event.inputs.suite \|\| 'nightly-torch' }}"

	#=========================================================================
	# Baseline comparison and reporting
	#=========================================================================

	- name: Compare with baseline and generate report
	run: \|
	. .venv/bin/activate

	SUITE="${{ github.event.inputs.suite \|\| 'nightly-torch' }}"
	RESULTS_CSV="ONNXRegression/reports/results_${SUITE}.csv"

	if [ ! -f "$RESULTS_CSV" ]; then
	echo "Results CSV not found: $RESULTS_CSV"
	exit 1
	fi

	python ONNXRegression/baseline_comparison.py run \
	--results "$RESULTS_CSV" \
	--suite-name "$SUITE" \
	--baselines-dir "ONNXRegression/baselines" \
	--github-summary

	#=========================================================================
	# Generate lockfile and artifacts
	#=========================================================================

	- name: Generate environment lockfile
	if: always()
	run: \|
	. .venv/bin/activate
	export PATH="$HOME/.local/bin:$PATH"
	python -m ONNXRegression.workflow.utils generate-lockfile \
	--run-id "${{ github.run_id }}"

	- name: Generate timestamp for artifacts
	if: always()
	id: timestamp
	run: \|
	echo "datetime=$(date '+%Y%m%d-%H%M%S')" >> $GITHUB_OUTPUT

	- name: Generate baseline artifact name
	id: artifact_name
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	. .venv/bin/activate
	ARTIFACT_NAME=$(python -c "
	from ONNXRegression.workflow.artifacts import ArtifactManager, WorkflowConfig
	config = WorkflowConfig.from_env()
	manager = ArtifactManager(config)
	print(manager.get_artifact_name('${{ github.ref_name }}', '${{ github.event.inputs.suite \|\| 'nightly-torch' }}'))
	")
	echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT
	echo "Generated artifact name: $ARTIFACT_NAME"

	#=========================================================================
	# Upload artifacts
	#=========================================================================

	- name: Upload baseline artifact
	uses: actions/upload-artifact@v3
	with:
	name: ${{ steps.artifact_name.outputs.name }}
	path: ONNXRegression/baselines/latest.json
	retention-days: 30
	if-no-files-found: error

	- name: Upload test reports
	if: always()
	uses: actions/upload-artifact@v3
	with:
	name: torch-test-reports-${{ steps.timestamp.outputs.datetime }}-${{ github.run_id }}
	path: ONNXRegression/reports/*/
	retention-days: 30

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Torch Nightly #35

Workflow file

Torch Nightly #35

Uh oh!

Workflow file for this run