Torch Nightly #36
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Torch Nightly | |
| on: | |
| schedule: | |
| # Run at 8 AM UTC (staggered 2 hours after ONNX nightly at 6 AM) | |
| - cron: '0 8 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| suite: | |
| description: 'Test suite to run' | |
| required: false | |
| type: choice | |
| options: | |
| - nightly-torch | |
| - smoke-torch | |
| - weekly-torch | |
| default: 'nightly-torch' | |
| env: | |
| VER_PYTHON: "3.10" | |
| VER_TORCH: "2.*" | |
| VER_CUDA: "12.1.1" | |
| VER_CUDA_SHORT: "cu121" | |
| jobs: | |
| #============================================================================= | |
| # Build AIMET wheel with Torch and GPU support | |
| #============================================================================= | |
| build-aimet-torch-gpu: | |
| name: Build AIMET (Torch + GPU) | |
| uses: ./.github/workflows/build-wheels.yml | |
| with: | |
| variants: > | |
| {"include":[{ | |
| "id":"torch-gpu", | |
| "runs-on":"k8s-gpu", | |
| "VER_PYTHON":"3.10", | |
| "VER_TORCH":"2.*", | |
| "VER_CUDA":"12.1.1", | |
| "ENABLE_TESTS":"OFF", | |
| "PIP_INDEX":"" | |
| }]} | |
| image-tag: latest | |
| secrets: inherit | |
| #============================================================================= | |
| # Run Torch regression tests on GPU | |
| #============================================================================= | |
| torch-gpu: | |
| name: Run Torch Regression (GPU) | |
| needs: build-aimet-torch-gpu | |
| runs-on: k8s-gpu | |
| container: | |
| image: "${{ vars.DOCKER_REGISTRY }}/${{ vars.DOCKER_IMAGE }}-torch-gpu:latest" | |
| credentials: | |
| username: ${{ secrets.DOCKER_LOGIN }} | |
| password: ${{ secrets.DOCKER_CREDENTIALS }} | |
| options: --gpus all | |
| timeout-minutes: 720 | |
| env: | |
| PYTHONUNBUFFERED: "1" | |
| INPUT_SUITE: ${{ github.event.inputs.suite || 'nightly-torch' }} | |
| WORKFLOW_FILE: "torch-nightly.yaml" | |
| PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" | |
| steps: | |
| #========================================================================= | |
| # Setup: Code and dependencies | |
| #========================================================================= | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Cache models and datasets | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache | |
| ~/.qai_hub_cache | |
| ~/.qaihm | |
| ~/.cache/huggingface | |
| ~/.cache/torch | |
| key: aimet-torch-cache-${{ runner.os }}-${{ hashFiles('ONNXRegression/**') }} | |
| restore-keys: aimet-torch-cache-${{ runner.os }}- | |
| - name: Download AIMET wheel | |
| uses: actions/download-artifact@v3 | |
| with: | |
| name: torch-gpu-wheel | |
| path: downloads | |
| - name: Install system dependencies | |
| run: | | |
| apt-get update -qq | |
| apt-get install -y \ | |
| libgl1-mesa-glx libglib2.0-0 libsm6 libxext6 \ | |
| libxrender-dev libgomp1 git openssh-client unzip curl | |
| apt-get clean | |
| rm -rf /var/lib/apt/lists/* | |
| - name: Setup Python environment with UV | |
| run: | | |
| set -e | |
| apt-get update -qq | |
| apt-get install -y python${{ env.VER_PYTHON}} python${{ env.VER_PYTHON}}-venv python${{ env.VER_PYTHON}}-dev | |
| curl -LsSf https://astral.sh/uv/install.sh | sh | |
| export PATH="$HOME/.local/bin:$PATH" | |
| uv venv .venv --python python${{ env.VER_PYTHON}} | |
| . .venv/bin/activate | |
| # Install PyTorch with CUDA support | |
| uv pip install torch==${{ env.VER_TORCH}} torchvision --index-url https://download.pytorch.org/whl/${{ env.VER_CUDA_SHORT }} | |
| # Install QAI Hub packages | |
| uv pip install qai-hub qai-hub-models==0.43.0 # TODO: Remove after next QAI Hub Models release | |
| uv pip install pynvml pytest requests object-detection-metrics | |
| - name: Install AIMET | |
| run: | | |
| . .venv/bin/activate | |
| export PATH="$HOME/.local/bin:$PATH" | |
| uv pip install downloads/*.whl | |
| python -c "import aimet_torch; print('AIMET Torch version:', aimet_torch.__version__)" | |
| - name: Verify PyTorch GPU | |
| run: | | |
| . .venv/bin/activate | |
| python -c " | |
| import torch | |
| print('PyTorch version:', torch.__version__) | |
| print('CUDA available:', torch.cuda.is_available()) | |
| if torch.cuda.is_available(): | |
| print('CUDA device:', torch.cuda.get_device_name(0)) | |
| print('CUDA version:', torch.version.cuda) | |
| assert torch.cuda.is_available(), 'CUDA not available!' | |
| " | |
| #========================================================================= | |
| # Configure AI Hub | |
| #========================================================================= | |
| - name: Configure AI Hub | |
| env: | |
| QAI_HUB_API_TOKEN: ${{ secrets.AIMET_BOT_AIHUB_DEV_TOKEN }} | |
| run: | | |
| . .venv/bin/activate | |
| python -m ONNXRegression.workflow.utils configure-aihub | |
| #========================================================================= | |
| # Baseline management | |
| #========================================================================= | |
| - name: Download baseline from previous runs | |
| continue-on-error: true | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| . .venv/bin/activate | |
| python -m ONNXRegression.workflow.artifacts download-baseline \ | |
| --output-dir ONNXRegression/baselines/downloaded | |
| - name: Setup baseline for comparison | |
| run: | | |
| . .venv/bin/activate | |
| python -m ONNXRegression.workflow.utils setup-baseline \ | |
| --suite "${{ github.event.inputs.suite || 'nightly-torch' }}" | |
| #========================================================================= | |
| # Run test suite | |
| #========================================================================= | |
| - name: Run test suite | |
| run: | | |
| . .venv/bin/activate | |
| python -m ONNXRegression.suite_runner \ | |
| --suite "${{ github.event.inputs.suite || 'nightly-torch' }}" | |
| #========================================================================= | |
| # Baseline comparison and reporting | |
| #========================================================================= | |
| - name: Compare with baseline and generate report | |
| run: | | |
| . .venv/bin/activate | |
| SUITE="${{ github.event.inputs.suite || 'nightly-torch' }}" | |
| RESULTS_CSV="ONNXRegression/reports/results_${SUITE}.csv" | |
| if [ ! -f "$RESULTS_CSV" ]; then | |
| echo "Results CSV not found: $RESULTS_CSV" | |
| exit 1 | |
| fi | |
| python ONNXRegression/baseline_comparison.py run \ | |
| --results "$RESULTS_CSV" \ | |
| --suite-name "$SUITE" \ | |
| --baselines-dir "ONNXRegression/baselines" \ | |
| --github-summary | |
| #========================================================================= | |
| # Generate lockfile and artifacts | |
| #========================================================================= | |
| - name: Generate environment lockfile | |
| if: always() | |
| run: | | |
| . .venv/bin/activate | |
| export PATH="$HOME/.local/bin:$PATH" | |
| python -m ONNXRegression.workflow.utils generate-lockfile \ | |
| --run-id "${{ github.run_id }}" | |
| - name: Generate timestamp for artifacts | |
| if: always() | |
| id: timestamp | |
| run: | | |
| echo "datetime=$(date '+%Y%m%d-%H%M%S')" >> $GITHUB_OUTPUT | |
| - name: Generate baseline artifact name | |
| id: artifact_name | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| . .venv/bin/activate | |
| ARTIFACT_NAME=$(python -c " | |
| from ONNXRegression.workflow.artifacts import ArtifactManager, WorkflowConfig | |
| config = WorkflowConfig.from_env() | |
| manager = ArtifactManager(config) | |
| print(manager.get_artifact_name('${{ github.ref_name }}', '${{ github.event.inputs.suite || 'nightly-torch' }}')) | |
| ") | |
| echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT | |
| echo "Generated artifact name: $ARTIFACT_NAME" | |
| #========================================================================= | |
| # Upload artifacts | |
| #========================================================================= | |
| - name: Upload baseline artifact | |
| uses: actions/upload-artifact@v3 | |
| with: | |
| name: ${{ steps.artifact_name.outputs.name }} | |
| path: ONNXRegression/baselines/latest.json | |
| retention-days: 30 | |
| if-no-files-found: error | |
| - name: Upload test reports | |
| if: always() | |
| uses: actions/upload-artifact@v3 | |
| with: | |
| name: torch-test-reports-${{ steps.timestamp.outputs.datetime }}-${{ github.run_id }} | |
| path: ONNXRegression/reports/**/* | |
| retention-days: 30 |