Skip to content

CICD NeMo

CICD NeMo #1378

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD NeMo
on:
schedule:
- cron: "0 */2 * * *"
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
linting:
runs-on: ubuntu-latest
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.docs_only == 'true')
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install ruff
run: |
pip install ruff
# - name: Check lint
# run: |
# pip install pre-commit==3.6.0
# pre-commit install
# pre-commit run --all-files --show-diff-on-failure --color=always
cicd-wait-in-queue:
runs-on: ubuntu-latest
needs: [pre-flight, linting]
environment: test
if: |
!(needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.docs_only == 'true')
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
cicd-unit-tests-latest:
strategy:
fail-fast: false
matrix:
include:
- bucket: "unit_tests"
- bucket: "unit_tests/data/"
- bucket: "unit_tests/dist_checkpointing/*.py"
- bucket: "unit_tests/dist_checkpointing/models/"
- bucket: "unit_tests/transformer/*.py"
- bucket: "unit_tests/transformer/moe"
needs: [pre-flight, cicd-wait-in-queue]
runs-on: nvidia-ci-aws-gpu-x8
name: "${{ matrix.bucket }} - latest"
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions
with:
test_case: tests/${{ matrix.bucket }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "true"
PAT: ${{ secrets.PAT }}
cicd-functional-tests-latest:
strategy:
fail-fast: false
matrix:
include:
- model: "gpt"
test_case: "gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G"
- model: "gpt"
test_case: "gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G"
- model: "moe"
test_case: "gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer"
- model: "moe"
test_case: "gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed"
needs:
- pre-flight
- cicd-wait-in-queue
- cicd-unit-tests-latest
runs-on: nvidia-ci-aws-gpu-x8
name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions
with:
test_case: ${{ matrix.test_case }}
model: ${{ matrix.model }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
Nemo_CICD_Test:
needs:
- pre-flight
- cicd-unit-tests-latest
- cicd-functional-tests-latest
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| success()
)
&& !cancelled()
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get workflow result
id: result
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
run: |
# Get workflow run details and check job conclusions
LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last')
NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 ]]; then
RESULT="success"
elif [[ $NUM_CANCELLED -gt 0 ]]; then
RESULT="cancelled"
else
RESULT="failure"
fi
# Output the final status
echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
- name: Checkout for GH CLI
uses: actions/checkout@v4
- name: Remove label if not cancelled
if: |
steps.result.outputs.code != 'cancelled'
&& github.event.label.name == 'Run CICD'
&& github.event.pull_request.head.repo.full_name == github.repository
env:
GH_TOKEN: ${{ github.token }}
PR_NUMBER: ${{ github.event.number }}
run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
- name: Pipeline successful, add PR comment
if: |
steps.result.outputs.code == 'success'
&& github.event_name == 'pull_request'
&& env.SLACK_WEBHOOK != ''
uses: peter-evans/create-or-update-comment@v4
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
with:
issue-number: ${{ github.event.number }}
body: |
[🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
So it might be time to merge this PR or get some approvals.
//cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
- name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
if: |
steps.result.outputs.code == 'failure'
&& github.event.label.name == 'Run CICD'
&& env.SLACK_WEBHOOK != ''
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
PR_NUMBER: ${{ github.event.number }}
SERVER_URL: ${{ github.server_url }}
run: |
set -x
pip install PyGithub
export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
python .github/scripts/notify.py
- name: Exit
if: ${{ always() }}
env:
RESULT: ${{ steps.result.outputs.code }}
run: |
if [ $RESULT == "success" ]; then
exit 0
else
exit 1
fi
Coverage_Fake:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight]
if: |
(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
&& !cancelled()
environment: nemo-ci
steps:
- name: Generate fake coverage report
uses: actions/github-script@v6
with:
github-token: ${{ secrets.PAT }}
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.sha,
state: 'success',
description: 'No code changes - coverage check skipped',
context: 'codecov/patch'
});
Coverage:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test]
if: |
(
needs.pre-flight.outputs.docs_only == 'false'
&& needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& success()
)
&& !cancelled()
strategy:
matrix:
flag: [unit-test, e2e]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download coverage reports of current branch
uses: actions/download-artifact@v4
with:
pattern: coverage-${{ matrix.flag }}-*
- name: List coverage files
run: find . -type f -name "*.xml" -o -name "*.lcov"
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i
rm -rf coverage-*
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true