Fix force_save_fsdp_all_gather and split_fsdp_prefetch #972

	name: Test CUDA

	on:
	pull_request:
	push:
	branches:
	- main
	- release/*

	concurrency:
	group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number \|\| github.ref }}
	cancel-in-progress: true

	jobs:
	test-cuda:
	name: Test CUDA (cuda12.6-py3.12)
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	strategy:
	fail-fast: true
	matrix:
	include:
	- name: 12xlargegpu
	runs-on: linux.g5.12xlarge.nvidia.gpu
	torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
	gpu-arch-type: "cuda"
	gpu-arch-version: "12.6"
	with:
	timeout: 60
	runner: ${{ matrix.runs-on }}
	gpu-arch-type: ${{ matrix.gpu-arch-type }}
	gpu-arch-version: ${{ matrix.gpu-arch-version }}
	submodules: recursive
	script: \|
	conda create --yes --quiet --name py312 python=3.12
	source $(conda info --base)/etc/profile.d/conda.sh
	conda activate py312

	pip install --quiet -r requirements-test.txt
	# For some reason the spec above isnt working
	pip uninstall -y torch
	pip install --no-input --quiet --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
	pip install --quiet .
	pytest tests
	python examples/example_autoparallel.py
	python examples/example_llama3.py
	python examples/example_dcp.py
	python examples/example_local_map.py
	python examples/example_pp_graph_passes.py
	torchrun --standalone --nproc-per-node 4 examples/example_ds3_local_map.py

Provide feedback