From a5935e973a5efb252f8f8ee7fa9631461ea628b8 Mon Sep 17 00:00:00 2001
From: feng0w0 <houyufeng4@huawei.com>
Date: Mon, 29 Dec 2025 09:23:59 +0800
Subject: [PATCH] =?UTF-8?q?=E8=AE=AD=E7=BB=83=E5=BF=AB=E9=80=9F=E4=B8=8A?=
 =?UTF-8?q?=E6=89=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/zh/Pipeline_Usage/GPU_support_样例1.md   |  86 ++++++++++++
 docs/zh/Pipeline_Usage/GPU_support_样例2.md   | 129 ++++++++++++++++++
 .../full/Wan2.2-I2V-A14B-NPU.sh               |  42 ++++++
 3 files changed, 257 insertions(+)
 create mode 100644 docs/zh/Pipeline_Usage/GPU_support_样例1.md
 create mode 100644 docs/zh/Pipeline_Usage/GPU_support_样例2.md
 create mode 100644 examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh

diff --git a/docs/zh/Pipeline_Usage/GPU_support_样例1.md b/docs/zh/Pipeline_Usage/GPU_support_样例1.md
new file mode 100644
index 0000000..03138cc
--- /dev/null
+++ b/docs/zh/Pipeline_Usage/GPU_support_样例1.md
@@ -0,0 +1,86 @@
+# GPU/NPU 支持
+
+`DiffSynth-Studio` 支持多种 GPU/NPU，本文介绍如何在这些设备上运行模型推理和训练。
+
+在开始前，请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。
+
+## NVIDIA GPU
+
+本项目提供的所有样例代码默认支持 NVIDIA GPU，无需额外修改。
+
+## AMD GPU
+
+AMD 提供了基于 ROCm 的 torch 包，所以大多数模型无需修改代码即可运行，少数模型由于依赖特定的 cuda 指令无法运行。
+
+## Ascend NPU
+
+### 推理
+使用 Ascend NPU 时，需把代码中的 `"cuda"` 改为 `"npu"`。
+
+例如，Wan2.1-T2V-1.3B 的推理代码：
+
+```diff
+import torch
+from diffsynth.utils.data import save_video, VideoData
+from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
+
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.bfloat16,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.bfloat16,
+-   "preparing_device": "cuda",
++   "preparing_device": "npu",
+    "computation_dtype": torch.bfloat16,
+-   "computation_device": "cuda",
++   "preparing_device": "npu",
+}
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+-   device="cuda",
++   device="npu",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config),
+    ],
+    tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
+-   vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
++   vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2,
+)
+
+video = pipe(
+    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+)
+save_video(video, "video.mp4", fps=15, quality=5)
+```
+
+### 训练
+当前已为每类模型添加NPU的启动脚本样例，例如 `examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh`
+
+NPU训练脚本中添加了优化性能的环境变量，针对特定模型，还添加一些参数
+
+#### 环境变量
+```
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+```
+expandable_segments:<value>: 使能内存池扩展段功能，即虚拟内存特征
+
+```
+export CPU_AFFINITY_CONF=1
+```
+设置0或未设置: 表示不启用绑核功能
+
+1: 表示开启粗粒度绑核
+
+2: 表示开启细粒度绑核
+
+#### 特定模型参数
+| 模型        | 参数 | 备注                |
+|-----------|------|-------------------|
+| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 |
+
+
diff --git a/docs/zh/Pipeline_Usage/GPU_support_样例2.md b/docs/zh/Pipeline_Usage/GPU_support_样例2.md
new file mode 100644
index 0000000..615cde6
--- /dev/null
+++ b/docs/zh/Pipeline_Usage/GPU_support_样例2.md
@@ -0,0 +1,129 @@
+# GPU/NPU 支持
+
+`DiffSynth-Studio` 支持多种 GPU/NPU，本文介绍如何在这些设备上运行模型推理和训练。
+
+在开始前，请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。
+
+## NVIDIA GPU
+
+本项目提供的所有样例代码默认支持 NVIDIA GPU，无需额外修改。
+
+## AMD GPU
+
+AMD 提供了基于 ROCm 的 torch 包，所以大多数模型无需修改代码即可运行，少数模型由于依赖特定的 cuda 指令无法运行。
+
+## Ascend NPU
+
+### 推理
+使用 Ascend NPU 时，需把代码中的 `"cuda"` 改为 `"npu"`。
+
+例如，Wan2.1-T2V-1.3B 的推理代码：
+
+```diff
+import torch
+from diffsynth.utils.data import save_video, VideoData
+from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
+
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.bfloat16,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.bfloat16,
+-   "preparing_device": "cuda",
++   "preparing_device": "npu",
+    "computation_dtype": torch.bfloat16,
+-   "computation_device": "cuda",
++   "preparing_device": "npu",
+}
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+-   device="cuda",
++   device="npu",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config),
+    ],
+    tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
+-   vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
++   vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2,
+)
+
+video = pipe(
+    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+)
+save_video(video, "video.mp4", fps=15, quality=5)
+```
+
+### 训练
+使用 Ascend NPU 时，可以添加优化性能的环境变量，针对特定模型，还需添加参数
+
+例如，Wan2.2-I2V-A14B 的训练代码：
+```diff
++ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
++ export CPU_AFFINITY_CONF=1
+
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --num_frames 49 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image" \
+  --use_gradient_checkpointing_offload \
+  --max_timestep_boundary 0.358 \
+  --min_timestep_boundary 0 \
++  --initialize_model_on_cpu
+# boundary corresponds to timesteps [900, 1000]
+
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --num_frames 49 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image" \
+  --use_gradient_checkpointing_offload \
+  --max_timestep_boundary 1 \
+  --min_timestep_boundary 0.358 \
++  --initialize_model_on_cpu
+# boundary corresponds to timesteps [0, 900)
+```
+#### 环境变量
+```
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+```
+expandable_segments:<value>: 使能内存池扩展段功能，即虚拟内存特征
+
+```
+export CPU_AFFINITY_CONF=1
+```
+设置0或未设置: 表示不启用绑核功能
+
+1: 表示开启粗粒度绑核
+
+2: 表示开启细粒度绑核
+
+#### 特定模型参数
+| 模型        | 参数 | 备注                |
+|-----------|------|-------------------|
+| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 |
+
+
diff --git a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh b/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh
new file mode 100644
index 0000000..b214af1
--- /dev/null
+++ b/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh
@@ -0,0 +1,42 @@
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export CPU_AFFINITY_CONF=1
+
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --num_frames 49 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image" \
+  --use_gradient_checkpointing_offload \
+  --max_timestep_boundary 0.358 \
+  --min_timestep_boundary 0 \
+  --initialize_model_on_cpu
+# boundary corresponds to timesteps [900, 1000]
+
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --num_frames 49 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image" \
+  --use_gradient_checkpointing_offload \
+  --max_timestep_boundary 1 \
+  --min_timestep_boundary 0.358 \
+  --initialize_model_on_cpu
+# boundary corresponds to timesteps [0, 900)
\ No newline at end of file