From d879d66c62242fc79c3353ea6d53f0c5fb03054c Mon Sep 17 00:00:00 2001
From: feng0w0 <houyufeng4@huawei.com>
Date: Wed, 21 Jan 2026 10:34:09 +0800
Subject: [PATCH] [NPU]:Support USP feature in NPU

---
 docs/zh/Pipeline_Usage/GPU_support_样例1.md   |  86 ------------
 docs/zh/Pipeline_Usage/GPU_support_样例2.md   | 129 ------------------
 docs/zh/Pipeline_Usage/Setup_NPU_样例1.md     |  55 --------
 docs/zh/Pipeline_Usage/Setup_NPU_样例2.md     |  56 --------
 .../full/Wan2.2-I2V-A14B-NPU.sh               |  42 ------
 pyproject.toml                                |  12 --
 6 files changed, 380 deletions(-)
 delete mode 100644 docs/zh/Pipeline_Usage/GPU_support_样例1.md
 delete mode 100644 docs/zh/Pipeline_Usage/GPU_support_样例2.md
 delete mode 100644 docs/zh/Pipeline_Usage/Setup_NPU_样例1.md
 delete mode 100644 docs/zh/Pipeline_Usage/Setup_NPU_样例2.md
 delete mode 100644 examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh

diff --git a/docs/zh/Pipeline_Usage/GPU_support_样例1.md b/docs/zh/Pipeline_Usage/GPU_support_样例1.md
deleted file mode 100644
index cfec21c..0000000
--- a/docs/zh/Pipeline_Usage/GPU_support_样例1.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# GPU/NPU 支持
-
-`DiffSynth-Studio` 支持多种 GPU/NPU，本文介绍如何在这些设备上运行模型推理和训练。
-
-在开始前，请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。
-
-## NVIDIA GPU
-
-本项目提供的所有样例代码默认支持 NVIDIA GPU，无需额外修改。
-
-## AMD GPU
-
-AMD 提供了基于 ROCm 的 torch 包，所以大多数模型无需修改代码即可运行，少数模型由于依赖特定的 cuda 指令无法运行。
-
-## Ascend NPU
-
-### 推理
-使用 Ascend NPU 时，需把代码中的 `"cuda"` 改为 `"npu"`。
-
-例如，Wan2.1-T2V-1.3B 的推理代码：
-
-```diff
-import torch
-from diffsynth.utils.data import save_video, VideoData
-from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
-
-vram_config = {
-    "offload_dtype": "disk",
-    "offload_device": "disk",
-    "onload_dtype": torch.bfloat16,
-    "onload_device": "cpu",
-    "preparing_dtype": torch.bfloat16,
--   "preparing_device": "cuda",
-+   "preparing_device": "npu",
-    "computation_dtype": torch.bfloat16,
--   "computation_device": "cuda",
-+   "preparing_device": "npu",
-}
-pipe = WanVideoPipeline.from_pretrained(
-    torch_dtype=torch.bfloat16,
--   device="cuda",
-+   device="npu",
-    model_configs=[
-        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config),
-        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config),
-        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config),
-    ],
-    tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
--   vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
-+   vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2,
-)
-
-video = pipe(
-    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
-    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-    seed=0, tiled=True,
-)
-save_video(video, "video.mp4", fps=15, quality=5)
-```
-
-### 训练
-当前已为每类模型添加NPU的启动脚本样例，例如 `examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh`
-
-NPU训练脚本中添加了优化性能的环境变量，针对特定模型，还需添加必要参数
-
-#### 环境变量
-```
-export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-```
-expandable_segments:<value>: 使能内存池扩展段功能，即虚拟内存特征
-
-```
-export CPU_AFFINITY_CONF=1
-```
-设置0或未设置: 表示不启用绑核功能
-
-1: 表示开启粗粒度绑核
-
-2: 表示开启细粒度绑核
-
-#### 特定模型参数
-| 模型        | 参数 | 备注                |
-|-----------|------|-------------------|
-| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 |
-
-
diff --git a/docs/zh/Pipeline_Usage/GPU_support_样例2.md b/docs/zh/Pipeline_Usage/GPU_support_样例2.md
deleted file mode 100644
index 95a94ad..0000000
--- a/docs/zh/Pipeline_Usage/GPU_support_样例2.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# GPU/NPU 支持
-
-`DiffSynth-Studio` 支持多种 GPU/NPU，本文介绍如何在这些设备上运行模型推理和训练。
-
-在开始前，请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。
-
-## NVIDIA GPU
-
-本项目提供的所有样例代码默认支持 NVIDIA GPU，无需额外修改。
-
-## AMD GPU
-
-AMD 提供了基于 ROCm 的 torch 包，所以大多数模型无需修改代码即可运行，少数模型由于依赖特定的 cuda 指令无法运行。
-
-## Ascend NPU
-
-### 推理
-使用 Ascend NPU 时，需把代码中的 `"cuda"` 改为 `"npu"`。
-
-例如，Wan2.1-T2V-1.3B 的推理代码：
-
-```diff
-import torch
-from diffsynth.utils.data import save_video, VideoData
-from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
-
-vram_config = {
-    "offload_dtype": "disk",
-    "offload_device": "disk",
-    "onload_dtype": torch.bfloat16,
-    "onload_device": "cpu",
-    "preparing_dtype": torch.bfloat16,
--   "preparing_device": "cuda",
-+   "preparing_device": "npu",
-    "computation_dtype": torch.bfloat16,
--   "computation_device": "cuda",
-+   "preparing_device": "npu",
-}
-pipe = WanVideoPipeline.from_pretrained(
-    torch_dtype=torch.bfloat16,
--   device="cuda",
-+   device="npu",
-    model_configs=[
-        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config),
-        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config),
-        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config),
-    ],
-    tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
--   vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
-+   vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2,
-)
-
-video = pipe(
-    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
-    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-    seed=0, tiled=True,
-)
-save_video(video, "video.mp4", fps=15, quality=5)
-```
-
-### 训练
-使用 Ascend NPU 时，可以添加优化性能的环境变量，针对特定模型，还需添加必要参数
-
-例如，Wan2.2-I2V-A14B 的训练代码：
-```diff
-+ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-+ export CPU_AFFINITY_CONF=1
-
-accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
-  --dataset_base_path data/example_video_dataset \
-  --dataset_metadata_path data/example_video_dataset/metadata.csv \
-  --height 480 \
-  --width 832 \
-  --num_frames 49 \
-  --dataset_repeat 100 \
-  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
-  --learning_rate 1e-5 \
-  --num_epochs 2 \
-  --remove_prefix_in_ckpt "pipe.dit." \
-  --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \
-  --trainable_models "dit" \
-  --extra_inputs "input_image" \
-  --use_gradient_checkpointing_offload \
-  --max_timestep_boundary 0.358 \
-  --min_timestep_boundary 0 \
-+  --initialize_model_on_cpu
-# boundary corresponds to timesteps [900, 1000]
-
-accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
-  --dataset_base_path data/example_video_dataset \
-  --dataset_metadata_path data/example_video_dataset/metadata.csv \
-  --height 480 \
-  --width 832 \
-  --num_frames 49 \
-  --dataset_repeat 100 \
-  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
-  --learning_rate 1e-5 \
-  --num_epochs 2 \
-  --remove_prefix_in_ckpt "pipe.dit." \
-  --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \
-  --trainable_models "dit" \
-  --extra_inputs "input_image" \
-  --use_gradient_checkpointing_offload \
-  --max_timestep_boundary 1 \
-  --min_timestep_boundary 0.358 \
-+  --initialize_model_on_cpu
-# boundary corresponds to timesteps [0, 900)
-```
-#### 环境变量
-```
-export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-```
-expandable_segments:<value>: 使能内存池扩展段功能，即虚拟内存特征
-
-```
-export CPU_AFFINITY_CONF=1
-```
-设置0或未设置: 表示不启用绑核功能
-
-1: 表示开启粗粒度绑核
-
-2: 表示开启细粒度绑核
-
-#### 特定模型参数
-| 模型        | 参数 | 备注                |
-|-----------|------|-------------------|
-| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 |
-
-
diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_样例1.md b/docs/zh/Pipeline_Usage/Setup_NPU_样例1.md
deleted file mode 100644
index a4f90ea..0000000
--- a/docs/zh/Pipeline_Usage/Setup_NPU_样例1.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# 安装依赖
-
-从源码安装（推荐）：
-
-```
-git clone https://github.com/modelscope/DiffSynth-Studio.git
-cd DiffSynth-Studio
-pip install -e .
-```
-
-从 pypi 安装（存在版本更新延迟，如需使用最新功能，请从源码安装）
-
-```
-pip install diffsynth
-```
-
-## GPU/NPU 支持
-
-* NVIDIA GPU
-
-按照以上方式安装即可。
-
-* AMD GPU
-
-需安装支持 ROCm 的 `torch` 包，以 ROCm 6.4（本文更新于 2025 年 12 月 15 日）、Linux 系统为例，请运行以下命令
-
-```shell
-pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4
-```
-
-* Ascend NPU
-
-1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit)
-
-2. 根据[安装依赖](#安装依赖)安装diffsynth仓
-
-3. 安装torch-npu仓。Ascend NPU 通过 `torch-npu` 包提供支持，以 `2.7.1` 版本（本文更新于 2025 年 12 月 15 日）为例，请运行以下命令
-    ```shell
-    # aarch64/ARM 
-    pip install torch-npu==2.7.1 torchvision==0.22.1
-    # x86
-    pip install torch==2.7.1+cpu torchvision==0.22.1+cpu --extra-index-url "https://download.pytorch.org/whl/cpu"
-    pip install torch-npu==2.7.1 
-    ```
-
-使用 Ascend NPU 时，请将 Python 代码中的 `"cuda"` 改为 `"npu"`，详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。
-
-## 其他安装问题
-
-如果在安装过程中遇到问题，可能是由上游依赖包导致的，请参考这些包的文档：
-
-* [torch](https://pytorch.org/get-started/locally/)
-* [Ascend/pytorch](https://github.com/Ascend/pytorch)
-* [sentencepiece](https://github.com/google/sentencepiece)
-* [cmake](https://cmake.org)
diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_样例2.md b/docs/zh/Pipeline_Usage/Setup_NPU_样例2.md
deleted file mode 100644
index e0616cc..0000000
--- a/docs/zh/Pipeline_Usage/Setup_NPU_样例2.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 安装依赖
-
-从源码安装（推荐）：
-
-```
-git clone https://github.com/modelscope/DiffSynth-Studio.git
-cd DiffSynth-Studio
-pip install -e .
-```
-
-从 pypi 安装（存在版本更新延迟，如需使用最新功能，请从源码安装）
-
-```
-pip install diffsynth
-```
-
-## GPU/NPU 支持
-
-* NVIDIA GPU
-
-按照以上方式安装即可。
-
-* AMD GPU
-
-需安装支持 ROCm 的 `torch` 包，以 ROCm 6.4（本文更新于 2025 年 12 月 15 日）、Linux 系统为例，请运行以下命令
-
-```shell
-pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4
-```
-
-* Ascend NPU
-
-1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit)
-
-2. 从源码安装
-   ```shell
-   git clone https://github.com/modelscope/DiffSynth-Studio.git
-   cd DiffSynth-Studio
-   # aarch64/ARM
-   pip install -e .[npu_aarch64] --extra-index-url "https://download.pytorch.org/whl/cpu"
-   # x86
-   pip install -e .[npu]
-   ```
-
-
-
-使用 Ascend NPU 时，请将 Python 代码中的 `"cuda"` 改为 `"npu"`，详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。
-
-## 其他安装问题
-
-如果在安装过程中遇到问题，可能是由上游依赖包导致的，请参考这些包的文档：
-
-* [torch](https://pytorch.org/get-started/locally/)
-* [Ascend/pytorch](https://github.com/Ascend/pytorch)
-* [sentencepiece](https://github.com/google/sentencepiece)
-* [cmake](https://cmake.org)
diff --git a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh b/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh
deleted file mode 100644
index b214af1..0000000
--- a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-export CPU_AFFINITY_CONF=1
-
-accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
-  --dataset_base_path data/example_video_dataset \
-  --dataset_metadata_path data/example_video_dataset/metadata.csv \
-  --height 480 \
-  --width 832 \
-  --num_frames 49 \
-  --dataset_repeat 100 \
-  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
-  --learning_rate 1e-5 \
-  --num_epochs 2 \
-  --remove_prefix_in_ckpt "pipe.dit." \
-  --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \
-  --trainable_models "dit" \
-  --extra_inputs "input_image" \
-  --use_gradient_checkpointing_offload \
-  --max_timestep_boundary 0.358 \
-  --min_timestep_boundary 0 \
-  --initialize_model_on_cpu
-# boundary corresponds to timesteps [900, 1000]
-
-accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
-  --dataset_base_path data/example_video_dataset \
-  --dataset_metadata_path data/example_video_dataset/metadata.csv \
-  --height 480 \
-  --width 832 \
-  --num_frames 49 \
-  --dataset_repeat 100 \
-  --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \
-  --learning_rate 1e-5 \
-  --num_epochs 2 \
-  --remove_prefix_in_ckpt "pipe.dit." \
-  --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \
-  --trainable_models "dit" \
-  --extra_inputs "input_image" \
-  --use_gradient_checkpointing_offload \
-  --max_timestep_boundary 1 \
-  --min_timestep_boundary 0.358 \
-  --initialize_model_on_cpu
-# boundary corresponds to timesteps [0, 900)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index c82697a..059e21d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,18 +32,6 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 
-[project.optional-dependencies]
-npu_aarch64 = [
-    "torch==2.7.1",
-    "torch-npu==2.7.1",
-    "torchvision==0.22.1"
-]
-npu = [
-    "torch==2.7.1+cpu",
-    "torch-npu==2.7.1",
-    "torchvision==0.22.1+cpu"
-]
-
 [tool.setuptools.packages.find]
 where = ["./"]
 include = ["diffsynth", "diffsynth.*"]