From d879d66c62242fc79c3353ea6d53f0c5fb03054c Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Wed, 21 Jan 2026 10:34:09 +0800 Subject: [PATCH] [NPU]:Support USP feature in NPU --- docs/zh/Pipeline_Usage/GPU_support_样例1.md | 86 ------------ docs/zh/Pipeline_Usage/GPU_support_样例2.md | 129 ------------------ docs/zh/Pipeline_Usage/Setup_NPU_样例1.md | 55 -------- docs/zh/Pipeline_Usage/Setup_NPU_样例2.md | 56 -------- .../full/Wan2.2-I2V-A14B-NPU.sh | 42 ------ pyproject.toml | 12 -- 6 files changed, 380 deletions(-) delete mode 100644 docs/zh/Pipeline_Usage/GPU_support_样例1.md delete mode 100644 docs/zh/Pipeline_Usage/GPU_support_样例2.md delete mode 100644 docs/zh/Pipeline_Usage/Setup_NPU_样例1.md delete mode 100644 docs/zh/Pipeline_Usage/Setup_NPU_样例2.md delete mode 100644 examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh diff --git a/docs/zh/Pipeline_Usage/GPU_support_样例1.md b/docs/zh/Pipeline_Usage/GPU_support_样例1.md deleted file mode 100644 index cfec21c..0000000 --- a/docs/zh/Pipeline_Usage/GPU_support_样例1.md +++ /dev/null @@ -1,86 +0,0 @@ -# GPU/NPU 支持 - -`DiffSynth-Studio` 支持多种 GPU/NPU,本文介绍如何在这些设备上运行模型推理和训练。 - -在开始前,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。 - -## NVIDIA GPU - -本项目提供的所有样例代码默认支持 NVIDIA GPU,无需额外修改。 - -## AMD GPU - -AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。 - -## Ascend NPU - -### 推理 -使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`。 - -例如,Wan2.1-T2V-1.3B 的推理代码: - -```diff -import torch -from diffsynth.utils.data import save_video, VideoData -from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig - -vram_config = { - "offload_dtype": "disk", - "offload_device": "disk", - "onload_dtype": torch.bfloat16, - "onload_device": "cpu", - "preparing_dtype": torch.bfloat16, -- "preparing_device": "cuda", -+ "preparing_device": "npu", - "computation_dtype": torch.bfloat16, -- "computation_device": "cuda", -+ "preparing_device": "npu", -} -pipe = WanVideoPipeline.from_pretrained( - torch_dtype=torch.bfloat16, -- device="cuda", -+ device="npu", - model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config), - ], - tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), -- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, -+ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, -) - -video = pipe( - prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", - negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", - seed=0, tiled=True, -) -save_video(video, "video.mp4", fps=15, quality=5) -``` - -### 训练 -当前已为每类模型添加NPU的启动脚本样例,例如 `examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh` - -NPU训练脚本中添加了优化性能的环境变量,针对特定模型,还需添加必要参数 - -#### 环境变量 -``` -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -``` -expandable_segments:: 使能内存池扩展段功能,即虚拟内存特征 - -``` -export CPU_AFFINITY_CONF=1 -``` -设置0或未设置: 表示不启用绑核功能 - -1: 表示开启粗粒度绑核 - -2: 表示开启细粒度绑核 - -#### 特定模型参数 -| 模型 | 参数 | 备注 | -|-----------|------|-------------------| -| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 | - - diff --git a/docs/zh/Pipeline_Usage/GPU_support_样例2.md b/docs/zh/Pipeline_Usage/GPU_support_样例2.md deleted file mode 100644 index 95a94ad..0000000 --- a/docs/zh/Pipeline_Usage/GPU_support_样例2.md +++ /dev/null @@ -1,129 +0,0 @@ -# GPU/NPU 支持 - -`DiffSynth-Studio` 支持多种 GPU/NPU,本文介绍如何在这些设备上运行模型推理和训练。 - -在开始前,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。 - -## NVIDIA GPU - -本项目提供的所有样例代码默认支持 NVIDIA GPU,无需额外修改。 - -## AMD GPU - -AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。 - -## Ascend NPU - -### 推理 -使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`。 - -例如,Wan2.1-T2V-1.3B 的推理代码: - -```diff -import torch -from diffsynth.utils.data import save_video, VideoData -from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig - -vram_config = { - "offload_dtype": "disk", - "offload_device": "disk", - "onload_dtype": torch.bfloat16, - "onload_device": "cpu", - "preparing_dtype": torch.bfloat16, -- "preparing_device": "cuda", -+ "preparing_device": "npu", - "computation_dtype": torch.bfloat16, -- "computation_device": "cuda", -+ "preparing_device": "npu", -} -pipe = WanVideoPipeline.from_pretrained( - torch_dtype=torch.bfloat16, -- device="cuda", -+ device="npu", - model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config), - ], - tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), -- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, -+ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, -) - -video = pipe( - prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", - negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", - seed=0, tiled=True, -) -save_video(video, "video.mp4", fps=15, quality=5) -``` - -### 训练 -使用 Ascend NPU 时,可以添加优化性能的环境变量,针对特定模型,还需添加必要参数 - -例如,Wan2.2-I2V-A14B 的训练代码: -```diff -+ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -+ export CPU_AFFINITY_CONF=1 - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 0.358 \ - --min_timestep_boundary 0 \ -+ --initialize_model_on_cpu -# boundary corresponds to timesteps [900, 1000] - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 1 \ - --min_timestep_boundary 0.358 \ -+ --initialize_model_on_cpu -# boundary corresponds to timesteps [0, 900) -``` -#### 环境变量 -``` -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -``` -expandable_segments:: 使能内存池扩展段功能,即虚拟内存特征 - -``` -export CPU_AFFINITY_CONF=1 -``` -设置0或未设置: 表示不启用绑核功能 - -1: 表示开启粗粒度绑核 - -2: 表示开启细粒度绑核 - -#### 特定模型参数 -| 模型 | 参数 | 备注 | -|-----------|------|-------------------| -| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 | - - diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_样例1.md b/docs/zh/Pipeline_Usage/Setup_NPU_样例1.md deleted file mode 100644 index a4f90ea..0000000 --- a/docs/zh/Pipeline_Usage/Setup_NPU_样例1.md +++ /dev/null @@ -1,55 +0,0 @@ -# 安装依赖 - -从源码安装(推荐): - -``` -git clone https://github.com/modelscope/DiffSynth-Studio.git -cd DiffSynth-Studio -pip install -e . -``` - -从 pypi 安装(存在版本更新延迟,如需使用最新功能,请从源码安装) - -``` -pip install diffsynth -``` - -## GPU/NPU 支持 - -* NVIDIA GPU - -按照以上方式安装即可。 - -* AMD GPU - -需安装支持 ROCm 的 `torch` 包,以 ROCm 6.4(本文更新于 2025 年 12 月 15 日)、Linux 系统为例,请运行以下命令 - -```shell -pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4 -``` - -* Ascend NPU - -1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit) - -2. 根据[安装依赖](#安装依赖)安装diffsynth仓 - -3. 安装torch-npu仓。Ascend NPU 通过 `torch-npu` 包提供支持,以 `2.7.1` 版本(本文更新于 2025 年 12 月 15 日)为例,请运行以下命令 - ```shell - # aarch64/ARM - pip install torch-npu==2.7.1 torchvision==0.22.1 - # x86 - pip install torch==2.7.1+cpu torchvision==0.22.1+cpu --extra-index-url "https://download.pytorch.org/whl/cpu" - pip install torch-npu==2.7.1 - ``` - -使用 Ascend NPU 时,请将 Python 代码中的 `"cuda"` 改为 `"npu"`,详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。 - -## 其他安装问题 - -如果在安装过程中遇到问题,可能是由上游依赖包导致的,请参考这些包的文档: - -* [torch](https://pytorch.org/get-started/locally/) -* [Ascend/pytorch](https://github.com/Ascend/pytorch) -* [sentencepiece](https://github.com/google/sentencepiece) -* [cmake](https://cmake.org) diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_样例2.md b/docs/zh/Pipeline_Usage/Setup_NPU_样例2.md deleted file mode 100644 index e0616cc..0000000 --- a/docs/zh/Pipeline_Usage/Setup_NPU_样例2.md +++ /dev/null @@ -1,56 +0,0 @@ -# 安装依赖 - -从源码安装(推荐): - -``` -git clone https://github.com/modelscope/DiffSynth-Studio.git -cd DiffSynth-Studio -pip install -e . -``` - -从 pypi 安装(存在版本更新延迟,如需使用最新功能,请从源码安装) - -``` -pip install diffsynth -``` - -## GPU/NPU 支持 - -* NVIDIA GPU - -按照以上方式安装即可。 - -* AMD GPU - -需安装支持 ROCm 的 `torch` 包,以 ROCm 6.4(本文更新于 2025 年 12 月 15 日)、Linux 系统为例,请运行以下命令 - -```shell -pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4 -``` - -* Ascend NPU - -1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit) - -2. 从源码安装 - ```shell - git clone https://github.com/modelscope/DiffSynth-Studio.git - cd DiffSynth-Studio - # aarch64/ARM - pip install -e .[npu_aarch64] --extra-index-url "https://download.pytorch.org/whl/cpu" - # x86 - pip install -e .[npu] - ``` - - - -使用 Ascend NPU 时,请将 Python 代码中的 `"cuda"` 改为 `"npu"`,详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。 - -## 其他安装问题 - -如果在安装过程中遇到问题,可能是由上游依赖包导致的,请参考这些包的文档: - -* [torch](https://pytorch.org/get-started/locally/) -* [Ascend/pytorch](https://github.com/Ascend/pytorch) -* [sentencepiece](https://github.com/google/sentencepiece) -* [cmake](https://cmake.org) diff --git a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh b/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh deleted file mode 100644 index b214af1..0000000 --- a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh +++ /dev/null @@ -1,42 +0,0 @@ -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export CPU_AFFINITY_CONF=1 - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 0.358 \ - --min_timestep_boundary 0 \ - --initialize_model_on_cpu -# boundary corresponds to timesteps [900, 1000] - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 1 \ - --min_timestep_boundary 0.358 \ - --initialize_model_on_cpu -# boundary corresponds to timesteps [0, 900) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c82697a..059e21d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,18 +32,6 @@ classifiers = [ "Operating System :: OS Independent", ] -[project.optional-dependencies] -npu_aarch64 = [ - "torch==2.7.1", - "torch-npu==2.7.1", - "torchvision==0.22.1" -] -npu = [ - "torch==2.7.1+cpu", - "torch-npu==2.7.1", - "torchvision==0.22.1+cpu" -] - [tool.setuptools.packages.find] where = ["./"] include = ["diffsynth", "diffsynth.*"]