support ascend npu

This commit is contained in:
Artiprocher
2025-12-15 15:48:42 +08:00
parent 78d8842ddf
commit 2883bc1b76
11 changed files with 242 additions and 9 deletions

View File

@@ -3,3 +3,4 @@ from .data import *
from .gradient import *
from .loader import *
from .vram import *
from .device import *

View File

@@ -0,0 +1 @@
from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type

View File

@@ -0,0 +1,107 @@
import importlib
import torch
from typing import Any
def is_torch_npu_available():
return importlib.util.find_spec("torch_npu") is not None
IS_CUDA_AVAILABLE = torch.cuda.is_available()
IS_NPU_AVAILABLE = is_torch_npu_available() and torch.npu.is_available()
if IS_NPU_AVAILABLE:
import torch_npu
torch.npu.config.allow_internal_format = False
def get_device_type() -> str:
"""Get device type based on current machine, currently only support CPU, CUDA, NPU."""
if IS_CUDA_AVAILABLE:
device = "cuda"
elif IS_NPU_AVAILABLE:
device = "npu"
else:
device = "cpu"
return device
def get_torch_device() -> Any:
"""Get torch attribute based on device type, e.g. torch.cuda or torch.npu"""
device_name = get_device_type()
try:
return getattr(torch, device_name)
except AttributeError:
print(f"Device namespace '{device_name}' not found in torch, try to load 'torch.cuda'.")
return torch.cuda
def get_device_id() -> int:
"""Get current device id based on device type."""
return get_torch_device().current_device()
def get_device_name() -> str:
"""Get current device name based on device type."""
return f"{get_device_type()}:{get_device_id()}"
def synchronize() -> None:
"""Execute torch synchronize operation."""
get_torch_device().synchronize()
def empty_cache() -> None:
"""Execute torch empty cache operation."""
get_torch_device().empty_cache()
def get_nccl_backend() -> str:
"""Return distributed communication backend type based on device type."""
if IS_CUDA_AVAILABLE:
return "nccl"
elif IS_NPU_AVAILABLE:
return "hccl"
else:
raise RuntimeError(f"No available distributed communication backend found on device type {get_device_type()}.")
def enable_high_precision_for_bf16():
"""
Set high accumulation dtype for matmul and reduction.
"""
if IS_CUDA_AVAILABLE:
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
if IS_NPU_AVAILABLE:
torch.npu.matmul.allow_tf32 = False
torch.npu.matmul.allow_bf16_reduced_precision_reduction = False
def parse_device_type(device):
if isinstance(device, str):
if device.startswith("cuda"):
return "cuda"
elif device.startswith("npu"):
return "npu"
else:
return "cpu"
elif isinstance(device, torch.device):
return device.type
def parse_nccl_backend(device_type):
if device_type == "cuda":
return "nccl"
elif device_type == "npu":
return "hccl"
else:
raise RuntimeError(f"No available distributed communication backend found on device type {device_type}.")
def get_available_device_type():
return get_device_type()

View File

@@ -2,6 +2,7 @@ import torch, copy
from typing import Union
from .initialization import skip_model_initialization
from .disk_map import DiskMap
from ..device import parse_device_type
class AutoTorchModule(torch.nn.Module):
@@ -32,6 +33,7 @@ class AutoTorchModule(torch.nn.Module):
)
self.state = 0
self.name = ""
self.computation_device_type = parse_device_type(self.computation_device)
def set_dtype_and_device(
self,
@@ -61,7 +63,7 @@ class AutoTorchModule(torch.nn.Module):
return r
def check_free_vram(self):
gpu_mem_state = torch.cuda.mem_get_info(self.computation_device)
gpu_mem_state = getattr(torch, self.computation_device_type).mem_get_info(self.computation_device)
used_memory = (gpu_mem_state[1] - gpu_mem_state[0]) / (1024**3)
return used_memory < self.vram_limit

View File

@@ -3,7 +3,7 @@ import torch
import numpy as np
from einops import repeat, reduce
from typing import Union
from ..core import AutoTorchModule, AutoWrappedLinear, load_state_dict, ModelConfig
from ..core import AutoTorchModule, AutoWrappedLinear, load_state_dict, ModelConfig, parse_device_type
from ..utils.lora import GeneralLoRALoader
from ..models.model_loader import ModelPool
from ..utils.controlnet import ControlNetInput
@@ -68,6 +68,7 @@ class BasePipeline(torch.nn.Module):
# The device and torch_dtype is used for the storage of intermediate variables, not models.
self.device = device
self.torch_dtype = torch_dtype
self.device_type = parse_device_type(device)
# The following parameters are used for shape check.
self.height_division_factor = height_division_factor
self.width_division_factor = width_division_factor
@@ -154,7 +155,7 @@ class BasePipeline(torch.nn.Module):
for module in model.modules():
if hasattr(module, "offload"):
module.offload()
torch.cuda.empty_cache()
getattr(torch, self.device_type).empty_cache()
# onload models
for name, model in self.named_children():
if name in model_names:
@@ -176,7 +177,7 @@ class BasePipeline(torch.nn.Module):
def get_vram(self):
return torch.cuda.mem_get_info(self.device)[1] / (1024 ** 3)
return getattr(torch, self.device_type).mem_get_info(self.device)[1] / (1024 ** 3)
def get_module(self, model, name):
if "." in name:

View File

@@ -126,7 +126,7 @@ class WanVideoPipeline(BasePipeline):
pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
if use_usp:
from ..utils.xfuser import initialize_usp
initialize_usp()
initialize_usp(device)
model_pool = pipe.download_and_load_models(model_configs, vram_limit)
# Fetch models

View File

@@ -5,19 +5,20 @@ from xfuser.core.distributed import (get_sequence_parallel_rank,
get_sequence_parallel_world_size,
get_sp_group)
from xfuser.core.long_ctx_attention import xFuserLongContextAttention
from ...core.device import parse_nccl_backend, parse_device_type
def initialize_usp():
def initialize_usp(device_type):
import torch.distributed as dist
from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
dist.init_process_group(backend="nccl", init_method="env://")
dist.init_process_group(backend=parse_nccl_backend(device_type), init_method="env://")
init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
initialize_model_parallel(
sequence_parallel_degree=dist.get_world_size(),
ring_degree=1,
ulysses_degree=dist.get_world_size(),
)
torch.cuda.set_device(dist.get_rank())
getattr(torch, device_type).set_device(dist.get_rank())
def sinusoidal_embedding_1d(dim, position):
@@ -141,5 +142,5 @@ def usp_attn_forward(self, x, freqs):
x = x.flatten(2)
del q, k, v
torch.cuda.empty_cache()
getattr(torch, parse_device_type(x.device)).empty_cache()
return self.o(x)

View File

@@ -0,0 +1,58 @@
# GPU/NPU Support
`DiffSynth-Studio` supports various GPUs and NPUs. This document explains how to run model inference and training on these devices.
Before you begin, please follow the [Installation Guide](/docs/en/Pipeline_Usage/Setup.md) to install the required GPU/NPU dependencies.
## NVIDIA GPU
All sample code provided by this project supports NVIDIA GPUs by default, requiring no additional modifications.
## AMD GPU
AMD provides PyTorch packages based on ROCm, so most models can run without code changes. A small number of models may not be compatible due to their reliance on CUDA-specific instructions.
## Ascend NPU
When using Ascend NPU, you need to replace `"cuda"` with `"npu"` in your code.
For example, here is the inference code for **Wan2.1-T2V-1.3B**, modified for Ascend NPU:
```diff
import torch
from diffsynth.utils.data import save_video, VideoData
from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": torch.bfloat16,
"onload_device": "cpu",
"preparing_dtype": torch.bfloat16,
- "preparing_device": "cuda",
+ "preparing_device": "npu",
"computation_dtype": torch.bfloat16,
- "computation_device": "cuda",
+ "computation_device": "npu",
}
pipe = WanVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
- device="cuda",
+ device="npu",
model_configs=[
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
+ vram_limit=torch.npu.mem_get_info("npu")[1] / (1024 ** 3) - 2,
)
video = pipe(
prompt="Documentary-style photography: a lively puppy running swiftly across lush green grass. The puppy has brownish-yellow fur, upright ears, and an alert, joyful expression. Sunlight bathes its body, making the fur appear exceptionally soft and shiny. The background is an open field with occasional wildflowers, and faint blue sky with scattered white clouds in the distance. Strong perspective captures the motion of the running puppy and the vitality of the surrounding grass. Mid-shot, side-moving viewpoint.",
negative_prompt="Overly vibrant colors, overexposed, static, blurry details, subtitles, artistic style, painting, still image, overall grayish tone, worst quality, low quality, JPEG artifacts, ugly, distorted, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, fused fingers, motionless scene, cluttered background, three legs, many people in background, walking backward",
seed=0, tiled=True,
)
save_video(video, "video.mp4", fps=15, quality=5)
```

View File

@@ -36,6 +36,8 @@ Ascend NPU support is provided via the `torch-npu` package. Taking version `2.1.
pip install torch-npu==2.1.0.post17
```
When using Ascend NPU, please replace `"cuda"` with `"npu"` in your Python code. For details, see [NPU Support](/docs/en/Pipeline_Usage/GPU_support.md#ascend-npu).
## Other Installation Issues
If you encounter issues during installation, they may be caused by upstream dependencies. Please refer to the documentation for these packages:

View File

@@ -0,0 +1,58 @@
# GPU/NPU 支持
`DiffSynth-Studio` 支持多种 GPU/NPU本文介绍如何在这些设备上运行模型推理和训练。
在开始前,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。
## NVIDIA GPU
本项目提供的所有样例代码默认支持 NVIDIA GPU无需额外修改。
## AMD GPU
AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。
## Ascend NPU
使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`
例如Wan2.1-T2V-1.3B 的推理代码:
```diff
import torch
from diffsynth.utils.data import save_video, VideoData
from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": torch.bfloat16,
"onload_device": "cpu",
"preparing_dtype": torch.bfloat16,
- "preparing_device": "cuda",
+ "preparing_device": "npu",
"computation_dtype": torch.bfloat16,
- "computation_device": "cuda",
+ "preparing_device": "npu",
}
pipe = WanVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
- device="cuda",
+ device="npu",
model_configs=[
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
+ vram_limit=torch.npu.mem_get_info("npu")[1] / (1024 ** 3) - 2,
)
video = pipe(
prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
negative_prompt="色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走",
seed=0, tiled=True,
)
save_video(video, "video.mp4", fps=15, quality=5)
```

View File

@@ -36,6 +36,8 @@ Ascend NPU 通过 `torch-npu` 包提供支持,以 `2.1.0.post17` 版本(本
pip install torch-npu==2.1.0.post17
```
使用 Ascend NPU 时,请将 Python 代码中的 `"cuda"` 改为 `"npu"`,详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。
## 其他安装问题
如果在安装过程中遇到问题,可能是由上游依赖包导致的,请参考这些包的文档: