ace-step train

2026-04-24 15:06:17 +00:00 · 2026-04-22 17:58:10 +08:00
parent b0680ef711
commit c53c813c12
42 changed files with 1235 additions and 30 deletions
--- a/diffsynth/core/data/operators.py
+++ b/diffsynth/core/data/operators.py
@@ -3,6 +3,7 @@ import torch, torchvision, imageio, os
 import imageio.v3 as iio
 from PIL import Image
 import torchaudio
+from diffsynth.utils.data.audio import read_audio


 class DataProcessingPipeline:
@@ -276,3 +277,27 @@ class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
        except:
            warnings.warn(f"Cannot load audio in {data}. The audio will be `None`.")
            return None
+
+
+class LoadPureAudioWithTorchaudio(DataProcessingOperator):
+
+    def __init__(self, target_sample_rate=None, target_duration=None):
+        self.target_sample_rate = target_sample_rate
+        self.target_duration = target_duration
+        self.resample = True if target_sample_rate is not None else False
+
+    def __call__(self, data: str):
+        try:
+            waveform, sample_rate = read_audio(data, resample=self.resample, resample_rate=self.target_sample_rate)
+            if self.target_duration is not None:
+                target_samples = int(self.target_duration * sample_rate)
+                current_samples = waveform.shape[-1]
+                if current_samples > target_samples:
+                    waveform = waveform[..., :target_samples]
+                elif current_samples < target_samples:
+                    padding = target_samples - current_samples
+                    waveform = torch.nn.functional.pad(waveform, (0, padding))
+            return waveform, sample_rate
+        except Exception as e:
+            warnings.warn(f"Cannot load audio in '{data}' due to '{e}'. The audio will be `None`.")
+            return None
--- a/diffsynth/models/ace_step_dit.py
+++ b/diffsynth/models/ace_step_dit.py
@@ -864,20 +864,13 @@ class AceStepDiTModel(nn.Module):
            layer_kwargs = flash_attn_kwargs

            # Use gradient checkpointing if enabled
-            if use_gradient_checkpointing or use_gradient_checkpointing_offload:
-                layer_outputs = gradient_checkpoint_forward(
-                    layer_module,
-                    use_gradient_checkpointing,
-                    use_gradient_checkpointing_offload,
-                    *layer_args,
-                    **layer_kwargs,
-                )
-            else:
-                layer_outputs = layer_module(
-                    *layer_args,
-                    **layer_kwargs,
-                )
-
+            layer_outputs = gradient_checkpoint_forward(
+                layer_module,
+                use_gradient_checkpointing,
+                use_gradient_checkpointing_offload,
+                *layer_args,
+                **layer_kwargs,
+            )
            hidden_states = layer_outputs[0]

            if output_attentions and self.layers[index_block].use_cross_attention:
--- a/diffsynth/models/ace_step_vae.py
+++ b/diffsynth/models/ace_step_vae.py
@@ -191,6 +191,43 @@ class OobleckDecoder(nn.Module):
        return self.conv2(hidden_state)


+class OobleckDiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        self.parameters = parameters
+        self.mean, self.scale = parameters.chunk(2, dim=1)
+        self.std = nn.functional.softplus(self.scale) + 1e-4
+        self.var = self.std * self.std
+        self.logvar = torch.log(self.var)
+        self.deterministic = deterministic
+
+    def sample(self, generator: torch.Generator | None = None) -> torch.Tensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = torch.randn(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "OobleckDiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return (self.mean * self.mean + self.var - self.logvar - 1.0).sum(1).mean()
+            else:
+                normalized_diff = torch.pow(self.mean - other.mean, 2) / other.var
+                var_ratio = self.var / other.var
+                logvar_diff = self.logvar - other.logvar
+
+                kl = normalized_diff + var_ratio + logvar_diff - 1
+
+                kl = kl.sum(1).mean()
+                return kl
+
+
 class AceStepVAE(nn.Module):
    """Audio VAE for ACE-Step (AutoencoderOobleck architecture).

@@ -229,17 +266,19 @@ class AceStepVAE(nn.Module):
        self.sampling_rate = sampling_rate

    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Audio waveform [B, audio_channels, T] → latent [B, encoder_hidden_size, T']."""
-        return self.encoder(x)
+        """Audio waveform [B, audio_channels, T] → latent [B, decoder_input_channels, T']."""
+        h = self.encoder(x)
+        output = OobleckDiagonalGaussianDistribution(h).sample()
+        return output

    def decode(self, z: torch.Tensor) -> torch.Tensor:
-        """Latent [B, encoder_hidden_size, T] → audio waveform [B, audio_channels, T']."""
+        """Latent [B, decoder_input_channels, T] → audio waveform [B, audio_channels, T']."""
        return self.decoder(z)

    def forward(self, sample: torch.Tensor) -> torch.Tensor:
        """Full round-trip: encode → decode."""
        z = self.encode(sample)
-        return self.decoder(z)
+        return self.decode(z)

    def remove_weight_norm(self):
        """Remove weight normalization from all conv layers (for export/inference)."""
--- a/diffsynth/pipelines/ace_step.py
+++ b/diffsynth/pipelines/ace_step.py
@@ -7,6 +7,7 @@ import re
 import torch
 from typing import Optional, Dict, Any, List, Tuple
 from tqdm import tqdm
+import random

 from ..core.device.npu_compatible_device import get_device_type
 from ..diffusion import FlowMatchScheduler
@@ -89,13 +90,14 @@ class AceStepPipeline(BasePipeline):
        self,
        # Prompt
        prompt: str,
-        negative_prompt: str = "",
        cfg_scale: float = 1.0,
        # Lyrics
        lyrics: str = "",
+        # Task type
+        task_type: Optional[str] = "text2music",
        # Reference audio
        reference_audios: List[torch.Tensor] = None,
-        # Src audio
+        # Source audio
        src_audio: torch.Tensor = None,
        denoising_strength: float = 1.0,
        # Audio codes
@@ -126,6 +128,7 @@ class AceStepPipeline(BasePipeline):
        inputs_shared = {
            "cfg_scale": cfg_scale,
            "lyrics": lyrics,
+            "task_type": task_type,
            "reference_audios": reference_audios,
            "src_audio": src_audio,
            "audio_code_string": audio_code_string,
@@ -147,7 +150,7 @@ class AceStepPipeline(BasePipeline):
        self.load_models_to_device(self.in_iteration_models)
        models = {name: getattr(self, name) for name in self.in_iteration_models}
        for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
-            timestep = timestep.to(dtype=self.torch_dtype, device=self.device)
+            timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device)
            noise_pred = self.cfg_guided_model_fn(
                self.model_fn, cfg_scale,
                inputs_shared, inputs_posi, inputs_nega,
@@ -182,13 +185,14 @@ class AceStepUnit_TaskTypeChecker(PipelineUnit):
    """Check and compute sequence length from duration."""
    def __init__(self):
        super().__init__(
-            input_params=("src_audio", "audio_code_string"),
+            input_params=("audio_code_string"),
            output_params=("task_type",),
        )

-    def process(self, pipe, src_audio, audio_code_string):
+    def process(self, pipe, audio_code_string):
+        if pipe.scheduler.training:
+            return {"task_type": "text2music"}
        if audio_code_string is not None:
-            print("audio_code_string detected, setting task_type to 'cover'")
            task_type = "cover"
        else:
            task_type = "text2music"
@@ -200,7 +204,6 @@ class AceStepUnit_PromptEmbedder(PipelineUnit):
    INSTRUCTION_MAP = {
        "text2music": "Fill the audio semantic mask based on the given conditions:",
        "cover": "Generate audio semantic tokens based on the given conditions:",
-
        "repaint": "Repaint the mask area based on the given conditions:",
        "extract": "Extract the {TRACK_NAME} track from the audio:",
        "extract_default": "Extract the track from the audio:",
@@ -292,6 +295,7 @@ class AceStepUnit_ReferenceAudioEmbedder(PipelineUnit):
    def process(self, pipe, reference_audios):
        pipe.load_models_to_device(['vae'])
        if reference_audios is not None and len(reference_audios) > 0:
+            raise NotImplementedError("Reference audio embedding is not implemented yet.")
            # TODO: implement reference audio embedding using VAE encode, and generate refer_audio_order_mask
            pass
        else:
@@ -299,6 +303,49 @@ class AceStepUnit_ReferenceAudioEmbedder(PipelineUnit):
            reference_latents, refer_audio_order_mask = self.infer_refer_latent(pipe, reference_audios)
        return {"reference_latents": reference_latents, "refer_audio_order_mask": refer_audio_order_mask}

+    # def process_reference_audio(self, reference_audios) -> Optional[torch.Tensor]:
+
+    #     try:
+    #         audio_np, sr = _read_audio_file(audio_file)
+    #         audio = self._numpy_to_channels_first(audio_np)
+
+    #         logger.debug(
+    #             f"[process_reference_audio] Reference audio shape: {audio.shape}"
+    #         )
+    #         logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
+    #         logger.debug(
+    #             f"[process_reference_audio] Reference audio duration: {audio.shape[-1] / sr:.6f} seconds"
+    #         )
+
+    #         audio = self._normalize_audio_to_stereo_48k(audio, sr)
+    #         if self.is_silence(audio):
+    #             return None
+
+    #         target_frames = 30 * 48000
+    #         segment_frames = 10 * 48000
+
+    #         if audio.shape[-1] < target_frames:
+    #             repeat_times = math.ceil(target_frames / audio.shape[-1])
+    #             audio = audio.repeat(1, repeat_times)
+
+    #         total_frames = audio.shape[-1]
+    #         segment_size = total_frames // 3
+
+    #         front_start = random.randint(0, max(0, segment_size - segment_frames))
+    #         front_audio = audio[:, front_start : front_start + segment_frames]
+
+    #         middle_start = segment_size + random.randint(
+    #             0, max(0, segment_size - segment_frames)
+    #         )
+    #         middle_audio = audio[:, middle_start : middle_start + segment_frames]
+
+    #         back_start = 2 * segment_size + random.randint(
+    #             0, max(0, (total_frames - 2 * segment_size) - segment_frames)
+    #         )
+    #         back_audio = audio[:, back_start : back_start + segment_frames]
+
+    #         return torch.cat([front_audio, middle_audio, back_audio], dim=-1)
+
    def infer_refer_latent(self, pipe, refer_audioss: List[List[torch.Tensor]]) -> Tuple[torch.Tensor, torch.Tensor]:
        """Infer packed reference-audio latents and order mask."""
        refer_audio_order_mask = []
@@ -401,8 +448,8 @@ class AceStepUnit_ContextLatentBuilder(PipelineUnit):
            chunk_masks = torch.ones((1, max_latent_length, src_latents.shape[-1]), dtype=torch.bool, device=pipe.device)
            attention_mask = torch.ones((1, max_latent_length), device=src_latents.device, dtype=pipe.torch_dtype)
            context_latents = torch.cat([src_latents, chunk_masks], dim=-1)
-        elif src_audio is not None:
-            raise NotImplementedError("src_audio conditioning is not implemented yet. Please set lm_hints to None.")
+        # elif src_audio is not None:
+        #     raise NotImplementedError("src_audio conditioning is not implemented yet. Please set lm_hints to None.")
        else:
            max_latent_length = duration * pipe.sample_rate  // 1920
            src_latents = self._get_silence_latent_slice(pipe, max_latent_length).unsqueeze(0)
@@ -435,8 +482,16 @@ class AceStepUnit_InputAudioEmbedder(PipelineUnit):
    def process(self, pipe, noise, input_audio):
        if input_audio is None:
            return {"latents": noise}
-        # TODO: support for train
-        return {"latents": noise, "input_latents": None}
+        if pipe.scheduler.training:
+            pipe.load_models_to_device(['vae'])
+            input_audio, sample_rate = input_audio
+            input_audio = torch.clamp(input_audio, -1.0, 1.0)
+            if input_audio.dim() == 2:
+                input_audio = input_audio.unsqueeze(0)
+            input_latents = pipe.vae.encode(input_audio.to(dtype=pipe.torch_dtype, device=pipe.device)).transpose(1, 2)
+            # prevent potential size mismatch between context_latents and input_latents by cropping input_latents to the same temporal length as noise
+            input_latents = input_latents[:, :noise.shape[1]]
+            return {"input_latents": input_latents}


 class AceStepUnit_AudioCodeDecoder(PipelineUnit):
@@ -494,7 +549,6 @@ def model_fn_ace_step(
    use_gradient_checkpointing_offload=False,
    **kwargs,
 ):
-    timestep = timestep.unsqueeze(0)
    decoder_outputs = dit(
        hidden_states=latents,
        timestep=timestep,
--- a/examples/ace_step/model_inference/Ace-Step1.5.py
+++ b/examples/ace_step/model_inference/Ace-Step1.5.py
@@ -35,6 +35,7 @@ with open("data/diffsynth_example_dataset/ace_step/Ace-Step1.5/audio_codes_input
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
+    task_type="cover",
    audio_code_string=audio_code_string,
    duration=160,
    bpm=100,
--- a/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py
+++ b/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py
@@ -55,6 +55,7 @@ with open("data/diffsynth_example_dataset/ace_step/Ace-Step1.5/audio_codes_input
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
+    task_type="cover",
    audio_code_string=audio_code_string,
    duration=160,
    bpm=100,
--- a/examples/ace_step/model_training/full/Ace-Step1.5.sh
+++ b/examples/ace_step/model_training/full/Ace-Step1.5.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/Ace-Step1.5/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/Ace-Step1.5/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/Ace-Step1.5" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/Ace-Step1.5/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/Ace-Step1.5:acestep-v15-turbo/model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/Ace-Step1.5_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-base.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-base.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-base/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-base/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-base" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-base/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-base:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-base_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-sft.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-sft.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-sft/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-sft/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-sft" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-sft/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-sft:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-sft_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-turbo-continuous.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-turbo-continuous.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-continuous/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-turbo-continuous/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-continuous" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-continuous/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-turbo-continuous:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-turbo-continuous_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-turbo-shift1.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-turbo-shift1.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift1/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-turbo-shift1/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift1" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift1/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-turbo-shift1:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-turbo-shift1_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-turbo-shift3.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-turbo-shift3.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift3/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-turbo-shift3/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift3" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift3/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-turbo-shift3:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-turbo-shift3_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-xl-base.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-xl-base.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-xl-base/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-xl-base/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-base" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-base/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-xl-base:model-*.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-xl-base_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-xl-sft.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-xl-sft/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-xl-sft/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-sft" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-sft/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-xl-sft:model-*.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-xl-sft_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh
+++ b/examples/ace_step/model_training/full/acestep-v15-xl-turbo.sh
@@ -0,0 +1,18 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-xl-turbo/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-xl-turbo/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-5 \
+    --num_epochs 2 \
+    --trainable_models "dit" \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-turbo" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-turbo/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-xl-turbo:model-*.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-xl-turbo_full" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/Ace-Step1.5.sh
+++ b/examples/ace_step/model_training/lora/Ace-Step1.5.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/Ace-Step1.5/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/Ace-Step1.5/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/Ace-Step1.5" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/Ace-Step1.5/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/Ace-Step1.5:acestep-v15-turbo/model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/Ace-Step1.5_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-base.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-base.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-base/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-base/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-base" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-base/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-base:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-base_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-sft.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-sft.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-sft/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-sft/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-sft" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-sft/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-sft:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-sft_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-turbo-continuous.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-turbo-continuous.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-continuous/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-turbo-continuous/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-continuous" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-continuous/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-turbo-continuous:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-turbo-continuous_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-turbo-shift1.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-turbo-shift1.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift1/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-turbo-shift1/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift1" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift1/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-turbo-shift1:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-turbo-shift1_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-turbo-shift3.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-turbo-shift3.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift3/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-turbo-shift3/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift3" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-turbo-shift3/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-turbo-shift3:model.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-turbo-shift3_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-xl-base.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-xl-base/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-xl-base/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-base" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-base/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-xl-base:model-*.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-xl-base_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-xl-sft.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-xl-sft/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-xl-sft/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-sft" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-sft/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-xl-sft:model-*.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-xl-sft_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh
+++ b/examples/ace_step/model_training/lora/acestep-v15-xl-turbo.sh
@@ -0,0 +1,20 @@
+# Dataset: data/diffsynth_example_dataset/ace_step/acestep-v15-xl-turbo/
+# Download: modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "ace_step/acestep-v15-xl-turbo/*" --local_dir ./data/diffsynth_example_dataset
+
+accelerate launch examples/ace_step/model_training/train.py \
+    --learning_rate 1e-4 \
+    --num_epochs 20 \
+    --lora_rank 32 \
+    --use_gradient_checkpointing \
+    --find_unused_parameters \
+    --dataset_base_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-turbo" \
+    --dataset_metadata_path "./data/diffsynth_example_dataset/ace_step/acestep-v15-xl-turbo/metadata.json" \
+    --model_id_with_origin_paths "ACE-Step/acestep-v15-xl-turbo:model-*.safetensors,ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/model.safetensors,ACE-Step/Ace-Step1.5:vae/diffusion_pytorch_model.safetensors" \
+    --tokenizer_path "ACE-Step/Ace-Step1.5:Qwen3-Embedding-0.6B/" \
+    --silence_latent_path "ACE-Step/Ace-Step1.5:acestep-v15-turbo/silence_latent.pt" \
+    --lora_base_model "dit" \
+    --remove_prefix_in_ckpt "pipe.dit." \
+    --dataset_repeat 50 \
+    --output_path "./models/train/acestep-v15-xl-turbo_lora" \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --data_file_keys "audio"
--- a/examples/ace_step/model_training/train.py
+++ b/examples/ace_step/model_training/train.py
@@ -0,0 +1,173 @@
+import torch, os, argparse, accelerate, warnings, torchaudio
+import math
+from diffsynth.core import UnifiedDataset
+from diffsynth.core.data.operators import ToAbsolutePath, RouteByType, DataProcessingOperator, LoadPureAudioWithTorchaudio
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.diffusion import *
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+class LoadAceStepAudio(DataProcessingOperator):
+    """Load audio file and return waveform tensor [2, T] at 48kHz."""
+    def __init__(self, target_sr=48000):
+        self.target_sr = target_sr
+
+    def __call__(self, data: str):
+        try:
+            waveform, sample_rate = torchaudio.load(data)
+            if sample_rate != self.target_sr:
+                resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sr)
+                waveform = resampler(waveform)
+            if waveform.shape[0] == 1:
+                waveform = waveform.repeat(2, 1)
+            return waveform
+        except Exception as e:
+            warnings.warn(f"Cannot load audio from {data}: {e}")
+            return None
+
+
+class AceStepTrainingModule(DiffusionTrainingModule):
+    def __init__(
+        self,
+        model_paths=None, model_id_with_origin_paths=None,
+        tokenizer_path=None, silence_latent_path=None,
+        trainable_models=None,
+        lora_base_model=None, lora_target_modules="", lora_rank=32, lora_checkpoint=None,
+        preset_lora_path=None, preset_lora_model=None,
+        use_gradient_checkpointing=True,
+        use_gradient_checkpointing_offload=False,
+        extra_inputs=None,
+        fp8_models=None,
+        offload_models=None,
+        device="cpu",
+        task="sft",
+    ):
+        super().__init__()
+        # ===== 解析模型配置（固定写法） =====
+        model_configs = self.parse_model_configs(model_paths, model_id_with_origin_paths, fp8_models=fp8_models, offload_models=offload_models, device=device)
+        # ===== Tokenizer 配置 =====
+        text_tokenizer_config = self.parse_path_or_model_id(tokenizer_path, default_value=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"))
+        silence_latent_config = self.parse_path_or_model_id(silence_latent_path, default_value=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"))
+        # ===== 构建 Pipeline =====
+        self.pipe = AceStepPipeline.from_pretrained(torch_dtype=torch.bfloat16, device=device, model_configs=model_configs, text_tokenizer_config=text_tokenizer_config, silence_latent_config=silence_latent_config)
+        # ===== 拆分 Pipeline Units（固定写法） =====
+        self.pipe = self.split_pipeline_units(task, self.pipe, trainable_models, lora_base_model)
+
+        # ===== 切换到训练模式（固定写法） =====
+        self.switch_pipe_to_training_mode(
+            self.pipe, trainable_models,
+            lora_base_model, lora_target_modules, lora_rank, lora_checkpoint,
+            preset_lora_path, preset_lora_model,
+            task=task,
+        )
+
+        # ===== 其他配置（固定写法） =====
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        self.use_gradient_checkpointing_offload = use_gradient_checkpointing_offload
+        self.extra_inputs = extra_inputs.split(",") if extra_inputs is not None else []
+        self.fp8_models = fp8_models
+        self.task = task
+        # ===== 任务模式路由（固定写法） =====
+        self.task_to_loss = {
+            "sft:data_process": lambda pipe, *args: args,
+            "sft": lambda pipe, inputs_shared, inputs_posi, inputs_nega: FlowMatchSFTLoss(pipe, **inputs_shared, **inputs_posi),
+            "sft:train": lambda pipe, inputs_shared, inputs_posi, inputs_nega: FlowMatchSFTLoss(pipe, **inputs_shared, **inputs_posi),
+        }
+
+    def get_pipeline_inputs(self, data):
+        inputs_posi = {"prompt": data["prompt"], "positive": True}
+        inputs_nega = {"positive": False}
+        duration = math.floor(data['audio'][0].shape[1] / data['audio'][1]) if data.get("audio") is not None else data.get("duration", 60)
+        # ===== 共享参数 =====
+        inputs_shared = {
+            # ===== 核心字段映射 =====
+            "input_audio": data["audio"],
+            # ===== 音频生成任务所需元数据 =====
+            "lyrics": data["lyrics"],
+            "task_type": "text2music",
+            "duration": duration,
+            "bpm": data.get("bpm", 100),
+            "keyscale": data.get("keyscale", "C major"),
+            "timesignature": data.get("timesignature", "4"),
+            "vocal_language": data.get("vocal_language", "unknown"),
+            # ===== 框架控制参数（固定写法） =====
+            "cfg_scale": 1,
+            "rand_device": self.pipe.device,
+            "use_gradient_checkpointing": self.use_gradient_checkpointing,
+            "use_gradient_checkpointing_offload": self.use_gradient_checkpointing_offload,
+        }
+        # ===== 额外字段注入：通过 --extra_inputs 配置的数据集列名（固定写法） =====
+        inputs_shared = self.parse_extra_inputs(data, self.extra_inputs, inputs_shared)
+        return inputs_shared, inputs_posi, inputs_nega
+
+    def forward(self, data, inputs=None):
+        # ===== 标准实现，不要修改（固定写法） =====
+        if inputs is None: inputs = self.get_pipeline_inputs(data)
+        inputs = self.transfer_data_to_device(inputs, self.pipe.device, self.pipe.torch_dtype)
+        for unit in self.pipe.units:
+            inputs = self.pipe.unit_runner(unit, self.pipe, *inputs)
+        loss = self.task_to_loss[self.task](self.pipe, *inputs)
+        return loss
+
+
+def ace_step_parser():
+    parser = argparse.ArgumentParser(description="ACE-Step training.")
+    parser = add_general_config(parser)
+    parser.add_argument("--tokenizer_path", type=str, default=None, help="Tokenizer path in format model_id:origin_pattern.")
+    parser.add_argument("--silence_latent_path", type=str, default=None, help="Silence latent path in format model_id:origin_pattern.")
+    parser.add_argument("--initialize_model_on_cpu", default=False, action="store_true", help="Whether to initialize models on CPU.")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ace_step_parser()
+    args = parser.parse_args()
+    # ===== Accelerator 配置（固定写法） =====
+    accelerator = accelerate.Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        kwargs_handlers=[accelerate.DistributedDataParallelKwargs(find_unused_parameters=args.find_unused_parameters)],
+    )
+    # ===== 数据集定义 =====
+    dataset = UnifiedDataset(
+        base_path=args.dataset_base_path,
+        metadata_path=args.dataset_metadata_path,
+        repeat=args.dataset_repeat,
+        data_file_keys=args.data_file_keys.split(","),
+        main_data_operator=None,
+        special_operator_map={
+            "audio": ToAbsolutePath(args.dataset_base_path) >> LoadPureAudioWithTorchaudio(target_sample_rate=48000),
+        },
+    )
+    # ===== TrainingModule =====
+    model = AceStepTrainingModule(
+        model_paths=args.model_paths,
+        model_id_with_origin_paths=args.model_id_with_origin_paths,
+        tokenizer_path=args.tokenizer_path,
+        silence_latent_path=args.silence_latent_path,
+        trainable_models=args.trainable_models,
+        lora_base_model=args.lora_base_model,
+        lora_target_modules=args.lora_target_modules,
+        lora_rank=args.lora_rank,
+        lora_checkpoint=args.lora_checkpoint,
+        preset_lora_path=args.preset_lora_path,
+        preset_lora_model=args.preset_lora_model,
+        use_gradient_checkpointing=args.use_gradient_checkpointing,
+        use_gradient_checkpointing_offload=args.use_gradient_checkpointing_offload,
+        extra_inputs=args.extra_inputs,
+        fp8_models=args.fp8_models,
+        offload_models=args.offload_models,
+        task=args.task,
+        device="cpu" if args.initialize_model_on_cpu else accelerator.device,
+    )
+    # ===== ModelLogger（固定写法） =====
+    model_logger = ModelLogger(
+        args.output_path,
+        remove_prefix_in_ckpt=args.remove_prefix_in_ckpt,
+    )
+    # ===== 任务路由（固定写法） =====
+    launcher_map = {
+        "sft:data_process": launch_data_process_task,
+        "sft": launch_training_task,
+        "sft:train": launch_training_task,
+    }
+    launcher_map[args.task](accelerator, dataset, model, model_logger, args=args)
--- a/examples/ace_step/model_training/validate_full/Ace-Step1.5.py
+++ b/examples/ace_step/model_training/validate_full/Ace-Step1.5.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/Ace-Step1.5_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "Ace-Step1.5_full.wav")
--- a/examples/ace_step/model_training/validate_full/acestep-v15-base.py
+++ b/examples/ace_step/model_training/validate_full/acestep-v15-base.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-base", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/acestep-v15-base_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=42,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-base_full.wav")
--- a/examples/ace_step/model_training/validate_full/acestep-v15-sft.py
+++ b/examples/ace_step/model_training/validate_full/acestep-v15-sft.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-sft", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/acestep-v15-sft_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-sft_full.wav")
--- a/examples/ace_step/model_training/validate_full/acestep-v15-turbo-continuous.py
+++ b/examples/ace_step/model_training/validate_full/acestep-v15-turbo-continuous.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-continuous", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/acestep-v15-turbo-continuous_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-continuous_full.wav")
--- a/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift1.py
+++ b/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift1.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift1", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/acestep-v15-turbo-shift1_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift1_full.wav")
--- a/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift3.py
+++ b/examples/ace_step/model_training/validate_full/acestep-v15-turbo-shift3.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift3", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/acestep-v15-turbo-shift3_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift3_full.wav")
--- a/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py
+++ b/examples/ace_step/model_training/validate_full/acestep-v15-xl-base.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/acestep-v15-xl-base_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-base_full.wav")
--- a/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py
+++ b/examples/ace_step/model_training/validate_full/acestep-v15-xl-sft.py
@@ -0,0 +1,35 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+from diffsynth import load_state_dict
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-sft", origin_file_pattern="model-*.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+state_dict = load_state_dict("models/train/acestep-v15-xl-sft_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-sft_full.wav")
--- a/examples/ace_step/model_training/validate_lora/Ace-Step1.5.py
+++ b/examples/ace_step/model_training/validate_lora/Ace-Step1.5.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/Ace-Step1.5_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "Ace-Step1.5_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-base.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-base.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-base", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-base_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-base_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-sft.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-sft.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-sft", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-sft_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-sft_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-continuous.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-continuous.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-continuous", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-turbo-continuous_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-continuous_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift1.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift1.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift1", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-turbo-shift1_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift1_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift3.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-turbo-shift3.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift3", origin_file_pattern="model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-turbo-shift3_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift3_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-xl-base.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-xl-base_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-base_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-xl-sft.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-sft", origin_file_pattern="model-*.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-xl-sft_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-sft_lora.wav")
--- a/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py
+++ b/examples/ace_step/model_training/validate_lora/acestep-v15-xl-turbo.py
@@ -0,0 +1,33 @@
+from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
+from diffsynth.utils.data.audio import save_audio
+import torch
+
+
+pipe = AceStepPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-turbo", origin_file_pattern="model-*.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
+    silence_latent_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/silence_latent.pt"),
+)
+pipe.load_lora(pipe.dit, "models/train/acestep-v15-xl-turbo_lora/epoch-9.safetensors", alpha=1)
+
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
+audio = pipe(
+    prompt=prompt,
+    lyrics=lyrics,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
+    seed=1,
+    num_inference_steps=30,
+    cfg_scale=4.0,
+)
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-turbo_lora.wav")