t2m

2026-04-24 15:06:17 +00:00 · 2026-04-21 20:12:15 +08:00
parent 95cfb77881
commit f5a3201d42
9 changed files with 133 additions and 369 deletions
--- a/diffsynth/pipelines/ace_step.py
+++ b/diffsynth/pipelines/ace_step.py
@@ -145,7 +145,6 @@ class AceStepPipeline(BasePipeline):
        # 4. Denoise loop
        self.load_models_to_device(self.in_iteration_models)
        models = {name: getattr(self, name) for name in self.in_iteration_models}
        self.momentum_buffer = MomentumBuffer()
        for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
            timestep = timestep.to(dtype=self.torch_dtype, device=self.device)
            noise_pred = self.cfg_guided_model_fn(
--- a/examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
+++ b/examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
@@ -1,183 +0,0 @@
 """
 Ace-Step 1.5 — Text-to-Music with Simple Mode (LLM expansion).
 Uses the ACE-Step LLM to expand a simple description into structured
 parameters (caption, lyrics, bpm, keyscale, etc.) AND audio codes,
 then feeds them to the DiffSynth Pipeline.
 The LLM expansion uses the target library's LLMHandler. If vLLM is
 not available, it falls back to using pre-structured parameters.
 Usage:
    python examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
 """
 import os
 import sys
 import json
 import torch
 import soundfile as sf
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 # ---------------------------------------------------------------------------
 # Simple Mode: LLM expansion
 # ---------------------------------------------------------------------------
 def try_load_llm_handler(checkpoint_dir: str, lm_model_path: str = "acestep-5Hz-lm-1.7B",
                         backend: str = "vllm"):
    """Try to load the target library's LLMHandler. Returns (handler, success)."""
    try:
        from acestep.llm_inference import LLMHandler
        handler = LLMHandler()
        status, success = handler.initialize(
            checkpoint_dir=checkpoint_dir,
            lm_model_path=lm_model_path,
            backend=backend,
        )
        if success:
            print(f"[Simple Mode] LLM loaded via {backend} backend: {status}")
            return handler, True
        else:
            print(f"[Simple Mode] LLM init failed: {status}")
            return None, False
    except Exception as e:
        print(f"[Simple Mode] LLMHandler not available: {e}")
        return None, False
 def expand_with_llm(llm_handler, description: str, duration: float = 30.0):
    """Expand a simple description using LLM Chain-of-Thought.
    Returns (params_dict, audio_codes_string).
    """
    result = llm_handler.generate_with_stop_condition(
        caption=description,
        lyrics="",
        infer_type="dit",  # metadata + audio codes
        temperature=0.85,
        cfg_scale=1.0,
        use_cot_metas=True,
        use_cot_caption=True,
        use_cot_language=True,
        user_metadata={"duration": int(duration)},
    )
    if result.get("success") and result.get("metadata"):
        meta = result["metadata"]
        params = {
            "caption": meta.get("caption", description),
            "lyrics": meta.get("lyrics", ""),
            "bpm": meta.get("bpm", 100),
            "keyscale": meta.get("keyscale", ""),
            "language": meta.get("language", "en"),
            "timesignature": meta.get("timesignature", "4"),
            "duration": meta.get("duration", duration),
        }
        audio_codes = result.get("audio_codes", "")
        return params, audio_codes
    print(f"[Simple Mode] LLM expansion failed: {result.get('error', 'unknown')}")
    return None, ""
 def fallback_expand(description: str, duration: float = 30.0):
    """Fallback: use description as caption with default parameters."""
    print(f"[Simple Mode] LLM not available. Using description as caption.")
    return {
        "caption": description,
        "lyrics": "",
        "bpm": 100,
        "keyscale": "",
        "language": "en",
        "timesignature": "4",
        "duration": duration,
    }, ""
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 def main():
    # Target library path (for LLMHandler)
    TARGET_LIB = os.path.join(os.path.dirname(__file__), "../../../../ACE-Step-1.5")
    if TARGET_LIB not in sys.path:
        sys.path.insert(0, TARGET_LIB)
    description = "a soft Bengali love song for a quiet evening"
    duration = 30.0
    # 1. Try to load LLM
    print("=" * 60)
    print("Ace-Step 1.5 — Simple Mode (LLM expansion)")
    print("=" * 60)
    print(f"\n[Simple Mode] Input: '{description}'")
    llm_handler, llm_ok = try_load_llm_handler(
        checkpoint_dir=TARGET_LIB,
        lm_model_path="acestep-5Hz-lm-1.7B",
    )
    # 2. Expand parameters + audio codes
    if llm_ok:
        params, audio_codes = expand_with_llm(llm_handler, description, duration=duration)
        if params is None:
            params, audio_codes = fallback_expand(description, duration)
    else:
        params, audio_codes = fallback_expand(description, duration)
    print(f"\n[Simple Mode] Parameters:")
    print(f"  Caption: {params['caption'][:100]}...")
    print(f"  Lyrics: {len(params['lyrics'])} chars")
    print(f"  BPM: {params['bpm']}, Keyscale: {params['keyscale']}")
    print(f"  Language: {params['language']}, Time Sig: {params['timesignature']}")
    print(f"  Duration: {params['duration']}s")
    print(f"  Audio codes: {len(audio_codes)} chars" if audio_codes else "  Audio codes: None (fallback)")
    # 3. Load Pipeline
    print(f"\n[Pipeline] Loading Ace-Step 1.5 (turbo)...")
    pipe = AceStepPipeline.from_pretrained(
        torch_dtype=torch.bfloat16,
        device="cuda",
        model_configs=[
            ModelConfig(
                model_id="ACE-Step/Ace-Step1.5",
                origin_file_pattern="acestep-v15-turbo/model.safetensors"
            ),
            ModelConfig(
                model_id="ACE-Step/Ace-Step1.5",
                origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
            ),
            ModelConfig(
                model_id="ACE-Step/Ace-Step1.5",
                origin_file_pattern="vae/diffusion_pytorch_model.safetensors"
            ),
        ],
        text_tokenizer_config=ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="Qwen3-Embedding-0.6B/"
        ),
    )
    # 4. Generate
    print(f"\n[Generation] Running Pipeline...")
    audio = pipe(
        prompt=params["caption"],
        lyrics=params["lyrics"],
        duration=params["duration"],
        audio_codes=audio_codes if audio_codes else None,
        seed=42,
        num_inference_steps=8,
        cfg_scale=1.0,
        shift=3.0,
    )
    output_path = "Ace-Step1.5-SimpleMode.wav"
    sf.write(output_path, audio.cpu().numpy(), pipe.sample_rate)
    print(f"\n[Done] Saved to {output_path}")
    print(f"  Shape: {audio.shape}, Duration: {audio.shape[-1] / pipe.sample_rate:.1f}s")
 if __name__ == "__main__":
    main()
--- a/examples/ace_step/model_inference/acestep-v15-sft.py
+++ b/examples/ace_step/model_inference/acestep-v15-sft.py
@@ -1,52 +1,38 @@
 """
-Ace-Step 1.5 SFT (supervised fine-tuned, 24 layers) — Text-to-Music inference example.
+Ace-Step 1.5 SFT (supervised fine-tuned) — Text-to-Music inference example.
 SFT variant is fine-tuned for specific music styles.
 Non-turbo model: uses num_inference_steps=30, cfg_scale=4.0.
 """
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio
 import torch
 import soundfile as sf
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
-        ModelConfig(
+        ModelConfig(model_id="ACE-Step/acestep-v15-sft", origin_file_pattern="model.safetensors"),
-            model_id="ACE-Step/Ace-Step1.5",
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
-            origin_file_pattern="acestep-v15-sft/model.safetensors"
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
        ),
        ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="acestep-v15-sft/model.safetensors"
        ),
        ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
        ),
    ],
-    tokenizer_config=ModelConfig(
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
        model_id="ACE-Step/Ace-Step1.5",
        origin_file_pattern="Qwen3-Embedding-0.6B/"
    ),
    vae_config=ModelConfig(
        model_id="ACE-Step/Ace-Step1.5",
        origin_file_pattern="vae/"
    ),
 )
-prompt = "A jazzy lo-fi beat with smooth saxophone and vinyl crackle, late night vibes"
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
-lyrics = "[Intro - Vinyl crackle]\n\n[Verse 1]\nMidnight city, neon glow\nSmooth jazz flowing to and fro\n\n[Chorus]\nLay back, let the music play\nJazzy nights, dreams drift away"
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
-    duration=30.0,
+    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
-    num_inference_steps=20,
+    num_inference_steps=30,
-    cfg_scale=7.0,
+    cfg_scale=4.0,
    shift=3.0,
 )
-
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-sft.wav")
 sf.write("acestep-v15-sft.wav", audio.cpu().numpy(), pipe.sample_rate)
 print(f"Saved, shape: {audio.shape}")
--- a/examples/ace_step/model_inference/acestep-v15-turbo-continuous.py
+++ b/examples/ace_step/model_inference/acestep-v15-turbo-continuous.py
@@ -0,0 +1,36 @@
 """
 Ace-Step 1.5 Turbo (continuous, shift 1-5) — Text-to-Music inference example.
 Turbo model: no num_inference_steps or cfg_scale (use defaults).
 Continuous variant: handles shift range internally, no shift parameter needed.
 """
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio
 import torch
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-continuous", origin_file_pattern="model.safetensors"),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
    ],
    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
 )
 prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
 lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
 )
 save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-continuous.wav")
--- a/examples/ace_step/model_inference/acestep-v15-turbo-shift1.py
+++ b/examples/ace_step/model_inference/acestep-v15-turbo-shift1.py
@@ -1,52 +1,36 @@
 """
 Ace-Step 1.5 Turbo (shift=1) — Text-to-Music inference example.
-Uses shift=1.0 (no timestep transformation) for smoother, slower denoising.
+Turbo model: no num_inference_steps or cfg_scale (use defaults).
 shift=1: default value, no need to pass.
 """
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio
 import torch
 import soundfile as sf
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
-        ModelConfig(
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift1", origin_file_pattern="model.safetensors"),
-            model_id="ACE-Step/Ace-Step1.5",
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
-            origin_file_pattern="acestep-v15-turbo/model.safetensors"
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
        ),
        ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="acestep-v15-turbo/model.safetensors"
        ),
        ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
        ),
    ],
-    tokenizer_config=ModelConfig(
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
        model_id="ACE-Step/Ace-Step1.5",
        origin_file_pattern="Qwen3-Embedding-0.6B/"
    ),
    vae_config=ModelConfig(
        model_id="ACE-Step/Ace-Step1.5",
        origin_file_pattern="vae/"
    ),
 )
-prompt = "A gentle acoustic guitar melody with soft piano accompaniment, peaceful and warm atmosphere"
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
-lyrics = "[Verse 1]\nSunlight filtering through the trees\nA quiet moment, just the breeze\n\n[Chorus]\nPeaceful heart, open mind\nLeaving all the noise behind"
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
-    duration=30.0,
+    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
    num_inference_steps=8,
    cfg_scale=1.0,
    shift=1.0,  # shift=1: no timestep transformation
 )
-
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift1.wav")
 sf.write("acestep-v15-turbo-shift1.wav", audio.cpu().numpy(), pipe.sample_rate)
 print(f"Saved, shape: {audio.shape}")
--- a/examples/ace_step/model_inference/acestep-v15-turbo-shift3.py
+++ b/examples/ace_step/model_inference/acestep-v15-turbo-shift3.py
@@ -1,52 +1,37 @@
 """
 Ace-Step 1.5 Turbo (shift=3) — Text-to-Music inference example.
-Uses shift=3.0 (default turbo shift) for faster denoising convergence.
+Turbo model: no num_inference_steps or cfg_scale (use defaults).
 shift=3: explicitly passed for this variant.
 """
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio
 import torch
 import soundfile as sf
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
-        ModelConfig(
+        ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift3", origin_file_pattern="model.safetensors"),
-            model_id="ACE-Step/Ace-Step1.5",
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
-            origin_file_pattern="acestep-v15-turbo/model.safetensors"
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
        ),
        ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="acestep-v15-turbo/model.safetensors"
        ),
        ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
        ),
    ],
-    tokenizer_config=ModelConfig(
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
        model_id="ACE-Step/Ace-Step1.5",
        origin_file_pattern="Qwen3-Embedding-0.6B/"
    ),
    vae_config=ModelConfig(
        model_id="ACE-Step/Ace-Step1.5",
        origin_file_pattern="vae/"
    ),
 )
-prompt = "An explosive, high-energy pop-rock track with anime theme song feel"
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
-lyrics = "[Intro]\n\n[Verse 1]\nRunning through the neon lights\nChasing dreams across the night\n\n[Chorus]\nFeel the fire in my soul\nMusic takes complete control"
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
-    duration=30.0,
+    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
-    num_inference_steps=8,
+    shift=3,
    cfg_scale=1.0,
    shift=3.0,
 )
-
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift3.wav")
 sf.write("acestep-v15-turbo-shift3.wav", audio.cpu().numpy(), pipe.sample_rate)
 print(f"Saved, shape: {audio.shape}")
--- a/examples/ace_step/model_inference/acestep-v15-xl-base.py
+++ b/examples/ace_step/model_inference/acestep-v15-xl-base.py
@@ -2,51 +2,37 @@
 Ace-Step 1.5 XL Base (32 layers, hidden_size=2560) — Text-to-Music inference example.
 XL variant with larger capacity for higher quality generation.
 Non-turbo model: uses num_inference_steps=30, cfg_scale=4.0.
 """
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio
 import torch
 import soundfile as sf
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
-        ModelConfig(
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors"),
-            model_id="ACE-Step/acestep-v15-xl-base",
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
-            origin_file_pattern="model-*.safetensors"
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
        ),
        ModelConfig(
            model_id="ACE-Step/acestep-v15-xl-base",
            origin_file_pattern="model-*.safetensors"
        ),
        ModelConfig(
            model_id="ACE-Step/acestep-v15-xl-base",
            origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
        ),
    ],
-    tokenizer_config=ModelConfig(
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
        model_id="ACE-Step/acestep-v15-xl-base",
        origin_file_pattern="Qwen3-Embedding-0.6B/"
    ),
    vae_config=ModelConfig(
        model_id="ACE-Step/acestep-v15-xl-base",
        origin_file_pattern="vae/"
    ),
 )
-prompt = "An epic symphonic metal track with double bass drums and soaring vocals"
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
-lyrics = "[Intro - Heavy guitar riff]\n\n[Verse 1]\nSteel and thunder, fire and rain\nBurning through the endless pain\n\n[Chorus]\nRise up, break the chains\nUnleash the fire in your veins"
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
-    duration=30.0,
+    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
-    num_inference_steps=20,
+    num_inference_steps=30,
-    cfg_scale=7.0,
+    cfg_scale=4.0,
    shift=3.0,
 )
-
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-base.wav")
 sf.write("acestep-v15-xl-base.wav", audio.cpu().numpy(), pipe.sample_rate)
 print(f"Saved, shape: {audio.shape}")
--- a/examples/ace_step/model_inference/acestep-v15-xl-sft.py
+++ b/examples/ace_step/model_inference/acestep-v15-xl-sft.py
@@ -1,50 +1,37 @@
 """
 Ace-Step 1.5 XL SFT (32 layers, supervised fine-tuned) — Text-to-Music inference example.
 Non-turbo model: uses num_inference_steps=30, cfg_scale=4.0.
 """
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio
 import torch
 import soundfile as sf
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
-        ModelConfig(
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-sft", origin_file_pattern="model-*.safetensors"),
-            model_id="ACE-Step/acestep-v15-xl-sft",
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
-            origin_file_pattern="model-*.safetensors"
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
        ),
        ModelConfig(
            model_id="ACE-Step/acestep-v15-xl-sft",
            origin_file_pattern="model-*.safetensors"
        ),
        ModelConfig(
            model_id="ACE-Step/acestep-v15-xl-sft",
            origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
        ),
    ],
-    tokenizer_config=ModelConfig(
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
        model_id="ACE-Step/acestep-v15-xl-sft",
        origin_file_pattern="Qwen3-Embedding-0.6B/"
    ),
    vae_config=ModelConfig(
        model_id="ACE-Step/acestep-v15-xl-sft",
        origin_file_pattern="vae/"
    ),
 )
-prompt = "A beautiful piano ballad with lush strings and emotional vocals, cinematic feel"
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
-lyrics = "[Intro - Solo piano]\n\n[Verse 1]\nWhispers of a distant shore\nMemories I hold so dear\n\n[Chorus]\nIn your eyes I see the dawn\nAll my fears are gone"
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
-    duration=30.0,
+    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
-    num_inference_steps=20,
+    num_inference_steps=30,
-    cfg_scale=7.0,
+    cfg_scale=4.0,
    shift=3.0,
 )
-
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-sft.wav")
 sf.write("acestep-v15-xl-sft.wav", audio.cpu().numpy(), pipe.sample_rate)
 print(f"Saved, shape: {audio.shape}")
--- a/examples/ace_step/model_inference/acestep-v15-xl-turbo.py
+++ b/examples/ace_step/model_inference/acestep-v15-xl-turbo.py
@@ -1,52 +1,36 @@
 """
-Ace-Step 1.5 XL Turbo (32 layers) — Text-to-Music inference example.
+Ace-Step 1.5 XL Turbo (32 layers, fast generation) — Text-to-Music inference example.
-XL turbo with fast generation (8 steps, shift=3.0, no CFG).
+Turbo model: no num_inference_steps or cfg_scale (use defaults).
 shift=3: explicitly passed for this variant.
 """
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio
 import torch
 import soundfile as sf
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
-        ModelConfig(
+        ModelConfig(model_id="ACE-Step/acestep-v15-xl-turbo", origin_file_pattern="model-*.safetensors"),
-            model_id="ACE-Step/acestep-v15-xl-turbo",
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
-            origin_file_pattern="model-*.safetensors"
+        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
        ),
        ModelConfig(
            model_id="ACE-Step/acestep-v15-xl-turbo",
            origin_file_pattern="model-*.safetensors"
        ),
        ModelConfig(
            model_id="ACE-Step/acestep-v15-xl-turbo",
            origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
        ),
    ],
-    tokenizer_config=ModelConfig(
+    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
        model_id="ACE-Step/acestep-v15-xl-turbo",
        origin_file_pattern="Qwen3-Embedding-0.6B/"
    ),
    vae_config=ModelConfig(
        model_id="ACE-Step/acestep-v15-xl-turbo",
        origin_file_pattern="vae/"
    ),
 )
-prompt = "An upbeat electronic dance track with pulsing synths and driving bassline"
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
-lyrics = "[Intro - Synth build]\n\n[Verse 1]\nFeel the rhythm in the air\nElectric beats are everywhere\n\n[Drop]\n\n[Chorus]\nDance until the break of dawn\nMove your body, carry on"
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
-    duration=30.0,
+    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
    num_inference_steps=8,
    cfg_scale=1.0,  # turbo: no CFG
    shift=3.0,
 )
-
+save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-turbo.wav")
 sf.write("acestep-v15-xl-turbo.wav", audio.cpu().numpy(), pipe.sample_rate)
 print(f"Saved, shape: {audio.shape}")