acestep t2m

2026-04-24 15:06:17 +00:00 · 2026-04-21 13:16:15 +08:00
parent a604d76339
commit 9d09e0431c
9 changed files with 300 additions and 377 deletions
--- a/examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
+++ b/examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
@@ -2,8 +2,8 @@
 Ace-Step 1.5 — Text-to-Music with Simple Mode (LLM expansion).

 Uses the ACE-Step LLM to expand a simple description into structured
-parameters (caption, lyrics, bpm, keyscale, etc.), then feeds them
-to the DiffSynth Pipeline.
+parameters (caption, lyrics, bpm, keyscale, etc.) AND audio codes,
+then feeds them to the DiffSynth Pipeline.

 The LLM expansion uses the target library's LLMHandler. If vLLM is
 not available, it falls back to using pre-structured parameters.
@@ -47,11 +47,14 @@ def try_load_llm_handler(checkpoint_dir: str, lm_model_path: str = "acestep-5Hz-


 def expand_with_llm(llm_handler, description: str, duration: float = 30.0):
-    """Expand a simple description using LLM Chain-of-Thought."""
+    """Expand a simple description using LLM Chain-of-Thought.
+
+    Returns (params_dict, audio_codes_string).
+    """
    result = llm_handler.generate_with_stop_condition(
        caption=description,
        lyrics="",
-        infer_type="dit",  # metadata only
+        infer_type="dit",  # metadata + audio codes
        temperature=0.85,
        cfg_scale=1.0,
        use_cot_metas=True,
@@ -62,7 +65,7 @@ def expand_with_llm(llm_handler, description: str, duration: float = 30.0):

    if result.get("success") and result.get("metadata"):
        meta = result["metadata"]
-        return {
+        params = {
            "caption": meta.get("caption", description),
            "lyrics": meta.get("lyrics", ""),
            "bpm": meta.get("bpm", 100),
@@ -71,9 +74,11 @@ def expand_with_llm(llm_handler, description: str, duration: float = 30.0):
            "timesignature": meta.get("timesignature", "4"),
            "duration": meta.get("duration", duration),
        }
+        audio_codes = result.get("audio_codes", "")
+        return params, audio_codes

    print(f"[Simple Mode] LLM expansion failed: {result.get('error', 'unknown')}")
-    return None
+    return None, ""


 def fallback_expand(description: str, duration: float = 30.0):
@@ -87,7 +92,7 @@ def fallback_expand(description: str, duration: float = 30.0):
        "language": "en",
        "timesignature": "4",
        "duration": duration,
-    }
+    }, ""


 # ---------------------------------------------------------------------------
@@ -114,13 +119,13 @@ def main():
        lm_model_path="acestep-5Hz-lm-1.7B",
    )

-    # 2. Expand parameters
+    # 2. Expand parameters + audio codes
    if llm_ok:
-        params = expand_with_llm(llm_handler, description, duration=duration)
+        params, audio_codes = expand_with_llm(llm_handler, description, duration=duration)
        if params is None:
-            params = fallback_expand(description, duration)
+            params, audio_codes = fallback_expand(description, duration)
    else:
-        params = fallback_expand(description, duration)
+        params, audio_codes = fallback_expand(description, duration)

    print(f"\n[Simple Mode] Parameters:")
    print(f"  Caption: {params['caption'][:100]}...")
@@ -128,6 +133,7 @@ def main():
    print(f"  BPM: {params['bpm']}, Keyscale: {params['keyscale']}")
    print(f"  Language: {params['language']}, Time Sig: {params['timesignature']}")
    print(f"  Duration: {params['duration']}s")
+    print(f"  Audio codes: {len(audio_codes)} chars" if audio_codes else "  Audio codes: None (fallback)")

    # 3. Load Pipeline
    print(f"\n[Pipeline] Loading Ace-Step 1.5 (turbo)...")
@@ -141,21 +147,17 @@ def main():
            ),
            ModelConfig(
                model_id="ACE-Step/Ace-Step1.5",
-                origin_file_pattern="acestep-v15-turbo/model.safetensors"
+                origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
            ),
            ModelConfig(
                model_id="ACE-Step/Ace-Step1.5",
-                origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
+                origin_file_pattern="vae/diffusion_pytorch_model.safetensors"
            ),
        ],
-        tokenizer_config=ModelConfig(
+        text_tokenizer_config=ModelConfig(
            model_id="ACE-Step/Ace-Step1.5",
            origin_file_pattern="Qwen3-Embedding-0.6B/"
        ),
-        vae_config=ModelConfig(
-            model_id="ACE-Step/Ace-Step1.5",
-            origin_file_pattern="vae/"
-        ),
    )

    # 4. Generate
@@ -164,6 +166,7 @@ def main():
        prompt=params["caption"],
        lyrics=params["lyrics"],
        duration=params["duration"],
+        audio_codes=audio_codes if audio_codes else None,
        seed=42,
        num_inference_steps=8,
        cfg_scale=1.0,
--- a/examples/ace_step/model_inference/Ace-Step1.5.py
+++ b/examples/ace_step/model_inference/Ace-Step1.5.py
@@ -1,16 +1,6 @@
-"""
-Ace-Step 1.5 — Text-to-Music (Turbo) inference example.
-
-Demonstrates the standard text2music pipeline with structured parameters
-(caption, lyrics, duration, etc.) — no LLM expansion needed.
-
-For Simple Mode (LLM expands a short description), see:
-    - Ace-Step1.5-SimpleMode.py
-"""
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 import torch
-import soundfile as sf
-
+from diffsynth.utils.data.audio import save_audio

 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
@@ -35,29 +25,21 @@ pipe = AceStepPipeline.from_pretrained(
    ),
 )

-prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline."
-lyrics = """[Intro - Synth Brass Fanfare]
-
-[Verse 1]
-黑夜里的风吹过耳畔
-甜蜜时光转瞬即逝
-脚步飘摇在星光上
-
-[Chorus]
-心电感应在震动间
-拥抱未来勇敢冒险
-
-[Outro - Instrumental]"""
+prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
+lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'

 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
-    duration=30.0,
+    duration=160,
+    bpm=100,
+    keyscale="B minor",
+    timesignature="4",
+    vocal_language="zh",
    seed=42,
    num_inference_steps=8,
    cfg_scale=1.0,
-    shift=3.0,
 )

-sf.write("Ace-Step1.5.wav", audio.cpu().numpy(), pipe.sample_rate)
+save_audio(audio.cpu(), pipe.vae.sampling_rate, "Ace-Step1.5.wav")
 print(f"Saved to Ace-Step1.5.wav, shape: {audio.shape}, duration: {audio.shape[-1] / pipe.sample_rate:.1f}s")