model-code

This commit is contained in:
mi804
2026-04-17 17:06:26 +08:00
parent 079e51c9f3
commit 36c203da57
23 changed files with 4230 additions and 2 deletions

View File

@@ -0,0 +1,180 @@
"""
Ace-Step 1.5 — Text-to-Music with Simple Mode (LLM expansion).
Uses the ACE-Step LLM to expand a simple description into structured
parameters (caption, lyrics, bpm, keyscale, etc.), then feeds them
to the DiffSynth Pipeline.
The LLM expansion uses the target library's LLMHandler. If vLLM is
not available, it falls back to using pre-structured parameters.
Usage:
python examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
"""
import os
import sys
import json
import torch
import soundfile as sf
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
# ---------------------------------------------------------------------------
# Simple Mode: LLM expansion
# ---------------------------------------------------------------------------
def try_load_llm_handler(checkpoint_dir: str, lm_model_path: str = "acestep-5Hz-lm-1.7B",
backend: str = "vllm"):
"""Try to load the target library's LLMHandler. Returns (handler, success)."""
try:
from acestep.llm_inference import LLMHandler
handler = LLMHandler()
status, success = handler.initialize(
checkpoint_dir=checkpoint_dir,
lm_model_path=lm_model_path,
backend=backend,
)
if success:
print(f"[Simple Mode] LLM loaded via {backend} backend: {status}")
return handler, True
else:
print(f"[Simple Mode] LLM init failed: {status}")
return None, False
except Exception as e:
print(f"[Simple Mode] LLMHandler not available: {e}")
return None, False
def expand_with_llm(llm_handler, description: str, duration: float = 30.0):
"""Expand a simple description using LLM Chain-of-Thought."""
result = llm_handler.generate_with_stop_condition(
caption=description,
lyrics="",
infer_type="dit", # metadata only
temperature=0.85,
cfg_scale=1.0,
use_cot_metas=True,
use_cot_caption=True,
use_cot_language=True,
user_metadata={"duration": int(duration)},
)
if result.get("success") and result.get("metadata"):
meta = result["metadata"]
return {
"caption": meta.get("caption", description),
"lyrics": meta.get("lyrics", ""),
"bpm": meta.get("bpm", 100),
"keyscale": meta.get("keyscale", ""),
"language": meta.get("language", "en"),
"timesignature": meta.get("timesignature", "4"),
"duration": meta.get("duration", duration),
}
print(f"[Simple Mode] LLM expansion failed: {result.get('error', 'unknown')}")
return None
def fallback_expand(description: str, duration: float = 30.0):
"""Fallback: use description as caption with default parameters."""
print(f"[Simple Mode] LLM not available. Using description as caption.")
return {
"caption": description,
"lyrics": "",
"bpm": 100,
"keyscale": "",
"language": "en",
"timesignature": "4",
"duration": duration,
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
# Target library path (for LLMHandler)
TARGET_LIB = os.path.join(os.path.dirname(__file__), "../../../../ACE-Step-1.5")
if TARGET_LIB not in sys.path:
sys.path.insert(0, TARGET_LIB)
description = "a soft Bengali love song for a quiet evening"
duration = 30.0
# 1. Try to load LLM
print("=" * 60)
print("Ace-Step 1.5 — Simple Mode (LLM expansion)")
print("=" * 60)
print(f"\n[Simple Mode] Input: '{description}'")
llm_handler, llm_ok = try_load_llm_handler(
checkpoint_dir=TARGET_LIB,
lm_model_path="acestep-5Hz-lm-1.7B",
)
# 2. Expand parameters
if llm_ok:
params = expand_with_llm(llm_handler, description, duration=duration)
if params is None:
params = fallback_expand(description, duration)
else:
params = fallback_expand(description, duration)
print(f"\n[Simple Mode] Parameters:")
print(f" Caption: {params['caption'][:100]}...")
print(f" Lyrics: {len(params['lyrics'])} chars")
print(f" BPM: {params['bpm']}, Keyscale: {params['keyscale']}")
print(f" Language: {params['language']}, Time Sig: {params['timesignature']}")
print(f" Duration: {params['duration']}s")
# 3. Load Pipeline
print(f"\n[Pipeline] Loading Ace-Step 1.5 (turbo)...")
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="vae/"
),
)
# 4. Generate
print(f"\n[Generation] Running Pipeline...")
audio = pipe(
prompt=params["caption"],
lyrics=params["lyrics"],
duration=params["duration"],
seed=42,
num_inference_steps=8,
cfg_scale=1.0,
shift=3.0,
)
output_path = "Ace-Step1.5-SimpleMode.wav"
sf.write(output_path, audio.cpu().numpy(), pipe.sample_rate)
print(f"\n[Done] Saved to {output_path}")
print(f" Shape: {audio.shape}, Duration: {audio.shape[-1] / pipe.sample_rate:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,67 @@
"""
Ace-Step 1.5 — Text-to-Music (Turbo) inference example.
Demonstrates the standard text2music pipeline with structured parameters
(caption, lyrics, duration, etc.) — no LLM expansion needed.
For Simple Mode (LLM expands a short description), see:
- Ace-Step1.5-SimpleMode.py
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="vae/"
),
)
prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline."
lyrics = """[Intro - Synth Brass Fanfare]
[Verse 1]
黑夜里的风吹过耳畔
甜蜜时光转瞬即逝
脚步飘摇在星光上
[Chorus]
心电感应在震动间
拥抱未来勇敢冒险
[Outro - Instrumental]"""
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=8,
cfg_scale=1.0,
shift=3.0,
)
sf.write("Ace-Step1.5.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved to Ace-Step1.5.wav, shape: {audio.shape}, duration: {audio.shape[-1] / pipe.sample_rate:.1f}s")

View File

@@ -0,0 +1,52 @@
"""
Ace-Step 1.5 Base (non-turbo, 24 layers) — Text-to-Music inference example.
Uses cfg_scale=7.0 (standard CFG guidance) and more steps for higher quality.
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-base/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-base/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="vae/"
),
)
prompt = "A cinematic orchestral piece with soaring strings and heroic brass"
lyrics = "[Intro - Orchestra]\n\n[Verse 1]\nAcross the mountains, through the valley\nA journey of a thousand miles\n\n[Chorus]\nRise above the stormy skies\nLet the music carry you"
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=20,
cfg_scale=7.0, # Base model uses CFG
shift=3.0,
)
sf.write("acestep-v15-base.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved, shape: {audio.shape}")

View File

@@ -0,0 +1,52 @@
"""
Ace-Step 1.5 SFT (supervised fine-tuned, 24 layers) — Text-to-Music inference example.
SFT variant is fine-tuned for specific music styles.
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-sft/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-sft/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="vae/"
),
)
prompt = "A jazzy lo-fi beat with smooth saxophone and vinyl crackle, late night vibes"
lyrics = "[Intro - Vinyl crackle]\n\n[Verse 1]\nMidnight city, neon glow\nSmooth jazz flowing to and fro\n\n[Chorus]\nLay back, let the music play\nJazzy nights, dreams drift away"
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=20,
cfg_scale=7.0,
shift=3.0,
)
sf.write("acestep-v15-sft.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved, shape: {audio.shape}")

View File

@@ -0,0 +1,52 @@
"""
Ace-Step 1.5 Turbo (shift=1) — Text-to-Music inference example.
Uses shift=1.0 (no timestep transformation) for smoother, slower denoising.
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="vae/"
),
)
prompt = "A gentle acoustic guitar melody with soft piano accompaniment, peaceful and warm atmosphere"
lyrics = "[Verse 1]\nSunlight filtering through the trees\nA quiet moment, just the breeze\n\n[Chorus]\nPeaceful heart, open mind\nLeaving all the noise behind"
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=8,
cfg_scale=1.0,
shift=1.0, # shift=1: no timestep transformation
)
sf.write("acestep-v15-turbo-shift1.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved, shape: {audio.shape}")

View File

@@ -0,0 +1,52 @@
"""
Ace-Step 1.5 Turbo (shift=3) — Text-to-Music inference example.
Uses shift=3.0 (default turbo shift) for faster denoising convergence.
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="acestep-v15-turbo/model.safetensors"
),
ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/Ace-Step1.5",
origin_file_pattern="vae/"
),
)
prompt = "An explosive, high-energy pop-rock track with anime theme song feel"
lyrics = "[Intro]\n\n[Verse 1]\nRunning through the neon lights\nChasing dreams across the night\n\n[Chorus]\nFeel the fire in my soul\nMusic takes complete control"
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=8,
cfg_scale=1.0,
shift=3.0,
)
sf.write("acestep-v15-turbo-shift3.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved, shape: {audio.shape}")

View File

@@ -0,0 +1,52 @@
"""
Ace-Step 1.5 XL Base (32 layers, hidden_size=2560) — Text-to-Music inference example.
XL variant with larger capacity for higher quality generation.
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-base",
origin_file_pattern="model-*.safetensors"
),
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-base",
origin_file_pattern="model-*.safetensors"
),
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-base",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/acestep-v15-xl-base",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/acestep-v15-xl-base",
origin_file_pattern="vae/"
),
)
prompt = "An epic symphonic metal track with double bass drums and soaring vocals"
lyrics = "[Intro - Heavy guitar riff]\n\n[Verse 1]\nSteel and thunder, fire and rain\nBurning through the endless pain\n\n[Chorus]\nRise up, break the chains\nUnleash the fire in your veins"
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=20,
cfg_scale=7.0,
shift=3.0,
)
sf.write("acestep-v15-xl-base.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved, shape: {audio.shape}")

View File

@@ -0,0 +1,50 @@
"""
Ace-Step 1.5 XL SFT (32 layers, supervised fine-tuned) — Text-to-Music inference example.
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-sft",
origin_file_pattern="model-*.safetensors"
),
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-sft",
origin_file_pattern="model-*.safetensors"
),
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-sft",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/acestep-v15-xl-sft",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/acestep-v15-xl-sft",
origin_file_pattern="vae/"
),
)
prompt = "A beautiful piano ballad with lush strings and emotional vocals, cinematic feel"
lyrics = "[Intro - Solo piano]\n\n[Verse 1]\nWhispers of a distant shore\nMemories I hold so dear\n\n[Chorus]\nIn your eyes I see the dawn\nAll my fears are gone"
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=20,
cfg_scale=7.0,
shift=3.0,
)
sf.write("acestep-v15-xl-sft.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved, shape: {audio.shape}")

View File

@@ -0,0 +1,52 @@
"""
Ace-Step 1.5 XL Turbo (32 layers) — Text-to-Music inference example.
XL turbo with fast generation (8 steps, shift=3.0, no CFG).
"""
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
import torch
import soundfile as sf
pipe = AceStepPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-turbo",
origin_file_pattern="model-*.safetensors"
),
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-turbo",
origin_file_pattern="model-*.safetensors"
),
ModelConfig(
model_id="ACE-Step/acestep-v15-xl-turbo",
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
),
],
tokenizer_config=ModelConfig(
model_id="ACE-Step/acestep-v15-xl-turbo",
origin_file_pattern="Qwen3-Embedding-0.6B/"
),
vae_config=ModelConfig(
model_id="ACE-Step/acestep-v15-xl-turbo",
origin_file_pattern="vae/"
),
)
prompt = "An upbeat electronic dance track with pulsing synths and driving bassline"
lyrics = "[Intro - Synth build]\n\n[Verse 1]\nFeel the rhythm in the air\nElectric beats are everywhere\n\n[Drop]\n\n[Chorus]\nDance until the break of dawn\nMove your body, carry on"
audio = pipe(
prompt=prompt,
lyrics=lyrics,
duration=30.0,
seed=42,
num_inference_steps=8,
cfg_scale=1.0, # turbo: no CFG
shift=3.0,
)
sf.write("acestep-v15-xl-turbo.wav", audio.cpu().numpy(), pipe.sample_rate)
print(f"Saved, shape: {audio.shape}")