mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-04-24 15:06:17 +00:00
model-code
This commit is contained in:
180
examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
Normal file
180
examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
Ace-Step 1.5 — Text-to-Music with Simple Mode (LLM expansion).
|
||||
|
||||
Uses the ACE-Step LLM to expand a simple description into structured
|
||||
parameters (caption, lyrics, bpm, keyscale, etc.), then feeds them
|
||||
to the DiffSynth Pipeline.
|
||||
|
||||
The LLM expansion uses the target library's LLMHandler. If vLLM is
|
||||
not available, it falls back to using pre-structured parameters.
|
||||
|
||||
Usage:
|
||||
python examples/ace_step/model_inference/Ace-Step1.5-SimpleMode.py
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Simple Mode: LLM expansion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def try_load_llm_handler(checkpoint_dir: str, lm_model_path: str = "acestep-5Hz-lm-1.7B",
|
||||
backend: str = "vllm"):
|
||||
"""Try to load the target library's LLMHandler. Returns (handler, success)."""
|
||||
try:
|
||||
from acestep.llm_inference import LLMHandler
|
||||
handler = LLMHandler()
|
||||
status, success = handler.initialize(
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
lm_model_path=lm_model_path,
|
||||
backend=backend,
|
||||
)
|
||||
if success:
|
||||
print(f"[Simple Mode] LLM loaded via {backend} backend: {status}")
|
||||
return handler, True
|
||||
else:
|
||||
print(f"[Simple Mode] LLM init failed: {status}")
|
||||
return None, False
|
||||
except Exception as e:
|
||||
print(f"[Simple Mode] LLMHandler not available: {e}")
|
||||
return None, False
|
||||
|
||||
|
||||
def expand_with_llm(llm_handler, description: str, duration: float = 30.0):
|
||||
"""Expand a simple description using LLM Chain-of-Thought."""
|
||||
result = llm_handler.generate_with_stop_condition(
|
||||
caption=description,
|
||||
lyrics="",
|
||||
infer_type="dit", # metadata only
|
||||
temperature=0.85,
|
||||
cfg_scale=1.0,
|
||||
use_cot_metas=True,
|
||||
use_cot_caption=True,
|
||||
use_cot_language=True,
|
||||
user_metadata={"duration": int(duration)},
|
||||
)
|
||||
|
||||
if result.get("success") and result.get("metadata"):
|
||||
meta = result["metadata"]
|
||||
return {
|
||||
"caption": meta.get("caption", description),
|
||||
"lyrics": meta.get("lyrics", ""),
|
||||
"bpm": meta.get("bpm", 100),
|
||||
"keyscale": meta.get("keyscale", ""),
|
||||
"language": meta.get("language", "en"),
|
||||
"timesignature": meta.get("timesignature", "4"),
|
||||
"duration": meta.get("duration", duration),
|
||||
}
|
||||
|
||||
print(f"[Simple Mode] LLM expansion failed: {result.get('error', 'unknown')}")
|
||||
return None
|
||||
|
||||
|
||||
def fallback_expand(description: str, duration: float = 30.0):
|
||||
"""Fallback: use description as caption with default parameters."""
|
||||
print(f"[Simple Mode] LLM not available. Using description as caption.")
|
||||
return {
|
||||
"caption": description,
|
||||
"lyrics": "",
|
||||
"bpm": 100,
|
||||
"keyscale": "",
|
||||
"language": "en",
|
||||
"timesignature": "4",
|
||||
"duration": duration,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
# Target library path (for LLMHandler)
|
||||
TARGET_LIB = os.path.join(os.path.dirname(__file__), "../../../../ACE-Step-1.5")
|
||||
if TARGET_LIB not in sys.path:
|
||||
sys.path.insert(0, TARGET_LIB)
|
||||
|
||||
description = "a soft Bengali love song for a quiet evening"
|
||||
duration = 30.0
|
||||
|
||||
# 1. Try to load LLM
|
||||
print("=" * 60)
|
||||
print("Ace-Step 1.5 — Simple Mode (LLM expansion)")
|
||||
print("=" * 60)
|
||||
print(f"\n[Simple Mode] Input: '{description}'")
|
||||
|
||||
llm_handler, llm_ok = try_load_llm_handler(
|
||||
checkpoint_dir=TARGET_LIB,
|
||||
lm_model_path="acestep-5Hz-lm-1.7B",
|
||||
)
|
||||
|
||||
# 2. Expand parameters
|
||||
if llm_ok:
|
||||
params = expand_with_llm(llm_handler, description, duration=duration)
|
||||
if params is None:
|
||||
params = fallback_expand(description, duration)
|
||||
else:
|
||||
params = fallback_expand(description, duration)
|
||||
|
||||
print(f"\n[Simple Mode] Parameters:")
|
||||
print(f" Caption: {params['caption'][:100]}...")
|
||||
print(f" Lyrics: {len(params['lyrics'])} chars")
|
||||
print(f" BPM: {params['bpm']}, Keyscale: {params['keyscale']}")
|
||||
print(f" Language: {params['language']}, Time Sig: {params['timesignature']}")
|
||||
print(f" Duration: {params['duration']}s")
|
||||
|
||||
# 3. Load Pipeline
|
||||
print(f"\n[Pipeline] Loading Ace-Step 1.5 (turbo)...")
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
# 4. Generate
|
||||
print(f"\n[Generation] Running Pipeline...")
|
||||
audio = pipe(
|
||||
prompt=params["caption"],
|
||||
lyrics=params["lyrics"],
|
||||
duration=params["duration"],
|
||||
seed=42,
|
||||
num_inference_steps=8,
|
||||
cfg_scale=1.0,
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
output_path = "Ace-Step1.5-SimpleMode.wav"
|
||||
sf.write(output_path, audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"\n[Done] Saved to {output_path}")
|
||||
print(f" Shape: {audio.shape}, Duration: {audio.shape[-1] / pipe.sample_rate:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
67
examples/ace_step/model_inference/Ace-Step1.5.py
Normal file
67
examples/ace_step/model_inference/Ace-Step1.5.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""
|
||||
Ace-Step 1.5 — Text-to-Music (Turbo) inference example.
|
||||
|
||||
Demonstrates the standard text2music pipeline with structured parameters
|
||||
(caption, lyrics, duration, etc.) — no LLM expansion needed.
|
||||
|
||||
For Simple Mode (LLM expands a short description), see:
|
||||
- Ace-Step1.5-SimpleMode.py
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline."
|
||||
lyrics = """[Intro - Synth Brass Fanfare]
|
||||
|
||||
[Verse 1]
|
||||
黑夜里的风吹过耳畔
|
||||
甜蜜时光转瞬即逝
|
||||
脚步飘摇在星光上
|
||||
|
||||
[Chorus]
|
||||
心电感应在震动间
|
||||
拥抱未来勇敢冒险
|
||||
|
||||
[Outro - Instrumental]"""
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=8,
|
||||
cfg_scale=1.0,
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
sf.write("Ace-Step1.5.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved to Ace-Step1.5.wav, shape: {audio.shape}, duration: {audio.shape[-1] / pipe.sample_rate:.1f}s")
|
||||
52
examples/ace_step/model_inference/acestep-v15-base.py
Normal file
52
examples/ace_step/model_inference/acestep-v15-base.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Ace-Step 1.5 Base (non-turbo, 24 layers) — Text-to-Music inference example.
|
||||
|
||||
Uses cfg_scale=7.0 (standard CFG guidance) and more steps for higher quality.
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-base/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-base/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "A cinematic orchestral piece with soaring strings and heroic brass"
|
||||
lyrics = "[Intro - Orchestra]\n\n[Verse 1]\nAcross the mountains, through the valley\nA journey of a thousand miles\n\n[Chorus]\nRise above the stormy skies\nLet the music carry you"
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=20,
|
||||
cfg_scale=7.0, # Base model uses CFG
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
sf.write("acestep-v15-base.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved, shape: {audio.shape}")
|
||||
52
examples/ace_step/model_inference/acestep-v15-sft.py
Normal file
52
examples/ace_step/model_inference/acestep-v15-sft.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Ace-Step 1.5 SFT (supervised fine-tuned, 24 layers) — Text-to-Music inference example.
|
||||
|
||||
SFT variant is fine-tuned for specific music styles.
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-sft/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-sft/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "A jazzy lo-fi beat with smooth saxophone and vinyl crackle, late night vibes"
|
||||
lyrics = "[Intro - Vinyl crackle]\n\n[Verse 1]\nMidnight city, neon glow\nSmooth jazz flowing to and fro\n\n[Chorus]\nLay back, let the music play\nJazzy nights, dreams drift away"
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=20,
|
||||
cfg_scale=7.0,
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
sf.write("acestep-v15-sft.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved, shape: {audio.shape}")
|
||||
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Ace-Step 1.5 Turbo (shift=1) — Text-to-Music inference example.
|
||||
|
||||
Uses shift=1.0 (no timestep transformation) for smoother, slower denoising.
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "A gentle acoustic guitar melody with soft piano accompaniment, peaceful and warm atmosphere"
|
||||
lyrics = "[Verse 1]\nSunlight filtering through the trees\nA quiet moment, just the breeze\n\n[Chorus]\nPeaceful heart, open mind\nLeaving all the noise behind"
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=8,
|
||||
cfg_scale=1.0,
|
||||
shift=1.0, # shift=1: no timestep transformation
|
||||
)
|
||||
|
||||
sf.write("acestep-v15-turbo-shift1.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved, shape: {audio.shape}")
|
||||
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Ace-Step 1.5 Turbo (shift=3) — Text-to-Music inference example.
|
||||
|
||||
Uses shift=3.0 (default turbo shift) for faster denoising convergence.
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="acestep-v15-turbo/model.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/Ace-Step1.5",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "An explosive, high-energy pop-rock track with anime theme song feel"
|
||||
lyrics = "[Intro]\n\n[Verse 1]\nRunning through the neon lights\nChasing dreams across the night\n\n[Chorus]\nFeel the fire in my soul\nMusic takes complete control"
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=8,
|
||||
cfg_scale=1.0,
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
sf.write("acestep-v15-turbo-shift3.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved, shape: {audio.shape}")
|
||||
52
examples/ace_step/model_inference/acestep-v15-xl-base.py
Normal file
52
examples/ace_step/model_inference/acestep-v15-xl-base.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Ace-Step 1.5 XL Base (32 layers, hidden_size=2560) — Text-to-Music inference example.
|
||||
|
||||
XL variant with larger capacity for higher quality generation.
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-base",
|
||||
origin_file_pattern="model-*.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-base",
|
||||
origin_file_pattern="model-*.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-base",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-base",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-base",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "An epic symphonic metal track with double bass drums and soaring vocals"
|
||||
lyrics = "[Intro - Heavy guitar riff]\n\n[Verse 1]\nSteel and thunder, fire and rain\nBurning through the endless pain\n\n[Chorus]\nRise up, break the chains\nUnleash the fire in your veins"
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=20,
|
||||
cfg_scale=7.0,
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
sf.write("acestep-v15-xl-base.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved, shape: {audio.shape}")
|
||||
50
examples/ace_step/model_inference/acestep-v15-xl-sft.py
Normal file
50
examples/ace_step/model_inference/acestep-v15-xl-sft.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""
|
||||
Ace-Step 1.5 XL SFT (32 layers, supervised fine-tuned) — Text-to-Music inference example.
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-sft",
|
||||
origin_file_pattern="model-*.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-sft",
|
||||
origin_file_pattern="model-*.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-sft",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-sft",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-sft",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "A beautiful piano ballad with lush strings and emotional vocals, cinematic feel"
|
||||
lyrics = "[Intro - Solo piano]\n\n[Verse 1]\nWhispers of a distant shore\nMemories I hold so dear\n\n[Chorus]\nIn your eyes I see the dawn\nAll my fears are gone"
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=20,
|
||||
cfg_scale=7.0,
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
sf.write("acestep-v15-xl-sft.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved, shape: {audio.shape}")
|
||||
52
examples/ace_step/model_inference/acestep-v15-xl-turbo.py
Normal file
52
examples/ace_step/model_inference/acestep-v15-xl-turbo.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Ace-Step 1.5 XL Turbo (32 layers) — Text-to-Music inference example.
|
||||
|
||||
XL turbo with fast generation (8 steps, shift=3.0, no CFG).
|
||||
"""
|
||||
from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
pipe = AceStepPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-turbo",
|
||||
origin_file_pattern="model-*.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-turbo",
|
||||
origin_file_pattern="model-*.safetensors"
|
||||
),
|
||||
ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-turbo",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"
|
||||
),
|
||||
],
|
||||
tokenizer_config=ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-turbo",
|
||||
origin_file_pattern="Qwen3-Embedding-0.6B/"
|
||||
),
|
||||
vae_config=ModelConfig(
|
||||
model_id="ACE-Step/acestep-v15-xl-turbo",
|
||||
origin_file_pattern="vae/"
|
||||
),
|
||||
)
|
||||
|
||||
prompt = "An upbeat electronic dance track with pulsing synths and driving bassline"
|
||||
lyrics = "[Intro - Synth build]\n\n[Verse 1]\nFeel the rhythm in the air\nElectric beats are everywhere\n\n[Drop]\n\n[Chorus]\nDance until the break of dawn\nMove your body, carry on"
|
||||
|
||||
audio = pipe(
|
||||
prompt=prompt,
|
||||
lyrics=lyrics,
|
||||
duration=30.0,
|
||||
seed=42,
|
||||
num_inference_steps=8,
|
||||
cfg_scale=1.0, # turbo: no CFG
|
||||
shift=3.0,
|
||||
)
|
||||
|
||||
sf.write("acestep-v15-xl-turbo.wav", audio.cpu().numpy(), pipe.sample_rate)
|
||||
print(f"Saved, shape: {audio.shape}")
|
||||
Reference in New Issue
Block a user