""" Ace-Step 1.5 XL Base (32 layers, hidden_size=2560) — Text-to-Music inference example. XL variant with larger capacity for higher quality generation. """ from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig import torch import soundfile as sf pipe = AceStepPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig( model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors" ), ModelConfig( model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors" ), ModelConfig( model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors" ), ], tokenizer_config=ModelConfig( model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="Qwen3-Embedding-0.6B/" ), vae_config=ModelConfig( model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="vae/" ), ) prompt = "An epic symphonic metal track with double bass drums and soaring vocals" lyrics = "[Intro - Heavy guitar riff]\n\n[Verse 1]\nSteel and thunder, fire and rain\nBurning through the endless pain\n\n[Chorus]\nRise up, break the chains\nUnleash the fire in your veins" audio = pipe( prompt=prompt, lyrics=lyrics, duration=30.0, seed=42, num_inference_steps=20, cfg_scale=7.0, shift=3.0, ) sf.write("acestep-v15-xl-base.wav", audio.cpu().numpy(), pipe.sample_rate) print(f"Saved, shape: {audio.shape}")