diff --git a/diffsynth/configs/vram_management_module_maps.py b/diffsynth/configs/vram_management_module_maps.py index 8d4800b..299a830 100644 --- a/diffsynth/configs/vram_management_module_maps.py +++ b/diffsynth/configs/vram_management_module_maps.py @@ -295,6 +295,43 @@ VRAM_MANAGEMENT_MODULE_MAPS = { "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", }, + # ACE-Step module maps + "diffsynth.models.ace_step_dit.AceStepDiTModel": { + "diffsynth.models.ace_step_dit.AceStepDiTLayer": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule", + "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", + "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule", + "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", + }, + "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder": { + "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", + "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", + }, + "diffsynth.models.ace_step_text_encoder.AceStepTextEncoder": { + "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", + "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", + }, + "diffsynth.models.ace_step_vae.AceStepVAE": { + "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", + "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule", + "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule", + "diffsynth.models.ace_step_vae.Snake1d": "diffsynth.core.vram.layers.AutoWrappedModule", + }, + "diffsynth.models.ace_step_tokenizer.AceStepTokenizer": { + "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", + "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule", + "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", + }, } def QwenImageTextEncoder_Module_Map_Updater(): diff --git a/diffsynth/models/ace_step_dit.py b/diffsynth/models/ace_step_dit.py index d917277..16669dc 100644 --- a/diffsynth/models/ace_step_dit.py +++ b/diffsynth/models/ace_step_dit.py @@ -522,7 +522,7 @@ class AceStepDiTLayer(nn.Module): # Extract scale-shift parameters for adaptive layer norm from timestep embeddings # 6 values: (shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa) shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = ( - self.scale_shift_table + temb + self.scale_shift_table.to(temb.device) + temb ).chunk(6, dim=1) # Step 1: Self-attention with adaptive layer norm (AdaLN) @@ -889,7 +889,7 @@ class AceStepDiTModel(nn.Module): return hidden_states # Extract scale-shift parameters for adaptive output normalization - shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1) + shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1) shift = shift.to(hidden_states.device) scale = scale.to(hidden_states.device) diff --git a/diffsynth/models/ace_step_tokenizer.py b/diffsynth/models/ace_step_tokenizer.py index c01e9d5..5bd0e74 100644 --- a/diffsynth/models/ace_step_tokenizer.py +++ b/diffsynth/models/ace_step_tokenizer.py @@ -594,7 +594,7 @@ class AudioTokenDetokenizer(nn.Module): x = self.embed_tokens(x) x = x.unsqueeze(2).repeat(1, 1, self.pool_window_size, 1) special_tokens = self.special_tokens.expand(B, T, -1, -1) - x = x + special_tokens + x = x + special_tokens.to(x.device) x = rearrange(x, "b t p c -> (b t) p c") cache_position = torch.arange(0, x.shape[1], device=x.device) diff --git a/diffsynth/models/ace_step_vae.py b/diffsynth/models/ace_step_vae.py index 168f851..ae5b501 100644 --- a/diffsynth/models/ace_step_vae.py +++ b/diffsynth/models/ace_step_vae.py @@ -22,7 +22,7 @@ from typing import Optional import torch import torch.nn as nn -from torch.nn.utils import weight_norm +from torch.nn.utils import weight_norm, remove_weight_norm class Snake1d(nn.Module): @@ -240,3 +240,9 @@ class AceStepVAE(nn.Module): """Full round-trip: encode → decode.""" z = self.encode(sample) return self.decoder(z) + + def remove_weight_norm(self): + """Remove weight normalization from all conv layers (for export/inference).""" + for module in self.modules(): + if isinstance(module, nn.Conv1d) or isinstance(module, nn.ConvTranspose1d): + remove_weight_norm(module) diff --git a/diffsynth/pipelines/ace_step.py b/diffsynth/pipelines/ace_step.py index 2f3256c..d369da6 100644 --- a/diffsynth/pipelines/ace_step.py +++ b/diffsynth/pipelines/ace_step.py @@ -69,6 +69,7 @@ class AceStepPipeline(BasePipeline): pipe.conditioner = model_pool.fetch_model("ace_step_conditioner") pipe.dit = model_pool.fetch_model("ace_step_dit") pipe.vae = model_pool.fetch_model("ace_step_vae") + pipe.vae.remove_weight_norm() pipe.tokenizer_model = model_pool.fetch_model("ace_step_tokenizer") if text_tokenizer_config is not None: @@ -372,8 +373,9 @@ class AceStepUnit_ConditionEmbedder(PipelineUnit): ) inputs_posi["encoder_hidden_states"] = encoder_hidden_states inputs_posi["encoder_attention_mask"] = encoder_attention_mask - inputs_nega["encoder_hidden_states"] = pipe.conditioner.null_condition_emb.expand_as(encoder_hidden_states) - inputs_nega["encoder_attention_mask"] = encoder_attention_mask + if inputs_shared["cfg_scale"] != 1.0: + inputs_nega["encoder_hidden_states"] = pipe.conditioner.null_condition_emb.expand_as(encoder_hidden_states).to(dtype=encoder_hidden_states.dtype, device=encoder_hidden_states.device) + inputs_nega["encoder_attention_mask"] = encoder_attention_mask return inputs_shared, inputs_posi, inputs_nega @@ -468,10 +470,15 @@ class AceStepUnit_AudioCodeDecoder(PipelineUnit): return {"lm_hints": None} pipe.load_models_to_device(["tokenizer_model"]) - indices = torch.tensor(code_ids, device=pipe.device, dtype=torch.long) - indices = indices.unsqueeze(0).unsqueeze(-1) # [1, N, 1] - quantized = pipe.tokenizer_model.tokenizer.quantizer.get_output_from_indices(indices).to(pipe.torch_dtype) # [1, N, 2048] - lm_hints = pipe.tokenizer_model.detokenizer(quantized) # [1, N*5, 64] + quantizer = pipe.tokenizer_model.tokenizer.quantizer + detokenizer = pipe.tokenizer_model.detokenizer + + indices = torch.tensor(code_ids, device=quantizer.codebooks.device, dtype=torch.long).unsqueeze(0).unsqueeze(-1) + codes = quantizer.get_codes_from_indices(indices) + quantized = codes.sum(dim=0).to(pipe.torch_dtype).to(pipe.device) + quantized = quantizer.project_out(quantized) + + lm_hints = detokenizer(quantized).to(pipe.device) return {"lm_hints": lm_hints} diff --git a/examples/ace_step/model_inference/Ace-Step1.5.py b/examples/ace_step/model_inference/Ace-Step1.5.py index 219cb31..f098396 100644 --- a/examples/ace_step/model_inference/Ace-Step1.5.py +++ b/examples/ace_step/model_inference/Ace-Step1.5.py @@ -24,8 +24,6 @@ audio = pipe( timesignature="4", vocal_language="zh", seed=42, - num_inference_steps=8, - cfg_scale=1.0, ) save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo.wav") @@ -44,7 +42,5 @@ audio = pipe( timesignature="4", vocal_language="zh", seed=42, - num_inference_steps=8, - cfg_scale=1.0, ) save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo5-with-audio-codes.wav") diff --git a/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py b/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py new file mode 100644 index 0000000..3ccd39d --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/Ace-Step1.5.py @@ -0,0 +1,66 @@ +""" +Ace-Step 1.5 (main model, turbo) — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +Turbo model: uses num_inference_steps=8, cfg_scale=1.0. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="acestep-v15-turbo/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, +) + +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-low-vram.wav") + +# input audio codes as reference +with open("data/diffsynth_example_dataset/ace_step/Ace-Step1.5/audio_codes_input.txt", "r") as f: + audio_code_string = f.read().strip() + +audio = pipe( + prompt=prompt, + lyrics=lyrics, + audio_code_string=audio_code_string, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo5-with-audio-codes-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-base.py b/examples/ace_step/model_inference_low_vram/acestep-v15-base.py new file mode 100644 index 0000000..fc997f2 --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-base.py @@ -0,0 +1,49 @@ +""" +Ace-Step 1.5 Base — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-base", origin_file_pattern="model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, + num_inference_steps=30, + cfg_scale=4.0, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-base-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-sft.py b/examples/ace_step/model_inference_low_vram/acestep-v15-sft.py new file mode 100644 index 0000000..189c26a --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-sft.py @@ -0,0 +1,51 @@ +""" +Ace-Step 1.5 SFT (supervised fine-tuned) — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +SFT variant is fine-tuned for specific music styles. +Non-turbo model: uses num_inference_steps=30, cfg_scale=4.0. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-sft", origin_file_pattern="model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, + num_inference_steps=30, + cfg_scale=4.0, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-sft-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-continuous.py b/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-continuous.py new file mode 100644 index 0000000..420bc93 --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-continuous.py @@ -0,0 +1,49 @@ +""" +Ace-Step 1.5 Turbo (continuous, shift 1-5) — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +Turbo model: no num_inference_steps or cfg_scale (use defaults). +Continuous variant: handles shift range internally, no shift parameter needed. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-turbo-continuous", origin_file_pattern="model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-continuous-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift1.py b/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift1.py new file mode 100644 index 0000000..cfa1583 --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift1.py @@ -0,0 +1,49 @@ +""" +Ace-Step 1.5 Turbo (shift=1) — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +Turbo model: no num_inference_steps or cfg_scale (use defaults). +shift=1: default value, no need to pass. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift1", origin_file_pattern="model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift1-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift3.py b/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift3.py new file mode 100644 index 0000000..aa2af9c --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-turbo-shift3.py @@ -0,0 +1,50 @@ +""" +Ace-Step 1.5 Turbo (shift=3) — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +Turbo model: no num_inference_steps or cfg_scale (use defaults). +shift=3: explicitly passed for this variant. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-turbo-shift3", origin_file_pattern="model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, + shift=3, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo-shift3-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py b/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py new file mode 100644 index 0000000..dc772ba --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-xl-base.py @@ -0,0 +1,51 @@ +""" +Ace-Step 1.5 XL Base — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +Non-turbo model: uses num_inference_steps=30, cfg_scale=4.0. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + +torch.cuda.reset_peak_memory_stats("cuda") + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-xl-base", origin_file_pattern="model-*.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, + num_inference_steps=30, + cfg_scale=4.0, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-base-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py b/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py new file mode 100644 index 0000000..5ac17b0 --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-xl-sft.py @@ -0,0 +1,50 @@ +""" +Ace-Step 1.5 XL SFT (supervised fine-tuned) — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +Non-turbo model: uses num_inference_steps=30, cfg_scale=4.0. +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-xl-sft", origin_file_pattern="model-*.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, + num_inference_steps=30, + cfg_scale=4.0, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-sft-low-vram.wav") diff --git a/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py b/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py new file mode 100644 index 0000000..53a5ec5 --- /dev/null +++ b/examples/ace_step/model_inference_low_vram/acestep-v15-xl-turbo.py @@ -0,0 +1,48 @@ +""" +Ace-Step 1.5 XL Turbo — Text-to-Music inference example (Low VRAM). + +Low VRAM version: models are offloaded to CPU and loaded on-demand. +Turbo model: no num_inference_steps or cfg_scale (use defaults). +""" +from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig +from diffsynth.utils.data.audio import save_audio +import torch + + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + + +pipe = AceStepPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="ACE-Step/acestep-v15-xl-turbo", origin_file_pattern="model-*.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config), + ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating." +lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]' +audio = pipe( + prompt=prompt, + lyrics=lyrics, + duration=160, + bpm=100, + keyscale="B minor", + timesignature="4", + vocal_language="zh", + seed=42, +) +save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-xl-turbo-low-vram.wav")