codes

2026-04-24 15:06:17 +00:00 · 2026-04-23 16:52:59 +08:00
parent 1186379139
commit 394db06d86
7 changed files with 212 additions and 20 deletions
--- a/diffsynth/configs/vram_management_module_maps.py
+++ b/diffsynth/configs/vram_management_module_maps.py
@@ -328,6 +328,7 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
    "diffsynth.models.ace_step_tokenizer.AceStepTokenizer": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "vector_quantize_pytorch.ResidualFSQ": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
--- a/diffsynth/models/ace_step_tokenizer.py
+++ b/diffsynth/models/ace_step_tokenizer.py
@@ -349,7 +349,7 @@ class AttentionPooler(nn.Module):
    ) -> torch.Tensor:
        B, T, P, D = x.shape
        x = self.embed_tokens(x)
-        special_tokens = self.special_token.expand(B, T, 1, -1)
+        special_tokens = self.special_token.expand(B, T, 1, -1).to(x.device)
        x = torch.cat([special_tokens, x], dim=2)
        x = rearrange(x, "b t p c -> (b t) p c")
--- a/diffsynth/pipelines/ace_step.py
+++ b/diffsynth/pipelines/ace_step.py
@@ -106,6 +106,9 @@ class AceStepPipeline(BasePipeline):
        audio_cover_strength: float = 1.0,
        # Audio codes
        audio_code_string: Optional[str] = None,
        # Inpainting
        repainting_ranges: Optional[List[Tuple[float, float]]] = None,
        repainting_strength: float = 1.0,
        # Shape
        duration: int = 60,
        # Audio Meta
@@ -134,9 +137,8 @@ class AceStepPipeline(BasePipeline):
            "lyrics": lyrics,
            "task_type": task_type,
            "reference_audios": reference_audios,
-            "src_audio": src_audio,
+            "src_audio": src_audio, "audio_cover_strength": audio_cover_strength, "audio_code_string": audio_code_string,
-            "audio_cover_strength": audio_cover_strength,
+            "repainting_ranges": repainting_ranges, "repainting_strength": repainting_strength,
            "audio_code_string": audio_code_string,
            "duration": duration,
            "bpm": bpm, "keyscale": keyscale, "timesignature": timesignature, "vocal_language": vocal_language,
            "seed": seed,
@@ -162,9 +164,8 @@ class AceStepPipeline(BasePipeline):
                inputs_shared, inputs_posi, inputs_nega,
                **models, timestep=timestep, progress_id=progress_id,
            )
-            inputs_shared["latents"] = self.step(
+            inputs_shared["latents"] = self.step(self.scheduler, inpaint_mask=inputs_shared.get("denoise_mask", None), input_latents=inputs_shared.get("src_latents", None),
-                self.scheduler, progress_id=progress_id, noise_pred=noise_pred, **inputs_shared
+                                                 progress_id=progress_id, noise_pred=noise_pred, **inputs_shared)
            )
        # 5. VAE 解码
        self.load_models_to_device(['vae'])
@@ -201,12 +202,17 @@ class AceStepUnit_TaskTypeChecker(PipelineUnit):
    """Check and compute sequence length from duration."""
    def __init__(self):
        super().__init__(
-            input_params=("task_type",),
+            input_params=("task_type", "src_audio", "repainting_ranges", "audio_code_string"),
            output_params=("task_type",),
        )
-    def process(self, pipe, task_type):
+    def process(self, pipe, task_type, src_audio, repainting_ranges, audio_code_string):
        assert task_type in ["text2music", "cover", "repaint"], f"Unsupported task_type: {task_type}"
        if task_type == "cover":
            assert (src_audio is not None) or (audio_code_string is not None), "For cover task, either src_audio or audio_code_string must be provided."
        elif task_type == "repaint":
            assert src_audio is not None, "For repaint task, src_audio must be provided."
            assert repainting_ranges is not None and len(repainting_ranges) > 0, "For repaint task, inpainting_ranges must be provided and non-empty."
        return {}
@@ -399,7 +405,7 @@ class AceStepUnit_ConditionEmbedder(PipelineUnit):
 class AceStepUnit_ContextLatentBuilder(PipelineUnit):
    def __init__(self):
        super().__init__(
-            input_params=("duration", "src_audio", "audio_code_string"),
+            input_params=("duration", "src_audio", "audio_code_string", "task_type", "repainting_ranges", "repainting_strength"),
            output_params=("context_latents", "src_latents", "chunk_masks", "attention_mask"),
            onload_model_names=("vae", "tokenizer_model",),
        )
@@ -435,9 +441,46 @@ class AceStepUnit_ContextLatentBuilder(PipelineUnit):
            raise ValueError(f"Invalid audio_code_string format: {e}")
        return codes
-    def process(self, pipe, duration, src_audio, audio_code_string):
+    def pad_src_audio(self, pipe, src_audio, task_type, repainting_ranges):
        if task_type != "repaint" or repainting_ranges is None:
            return src_audio, repainting_ranges, None, None
        min_left = min([start for start, end in repainting_ranges])
        max_right = max([end for start, end in repainting_ranges])
        total_length = src_audio.shape[-1] // pipe.vae.sampling_rate
        pad_left = max(0, -min_left)
        pad_right = max(0, max_right - total_length)
        if pad_left > 0 or pad_right > 0:
            padding_frames_left, padding_frames_right = pad_left * pipe.vae.sampling_rate, pad_right * pipe.vae.sampling_rate
            src_audio = F.pad(src_audio, (padding_frames_left, padding_frames_right), value=0.0)
        repainting_ranges = [(start + pad_left, end + pad_left) for start, end in repainting_ranges]
        return src_audio, repainting_ranges, pad_left, pad_right
    def parse_repaint_masks(self, pipe, src_latents, task_type, repainting_ranges, repainting_strength, pad_left, pad_right):
        if task_type != "repaint" or repainting_ranges is None:
            return None, src_latents
        # let repainting area be repainting_strength, non-repainting area be 0.0, and blend at the boundary with cf_frames.
        max_latent_length = src_latents.shape[1]
        denoise_mask = torch.zeros((1, max_latent_length, 1), dtype=pipe.torch_dtype, device=pipe.device)
        for start, end in repainting_ranges:
            start_frame = start * pipe.vae.sampling_rate // 1920
            end_frame = end * pipe.vae.sampling_rate // 1920
            denoise_mask[:, start_frame:end_frame, :] = repainting_strength
        # set padding areas to 1.0 (full repaint) to avoid artifacts at the boundaries caused by padding
        pad_left_frames = pad_left * pipe.vae.sampling_rate // 1920
        pad_right_frames = pad_right * pipe.vae.sampling_rate // 1920
        denoise_mask[:, :pad_left_frames, :] = 1
        denoise_mask[:, max_latent_length - pad_right_frames:, :] = 1
        silent_latents = self._get_silence_latent_slice(pipe, max_latent_length).unsqueeze(0)
        src_latents = src_latents * (1 - denoise_mask) + silent_latents * denoise_mask
        return denoise_mask, src_latents
    def process(self, pipe, duration, src_audio, audio_code_string, task_type=None, repainting_ranges=None, repainting_strength=None):
        # get src_latents from audio_code_string > src_audio > silence
        source_latents = None
        denoise_mask = None
        if audio_code_string is not None:
            # use audio_cede_string to get src_latents.
            pipe.load_models_to_device(self.onload_model_names)
            code_ids = self._parse_audio_code_string(audio_code_string)
            quantizer = pipe.tokenizer_model.tokenizer.quantizer
@@ -448,33 +491,42 @@ class AceStepUnit_ContextLatentBuilder(PipelineUnit):
            src_latents = pipe.tokenizer_model.detokenizer(quantized).to(pipe.device)
            max_latent_length = src_latents.shape[1]
        elif src_audio is not None:
            # use src_audio to get src_latents.
            pipe.load_models_to_device(self.onload_model_names)
            src_audio = src_audio.unsqueeze(0) if src_audio.dim() == 2 else src_audio
            src_audio = torch.clamp(src_audio, -1.0, 1.0)
            src_audio, repainting_ranges, pad_left, pad_right = self.pad_src_audio(pipe, src_audio, task_type, repainting_ranges)
            src_latents = pipe.vae.encode(src_audio.to(dtype=pipe.torch_dtype, device=pipe.device)).transpose(1, 2)
-            lm_hints_5Hz = self.tokenize(pipe.tokenizer_model.tokenizer, src_latents, pipe.silence_latent, pipe.tokenizer_model.tokenizer.pool_window_size)
+            source_latents = src_latents # cache for potential use in audio inpainting tasks
-            src_latents = pipe.tokenizer_model.detokenizer(lm_hints_5Hz)
+            denoise_mask, src_latents = self.parse_repaint_masks(pipe, src_latents, task_type, repainting_ranges, repainting_strength, pad_left, pad_right)
            if task_type == "cover":
                lm_hints_5Hz = self.tokenize(pipe.tokenizer_model.tokenizer, src_latents, pipe.silence_latent, pipe.tokenizer_model.tokenizer.pool_window_size)
                src_latents = pipe.tokenizer_model.detokenizer(lm_hints_5Hz)
            max_latent_length = src_latents.shape[1]
        else:
            # use silence latents.
            max_latent_length = int(duration * pipe.sample_rate  // 1920)
            src_latents = self._get_silence_latent_slice(pipe, max_latent_length).unsqueeze(0)
        chunk_masks = torch.ones((1, max_latent_length, src_latents.shape[-1]), dtype=torch.bool, device=pipe.device)
        attention_mask = torch.ones((1, max_latent_length), device=src_latents.device, dtype=pipe.torch_dtype)
        context_latents = torch.cat([src_latents, chunk_masks], dim=-1)
-        return {"context_latents": context_latents, "attention_mask": attention_mask}
+        return {"context_latents": context_latents, "attention_mask": attention_mask, "src_latents": source_latents, "denoise_mask": denoise_mask}
 class AceStepUnit_NoiseInitializer(PipelineUnit):
    def __init__(self):
        super().__init__(
-            input_params=("context_latents", "seed", "rand_device"),
+            input_params=("context_latents", "seed", "rand_device", "src_latents"),
            output_params=("noise",),
        )
-    def process(self, pipe, context_latents, seed, rand_device):
+    def process(self, pipe, context_latents, seed, rand_device, src_latents):
        src_latents_shape = (context_latents.shape[0], context_latents.shape[1], context_latents.shape[-1] // 2)
        noise = pipe.generate_noise(src_latents_shape, seed=seed, rand_device=rand_device, rand_torch_dtype=pipe.torch_dtype)
-        noise = pipe.scheduler.add_noise(context_latents[:, :, :src_latents_shape[-1]], noise, timestep=pipe.scheduler.timesteps[0])
+        if src_latents is not None:
            noise = pipe.scheduler.add_noise(src_latents, noise, timestep=pipe.scheduler.timesteps[0])
        return {"noise": noise}
@@ -502,7 +554,6 @@ class AceStepUnit_InputAudioEmbedder(PipelineUnit):
            return {"input_latents": input_latents}
 def model_fn_ace_step(
    dit: AceStepDiTModel,
    latents=None,
--- a/examples/ace_step/model_inference/acestep-v15-base-CoverTask.py
+++ b/examples/ace_step/model_inference/acestep-v15-base-CoverTask.py
@@ -16,12 +16,14 @@ pipe = AceStepPipeline.from_pretrained(
 prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
 lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 src_audio, sr = read_audio("data/diffsynth_example_dataset/ace_step/acestep-v15-base/audio.wav", resample=True, resample_rate=pipe.vae.sampling_rate)
 # audio_cover_strength controls the steps of doing cover tasks. [0, num_inference_steps * audio_cover_strength] steps will be cover steps, and the rest will be regular text-to-music generation steps.
 # denoising_strength controls how the output audio is influenced by the source audio in cover tasks.
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
    task_type="cover",
    src_audio=src_audio,
-    audio_cover_strength=0.6,
+    audio_cover_strength=0.5,
    denoising_strength=0.9,
    duration=160,
    bpm=100,
@@ -32,5 +34,4 @@ audio = pipe(
    num_inference_steps=30,
    cfg_scale=4.0,
 )
 save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-base-cover.wav")
--- a/examples/ace_step/model_inference/acestep-v15-base-RepaintTask.py
+++ b/examples/ace_step/model_inference/acestep-v15-base-RepaintTask.py
@@ -0,0 +1,39 @@
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio, read_audio
 import torch
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
        ModelConfig(model_id="ACE-Step/acestep-v15-base", origin_file_pattern="model.safetensors"),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors"),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
    ],
    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
 )
 prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
 lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 src_audio, sr = read_audio("data/diffsynth_example_dataset/ace_step/acestep-v15-base/audio.wav", resample=True, resample_rate=pipe.vae.sampling_rate)
 # repainting_ranges are in seconds, and will be converted to frames internally in the pipeline. The negative value in repainting_ranges means the padding from the start of the audio.
 # For example, repainting_ranges=[(-10, 30), (160, 200)] means we want to repaint the audio from -10s to 30s (with 10s padding before the start) and from 160s to 200s. The non-existent parts will be padded with silence.
 # Repainting strength denotes the intensity of repainting area, where 0 means no repainting (keep the original audio) and 1 means full repainting.
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
    task_type="repaint",
    src_audio=src_audio,
    repainting_ranges=[(-10, 30), (150, 200)],
    repainting_strength=1.0,
    duration=210,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=1,
    num_inference_steps=30,
    cfg_scale=4.0,
 )
 save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-base-repaint.wav")
--- a/examples/ace_step/model_inference_low_vram/acestep-v15-base-CoverTask.py
+++ b/examples/ace_step/model_inference_low_vram/acestep-v15-base-CoverTask.py
@@ -0,0 +1,49 @@
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio, read_audio
 import torch
 vram_config = {
    "offload_dtype": torch.bfloat16,
    "offload_device": "cpu",
    "onload_dtype": torch.bfloat16,
    "onload_device": "cpu",
    "preparing_dtype": torch.bfloat16,
    "preparing_device": "cuda",
    "computation_dtype": torch.bfloat16,
    "computation_device": "cuda",
 }
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
        ModelConfig(model_id="ACE-Step/acestep-v15-base", origin_file_pattern="model.safetensors", **vram_config),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
    ],
    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
 )
 prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
 lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 src_audio, sr = read_audio("data/diffsynth_example_dataset/ace_step/acestep-v15-base/audio.wav", resample=True, resample_rate=pipe.vae.sampling_rate)
 # audio_cover_strength controls the steps of doing cover tasks. [0, num_inference_steps * audio_cover_strength] steps will be cover steps, and the rest will be regular text-to-music generation steps.
 # denoising_strength controls how the output audio is influenced by the source audio in cover tasks.
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
    task_type="cover",
    src_audio=src_audio,
    audio_cover_strength=0.5,
    denoising_strength=0.9,
    duration=160,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=42,
    num_inference_steps=30,
    cfg_scale=4.0,
 )
 save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-base-cover.wav")
--- a/examples/ace_step/model_inference_low_vram/acestep-v15-base-RepaintTask.py
+++ b/examples/ace_step/model_inference_low_vram/acestep-v15-base-RepaintTask.py
@@ -0,0 +1,51 @@
 from diffsynth.pipelines.ace_step import AceStepPipeline, ModelConfig
 from diffsynth.utils.data.audio import save_audio, read_audio
 import torch
 vram_config = {
    "offload_dtype": torch.bfloat16,
    "offload_device": "cpu",
    "onload_dtype": torch.bfloat16,
    "onload_device": "cpu",
    "preparing_dtype": torch.bfloat16,
    "preparing_device": "cuda",
    "computation_dtype": torch.bfloat16,
    "computation_device": "cuda",
 }
 pipe = AceStepPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
        ModelConfig(model_id="ACE-Step/acestep-v15-base", origin_file_pattern="model.safetensors", **vram_config),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/model.safetensors", **vram_config),
        ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
    ],
    text_tokenizer_config=ModelConfig(model_id="ACE-Step/Ace-Step1.5", origin_file_pattern="Qwen3-Embedding-0.6B/"),
    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
 )
 prompt = "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating."
 lyrics = '[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]'
 src_audio, sr = read_audio("data/diffsynth_example_dataset/ace_step/acestep-v15-base/audio.wav", resample=True, resample_rate=pipe.vae.sampling_rate)
 # repainting_ranges are in seconds, and will be converted to frames internally in the pipeline. The negative value in repainting_ranges means the padding from the start of the audio.
 # For example, repainting_ranges=[(-10, 30), (160, 200)] means we want to repaint the audio from -10s to 30s (with 10s padding before the start) and from 160s to 200s. The non-existent parts will be padded with silence.
 # Repainting strength denotes the intensity of repainting area, where 0 means no repainting (keep the original audio) and 1 means full repainting.
 audio = pipe(
    prompt=prompt,
    lyrics=lyrics,
    task_type="repaint",
    src_audio=src_audio,
    repainting_ranges=[(-10, 30), (150, 200)],
    repainting_strength=1.0,
    duration=210,
    bpm=100,
    keyscale="B minor",
    timesignature="4",
    vocal_language="zh",
    seed=1,
    num_inference_steps=30,
    cfg_scale=4.0,
 )
 save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-base-repaint.wav")