mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
@@ -768,7 +768,7 @@ ltx2_series = [
|
||||
"model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
|
||||
"model_name": "ltx2_text_encoder_post_modules",
|
||||
"model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
|
||||
"extra_kwargs": {"seperated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attetion_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connetor_layers": 8, "apply_gated_attention": True},
|
||||
"extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
|
||||
"state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
|
||||
},
|
||||
{
|
||||
|
||||
@@ -406,36 +406,36 @@ class Embeddings1DConnector(nn.Module):
|
||||
class LTX2TextEncoderPostModules(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
seperated_audio_video: bool = False,
|
||||
separated_audio_video: bool = False,
|
||||
embedding_dim_gemma: int = 3840,
|
||||
num_layers_gemma: int = 49,
|
||||
video_attetion_heads: int = 32,
|
||||
video_attention_heads: int = 32,
|
||||
video_attention_head_dim: int = 128,
|
||||
audio_attention_heads: int = 32,
|
||||
audio_attention_head_dim: int = 64,
|
||||
num_connetor_layers: int = 2,
|
||||
num_connector_layers: int = 2,
|
||||
apply_gated_attention: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
if not seperated_audio_video:
|
||||
if not separated_audio_video:
|
||||
self.feature_extractor_linear = GemmaFeaturesExtractorProjLinear()
|
||||
self.embeddings_connector = Embeddings1DConnector()
|
||||
self.audio_embeddings_connector = Embeddings1DConnector()
|
||||
else:
|
||||
# LTX-2.3
|
||||
self.feature_extractor_linear = GemmaSeperatedFeaturesExtractorProjLinear(
|
||||
num_layers_gemma, embedding_dim_gemma, video_attetion_heads * video_attention_head_dim,
|
||||
num_layers_gemma, embedding_dim_gemma, video_attention_heads * video_attention_head_dim,
|
||||
audio_attention_heads * audio_attention_head_dim)
|
||||
self.embeddings_connector = Embeddings1DConnector(
|
||||
attention_head_dim=video_attention_head_dim,
|
||||
num_attention_heads=video_attetion_heads,
|
||||
num_layers=num_connetor_layers,
|
||||
num_attention_heads=video_attention_heads,
|
||||
num_layers=num_connector_layers,
|
||||
apply_gated_attention=apply_gated_attention,
|
||||
)
|
||||
self.audio_embeddings_connector = Embeddings1DConnector(
|
||||
attention_head_dim=audio_attention_head_dim,
|
||||
num_attention_heads=audio_attention_heads,
|
||||
num_layers=num_connetor_layers,
|
||||
num_layers=num_connector_layers,
|
||||
apply_gated_attention=apply_gated_attention,
|
||||
)
|
||||
|
||||
|
||||
@@ -413,7 +413,7 @@ class LTX2AudioVideoUnit_InputAudioEmbedder(PipelineUnit):
|
||||
class LTX2AudioVideoUnit_InputImagesEmbedder(PipelineUnit):
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
input_params=("input_images", "input_images_indexes", "input_images_strength", "video_latents", "height", "width", "num_frames", "tiled", "tile_size_in_pixels", "tile_overlap_in_pixels", "use_two_stage_pipeline"),
|
||||
input_params=("input_images", "input_images_indexes", "input_images_strength", "video_latents", "height", "width", "tiled", "tile_size_in_pixels", "tile_overlap_in_pixels", "use_two_stage_pipeline"),
|
||||
output_params=("video_latents", "denoise_mask_video", "input_latents_video", "stage2_input_latents"),
|
||||
onload_model_names=("video_vae_encoder")
|
||||
)
|
||||
@@ -426,7 +426,7 @@ class LTX2AudioVideoUnit_InputImagesEmbedder(PipelineUnit):
|
||||
latent = pipe.video_vae_encoder.encode(image, tiled, tile_size_in_pixels, tile_overlap_in_pixels).to(pipe.device)
|
||||
return latent
|
||||
|
||||
def process(self, pipe: LTX2AudioVideoPipeline, input_images, input_images_indexes, input_images_strength, video_latents, height, width, num_frames, tiled, tile_size_in_pixels, tile_overlap_in_pixels, use_two_stage_pipeline=False):
|
||||
def process(self, pipe: LTX2AudioVideoPipeline, input_images, input_images_indexes, input_images_strength, video_latents, height, width, tiled, tile_size_in_pixels, tile_overlap_in_pixels, use_two_stage_pipeline=False):
|
||||
if input_images is None or len(input_images) == 0:
|
||||
return {"video_latents": video_latents}
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user