add audio_vae, audio_vocoder, text_encoder, connector and upsampler for ltx2

2026-03-18 22:08:13 +00:00 · 2026-01-28 16:09:22 +08:00
parent 00da4b6c4f
commit 8d303b47e9
8 changed files with 2207 additions and 24 deletions
--- a/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py
+++ b/diffsynth/utils/state_dict_converters/ltx2_audio_vae.py
@@ -0,0 +1,32 @@
+def LTX2AudioEncoderStateDictConverter(state_dict):
+    # Not used
+    state_dict_ = {}
+    for name in state_dict:
+        if name.startswith("audio_vae.encoder."):
+            new_name = name.replace("audio_vae.encoder.", "")
+            state_dict_[new_name] = state_dict[name]
+        elif name.startswith("audio_vae.per_channel_statistics."):
+            new_name = name.replace("audio_vae.per_channel_statistics.", "per_channel_statistics.")
+            state_dict_[new_name] = state_dict[name]
+    return state_dict_
+
+
+def LTX2AudioDecoderStateDictConverter(state_dict):
+    state_dict_ = {}
+    for name in state_dict:
+        if name.startswith("audio_vae.decoder."):
+            new_name = name.replace("audio_vae.decoder.", "")
+            state_dict_[new_name] = state_dict[name]
+        elif name.startswith("audio_vae.per_channel_statistics."):
+            new_name = name.replace("audio_vae.per_channel_statistics.", "per_channel_statistics.")
+            state_dict_[new_name] = state_dict[name]
+    return state_dict_
+
+
+def LTX2VocoderStateDictConverter(state_dict):
+    state_dict_ = {}
+    for name in state_dict:
+        if name.startswith("vocoder."):
+            new_name = name.replace("vocoder.", "")
+            state_dict_[new_name] = state_dict[name]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py
+++ b/diffsynth/utils/state_dict_converters/ltx2_text_encoder.py
@@ -0,0 +1,31 @@
+def LTX2TextEncoderStateDictConverter(state_dict):
+    state_dict_ = {}
+    for key in state_dict:
+        if key.startswith("language_model.model."):
+            new_key = key.replace("language_model.model.", "model.language_model.")
+        elif key.startswith("vision_tower."):
+            new_key = key.replace("vision_tower.", "model.vision_tower.")
+        elif key.startswith("multi_modal_projector."):
+            new_key = key.replace("multi_modal_projector.", "model.multi_modal_projector.")
+        elif key.startswith("language_model.lm_head."):
+            new_key = key.replace("language_model.lm_head.", "lm_head.")
+        else:
+            continue
+        state_dict_[new_key] = state_dict[key]
+    state_dict_["lm_head.weight"] = state_dict_.get("model.language_model.embed_tokens.weight")
+    return state_dict_
+
+
+def LTX2TextEncoderPostModulesStateDictConverter(state_dict):
+    state_dict_ = {}
+    for key in state_dict:
+        if key.startswith("text_embedding_projection."):
+            new_key = key.replace("text_embedding_projection.", "feature_extractor_linear.")
+        elif key.startswith("model.diffusion_model.video_embeddings_connector."):
+            new_key = key.replace("model.diffusion_model.video_embeddings_connector.", "embeddings_connector.")
+        elif key.startswith("model.diffusion_model.audio_embeddings_connector."):
+            new_key = key.replace("model.diffusion_model.audio_embeddings_connector.", "audio_embeddings_connector.")
+        else:
+            continue
+        state_dict_[new_key] = state_dict[key]
+    return state_dict_
--- a/diffsynth/utils/test/load_model.py
+++ b/diffsynth/utils/test/load_model.py
@@ -1,22 +0,0 @@
-import torch
-from diffsynth.models.model_loader import ModelPool
-from diffsynth.core.loader import ModelConfig
-
-
-def test_model_loading(model_name,
-                       model_config: ModelConfig,
-                       vram_limit: float = None,
-                       device="cpu",
-                       torch_dtype=torch.bfloat16):
-    model_pool = ModelPool()
-    model_config.download_if_necessary()
-    vram_config = model_config.vram_config()
-    vram_config["computation_dtype"] = torch_dtype
-    vram_config["computation_device"] = device
-    model_pool.auto_load_model(
-        model_config.path,
-        vram_config=vram_config,
-        vram_limit=vram_limit,
-        clear_parameters=model_config.clear_parameters,
-    )
-    return model_pool.fetch_model(model_name)