update examples and downloaders

2024-06-27 19:43:50 +08:00
parent 0af60b9c73
commit 0b1704976a
21 changed files with 409 additions and 100 deletions
--- a/diffsynth/models/init.py
+++ b/diffsynth/models/init.py
@@ -48,23 +48,129 @@ preset_models_on_huggingface = {
    ],
 }
 preset_models_on_modelscope = {
+    # Hunyuan DiT
    "HunyuanDiT": [
        ("modelscope/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
        ("modelscope/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
        ("modelscope/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
        ("modelscope/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
    ],
+    # Stable Video Diffusion
    "stable-video-diffusion-img2vid-xt": [
        ("AI-ModelScope/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
    ],
+    # ExVideo
    "ExVideo-SVD-128f-v1": [
        ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        ("AI-ModelScope/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
+    ],
+    "DreamShaper_8": [
+        ("sd_lora/dreamshaper_8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    "AingDiffusion_v12": [
+        ("sd_lora/aingdiffusion_v12", "aingdiffusion_v12.safetensors", "models/stable_diffusion"),
+    ],
+    "Flat2DAnimerge_v45Sharp": [
+        ("sd_lora/Flat-2D-Animerge", "flat2DAnimerge_v45Sharp.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        ("sd_lora/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        ("AI-ModelScope/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "BluePencilXL_v200": [
+        ("sd_lora/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "StableDiffusionXL_Turbo": [
+        ("AI-ModelScope/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators")
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators")
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
+    ],
+    # Beautiful Prompt
+    "BeautifulPrompt": [
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+    ],
+    # Translator
+    "opus-mt-zh-en": [
+        ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+    ],
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        ("AI-ModelScope/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
+    ],
+    "IP-Adapter-SDXL": [
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
+    ],
 }
 Preset_model_id: TypeAlias = Literal[
    "HunyuanDiT",
    "stable-video-diffusion-img2vid-xt",
-    "ExVideo-SVD-128f-v1"
+    "ExVideo-SVD-128f-v1",
+    "StableDiffusion_v15",
+    "DreamShaper_8",
+    "AingDiffusion_v12",
+    "Flat2DAnimerge_v45Sharp",
+    "TextualInversion_VeryBadImageNegative_v1.3",
+    "StableDiffusionXL_v1",
+    "BluePencilXL_v200",
+    "StableDiffusionXL_Turbo",
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "ControlNet_v11f1e_sd15_tile",
+    "ControlNet_v11p_sd15_lineart",
+    "AnimateDiff_v2",
+    "AnimateDiff_xl_beta",
+    "RIFE",
+    "BeautifulPrompt",
+    "opus-mt-zh-en",
+    "IP-Adapter-SD",
+    "IP-Adapter-SDXL",
 ]
 Preset_model_website: TypeAlias = Literal[
    "HuggingFace",
@@ -80,6 +186,26 @@ website_to_download_fn = {
 }


+def download_models(
+    model_id_list: List[Preset_model_id] = [],
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    downloaded_files = []
+    for model_id in model_id_list:
+        for website in downloading_priority:
+            if model_id in website_to_preset_models[website]:
+                for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]:
+                    # Check if the file is downloaded.
+                    file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+                    if file_to_download in downloaded_files:
+                        continue
+                    # Download
+                    website_to_download_fn[website](model_id, origin_file_path, local_dir)
+                    if os.path.basename(origin_file_path) in os.listdir(local_dir):
+                        downloaded_files.append(file_to_download)
+    return downloaded_files
+
+
 class ModelManager:
    def __init__(
        self,
@@ -94,28 +220,19 @@ class ModelManager:
        self.model = {}
        self.model_path = {}
        self.textual_inversion_dict = {}
-        downloaded_files = self.download_models(model_id_list, downloading_priority)
+        downloaded_files = download_models(model_id_list, downloading_priority)
        self.load_models(downloaded_files + file_path_list)

-    def download_models(
+    def load_model_from_origin(
        self,
-        model_id_list: List[Preset_model_id] = [],
-        downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+        download_from: Preset_model_website = "ModelScope",
+        model_id = "",
+        origin_file_path = "",
+        local_dir = ""
    ):
-        downloaded_files = []
-        for model_id in model_id_list:
-            for website in downloading_priority:
-                if model_id in website_to_preset_models[website]:
-                    for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]:
-                        # Check if the file is downloaded.
-                        file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
-                        if file_to_download in downloaded_files:
-                            continue
-                        # Download
-                        website_to_download_fn[website](model_id, origin_file_path, local_dir)
-                        if os.path.basename(origin_file_path) in os.listdir(local_dir):
-                            downloaded_files.append(file_to_download)
-        return downloaded_files
+        website_to_download_fn[download_from](model_id, origin_file_path, local_dir)
+        file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+        self.load_model(file_to_download)

    def is_stable_video_diffusion(self, state_dict):
        param_name = "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight"
@@ -158,7 +275,7 @@ class ModelManager:
    
    def is_translator(self, state_dict):
        param_name = "model.encoder.layers.5.self_attn_layer_norm.weight"
-        return param_name in state_dict and len(state_dict) == 254
+        return param_name in state_dict and len(state_dict) == 258
    
    def is_ipadapter(self, state_dict):
        return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([3072, 1024])
--- a/diffsynth/models/sd_ipadapter.py
+++ b/diffsynth/models/sd_ipadapter.py
@@ -29,7 +29,7 @@ class SDIpAdapter(torch.nn.Module):

    def set_less_adapter(self):
        # IP-Adapter for SD v1.5 doesn't support this feature.
-        self.set_full_adapter(self)
+        self.set_full_adapter()

    def forward(self, hidden_states, scale=1.0):
        hidden_states = self.image_proj(hidden_states)
--- a/diffsynth/pipelines/stable_diffusion_xl.py
+++ b/diffsynth/pipelines/stable_diffusion_xl.py
@@ -87,6 +87,7 @@ class SDXLImagePipeline(torch.nn.Module):
        input_image=None,
        ipadapter_images=None,
        ipadapter_scale=1.0,
+        ipadapter_use_instant_style=False,
        controlnet_image=None,
        denoising_strength=1.0,
        height=1024,
@@ -134,6 +135,10 @@ class SDXLImagePipeline(torch.nn.Module):

        # IP-Adapter
        if ipadapter_images is not None:
+            if ipadapter_use_instant_style:
+                self.ipadapter.set_less_adapter()
+            else:
+                self.ipadapter.set_full_adapter()
            ipadapter_image_encoding = self.ipadapter_image_encoder(ipadapter_images)
            ipadapter_kwargs_list_posi = self.ipadapter(ipadapter_image_encoding, scale=ipadapter_scale)
            ipadapter_kwargs_list_nega = self.ipadapter(torch.zeros_like(ipadapter_image_encoding))
--- a/diffsynth/prompts/sdxl_prompter.py
+++ b/diffsynth/prompts/sdxl_prompter.py
@@ -41,6 +41,10 @@ class SDXLPrompter(Prompter):
        add_text_embeds, prompt_emb_2 = text_encoder_2(input_ids_2, clip_skip=clip_skip_2)

        # Merge
+        if prompt_emb_1.shape[0] != prompt_emb_2.shape[0]:
+            max_batch_size = min(prompt_emb_1.shape[0], prompt_emb_2.shape[0])
+            prompt_emb_1 = prompt_emb_1[: max_batch_size]
+            prompt_emb_2 = prompt_emb_2[: max_batch_size]
        prompt_emb = torch.concatenate([prompt_emb_1, prompt_emb_2], dim=-1)

        # For very long prompt, we only use the first 77 tokens to compute `add_text_embeds`.