From e88328d152fb0e680cae2ea9143025665f1c4d27 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Mon, 31 Mar 2025 14:29:15 +0800 Subject: [PATCH] support infiniteyou --- README.md | 2 + diffsynth/configs/model_config.py | 23 +++++ diffsynth/models/flux_controlnet.py | 2 + diffsynth/models/flux_infiniteyou.py | 128 +++++++++++++++++++++++++++ diffsynth/pipelines/flux_image.py | 89 ++++++++++++++++++- examples/InfiniteYou/README.md | 7 ++ examples/InfiniteYou/infiniteyou.py | 54 +++++++++++ 7 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 diffsynth/models/flux_infiniteyou.py create mode 100644 examples/InfiniteYou/README.md create mode 100644 examples/InfiniteYou/infiniteyou.py diff --git a/README.md b/README.md index b9d2214..2765900 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ Until now, DiffSynth-Studio has supported the following models: * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5) ## News +- **March 31, 2025** We support InfiniteYou, an identity preserving method for FLUX. Please refer to [./examples/InfiniteYou/](./examples/InfiniteYou/) for more details. + - **March 25, 2025** 🔥🔥🔥 Our new open-source project, [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine), is now open-sourced! Focused on stable model deployment. Geared towards industry. Offers better engineering support, higher computational performance, and more stable functionality. - **March 13, 2025** We support HunyuanVideo-I2V, the image-to-video generation version of HunyuanVideo open-sourced by Tencent. Please refer to [./examples/HunyuanVideo/](./examples/HunyuanVideo/) for more details. diff --git a/diffsynth/configs/model_config.py b/diffsynth/configs/model_config.py index 15dcbed..969afae 100644 --- a/diffsynth/configs/model_config.py +++ b/diffsynth/configs/model_config.py @@ -37,6 +37,7 @@ from ..models.flux_text_encoder import FluxTextEncoder2 from ..models.flux_vae import FluxVAEEncoder, FluxVAEDecoder from ..models.flux_controlnet import FluxControlNet from ..models.flux_ipadapter import FluxIpAdapter +from ..models.flux_infiniteyou import InfiniteYouImageProjector from ..models.cog_vae import CogVAEEncoder, CogVAEDecoder from ..models.cog_dit import CogDiT @@ -104,6 +105,8 @@ model_loader_configs = [ (None, "b001c89139b5f053c715fe772362dd2a", ["flux_controlnet"], [FluxControlNet], "diffusers"), (None, "52357cb26250681367488a8954c271e8", ["flux_controlnet"], [FluxControlNet], "diffusers"), (None, "0cfd1740758423a2a854d67c136d1e8c", ["flux_controlnet"], [FluxControlNet], "diffusers"), + (None, "7f9583eb8ba86642abb9a21a4b2c9e16", ["flux_controlnet"], [FluxControlNet], "diffusers"), + (None, "c07c0f04f5ff55e86b4e937c7a40d481", ["infiniteyou_image_projector"], [InfiniteYouImageProjector], "diffusers"), (None, "4daaa66cc656a8fe369908693dad0a35", ["flux_ipadapter"], [FluxIpAdapter], "diffusers"), (None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"), (None, "98cc34ccc5b54ae0e56bdea8688dcd5a", ["sd3_text_encoder_2"], [SD3TextEncoder2], "civitai"), @@ -598,6 +601,25 @@ preset_models_on_modelscope = { "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder", ], }, + "InfiniteYou":{ + "file_list":[ + ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"), + ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"), + ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/image_proj_model.bin", "models/InfiniteYou"), + ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/1k3d68.onnx", "models/InfiniteYou/insightface/models/antelopev2"), + ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/2d106det.onnx", "models/InfiniteYou/insightface/models/antelopev2"), + ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/genderage.onnx", "models/InfiniteYou/insightface/models/antelopev2"), + ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/glintr100.onnx", "models/InfiniteYou/insightface/models/antelopev2"), + ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/scrfd_10g_bnkps.onnx", "models/InfiniteYou/insightface/models/antelopev2"), + ], + "load_path":[ + [ + "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors", + "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors" + ], + "models/InfiniteYou/image_proj_model.bin", + ], + }, # ESRGAN "ESRGAN_x4": [ ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"), @@ -757,6 +779,7 @@ Preset_model_id: TypeAlias = Literal[ "Shakker-Labs/FLUX.1-dev-ControlNet-Depth", "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro", "InstantX/FLUX.1-dev-IP-Adapter", + "InfiniteYou", "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0", "QwenPrompt", "OmostPrompt", diff --git a/diffsynth/models/flux_controlnet.py b/diffsynth/models/flux_controlnet.py index d812e6c..7bc3dc0 100644 --- a/diffsynth/models/flux_controlnet.py +++ b/diffsynth/models/flux_controlnet.py @@ -318,6 +318,8 @@ class FluxControlNetStateDictConverter: extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4} elif hash_value == "0cfd1740758423a2a854d67c136d1e8c": extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 1} + elif hash_value == "7f9583eb8ba86642abb9a21a4b2c9e16": + extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 10} else: extra_kwargs = {} return state_dict_, extra_kwargs diff --git a/diffsynth/models/flux_infiniteyou.py b/diffsynth/models/flux_infiniteyou.py new file mode 100644 index 0000000..2015de4 --- /dev/null +++ b/diffsynth/models/flux_infiniteyou.py @@ -0,0 +1,128 @@ +import math +import torch +import torch.nn as nn + + +# FFN +def FeedForward(dim, mult=4): + inner_dim = int(dim * mult) + return nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, inner_dim, bias=False), + nn.GELU(), + nn.Linear(inner_dim, dim, bias=False), + ) + + +def reshape_tensor(x, heads): + bs, length, width = x.shape + #(bs, length, width) --> (bs, length, n_heads, dim_per_head) + x = x.view(bs, length, heads, -1) + # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) + x = x.transpose(1, 2) + # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) + x = x.reshape(bs, heads, length, -1) + return x + + +class PerceiverAttention(nn.Module): + + def __init__(self, *, dim, dim_head=64, heads=8): + super().__init__() + self.scale = dim_head**-0.5 + self.dim_head = dim_head + self.heads = heads + inner_dim = dim_head * heads + + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + + self.to_q = nn.Linear(dim, inner_dim, bias=False) + self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) + self.to_out = nn.Linear(inner_dim, dim, bias=False) + + def forward(self, x, latents): + """ + Args: + x (torch.Tensor): image features + shape (b, n1, D) + latent (torch.Tensor): latent features + shape (b, n2, D) + """ + x = self.norm1(x) + latents = self.norm2(latents) + + b, l, _ = latents.shape + + q = self.to_q(latents) + kv_input = torch.cat((x, latents), dim=-2) + k, v = self.to_kv(kv_input).chunk(2, dim=-1) + + q = reshape_tensor(q, self.heads) + k = reshape_tensor(k, self.heads) + v = reshape_tensor(v, self.heads) + + # attention + scale = 1 / math.sqrt(math.sqrt(self.dim_head)) + weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) + out = weight @ v + + out = out.permute(0, 2, 1, 3).reshape(b, l, -1) + + return self.to_out(out) + + +class InfiniteYouImageProjector(nn.Module): + + def __init__( + self, + dim=1280, + depth=4, + dim_head=64, + heads=20, + num_queries=8, + embedding_dim=512, + output_dim=4096, + ff_mult=4, + ): + super().__init__() + self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5) + self.proj_in = nn.Linear(embedding_dim, dim) + + self.proj_out = nn.Linear(dim, output_dim) + self.norm_out = nn.LayerNorm(output_dim) + + self.layers = nn.ModuleList([]) + for _ in range(depth): + self.layers.append( + nn.ModuleList([ + PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), + FeedForward(dim=dim, mult=ff_mult), + ])) + + def forward(self, x): + + latents = self.latents.repeat(x.size(0), 1, 1) + + x = self.proj_in(x) + + for attn, ff in self.layers: + latents = attn(x, latents) + latents + latents = ff(latents) + latents + + latents = self.proj_out(latents) + return self.norm_out(latents) + + @staticmethod + def state_dict_converter(): + return FluxInfiniteYouImageProjectorStateDictConverter() + + +class FluxInfiniteYouImageProjectorStateDictConverter: + + def __init__(self): + pass + + def from_diffusers(self, state_dict): + return state_dict['image_proj'] diff --git a/diffsynth/pipelines/flux_image.py b/diffsynth/pipelines/flux_image.py index 7303dff..866a0c2 100644 --- a/diffsynth/pipelines/flux_image.py +++ b/diffsynth/pipelines/flux_image.py @@ -4,10 +4,12 @@ from ..prompters import FluxPrompter from ..schedulers import FlowMatchScheduler from .base import BasePipeline from typing import List +import math import torch from tqdm import tqdm import numpy as np from PIL import Image +import cv2 from ..models.tiler import FastTileWorker from transformers import SiglipVisionModel from copy import deepcopy @@ -162,6 +164,20 @@ class FluxImagePipeline(BasePipeline): self.ipadapter = model_manager.fetch_model("flux_ipadapter") self.ipadapter_image_encoder = model_manager.fetch_model("siglip_vision_model") + # InfiniteYou + self.image_proj_model = model_manager.fetch_model("infiniteyou_image_projector") + if self.image_proj_model is not None: + from facexlib.recognition import init_recognition_model + from insightface.app import FaceAnalysis + insightface_root_path = 'models/InfiniteYou/insightface' + self.app_640 = FaceAnalysis(name='antelopev2', root=insightface_root_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) + self.app_640.prepare(ctx_id=0, det_size=(640, 640)) + self.app_320 = FaceAnalysis(name='antelopev2', root=insightface_root_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) + self.app_320.prepare(ctx_id=0, det_size=(320, 320)) + self.app_160 = FaceAnalysis(name='antelopev2', root=insightface_root_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) + self.app_160.prepare(ctx_id=0, det_size=(160, 160)) + self.arcface_model = init_recognition_model('arcface', device=self.device) + @staticmethod def from_model_manager(model_manager: ModelManager, controlnet_config_units: List[ControlNetConfigUnit]=[], prompt_refiner_classes=[], prompt_extender_classes=[], device=None, torch_dtype=None): @@ -337,6 +353,66 @@ class FluxImagePipeline(BasePipeline): return eligen_kwargs_posi, eligen_kwargs_nega, fg_mask, bg_mask + def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]): + stickwidth = 4 + limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]]) + kps = np.array(kps) + w, h = image_pil.size + out_img = np.zeros([h, w, 3]) + for i in range(len(limbSeq)): + index = limbSeq[i] + color = color_list[index[0]] + x = kps[index][:, 0] + y = kps[index][:, 1] + length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5 + angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1])) + polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1) + out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color) + out_img = (out_img * 0.6).astype(np.uint8) + for idx_kp, kp in enumerate(kps): + color = color_list[idx_kp] + out_img = cv2.circle(out_img.copy(), (int(kp[0]), int(kp[1])), 10, color, -1) + out_img_pil = Image.fromarray(out_img.astype(np.uint8)) + return out_img_pil + + + def extract_arcface_bgr_embedding(self, in_image, landmark): + from insightface.utils import face_align + arc_face_image = face_align.norm_crop(in_image, landmark=np.array(landmark), image_size=112) + arc_face_image = torch.from_numpy(arc_face_image).unsqueeze(0).permute(0, 3, 1, 2) / 255. + arc_face_image = 2 * arc_face_image - 1 + arc_face_image = arc_face_image.contiguous().to(self.device) + face_emb = self.arcface_model(arc_face_image)[0] # [512], normalized + return face_emb + + + def _detect_face(self, id_image_cv2): + face_info = self.app_640.get(id_image_cv2) + if len(face_info) > 0: + return face_info + face_info = self.app_320.get(id_image_cv2) + if len(face_info) > 0: + return face_info + face_info = self.app_160.get(id_image_cv2) + return face_info + + + def prepare_infinite_you(self, id_image, controlnet_image, controlnet_guidance, height, width): + if id_image is None: + return {'id_emb': None}, controlnet_image + id_image_cv2 = cv2.cvtColor(np.array(id_image), cv2.COLOR_RGB2BGR) + face_info = self._detect_face(id_image_cv2) + if len(face_info) == 0: + raise ValueError('No face detected in the input ID image') + landmark = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1]['kps'] # only use the maximum face + id_emb = self.extract_arcface_bgr_embedding(id_image_cv2, landmark) + id_emb = self.image_proj_model(id_emb.unsqueeze(0).reshape([1, -1, 512]).to(dtype=self.torch_dtype)) + if controlnet_image is None: + controlnet_image = Image.fromarray(np.zeros([height, width, 3]).astype(np.uint8)) + controlnet_guidance = torch.Tensor([controlnet_guidance]).to(device=self.device, dtype=self.torch_dtype) + return {'id_emb': id_emb, 'controlnet_guidance': controlnet_guidance}, controlnet_image + + def prepare_prompts(self, prompt, local_prompts, masks, mask_scales, t5_sequence_length, negative_prompt, cfg_scale): # Extend prompt self.load_models_to_device(['text_encoder_1', 'text_encoder_2']) @@ -374,6 +450,7 @@ class FluxImagePipeline(BasePipeline): controlnet_image=None, controlnet_inpaint_mask=None, enable_controlnet_on_negative=False, + controlnet_guidance=1.0, # IP-Adapter ipadapter_images=None, ipadapter_scale=1.0, @@ -382,6 +459,8 @@ class FluxImagePipeline(BasePipeline): eligen_entity_masks=None, enable_eligen_on_negative=False, enable_eligen_inpaint=False, + # InfiniteYou + id_image=None, # TeaCache tea_cache_l1_thresh=None, # Tile @@ -409,6 +488,9 @@ class FluxImagePipeline(BasePipeline): # Extra input extra_input = self.prepare_extra_input(latents, guidance=embedded_guidance) + # InfiniteYou + infiniteyou_kwargs, controlnet_image = self.prepare_infinite_you(id_image, controlnet_image, controlnet_guidance, height, width) + # Entity control eligen_kwargs_posi, eligen_kwargs_nega, fg_mask, bg_mask = self.prepare_eligen(prompt_emb_nega, eligen_entity_prompts, eligen_entity_masks, width, height, t5_sequence_length, enable_eligen_inpaint, enable_eligen_on_negative, cfg_scale) @@ -430,7 +512,7 @@ class FluxImagePipeline(BasePipeline): inference_callback = lambda prompt_emb_posi, controlnet_kwargs: lets_dance_flux( dit=self.dit, controlnet=self.controlnet, hidden_states=latents, timestep=timestep, - **prompt_emb_posi, **tiler_kwargs, **extra_input, **controlnet_kwargs, **ipadapter_kwargs_list_posi, **eligen_kwargs_posi, **tea_cache_kwargs, + **prompt_emb_posi, **tiler_kwargs, **extra_input, **controlnet_kwargs, **ipadapter_kwargs_list_posi, **eligen_kwargs_posi, **tea_cache_kwargs, **infiniteyou_kwargs ) noise_pred_posi = self.control_noise_via_local_prompts( prompt_emb_posi, prompt_emb_locals, masks, mask_scales, inference_callback, @@ -529,6 +611,8 @@ def lets_dance_flux( entity_prompt_emb=None, entity_masks=None, ipadapter_kwargs_list={}, + id_emb=None, + controlnet_guidance=None, tea_cache: TeaCache = None, **kwargs ): @@ -573,6 +657,9 @@ def lets_dance_flux( "tile_size": tile_size, "tile_stride": tile_stride, } + if id_emb is not None: + controlnet_text_ids = torch.zeros(id_emb.shape[0], id_emb.shape[1], 3).to(device=hidden_states.device, dtype=hidden_states.dtype) + controlnet_extra_kwargs.update({"prompt_emb": id_emb, 'text_ids': controlnet_text_ids, 'guidance': controlnet_guidance}) controlnet_res_stack, controlnet_single_res_stack = controlnet( controlnet_frames, **controlnet_extra_kwargs ) diff --git a/examples/InfiniteYou/README.md b/examples/InfiniteYou/README.md new file mode 100644 index 0000000..5ce4946 --- /dev/null +++ b/examples/InfiniteYou/README.md @@ -0,0 +1,7 @@ +# InfiniteYou: Flexible Photo Recrafting While Preserving Your Identity +We support the identity preserving feature of InfiniteYou. See [./infiniteyou.py](./infiniteyou.py) for example. The visualization of the result is shown below. + +|Identity Image|Generated Image| +|-|-| +|![man_id](https://github.com/user-attachments/assets/bbc38a91-966e-49e8-a0d7-c5467582ad1f)|![man](https://github.com/user-attachments/assets/0decd5e1-5f65-437c-98fa-90991b6f23c1)| +|![woman_id](https://github.com/user-attachments/assets/b2894695-690e-465b-929c-61e5dc57feeb)|![woman](https://github.com/user-attachments/assets/67cc7496-c4d3-4de1-a8f1-9eb4991d95e8)| diff --git a/examples/InfiniteYou/infiniteyou.py b/examples/InfiniteYou/infiniteyou.py new file mode 100644 index 0000000..035e738 --- /dev/null +++ b/examples/InfiniteYou/infiniteyou.py @@ -0,0 +1,54 @@ +import importlib +import torch +from diffsynth import ModelManager, FluxImagePipeline, download_models, ControlNetConfigUnit +from modelscope import dataset_snapshot_download +from PIL import Image + +if importlib.util.find_spec("facexlib") is None: + raise ImportError("You are using InifiniteYou. It depends on facexlib, which is not installed. Please install it with `pip install facexlib`.") +if importlib.util.find_spec("insightface") is None: + raise ImportError("You are using InifiniteYou. It depends on insightface, which is not installed. Please install it with `pip install insightface`.") + +download_models(["InfiniteYou"]) +model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda", model_id_list=["FLUX.1-dev"]) +model_manager.load_models([ + [ + "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors", + "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors" + ], + "models/InfiniteYou/image_proj_model.bin", +]) + + +pipe = FluxImagePipeline.from_model_manager( + model_manager, + controlnet_config_units=[ + ControlNetConfigUnit(processor_id="none", + model_path=[ + 'models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors', + 'models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors' + ], + scale=1.0)]) +dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/infiniteyou/*") + +prompt = "A man, portrait, cinematic" +id_image = "data/examples/infiniteyou/man.jpg" +id_image = Image.open(id_image).convert('RGB') +image = pipe( + prompt=prompt, seed=1, + id_image=id_image, controlnet_guidance=1.0, + num_inference_steps=50, embedded_guidance=3.5, + height=1024, width=1024, +) +image.save("man.jpg") + +prompt = "A woman, portrait, cinematic" +id_image = "data/examples/infiniteyou/woman.jpg" +id_image = Image.open(id_image).convert('RGB') +image = pipe( + prompt=prompt, seed=1, + id_image=id_image, controlnet_guidance=1.0, + num_inference_steps=50, embedded_guidance=3.5, + height=1024, width=1024, +) +image.save("woman.jpg")