Add files via upload

再改一次
This commit is contained in:
yrk111222
2024-10-22 09:56:03 +08:00
committed by GitHub
parent 157ba2e426
commit f6e676cdf9
46 changed files with 2525 additions and 0 deletions

View File

@@ -0,0 +1,135 @@
# ControlNet、LoRA、IP-Adapter——Precision Control Technology
Based on the text-to-images model, various adapter-based models can be used to control the generation process.
Let's download the models we'll be using in the upcoming examples:
* A highly praised Stable Diffusion XL architecture anime-style model
* A ControlNet model that supports multiple control modes
* A LoRA model for the Stable Diffusion XL model
* An IP-Adapter model and its corresponding image encoder
Please note that the names of the models are kept in English as per your instruction to retain specific terminology.
```python
from diffsynth import download_models
download_models([
"BluePencilXL_v200",
"ControlNet_union_sdxl_promax",
"SDXL_lora_zyd23ble_diffusion_xl/bluePencilXL_v200.safetensors"])
pipe = SDXLImagePipeline.from_model_ma2_ChineseInkStyle_SDXL_v1_0",
"IP-Adapter-SDXL"
])
```
Using basic text-to-image functionality to generate a picture.
```python
from diffsynth import ModelManager, SDXLImagePipeline
import torch
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
model_manager.load_models(["models/stanager(model_manager)
torch.manual_seed(1)
image = pipe(
prompt="masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,",
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
cfg_scale=6, num_inference_steps=60,
)
image.save("image.jpg")
```
![image](https://github.com/user-attachments/assets/cc094e8f-ff6a-4f9e-ba05-7a5c2e0e609f)
Next, let's transform this graceful underwater dancer into a fire mage! We'll activate the ControlNet to maintain the structure of the image while modifying the prompt.
```python
from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
import torch
from PIL import Image
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
model_manager.load_models([
"models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
"models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
])
pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
])
torch.manual_seed(2)
image = pipe(
prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
cfg_scale=6, num_inference_steps=60,
controlnet_image=Image.open("image.jpg")
)
image.save("image_controlnet.jpg")
```
![image_controlnet](https://github.com/user-attachments/assets/d50d173e-e81a-4d7e-93e3-b2787d69953e)
Isn't that cool? There's more! Add a LoRA to make the image closer to the flat style of hand-drawn comics. This LoRA requires certain trigger words to take effect, which is mentioned on the original author's model page. Remember to add the trigger words at the beginning of the prompt.
```python
from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
import torch
from PIL import Image
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
model_manager.load_models([
"models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
"models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
])
model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
])
torch.manual_seed(3)
image = pipe(
prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
cfg_scale=6, num_inference_steps=60,
controlnet_image=Image.open("image.jpg")
)
image.save("image_lora.jpg")
```
![image_lora](https://github.com/user-attachments/assets/c599b2f8-8351-4be5-a6ae-8380889cb9d8)
Not done yet! Find a Chinese painting with ink-wash style as a style guide, activate the IP-Adapter, and let classical art collide with modern aesthetics!
| Let's use this image as a style guide. |![ink_style](https://github.com/user-attachments/assets/e47c5a03-9c7b-402b-b260-d8bfd56abbc5)|
|-|-|
```python
from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
import torch
from PIL import Image
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
model_manager.load_models([
"models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
"models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors",
"models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin",
"models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
])
model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
])
torch.manual_seed(2)
image = pipe(
prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
cfg_scale=6, num_inference_steps=60,
controlnet_image=Image.open("image.jpg"),
ipadapter_images=[Image.open("ink_style.jpg")],
ipadapter_use_instant_style=True, ipadapter_scale=0.5
)
image.save("image_ipadapter.jpg")
```
![image_ipadapter](https://github.com/user-attachments/assets/e5924aef-03b0-4462-811f-a60e2523fd7f)
The joy of generating images with Diffusion lies in the combination of various ecosystem models, which can realize all kinds of creative ideas.

View File

@@ -0,0 +1,64 @@
# Text-to-Image, Image-to-Image, and High-Resolution Restoration - First Encounter with the Dazzling Diffusion.
Load the text-to-image model, here we use an anime-style model from Civitai as an example.
```python
import torch
from diffsynth import ModelManager, SDImagePipeline, download_models
download_models(["AingDiffusion_v12"])
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
pipe = SDImagePipeline.from_model_manager(model_manager)
```
Generate a picture to give it a try.
```python
torch.manual_seed(0)
image = pipe(
prompt="masterpiece, best quality, a girl with long silver hair",
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
height=512, width=512, num_inference_steps=80,
)
image.save("image.jpg")
```
Ah, a lovely young lady.
![image](https://github.com/user-attachments/assets/999100d2-1c39-4f18-b37e-aa9d5b4e519c)
Use the image-to-image feature to turn her hair red, simply by adding `input_image` and `denoising_strength` as parameters. The `denoising_strength` controls the intensity of the noise added, when set to 0, the generated image will be identical to the input image, and when set to 1, it will be completely randomly generated.
```python
torch.manual_seed(1)
image_edited = pipe(
prompt="masterpiece, best quality, a girl with long red hair",
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
height=512, width=512, num_inference_steps=80,
input_image=image, denoising_strength=0.6,
)
image_edited.save("image_edited.jpg")
```
Ah, a cute girl with red hair.
![image_edited](https://github.com/user-attachments/assets/e3de8bc1-037f-4d4d-aacf-8919143c2375)
Since the model itself was trained at a resolution of 512*512, the image appears a bit blurry. However, we can utilize the model's own capabilities to refine the image and add details. Specifically, this involves increasing the resolution and then using image-to-image generation.
```python
torch.manual_seed(2)
image_highres = pipe(
prompt="masterpiece, best quality, a girl with long red hair",
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
height=1024, width=1024, num_inference_steps=80,
input_image=image_edited.resize((1024, 1024)), denoising_strength=0.6,
)
image_highres.save("image_highres.jpg")
```
Ah, a clear and lovely girl with red hair.
![image_highres](https://github.com/user-attachments/assets/4466353e-662c-49f5-9211-b11bb0bb7fb7)
It's worth noting that the image-to-image and high-resolution restoration features are globally supported, and currently, all of our image generation pipelines can be used in this way.

View File

@@ -0,0 +1,77 @@
# Translation and Polishing — The Magic of Prompt Words
When generating images, we need to write prompt words to describe the content of the image. Prompt words directly affect the outcome of the generation, but crafting them is also an art. Good prompt words can produce images with a high degree of aesthetic appeal. We offer a range of models to help users handle prompt words effectively.
## Translation
Most text-to-image models currently only support English prompt words, which can be challenging for users who are not native English speakers. To address this, we can use open-source translation models to translate the prompt words into English. In the following example, we take "一个女孩" (a girl) as the prompt word and use the model opus-mt-zh-en (which can be downloaded from [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) or [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en)) for translation.
```python
from diffsynth import ModelManager, SDXLImagePipeline, Translator
import torch
model_manager = ModelManager(
torch_dtype=torch.float16, device="cuda",
model_id_list=["BluePencilXL_v200", "opus-mt-zh-en"]
)
pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator])
torch.manual_seed(0)
prompt = "一个女孩"
image = pipe(
prompt=prompt, negative_prompt="",
height=1024, width=1024, num_inference_steps=30
)
image.save("image_1.jpg")
```
![image_1](https://github.com/user-attachments/assets/c8070a6b-3d2f-4faf-a806-c403b91f1a94)
## Polishing
Detailed prompt words can generate images with richer details. We can use a prompt polishing model like BeautifulPrompt(which can be downloaded from [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) or [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en)) to embellish simple prompt words. This model can make the overall picture style more gorgeous.
This module can be activated simultaneously with the translation module, but please pay attention to the order: translate first, then polish.
```python
from diffsynth import ModelManager, SDXLImagePipeline, Translator, BeautifulPrompt
import torch
model_manager = ModelManager(
torch_dtype=torch.float16, device="cuda",
model_id_list=["BluePencilXL_v200", "opus-mt-zh-en", "BeautifulPrompt"]
)
pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
torch.manual_seed(0)
prompt = "一个女孩"
image = pipe(
prompt=prompt, negative_prompt="",
height=1024, width=1024, num_inference_steps=30
)
image.save("image_2.jpg")
```
![image_2](https://github.com/user-attachments/assets/94f64a7d-b14a-41e2-a013-c9a74635a84d)
We have also integrated a Tongyi Qwen model that can seamlessly complete the translation and polishing of prompt words in one step.
```python
from diffsynth import ModelManager, SDXLImagePipeline, QwenPrompt
import torch
model_manager = ModelManager(
torch_dtype=torch.float16, device="cuda",
model_id_list=["BluePencilXL_v200", "QwenPrompt"]
)
pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[QwenPrompt])
torch.manual_seed(0)
prompt = "一个女孩"
image = pipe(
prompt=prompt, negative_prompt="",
height=1024, width=1024, num_inference_steps=30
)
image.save("image_3.jpg")
```
![image_3](https://github.com/user-attachments/assets/fc1a201d-aef1-4e6a-81d6-2e2249ffa230)

View File

@@ -0,0 +1,95 @@
# When Image Models Meet AnimateDiff—Model Combination Technology
We have already witnessed the powerful image generation capabilities of the Stable Diffusion model and its ecosystem models. Now, we introduce a new module: AnimateDiff, which allows us to transfer the capabilities of image models to videos. In this article, we showcase an anime-style video rendering solution built on DiffSynth-Studio: Diffutoon.
## Download Models
The following examples will use many models, so let's download them first.
* An anime-style Stable Diffusion architecture model
* Two ControlNet models
* A Textual Inversion model
* An AnimateDiff model
```python
from diffsynth import download_models
download_models([
"AingDiffusion_v12",
"AnimateDiff_v2",
"ControlNet_v11p_sd15_lineart",
"ControlNet_v11f1e_sd15_tile",
"TextualInversion_VeryBadImageNegative_v1.3"
])
```
## Download Video
You can choose any video you like. We use [this video](https://www.bilibili.com/video/BV1iG411a7sQ) as a demonstration. You can download this video file with the following command, but please note, do not use it for commercial purposes without obtaining the commercial copyright from the original video creator.
```
modelscope download --dataset Artiprocher/examples_in_diffsynth data/examples/diffutoon/input_video.mp4 --local_dir ./
```
## Generate Anime
```python
from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
import torch
# Load models
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
model_manager.load_models([
"models/stable_diffusion/aingdiffusion_v12.safetensors",
"models/AnimateDiff/mm_sd_v15_v2.ckpt",
"models/ControlNet/control_v11p_sd15_lineart.pth",
"models/ControlNet/control_v11f1e_sd15_tile.pth",
])
# Build pipeline
pipe = SDVideoPipeline.from_model_manager(
model_manager,
[
ControlNetConfigUnit(
processor_id="tile",
model_path="models/ControlNet/control_v11f1e_sd15_tile.pth",
scale=0.5
),
ControlNetConfigUnit(
processor_id="lineart",
model_path="models/ControlNet/control_v11p_sd15_lineart.pth",
scale=0.5
)
]
)
pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"])
# Load video
video = VideoData(
video_file="data/examples/diffutoon/input_video.mp4",
height=1536, width=1536
)
input_video = [video[i] for i in range(30)]
# Generate
torch.manual_seed(0)
output_video = pipe(
prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
negative_prompt="verybadimagenegative_v1.3",
cfg_scale=7, clip_skip=2,
input_frames=input_video, denoising_strength=1.0,
controlnet_frames=input_video, num_frames=len(input_video),
num_inference_steps=10, height=1536, width=1536,
animatediff_batch_size=16, animatediff_stride=8,
)
# Save video
save_video(output_video, "output_video.mp4", fps=30)
```
## Effect Display
<video width="512" height="256" controls>
<source src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd" type="video/mp4">
Your browser does not support the Video tag.
</video>