From 4e1cea64addb5581e185e61823bf5d6f102f0e2b Mon Sep 17 00:00:00 2001 From: tc2000731 Date: Fri, 25 Oct 2024 17:32:29 +0800 Subject: [PATCH] update HunyuanDiT.md Kolors.md --- docs/source/model/HunyuanDiT.md | 59 ++++++++++++++++++++++++++++++++- docs/source/model/Kolors.md | 48 ++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/docs/source/model/HunyuanDiT.md b/docs/source/model/HunyuanDiT.md index 4df2413..457cc43 100644 --- a/docs/source/model/HunyuanDiT.md +++ b/docs/source/model/HunyuanDiT.md @@ -1 +1,58 @@ -# Hunyuan-DiT \ No newline at end of file +# Hunyuan-DiT + +## 相关链接 + +* 论文:[Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding](https://arxiv.org/pdf/2405.08748) +* 模型 + * HunyuanDiT + * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) + * [ModelScope](https://modelscope.cn/models/modelscope/HunyuanDiT) + * HunyuanDiT-v1.1 + * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1) + * HunyuanDiT-v1.2 + * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2) + * Distillation + * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/Distillation) + * Distillation-v1.1 + * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.1) + * Distillation-v1.2 + * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.2) +* 项目页面: https://dit.hunyuan.tencent.com/ + +## 模型介绍 + +Hunyuan-DiT是一种基于传统DiT架构的扩散模型, 为了加强模型对中文的细粒度(fine-grained)理解能力, Hunyuan-DiT对Transformer在多个方面进行了改进. 在类别条件(class-conditional)的 DiT 中使用的自适应层归一化(Adaptive Layer Norm)在强制执行细粒度文本条件方面表现不好, 为此Hunyuan-DiT采用了与Stable Diffusion 相似的交叉注意力机制. Hunyuan-DiT接受VAE潜在空间的向量作为输入, 将它分割成小块后经过线性层得到后续用于transformer块的标记. 在每个Hunyuan-DiT Block中包含三个模块, 自注意力(self-attention), 交叉注意力(cross-attention), 和前馈网络(feed-forward network, FFN). + +![image](https://github.com/user-attachments/assets/50f3eb1f-855d-4095-88fb-c03711f4c7ae) + +为了加强训练的稳定性, Hunyuan-DiT采用了QK-Norm, 在注意力层计算QKV前加入层归一化, 并且在decoder block的skip module后加入层归一化避免损失爆炸(loss explosion). + +Hunyuan-DiT的生成效果: +![image](https://github.com/user-attachments/assets/4c11be16-c7ac-45a1-a900-b620606eb2c4) + +## 代码样例 + +```python +from diffsynth import ModelManager, HunyuanDiTImagePipeline, download_models +import torch + +download_models(["HunyuanDiT"]) + +model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") +model_manager.load_models([ + "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin", + "models/HunyuanDiT/t2i/mt5/pytorch_model.bin", + "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt", + "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin" +]) +pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager) + +prompt = "一幅细致的油画描绘了一只年轻獾轻轻嗅着一朵明亮的黄色玫瑰时错综复杂的皮毛。背景是一棵大树干的粗糙纹理,獾的爪子轻轻地挖进树皮。在柔和的背景中,一个宁静的瀑布倾泻而下,它的水在绿色植物中闪烁着蓝色。" + +torch.manual_seed(0) +image = pipe( + prompt=prompt, + num_inference_steps=50, height=1024, width=1024, +) +image.save("image_1024.png") +``` \ No newline at end of file diff --git a/docs/source/model/Kolors.md b/docs/source/model/Kolors.md index e7fa7a9..5eafa0a 100644 --- a/docs/source/model/Kolors.md +++ b/docs/source/model/Kolors.md @@ -1 +1,47 @@ -# Kolors \ No newline at end of file +# Kolors + +## 相关链接 + +* 论文:[Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf) +* 模型 + * Kolors + * [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) + * [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors) + +* 项目页面: https://kwai-kolors.github.io/ + +## 模型介绍 + +Kolors是一种用于文本生成图像的潜在扩散模型, 使用了General Language Model(GLM)作为文本编码器, 增强了它的中英文理解能力. Kolors有两个训练阶段, 包括概念学习阶段(使用广泛的知识)和质量提升阶段(使用精心整理的高美学数据), 并且在质量提升阶段使用1100步的调度器添加噪声, 以达到更低的信噪比. 这些改动使得即使Kolors以U-Net作为骨干模型, 也能达到好的效果. +![image](https://github.com/user-attachments/assets/d6b91d41-3d88-4d26-a399-03ca180640cf) + +kolors的生成效果: +![kolors](https://github.com/user-attachments/assets/f6926507-52e2-471d-87ab-a9351338e4ca) + + +## 代码样例 + +```python +from diffsynth import ModelManager, SDXLImagePipeline, download_models +import torch + +download_models(["Kolors"]) +model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", + file_path_list=[ + "models/kolors/Kolors/text_encoder", + "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors", + "models/kolors/Kolors/vae/diffusion_pytorch_model.safetensors" + ]) +pipe = SDXLImagePipeline.from_model_manager(model_manager) + +prompt = '一张瓢虫的照片,微距,变焦,高质量,电影,拿着一个牌子,写着"Kolors"' + +torch.manual_seed(7) +image = pipe( + prompt=prompt, + num_inference_steps=50, + cfg_scale=4, +) +image.save(f"image_1024.jpg") + +``` \ No newline at end of file