This commit is contained in:
co63oc
2025-02-26 14:18:36 +08:00
parent bed770248b
commit 4268f5466b
9 changed files with 14 additions and 14 deletions

View File

@@ -1,7 +1,7 @@
# Set web page format # Set web page format
import streamlit as st import streamlit as st
st.set_page_config(layout="wide") st.set_page_config(layout="wide")
# Diasble virtual VRAM on windows system # Disable virtual VRAM on windows system
import torch import torch
torch.cuda.set_per_process_memory_fraction(0.999, 0) torch.cuda.set_per_process_memory_fraction(0.999, 0)

View File

@@ -980,7 +980,7 @@ class Embedding(torch.nn.Module):
# Embeddings. # Embeddings.
words_embeddings = self.word_embeddings(input_ids) words_embeddings = self.word_embeddings(input_ids)
embeddings = words_embeddings embeddings = words_embeddings
# Data format change to avoid explicit tranposes : [b s h] --> [s b h]. # Data format change to avoid explicit transposes : [b s h] --> [s b h].
embeddings = embeddings.transpose(0, 1).contiguous() embeddings = embeddings.transpose(0, 1).contiguous()
# If the input flag for fp32 residual connection is set, convert for float. # If the input flag for fp32 residual connection is set, convert for float.
if self.fp32_residual_connection: if self.fp32_residual_connection:

View File

@@ -398,7 +398,7 @@ class RoPE1D:
* tokens: batch_size x ntokens x nheads x dim * tokens: batch_size x ntokens x nheads x dim
* positions: batch_size x ntokens (t position of each token) * positions: batch_size x ntokens (t position of each token)
output: output:
* tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim) * tokens after applying RoPE2D (batch_size x ntokens x nheads x dim)
""" """
D = tokens.size(3) D = tokens.size(3)
assert positions.ndim == 2 # Batch, Seq assert positions.ndim == 2 # Batch, Seq
@@ -428,7 +428,7 @@ class RoPE3D(RoPE1D):
* tokens: batch_size x ntokens x nheads x dim * tokens: batch_size x ntokens x nheads x dim
* rope_positions: list of (f, h, w) * rope_positions: list of (f, h, w)
output: output:
* tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim) * tokens after applying RoPE2D (batch_size x ntokens x nheads x dim)
""" """
assert sum(ch_split) == tokens.size(-1); assert sum(ch_split) == tokens.size(-1);

View File

@@ -88,7 +88,7 @@ class LLaMaEmbedding(nn.Module):
embeddings = embeddings.to(self.params_dtype) embeddings = embeddings.to(self.params_dtype)
self.word_embeddings = self.word_embeddings.to(self.params_dtype) self.word_embeddings = self.word_embeddings.to(self.params_dtype)
# Data format change to avoid explicit tranposes : [b s h] --> [s b h]. # Data format change to avoid explicit transposes : [b s h] --> [s b h].
embeddings = embeddings.transpose(0, 1).contiguous() embeddings = embeddings.transpose(0, 1).contiguous()
# If the input flag for fp32 residual connection is set, convert for float. # If the input flag for fp32 residual connection is set, convert for float.
@@ -326,7 +326,7 @@ class MultiQueryAttention(nn.Module):
dim=-1, dim=-1,
) )
# gather on 1st dimention # gather on 1st dimension
xq = xq.view(seqlen, bsz, self.n_local_heads, self.head_dim) xq = xq.view(seqlen, bsz, self.n_local_heads, self.head_dim)
xkv = xkv.view(seqlen, bsz, self.n_local_groups, 2 * self.head_dim) xkv = xkv.view(seqlen, bsz, self.n_local_groups, 2 * self.head_dim)
xk, xv = xkv.chunk(2, -1) xk, xv = xkv.chunk(2, -1)
@@ -357,7 +357,7 @@ class MultiQueryAttention(nn.Module):
output = self.core_attention(xq, xk, xv, output = self.core_attention(xq, xk, xv,
cu_seqlens=cu_seqlens, cu_seqlens=cu_seqlens,
max_seq_len=max_seq_len) max_seq_len=max_seq_len)
# reduce-scatter only support first dimention now # reduce-scatter only support first dimension now
output = rearrange(output, "b s h d -> s b (h d)").contiguous() output = rearrange(output, "b s h d -> s b (h d)").contiguous()
else: else:
xq, xk, xv = [ xq, xk, xv = [

View File

@@ -55,7 +55,7 @@ class TileWorker:
def io_scale(self, model_output, tile_size): def io_scale(self, model_output, tile_size):
# Determine the size modification happend in forward_fn # Determine the size modification happened in forward_fn
# We only consider the same scale on height and width. # We only consider the same scale on height and width.
io_scale = model_output.shape[2] / tile_size io_scale = model_output.shape[2] / tile_size
return io_scale return io_scale

View File

@@ -16,7 +16,7 @@ class OmniGenCache(DynamicCache):
def __init__(self, def __init__(self,
num_tokens_for_img: int, offload_kv_cache: bool=False) -> None: num_tokens_for_img: int, offload_kv_cache: bool=False) -> None:
if not torch.cuda.is_available(): if not torch.cuda.is_available():
print("No avaliable GPU, offload_kv_cache wiil be set to False, which will result in large memory usage and time cost when input multiple images!!!") print("No available GPU, offload_kv_cache will be set to False, which will result in large memory usage and time cost when input multiple images!!!")
offload_kv_cache = False offload_kv_cache = False
raise RuntimeError("OffloadedCache can only be used with a GPU") raise RuntimeError("OffloadedCache can only be used with a GPU")
super().__init__() super().__init__()

View File

@@ -16,14 +16,14 @@ The IP-Adapter model based on Stable Diffusion XL is more powerful. You have the
* Content controlling (original usage of IP-Adapter) * Content controlling (original usage of IP-Adapter)
|First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparision, disable IP-Adapter to see the generated image.| |First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparison, disable IP-Adapter to see the generated image.|
|-|-|-| |-|-|-|
|![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_jumping_rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/b93c5495-0b77-4d97-bcd3-3942858288f2)|![rabbit_to_jumping_rabbit_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/52f37195-65b3-4a38-8d9b-73df37311c15)| |![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_jumping_rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/b93c5495-0b77-4d97-bcd3-3942858288f2)|![rabbit_to_jumping_rabbit_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/52f37195-65b3-4a38-8d9b-73df37311c15)|
* Style controlling (InstantStyle) * Style controlling (InstantStyle)
|First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparision, disable IP-Adapter to see the generated image.| |First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparison, disable IP-Adapter to see the generated image.|
|-|-|-| |-|-|-|
|![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_cat](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/a006b281-f643-4ea9-b0da-712289c96059)|![rabbit_to_cat_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/189bd11e-7a10-4c09-8554-0eebde9150fd)| |![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_cat](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/a006b281-f643-4ea9-b0da-712289c96059)|![rabbit_to_cat_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/189bd11e-7a10-4c09-8554-0eebde9150fd)|

View File

@@ -45,7 +45,7 @@ file_name,text
04.jpg,a dog 04.jpg,a dog
``` ```
Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommand to use Chinese texts in the dataset. For example Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommend to use Chinese texts in the dataset. For example
``` ```
file_name,text file_name,text
@@ -526,7 +526,7 @@ models/stable_diffusion_xl
└── sd_xl_base_1.0.safetensors └── sd_xl_base_1.0.safetensors
``` ```
We observed that Stable Diffusion XL is not float16-safe, thus we recommand users to use float32. We observed that Stable Diffusion XL is not float16-safe, thus we recommend users to use float32.
``` ```
CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \ CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \

View File

@@ -41,7 +41,7 @@ def parse_args():
type=str, type=str,
default=None, default=None,
required=True, required=True,
help="Path to pretrained models, seperated by comma. For example, SD3: `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`, SD3.5-large: `models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors`", help="Path to pretrained models, separated by comma. For example, SD3: `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`, SD3.5-large: `models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors`",
) )
parser.add_argument( parser.add_argument(
"--lora_target_modules", "--lora_target_modules",