mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-24 01:48:13 +00:00
Fix
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
# Set web page format
|
# Set web page format
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
st.set_page_config(layout="wide")
|
st.set_page_config(layout="wide")
|
||||||
# Diasble virtual VRAM on windows system
|
# Disable virtual VRAM on windows system
|
||||||
import torch
|
import torch
|
||||||
torch.cuda.set_per_process_memory_fraction(0.999, 0)
|
torch.cuda.set_per_process_memory_fraction(0.999, 0)
|
||||||
|
|
||||||
|
|||||||
@@ -980,7 +980,7 @@ class Embedding(torch.nn.Module):
|
|||||||
# Embeddings.
|
# Embeddings.
|
||||||
words_embeddings = self.word_embeddings(input_ids)
|
words_embeddings = self.word_embeddings(input_ids)
|
||||||
embeddings = words_embeddings
|
embeddings = words_embeddings
|
||||||
# Data format change to avoid explicit tranposes : [b s h] --> [s b h].
|
# Data format change to avoid explicit transposes : [b s h] --> [s b h].
|
||||||
embeddings = embeddings.transpose(0, 1).contiguous()
|
embeddings = embeddings.transpose(0, 1).contiguous()
|
||||||
# If the input flag for fp32 residual connection is set, convert for float.
|
# If the input flag for fp32 residual connection is set, convert for float.
|
||||||
if self.fp32_residual_connection:
|
if self.fp32_residual_connection:
|
||||||
|
|||||||
@@ -398,7 +398,7 @@ class RoPE1D:
|
|||||||
* tokens: batch_size x ntokens x nheads x dim
|
* tokens: batch_size x ntokens x nheads x dim
|
||||||
* positions: batch_size x ntokens (t position of each token)
|
* positions: batch_size x ntokens (t position of each token)
|
||||||
output:
|
output:
|
||||||
* tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim)
|
* tokens after applying RoPE2D (batch_size x ntokens x nheads x dim)
|
||||||
"""
|
"""
|
||||||
D = tokens.size(3)
|
D = tokens.size(3)
|
||||||
assert positions.ndim == 2 # Batch, Seq
|
assert positions.ndim == 2 # Batch, Seq
|
||||||
@@ -428,7 +428,7 @@ class RoPE3D(RoPE1D):
|
|||||||
* tokens: batch_size x ntokens x nheads x dim
|
* tokens: batch_size x ntokens x nheads x dim
|
||||||
* rope_positions: list of (f, h, w)
|
* rope_positions: list of (f, h, w)
|
||||||
output:
|
output:
|
||||||
* tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim)
|
* tokens after applying RoPE2D (batch_size x ntokens x nheads x dim)
|
||||||
"""
|
"""
|
||||||
assert sum(ch_split) == tokens.size(-1);
|
assert sum(ch_split) == tokens.size(-1);
|
||||||
|
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ class LLaMaEmbedding(nn.Module):
|
|||||||
embeddings = embeddings.to(self.params_dtype)
|
embeddings = embeddings.to(self.params_dtype)
|
||||||
self.word_embeddings = self.word_embeddings.to(self.params_dtype)
|
self.word_embeddings = self.word_embeddings.to(self.params_dtype)
|
||||||
|
|
||||||
# Data format change to avoid explicit tranposes : [b s h] --> [s b h].
|
# Data format change to avoid explicit transposes : [b s h] --> [s b h].
|
||||||
embeddings = embeddings.transpose(0, 1).contiguous()
|
embeddings = embeddings.transpose(0, 1).contiguous()
|
||||||
|
|
||||||
# If the input flag for fp32 residual connection is set, convert for float.
|
# If the input flag for fp32 residual connection is set, convert for float.
|
||||||
@@ -326,7 +326,7 @@ class MultiQueryAttention(nn.Module):
|
|||||||
dim=-1,
|
dim=-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
# gather on 1st dimention
|
# gather on 1st dimension
|
||||||
xq = xq.view(seqlen, bsz, self.n_local_heads, self.head_dim)
|
xq = xq.view(seqlen, bsz, self.n_local_heads, self.head_dim)
|
||||||
xkv = xkv.view(seqlen, bsz, self.n_local_groups, 2 * self.head_dim)
|
xkv = xkv.view(seqlen, bsz, self.n_local_groups, 2 * self.head_dim)
|
||||||
xk, xv = xkv.chunk(2, -1)
|
xk, xv = xkv.chunk(2, -1)
|
||||||
@@ -357,7 +357,7 @@ class MultiQueryAttention(nn.Module):
|
|||||||
output = self.core_attention(xq, xk, xv,
|
output = self.core_attention(xq, xk, xv,
|
||||||
cu_seqlens=cu_seqlens,
|
cu_seqlens=cu_seqlens,
|
||||||
max_seq_len=max_seq_len)
|
max_seq_len=max_seq_len)
|
||||||
# reduce-scatter only support first dimention now
|
# reduce-scatter only support first dimension now
|
||||||
output = rearrange(output, "b s h d -> s b (h d)").contiguous()
|
output = rearrange(output, "b s h d -> s b (h d)").contiguous()
|
||||||
else:
|
else:
|
||||||
xq, xk, xv = [
|
xq, xk, xv = [
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class TileWorker:
|
|||||||
|
|
||||||
|
|
||||||
def io_scale(self, model_output, tile_size):
|
def io_scale(self, model_output, tile_size):
|
||||||
# Determine the size modification happend in forward_fn
|
# Determine the size modification happened in forward_fn
|
||||||
# We only consider the same scale on height and width.
|
# We only consider the same scale on height and width.
|
||||||
io_scale = model_output.shape[2] / tile_size
|
io_scale = model_output.shape[2] / tile_size
|
||||||
return io_scale
|
return io_scale
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ class OmniGenCache(DynamicCache):
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
num_tokens_for_img: int, offload_kv_cache: bool=False) -> None:
|
num_tokens_for_img: int, offload_kv_cache: bool=False) -> None:
|
||||||
if not torch.cuda.is_available():
|
if not torch.cuda.is_available():
|
||||||
print("No avaliable GPU, offload_kv_cache wiil be set to False, which will result in large memory usage and time cost when input multiple images!!!")
|
print("No available GPU, offload_kv_cache will be set to False, which will result in large memory usage and time cost when input multiple images!!!")
|
||||||
offload_kv_cache = False
|
offload_kv_cache = False
|
||||||
raise RuntimeError("OffloadedCache can only be used with a GPU")
|
raise RuntimeError("OffloadedCache can only be used with a GPU")
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|||||||
@@ -16,14 +16,14 @@ The IP-Adapter model based on Stable Diffusion XL is more powerful. You have the
|
|||||||
|
|
||||||
* Content controlling (original usage of IP-Adapter)
|
* Content controlling (original usage of IP-Adapter)
|
||||||
|
|
||||||
|First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparision, disable IP-Adapter to see the generated image.|
|
|First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparison, disable IP-Adapter to see the generated image.|
|
||||||
|-|-|-|
|
|-|-|-|
|
||||||
||||
|
||||
|
||||||
|
|
||||||
|
|
||||||
* Style controlling (InstantStyle)
|
* Style controlling (InstantStyle)
|
||||||
|
|
||||||
|First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparision, disable IP-Adapter to see the generated image.|
|
|First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparison, disable IP-Adapter to see the generated image.|
|
||||||
|-|-|-|
|
|-|-|-|
|
||||||
||||
|
||||
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ file_name,text
|
|||||||
04.jpg,a dog
|
04.jpg,a dog
|
||||||
```
|
```
|
||||||
|
|
||||||
Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommand to use Chinese texts in the dataset. For example
|
Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommend to use Chinese texts in the dataset. For example
|
||||||
|
|
||||||
```
|
```
|
||||||
file_name,text
|
file_name,text
|
||||||
@@ -526,7 +526,7 @@ models/stable_diffusion_xl
|
|||||||
└── sd_xl_base_1.0.safetensors
|
└── sd_xl_base_1.0.safetensors
|
||||||
```
|
```
|
||||||
|
|
||||||
We observed that Stable Diffusion XL is not float16-safe, thus we recommand users to use float32.
|
We observed that Stable Diffusion XL is not float16-safe, thus we recommend users to use float32.
|
||||||
|
|
||||||
```
|
```
|
||||||
CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
|
CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ def parse_args():
|
|||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
required=True,
|
required=True,
|
||||||
help="Path to pretrained models, seperated by comma. For example, SD3: `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`, SD3.5-large: `models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors`",
|
help="Path to pretrained models, separated by comma. For example, SD3: `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`, SD3.5-large: `models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors`",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lora_target_modules",
|
"--lora_target_modules",
|
||||||
|
|||||||
Reference in New Issue
Block a user