From 4268f5466b75d1b5ab10130cba851c021ea14ce6 Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 26 Feb 2025 14:18:36 +0800 Subject: [PATCH] Fix --- apps/streamlit/DiffSynth_Studio.py | 2 +- diffsynth/models/kolors_text_encoder.py | 2 +- diffsynth/models/stepvideo_dit.py | 4 ++-- diffsynth/models/stepvideo_text_encoder.py | 6 +++--- diffsynth/models/tiler.py | 2 +- diffsynth/pipelines/omnigen_image.py | 2 +- examples/Ip-Adapter/README.md | 4 ++-- examples/train/README.md | 4 ++-- examples/train/stable_diffusion_3/train_sd3_lora.py | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/apps/streamlit/DiffSynth_Studio.py b/apps/streamlit/DiffSynth_Studio.py index 855e5a5..cfd3856 100644 --- a/apps/streamlit/DiffSynth_Studio.py +++ b/apps/streamlit/DiffSynth_Studio.py @@ -1,7 +1,7 @@ # Set web page format import streamlit as st st.set_page_config(layout="wide") -# Diasble virtual VRAM on windows system +# Disable virtual VRAM on windows system import torch torch.cuda.set_per_process_memory_fraction(0.999, 0) diff --git a/diffsynth/models/kolors_text_encoder.py b/diffsynth/models/kolors_text_encoder.py index 693f72e..ee785e3 100644 --- a/diffsynth/models/kolors_text_encoder.py +++ b/diffsynth/models/kolors_text_encoder.py @@ -980,7 +980,7 @@ class Embedding(torch.nn.Module): # Embeddings. words_embeddings = self.word_embeddings(input_ids) embeddings = words_embeddings - # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + # Data format change to avoid explicit transposes : [b s h] --> [s b h]. embeddings = embeddings.transpose(0, 1).contiguous() # If the input flag for fp32 residual connection is set, convert for float. if self.fp32_residual_connection: diff --git a/diffsynth/models/stepvideo_dit.py b/diffsynth/models/stepvideo_dit.py index ccfb8f1..3ca7e06 100644 --- a/diffsynth/models/stepvideo_dit.py +++ b/diffsynth/models/stepvideo_dit.py @@ -398,7 +398,7 @@ class RoPE1D: * tokens: batch_size x ntokens x nheads x dim * positions: batch_size x ntokens (t position of each token) output: - * tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim) + * tokens after applying RoPE2D (batch_size x ntokens x nheads x dim) """ D = tokens.size(3) assert positions.ndim == 2 # Batch, Seq @@ -428,7 +428,7 @@ class RoPE3D(RoPE1D): * tokens: batch_size x ntokens x nheads x dim * rope_positions: list of (f, h, w) output: - * tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim) + * tokens after applying RoPE2D (batch_size x ntokens x nheads x dim) """ assert sum(ch_split) == tokens.size(-1); diff --git a/diffsynth/models/stepvideo_text_encoder.py b/diffsynth/models/stepvideo_text_encoder.py index 46aff0d..598825a 100644 --- a/diffsynth/models/stepvideo_text_encoder.py +++ b/diffsynth/models/stepvideo_text_encoder.py @@ -88,7 +88,7 @@ class LLaMaEmbedding(nn.Module): embeddings = embeddings.to(self.params_dtype) self.word_embeddings = self.word_embeddings.to(self.params_dtype) - # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + # Data format change to avoid explicit transposes : [b s h] --> [s b h]. embeddings = embeddings.transpose(0, 1).contiguous() # If the input flag for fp32 residual connection is set, convert for float. @@ -326,7 +326,7 @@ class MultiQueryAttention(nn.Module): dim=-1, ) - # gather on 1st dimention + # gather on 1st dimension xq = xq.view(seqlen, bsz, self.n_local_heads, self.head_dim) xkv = xkv.view(seqlen, bsz, self.n_local_groups, 2 * self.head_dim) xk, xv = xkv.chunk(2, -1) @@ -357,7 +357,7 @@ class MultiQueryAttention(nn.Module): output = self.core_attention(xq, xk, xv, cu_seqlens=cu_seqlens, max_seq_len=max_seq_len) - # reduce-scatter only support first dimention now + # reduce-scatter only support first dimension now output = rearrange(output, "b s h d -> s b (h d)").contiguous() else: xq, xk, xv = [ diff --git a/diffsynth/models/tiler.py b/diffsynth/models/tiler.py index 77c443b..dff5ebf 100644 --- a/diffsynth/models/tiler.py +++ b/diffsynth/models/tiler.py @@ -55,7 +55,7 @@ class TileWorker: def io_scale(self, model_output, tile_size): - # Determine the size modification happend in forward_fn + # Determine the size modification happened in forward_fn # We only consider the same scale on height and width. io_scale = model_output.shape[2] / tile_size return io_scale diff --git a/diffsynth/pipelines/omnigen_image.py b/diffsynth/pipelines/omnigen_image.py index 428e7f0..ddb2ae6 100644 --- a/diffsynth/pipelines/omnigen_image.py +++ b/diffsynth/pipelines/omnigen_image.py @@ -16,7 +16,7 @@ class OmniGenCache(DynamicCache): def __init__(self, num_tokens_for_img: int, offload_kv_cache: bool=False) -> None: if not torch.cuda.is_available(): - print("No avaliable GPU, offload_kv_cache wiil be set to False, which will result in large memory usage and time cost when input multiple images!!!") + print("No available GPU, offload_kv_cache will be set to False, which will result in large memory usage and time cost when input multiple images!!!") offload_kv_cache = False raise RuntimeError("OffloadedCache can only be used with a GPU") super().__init__() diff --git a/examples/Ip-Adapter/README.md b/examples/Ip-Adapter/README.md index 86f9dab..05f28be 100644 --- a/examples/Ip-Adapter/README.md +++ b/examples/Ip-Adapter/README.md @@ -16,14 +16,14 @@ The IP-Adapter model based on Stable Diffusion XL is more powerful. You have the * Content controlling (original usage of IP-Adapter) -|First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparision, disable IP-Adapter to see the generated image.| +|First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparison, disable IP-Adapter to see the generated image.| |-|-|-| |![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_jumping_rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/b93c5495-0b77-4d97-bcd3-3942858288f2)|![rabbit_to_jumping_rabbit_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/52f37195-65b3-4a38-8d9b-73df37311c15)| * Style controlling (InstantStyle) -|First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparision, disable IP-Adapter to see the generated image.| +|First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparison, disable IP-Adapter to see the generated image.| |-|-|-| |![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_cat](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/a006b281-f643-4ea9-b0da-712289c96059)|![rabbit_to_cat_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/189bd11e-7a10-4c09-8554-0eebde9150fd)| diff --git a/examples/train/README.md b/examples/train/README.md index db90b21..fa99312 100644 --- a/examples/train/README.md +++ b/examples/train/README.md @@ -45,7 +45,7 @@ file_name,text 04.jpg,a dog ``` -Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommand to use Chinese texts in the dataset. For example +Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommend to use Chinese texts in the dataset. For example ``` file_name,text @@ -526,7 +526,7 @@ models/stable_diffusion_xl └── sd_xl_base_1.0.safetensors ``` -We observed that Stable Diffusion XL is not float16-safe, thus we recommand users to use float32. +We observed that Stable Diffusion XL is not float16-safe, thus we recommend users to use float32. ``` CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \ diff --git a/examples/train/stable_diffusion_3/train_sd3_lora.py b/examples/train/stable_diffusion_3/train_sd3_lora.py index c9abf2b..dde1c96 100644 --- a/examples/train/stable_diffusion_3/train_sd3_lora.py +++ b/examples/train/stable_diffusion_3/train_sd3_lora.py @@ -41,7 +41,7 @@ def parse_args(): type=str, default=None, required=True, - help="Path to pretrained models, seperated by comma. For example, SD3: `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`, SD3.5-large: `models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors`", + help="Path to pretrained models, separated by comma. For example, SD3: `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`, SD3.5-large: `models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors`", ) parser.add_argument( "--lora_target_modules",