mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
@@ -980,7 +980,7 @@ class Embedding(torch.nn.Module):
|
||||
# Embeddings.
|
||||
words_embeddings = self.word_embeddings(input_ids)
|
||||
embeddings = words_embeddings
|
||||
# Data format change to avoid explicit tranposes : [b s h] --> [s b h].
|
||||
# Data format change to avoid explicit transposes : [b s h] --> [s b h].
|
||||
embeddings = embeddings.transpose(0, 1).contiguous()
|
||||
# If the input flag for fp32 residual connection is set, convert for float.
|
||||
if self.fp32_residual_connection:
|
||||
|
||||
@@ -398,7 +398,7 @@ class RoPE1D:
|
||||
* tokens: batch_size x ntokens x nheads x dim
|
||||
* positions: batch_size x ntokens (t position of each token)
|
||||
output:
|
||||
* tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim)
|
||||
* tokens after applying RoPE2D (batch_size x ntokens x nheads x dim)
|
||||
"""
|
||||
D = tokens.size(3)
|
||||
assert positions.ndim == 2 # Batch, Seq
|
||||
@@ -428,7 +428,7 @@ class RoPE3D(RoPE1D):
|
||||
* tokens: batch_size x ntokens x nheads x dim
|
||||
* rope_positions: list of (f, h, w)
|
||||
output:
|
||||
* tokens after appplying RoPE2D (batch_size x ntokens x nheads x dim)
|
||||
* tokens after applying RoPE2D (batch_size x ntokens x nheads x dim)
|
||||
"""
|
||||
assert sum(ch_split) == tokens.size(-1);
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ class LLaMaEmbedding(nn.Module):
|
||||
embeddings = embeddings.to(self.params_dtype)
|
||||
self.word_embeddings = self.word_embeddings.to(self.params_dtype)
|
||||
|
||||
# Data format change to avoid explicit tranposes : [b s h] --> [s b h].
|
||||
# Data format change to avoid explicit transposes : [b s h] --> [s b h].
|
||||
embeddings = embeddings.transpose(0, 1).contiguous()
|
||||
|
||||
# If the input flag for fp32 residual connection is set, convert for float.
|
||||
@@ -326,7 +326,7 @@ class MultiQueryAttention(nn.Module):
|
||||
dim=-1,
|
||||
)
|
||||
|
||||
# gather on 1st dimention
|
||||
# gather on 1st dimension
|
||||
xq = xq.view(seqlen, bsz, self.n_local_heads, self.head_dim)
|
||||
xkv = xkv.view(seqlen, bsz, self.n_local_groups, 2 * self.head_dim)
|
||||
xk, xv = xkv.chunk(2, -1)
|
||||
@@ -357,7 +357,7 @@ class MultiQueryAttention(nn.Module):
|
||||
output = self.core_attention(xq, xk, xv,
|
||||
cu_seqlens=cu_seqlens,
|
||||
max_seq_len=max_seq_len)
|
||||
# reduce-scatter only support first dimention now
|
||||
# reduce-scatter only support first dimension now
|
||||
output = rearrange(output, "b s h d -> s b (h d)").contiguous()
|
||||
else:
|
||||
xq, xk, xv = [
|
||||
|
||||
@@ -55,7 +55,7 @@ class TileWorker:
|
||||
|
||||
|
||||
def io_scale(self, model_output, tile_size):
|
||||
# Determine the size modification happend in forward_fn
|
||||
# Determine the size modification happened in forward_fn
|
||||
# We only consider the same scale on height and width.
|
||||
io_scale = model_output.shape[2] / tile_size
|
||||
return io_scale
|
||||
|
||||
@@ -16,7 +16,7 @@ class OmniGenCache(DynamicCache):
|
||||
def __init__(self,
|
||||
num_tokens_for_img: int, offload_kv_cache: bool=False) -> None:
|
||||
if not torch.cuda.is_available():
|
||||
print("No avaliable GPU, offload_kv_cache wiil be set to False, which will result in large memory usage and time cost when input multiple images!!!")
|
||||
print("No available GPU, offload_kv_cache will be set to False, which will result in large memory usage and time cost when input multiple images!!!")
|
||||
offload_kv_cache = False
|
||||
raise RuntimeError("OffloadedCache can only be used with a GPU")
|
||||
super().__init__()
|
||||
|
||||
Reference in New Issue
Block a user