diff --git a/diffsynth/models/qwen_image_dit.py b/diffsynth/models/qwen_image_dit.py index e0d493c..13cb5ca 100644 --- a/diffsynth/models/qwen_image_dit.py +++ b/diffsynth/models/qwen_image_dit.py @@ -94,7 +94,7 @@ class QwenEmbedRope(nn.Module): def _expand_pos_freqs_if_needed(self, video_fhw, txt_seq_lens): if isinstance(video_fhw, list): - video_fhw = video_fhw[0] + video_fhw = tuple(max([i[j] for i in video_fhw]) for j in range(3)) _, height, width = video_fhw if self.scale_rope: max_vid_index = max(height // 2, width // 2)