Merge branch 'main' into qwen

This commit is contained in:
Zhongjie Duan
2024-09-04 17:22:38 +08:00
committed by GitHub
30 changed files with 3100 additions and 124 deletions

View File

@@ -0,0 +1,24 @@
import torch
from diffsynth import download_models, ModelManager, OmostPromter, FluxImagePipeline
download_models(["OmostPrompt"])
download_models(["FLUX.1-dev"])
model_manager = ModelManager(torch_dtype=torch.bfloat16)
model_manager.load_models([
"models/OmostPrompt/omost-llama-3-8b-4bits",
"models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
"models/FLUX/FLUX.1-dev/text_encoder_2",
"models/FLUX/FLUX.1-dev/ae.safetensors",
"models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
])
pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
torch.manual_seed(0)
image = pipe(
prompt="an image of a witch who is releasing ice and fire magic",
num_inference_steps=30, embedded_guidance=3.5
)
image.save("image_omost.jpg")

View File

@@ -1,8 +1,46 @@
# Text to Video
In DiffSynth Studio, we can use AnimateDiff and SVD to generate videos. However, these models usually generate terrible contents. We do not recommend users to use these models, until a more powerful video model emerges.
In DiffSynth Studio, we can use some video models to generate videos.
### Example 7: Text to Video
### Example: Text-to-Video using CogVideoX-5B (Experimental)
See [cogvideo_text_to_video.py](cogvideo_text_to_video.py).
First, we generate a video using prompt "an astronaut riding a horse on Mars".
https://github.com/user-attachments/assets/4c91c1cd-e4a0-471a-bd8d-24d761262941
Then, we convert the astronaut to a robot.
https://github.com/user-attachments/assets/225a00a4-2bc8-4740-8e86-a64b460a29ec
Upscale the video using the model itself.
https://github.com/user-attachments/assets/c02cb30c-de60-473c-8242-32c67b3155ad
Make the video look smoother by interpolating frames.
https://github.com/user-attachments/assets/f0e465b4-45df-4435-ab10-7a084ca2b0a0
Here is another example.
First, we generate a video using prompt "a dog is running".
https://github.com/user-attachments/assets/e3696297-99f5-4d0c-a5ca-1d1566db85b4
Then, we add a blue collar to the dog.
https://github.com/user-attachments/assets/7ff22be7-4390-4d33-ae6c-53f6f056e18d
Upscale the video using the model itself.
https://github.com/user-attachments/assets/a909c32c-0b7d-495c-a53c-d23a99a3d3e9
Make the video look smoother by interpolating frames.
https://github.com/user-attachments/assets/ea37c150-97a0-4858-8003-0c2e5eef3331
### Example: Text-to-Video using AnimateDiff
Generate a video using a Stable Diffusion model and an AnimateDiff model. We can break the limitation of number of frames! See [sd_text_to_video.py](./sd_text_to_video.py).

View File

@@ -0,0 +1,73 @@
from diffsynth import ModelManager, save_video, VideoData, download_models, CogVideoPipeline
from diffsynth.extensions.RIFE import RIFEInterpolater
import torch, os
os.environ["TOKENIZERS_PARALLELISM"] = "True"
def text_to_video(model_manager, prompt, seed, output_path):
pipe = CogVideoPipeline.from_model_manager(model_manager)
torch.manual_seed(seed)
video = pipe(
prompt=prompt,
height=480, width=720,
cfg_scale=7.0, num_inference_steps=200
)
save_video(video, output_path, fps=8, quality=5)
def edit_video(model_manager, prompt, seed, input_path, output_path):
pipe = CogVideoPipeline.from_model_manager(model_manager)
input_video = VideoData(video_file=input_path)
torch.manual_seed(seed)
video = pipe(
prompt=prompt,
height=480, width=720,
cfg_scale=7.0, num_inference_steps=200,
input_video=input_video, denoising_strength=0.7
)
save_video(video, output_path, fps=8, quality=5)
def self_upscale(model_manager, prompt, seed, input_path, output_path):
pipe = CogVideoPipeline.from_model_manager(model_manager)
input_video = VideoData(video_file=input_path, height=480*2, width=720*2).raw_data()
torch.manual_seed(seed)
video = pipe(
prompt=prompt,
height=480*2, width=720*2,
cfg_scale=7.0, num_inference_steps=30,
input_video=input_video, denoising_strength=0.4, tiled=True
)
save_video(video, output_path, fps=8, quality=7)
def interpolate_video(model_manager, input_path, output_path):
rife = RIFEInterpolater.from_model_manager(model_manager)
video = VideoData(video_file=input_path).raw_data()
video = rife.interpolate(video, num_iter=2)
save_video(video, output_path, fps=32, quality=5)
download_models(["CogVideoX-5B", "RIFE"])
model_manager = ModelManager(torch_dtype=torch.bfloat16)
model_manager.load_models([
"models/CogVideo/CogVideoX-5b/text_encoder",
"models/CogVideo/CogVideoX-5b/transformer",
"models/CogVideo/CogVideoX-5b/vae/diffusion_pytorch_model.safetensors",
"models/RIFE/flownet.pkl",
])
# Example 1
text_to_video(model_manager, "an astronaut riding a horse on Mars.", 0, "1_video_1.mp4")
edit_video(model_manager, "a white robot riding a horse on Mars.", 1, "1_video_1.mp4", "1_video_2.mp4")
self_upscale(model_manager, "a white robot riding a horse on Mars.", 2, "1_video_2.mp4", "1_video_3.mp4")
interpolate_video(model_manager, "1_video_3.mp4", "1_video_4.mp4")
# Example 2
text_to_video(model_manager, "a dog is running.", 1, "2_video_1.mp4")
edit_video(model_manager, "a dog with blue collar.", 2, "2_video_1.mp4", "2_video_2.mp4")
self_upscale(model_manager, "a dog with blue collar.", 3, "2_video_2.mp4", "2_video_3.mp4")
interpolate_video(model_manager, "2_video_3.mp4", "2_video_4.mp4")