mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-04-13 13:05:45 +00:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9f8c352a15 | ||
|
|
f88b99cb4f |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,6 +2,7 @@
|
|||||||
/models
|
/models
|
||||||
/scripts
|
/scripts
|
||||||
/diffusers
|
/diffusers
|
||||||
|
/.vscode
|
||||||
*.pkl
|
*.pkl
|
||||||
*.safetensors
|
*.safetensors
|
||||||
*.pth
|
*.pth
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from ..utils.lora import GeneralLoRALoader
|
|||||||
from ..models.model_loader import ModelPool
|
from ..models.model_loader import ModelPool
|
||||||
from ..utils.controlnet import ControlNetInput
|
from ..utils.controlnet import ControlNetInput
|
||||||
from ..core.device import get_device_name, IS_NPU_AVAILABLE
|
from ..core.device import get_device_name, IS_NPU_AVAILABLE
|
||||||
|
from .template import load_template_model, load_template_data_processor
|
||||||
|
|
||||||
|
|
||||||
class PipelineUnit:
|
class PipelineUnit:
|
||||||
@@ -319,14 +320,21 @@ class BasePipeline(torch.nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
def cfg_guided_model_fn(self, model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega, **inputs_others):
|
def cfg_guided_model_fn(self, model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega, **inputs_others):
|
||||||
|
# Positive side forward
|
||||||
if inputs_shared.get("positive_only_lora", None) is not None:
|
if inputs_shared.get("positive_only_lora", None) is not None:
|
||||||
self.clear_lora(verbose=0)
|
|
||||||
self.load_lora(self.dit, state_dict=inputs_shared["positive_only_lora"], verbose=0)
|
self.load_lora(self.dit, state_dict=inputs_shared["positive_only_lora"], verbose=0)
|
||||||
noise_pred_posi = model_fn(**inputs_posi, **inputs_shared, **inputs_others)
|
noise_pred_posi = model_fn(**inputs_posi, **inputs_shared, **inputs_others)
|
||||||
if cfg_scale != 1.0:
|
|
||||||
if inputs_shared.get("positive_only_lora", None) is not None:
|
if inputs_shared.get("positive_only_lora", None) is not None:
|
||||||
self.clear_lora(verbose=0)
|
self.clear_lora(verbose=0)
|
||||||
|
|
||||||
|
if cfg_scale != 1.0:
|
||||||
|
# Negative side forward
|
||||||
|
if inputs_shared.get("negative_only_lora", None) is not None:
|
||||||
|
self.load_lora(self.dit, state_dict=inputs_shared["negative_only_lora"], verbose=0)
|
||||||
noise_pred_nega = model_fn(**inputs_nega, **inputs_shared, **inputs_others)
|
noise_pred_nega = model_fn(**inputs_nega, **inputs_shared, **inputs_others)
|
||||||
|
if inputs_shared.get("negative_only_lora", None) is not None:
|
||||||
|
self.clear_lora(verbose=0)
|
||||||
|
|
||||||
if isinstance(noise_pred_posi, tuple):
|
if isinstance(noise_pred_posi, tuple):
|
||||||
# Separately handling different output types of latents, eg. video and audio latents.
|
# Separately handling different output types of latents, eg. video and audio latents.
|
||||||
noise_pred = tuple(
|
noise_pred = tuple(
|
||||||
@@ -340,6 +348,14 @@ class BasePipeline(torch.nn.Module):
|
|||||||
return noise_pred
|
return noise_pred
|
||||||
|
|
||||||
|
|
||||||
|
def load_training_template_model(self, model_config: ModelConfig = None):
|
||||||
|
if model_config is not None:
|
||||||
|
model_config.download_if_necessary()
|
||||||
|
self.template_model = load_template_model(model_config.path, torch_dtype=self.torch_dtype, device=self.device)
|
||||||
|
self.template_data_processor = load_template_data_processor(model_config.path)()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineUnitGraph:
|
class PipelineUnitGraph:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -60,6 +60,10 @@ def add_gradient_config(parser: argparse.ArgumentParser):
|
|||||||
parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Gradient accumulation steps.")
|
parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Gradient accumulation steps.")
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
def add_template_model_config(parser: argparse.ArgumentParser):
|
||||||
|
parser.add_argument("--template_model_id_or_path", type=str, default=None, help="Model ID of path of template models.")
|
||||||
|
return parser
|
||||||
|
|
||||||
def add_general_config(parser: argparse.ArgumentParser):
|
def add_general_config(parser: argparse.ArgumentParser):
|
||||||
parser = add_dataset_base_config(parser)
|
parser = add_dataset_base_config(parser)
|
||||||
parser = add_model_config(parser)
|
parser = add_model_config(parser)
|
||||||
@@ -67,4 +71,5 @@ def add_general_config(parser: argparse.ArgumentParser):
|
|||||||
parser = add_output_config(parser)
|
parser = add_output_config(parser)
|
||||||
parser = add_lora_config(parser)
|
parser = add_lora_config(parser)
|
||||||
parser = add_gradient_config(parser)
|
parser = add_gradient_config(parser)
|
||||||
|
parser = add_template_model_config(parser)
|
||||||
return parser
|
return parser
|
||||||
|
|||||||
176
diffsynth/diffusion/template.py
Normal file
176
diffsynth/diffusion/template.py
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
import torch, os, importlib, warnings, json, inspect
|
||||||
|
from typing import Dict, List, Tuple, Union
|
||||||
|
from ..core import ModelConfig, load_model
|
||||||
|
from ..core.device.npu_compatible_device import get_device_type
|
||||||
|
|
||||||
|
|
||||||
|
KVCache = Dict[str, Tuple[torch.Tensor, torch.Tensor]]
|
||||||
|
|
||||||
|
|
||||||
|
class TemplateModel(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def process_inputs(self, **kwargs):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def forward(self, **kwargs):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def check_template_model_format(model):
|
||||||
|
if not hasattr(model, "process_inputs"):
|
||||||
|
raise NotImplementedError("`process_inputs` is not implemented in the Template model.")
|
||||||
|
if "kwargs" not in inspect.signature(model.process_inputs).parameters:
|
||||||
|
raise NotImplementedError("`**kwargs` is not included in `process_inputs`.")
|
||||||
|
if not hasattr(model, "forward"):
|
||||||
|
raise NotImplementedError("`forward` is not implemented in the Template model.")
|
||||||
|
if "kwargs" not in inspect.signature(model.forward).parameters:
|
||||||
|
raise NotImplementedError("`**kwargs` is not included in `forward`.")
|
||||||
|
|
||||||
|
|
||||||
|
def load_template_model(path, torch_dtype=torch.bfloat16, device="cuda", verbose=1):
|
||||||
|
spec = importlib.util.spec_from_file_location("template_model", os.path.join(path, "model.py"))
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
template_model_path = getattr(module, 'TEMPLATE_MODEL_PATH') if hasattr(module, 'TEMPLATE_MODEL_PATH') else None
|
||||||
|
if template_model_path is not None:
|
||||||
|
# With `TEMPLATE_MODEL_PATH`, a pretrained model will be loaded.
|
||||||
|
model = load_model(
|
||||||
|
model_class=getattr(module, 'TEMPLATE_MODEL'),
|
||||||
|
config=getattr(module, 'TEMPLATE_MODEL_CONFIG') if hasattr(module, 'TEMPLATE_MODEL_CONFIG') else None,
|
||||||
|
path=os.path.join(path, getattr(module, 'TEMPLATE_MODEL_PATH')),
|
||||||
|
torch_dtype=torch_dtype,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Without `TEMPLATE_MODEL_PATH`, a randomly initialized model or a non-model module will be loaded.
|
||||||
|
model = module.TEMPLATE_MODEL()
|
||||||
|
if hasattr(model, "to"):
|
||||||
|
model = model.to(dtype=torch_dtype, device=device)
|
||||||
|
if hasattr(model, "eval"):
|
||||||
|
model = model.eval()
|
||||||
|
check_template_model_format(model)
|
||||||
|
if verbose > 0:
|
||||||
|
metadata = {
|
||||||
|
"model_architecture": getattr(module, 'TEMPLATE_MODEL').__name__,
|
||||||
|
"code_path": os.path.join(path, "model.py"),
|
||||||
|
"weight_path": template_model_path,
|
||||||
|
}
|
||||||
|
print(f"Template model loaded: {json.dumps(metadata, indent=4)}")
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def load_template_data_processor(path):
|
||||||
|
spec = importlib.util.spec_from_file_location("template_model", os.path.join(path, "model.py"))
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
if hasattr(module, 'TEMPLATE_DATA_PROCESSOR'):
|
||||||
|
processor = getattr(module, 'TEMPLATE_DATA_PROCESSOR')
|
||||||
|
return processor
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class TemplatePipeline(torch.nn.Module):
|
||||||
|
def __init__(self, models: List[TemplateModel]):
|
||||||
|
super().__init__()
|
||||||
|
self.models = torch.nn.ModuleList(models)
|
||||||
|
|
||||||
|
def merge_kv_cache(self, kv_cache_list: List[KVCache]) -> KVCache:
|
||||||
|
names = {}
|
||||||
|
for kv_cache in kv_cache_list:
|
||||||
|
for name in kv_cache:
|
||||||
|
names[name] = None
|
||||||
|
kv_cache_merged = {}
|
||||||
|
for name in names:
|
||||||
|
kv_list = [kv_cache.get(name) for kv_cache in kv_cache_list]
|
||||||
|
kv_list = [kv for kv in kv_list if kv is not None]
|
||||||
|
if len(kv_list) > 0:
|
||||||
|
k = torch.concat([kv[0] for kv in kv_list], dim=1)
|
||||||
|
v = torch.concat([kv[1] for kv in kv_list], dim=1)
|
||||||
|
kv_cache_merged[name] = (k, v)
|
||||||
|
return kv_cache_merged
|
||||||
|
|
||||||
|
def merge_template_cache(self, template_cache_list):
|
||||||
|
params = sorted(list(set(sum([list(template_cache.keys()) for template_cache in template_cache_list], []))))
|
||||||
|
template_cache_merged = {}
|
||||||
|
for param in params:
|
||||||
|
data = [template_cache[param] for template_cache in template_cache_list if param in template_cache]
|
||||||
|
if param == "kv_cache":
|
||||||
|
data = self.merge_kv_cache(data)
|
||||||
|
elif len(data) == 1:
|
||||||
|
data = data[0]
|
||||||
|
else:
|
||||||
|
print(f"Conflict detected: `{param}` appears in the outputs of multiple Template models. Only the first one will be retained.")
|
||||||
|
data = data[0]
|
||||||
|
template_cache_merged[param] = data
|
||||||
|
return template_cache_merged
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def check_vram_config(model_config: ModelConfig):
|
||||||
|
params = [
|
||||||
|
model_config.offload_device, model_config.offload_dtype,
|
||||||
|
model_config.onload_device, model_config.onload_dtype,
|
||||||
|
model_config.preparing_device, model_config.preparing_dtype,
|
||||||
|
model_config.computation_device, model_config.computation_dtype,
|
||||||
|
]
|
||||||
|
for param in params:
|
||||||
|
if param is not None:
|
||||||
|
warnings.warn("TemplatePipeline doesn't support VRAM management. VRAM config will be ignored.")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_pretrained(
|
||||||
|
torch_dtype: torch.dtype = torch.bfloat16,
|
||||||
|
device: Union[str, torch.device] = get_device_type(),
|
||||||
|
model_configs: list[ModelConfig] = [],
|
||||||
|
):
|
||||||
|
models = []
|
||||||
|
for model_config in model_configs:
|
||||||
|
TemplatePipeline.check_vram_config(model_config)
|
||||||
|
model_config.download_if_necessary()
|
||||||
|
model = load_template_model(model_config.path, torch_dtype=torch_dtype, device=device)
|
||||||
|
models.append(model)
|
||||||
|
pipe = TemplatePipeline(models)
|
||||||
|
return pipe
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def process_inputs(self, inputs: List[Dict], pipe=None, **kwargs):
|
||||||
|
return [(i.get("model_id", 0), self.models[i.get("model_id", 0)].process_inputs(pipe=pipe, **i)) for i in inputs]
|
||||||
|
|
||||||
|
def forward(self, inputs: List[Tuple[int, Dict]], pipe=None, **kwargs):
|
||||||
|
template_cache = []
|
||||||
|
for model_id, model_inputs in inputs:
|
||||||
|
kv_cache = self.models[model_id](pipe=pipe, **model_inputs)
|
||||||
|
template_cache.append(kv_cache)
|
||||||
|
return template_cache
|
||||||
|
|
||||||
|
def call_single_side(self, pipe=None, inputs: List[Dict] = None):
|
||||||
|
inputs = self.process_inputs(pipe=pipe, inputs=inputs)
|
||||||
|
template_cache = self.forward(pipe=pipe, inputs=inputs)
|
||||||
|
template_cache = self.merge_template_cache(template_cache)
|
||||||
|
return template_cache
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
pipe=None,
|
||||||
|
template_inputs: List[Dict] = None,
|
||||||
|
negative_template_inputs: List[Dict] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
template_cache = self.call_single_side(pipe=pipe, inputs=template_inputs or [])
|
||||||
|
negative_template_cache = self.call_single_side(pipe=pipe, inputs=negative_template_inputs or [])
|
||||||
|
required_params = list(inspect.signature(pipe.__call__).parameters.keys())
|
||||||
|
for param in template_cache:
|
||||||
|
if param in required_params:
|
||||||
|
kwargs[param] = template_cache[param]
|
||||||
|
else:
|
||||||
|
print(f"`{param}` is not included in the inputs of `{pipe.__class__.__name__}`. This parameter will be ignored.")
|
||||||
|
for param in negative_template_cache:
|
||||||
|
if "negative_" + param in required_params:
|
||||||
|
kwargs["negative_" + param] = negative_template_cache[param]
|
||||||
|
else:
|
||||||
|
print(f"`{'negative_' + param}` is not included in the inputs of `{pipe.__class__.__name__}`. This parameter will be ignored.")
|
||||||
|
return pipe(**kwargs)
|
||||||
@@ -6,6 +6,7 @@ from peft import LoraConfig, inject_adapter_in_model
|
|||||||
|
|
||||||
|
|
||||||
class GeneralUnit_RemoveCache(PipelineUnit):
|
class GeneralUnit_RemoveCache(PipelineUnit):
|
||||||
|
# Only used for training
|
||||||
def __init__(self, required_params=tuple(), force_remove_params_shared=tuple(), force_remove_params_posi=tuple(), force_remove_params_nega=tuple()):
|
def __init__(self, required_params=tuple(), force_remove_params_shared=tuple(), force_remove_params_posi=tuple(), force_remove_params_nega=tuple()):
|
||||||
super().__init__(take_over=True)
|
super().__init__(take_over=True)
|
||||||
self.required_params = required_params
|
self.required_params = required_params
|
||||||
@@ -27,6 +28,47 @@ class GeneralUnit_RemoveCache(PipelineUnit):
|
|||||||
return inputs_shared, inputs_posi, inputs_nega
|
return inputs_shared, inputs_posi, inputs_nega
|
||||||
|
|
||||||
|
|
||||||
|
class GeneralUnit_TemplateProcessInputs(PipelineUnit):
|
||||||
|
# Only used for training
|
||||||
|
def __init__(self, data_processor):
|
||||||
|
super().__init__(
|
||||||
|
input_params=("template_inputs",),
|
||||||
|
output_params=("template_inputs",),
|
||||||
|
)
|
||||||
|
self.data_processor = data_processor
|
||||||
|
|
||||||
|
def process(self, pipe, template_inputs):
|
||||||
|
if not hasattr(pipe, "template_model"):
|
||||||
|
return {}
|
||||||
|
if self.data_processor is not None:
|
||||||
|
template_inputs = self.data_processor(**template_inputs)
|
||||||
|
template_inputs = pipe.template_model.process_inputs(pipe=pipe, **template_inputs)
|
||||||
|
return {"template_inputs": template_inputs}
|
||||||
|
|
||||||
|
|
||||||
|
class GeneralUnit_TemplateForward(PipelineUnit):
|
||||||
|
# Only used for training
|
||||||
|
def __init__(self, use_gradient_checkpointing=False, use_gradient_checkpointing_offload=False):
|
||||||
|
super().__init__(
|
||||||
|
input_params=("template_inputs",),
|
||||||
|
output_params=("kv_cache",),
|
||||||
|
onload_model_names=("template_model",)
|
||||||
|
)
|
||||||
|
self.use_gradient_checkpointing = use_gradient_checkpointing
|
||||||
|
self.use_gradient_checkpointing_offload = use_gradient_checkpointing_offload
|
||||||
|
|
||||||
|
def process(self, pipe, template_inputs):
|
||||||
|
if not hasattr(pipe, "template_model"):
|
||||||
|
return {}
|
||||||
|
template_cache = pipe.template_model.forward(
|
||||||
|
**template_inputs,
|
||||||
|
pipe=pipe,
|
||||||
|
use_gradient_checkpointing=self.use_gradient_checkpointing,
|
||||||
|
use_gradient_checkpointing_offload=self.use_gradient_checkpointing_offload,
|
||||||
|
)
|
||||||
|
return template_cache
|
||||||
|
|
||||||
|
|
||||||
class DiffusionTrainingModule(torch.nn.Module):
|
class DiffusionTrainingModule(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -211,6 +253,16 @@ class DiffusionTrainingModule(torch.nn.Module):
|
|||||||
return lora_target_modules
|
return lora_target_modules
|
||||||
|
|
||||||
|
|
||||||
|
def load_training_template_model(self, pipe, path_or_model_id, use_gradient_checkpointing=False, use_gradient_checkpointing_offload=False):
|
||||||
|
if path_or_model_id is None:
|
||||||
|
return pipe
|
||||||
|
model_config = self.parse_path_or_model_id(path_or_model_id)
|
||||||
|
pipe.load_training_template_model(model_config)
|
||||||
|
pipe.units.append(GeneralUnit_TemplateProcessInputs(pipe.template_data_processor))
|
||||||
|
pipe.units.append(GeneralUnit_TemplateForward(use_gradient_checkpointing, use_gradient_checkpointing_offload))
|
||||||
|
return pipe
|
||||||
|
|
||||||
|
|
||||||
def switch_pipe_to_training_mode(
|
def switch_pipe_to_training_mode(
|
||||||
self,
|
self,
|
||||||
pipe,
|
pipe,
|
||||||
|
|||||||
@@ -364,78 +364,7 @@ class Flux2FeedForward(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
class Flux2AttnProcessor:
|
|
||||||
_attention_backend = None
|
|
||||||
_parallel_config = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
if not hasattr(F, "scaled_dot_product_attention"):
|
|
||||||
raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
attn: "Flux2Attention",
|
|
||||||
hidden_states: torch.Tensor,
|
|
||||||
encoder_hidden_states: torch.Tensor = None,
|
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
|
||||||
image_rotary_emb: Optional[torch.Tensor] = None,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
|
|
||||||
attn, hidden_states, encoder_hidden_states
|
|
||||||
)
|
|
||||||
|
|
||||||
query = query.unflatten(-1, (attn.heads, -1))
|
|
||||||
key = key.unflatten(-1, (attn.heads, -1))
|
|
||||||
value = value.unflatten(-1, (attn.heads, -1))
|
|
||||||
|
|
||||||
query = attn.norm_q(query)
|
|
||||||
key = attn.norm_k(key)
|
|
||||||
|
|
||||||
if attn.added_kv_proj_dim is not None:
|
|
||||||
encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
|
|
||||||
encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
|
|
||||||
encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
|
|
||||||
|
|
||||||
encoder_query = attn.norm_added_q(encoder_query)
|
|
||||||
encoder_key = attn.norm_added_k(encoder_key)
|
|
||||||
|
|
||||||
query = torch.cat([encoder_query, query], dim=1)
|
|
||||||
key = torch.cat([encoder_key, key], dim=1)
|
|
||||||
value = torch.cat([encoder_value, value], dim=1)
|
|
||||||
|
|
||||||
if image_rotary_emb is not None:
|
|
||||||
query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
|
|
||||||
key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
|
|
||||||
|
|
||||||
query, key, value = query.to(hidden_states.dtype), key.to(hidden_states.dtype), value.to(hidden_states.dtype)
|
|
||||||
hidden_states = attention_forward(
|
|
||||||
query,
|
|
||||||
key,
|
|
||||||
value,
|
|
||||||
q_pattern="b s n d", k_pattern="b s n d", v_pattern="b s n d", out_pattern="b s n d",
|
|
||||||
)
|
|
||||||
hidden_states = hidden_states.flatten(2, 3)
|
|
||||||
hidden_states = hidden_states.to(query.dtype)
|
|
||||||
|
|
||||||
if encoder_hidden_states is not None:
|
|
||||||
encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
|
|
||||||
[encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
|
|
||||||
)
|
|
||||||
encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
|
|
||||||
|
|
||||||
hidden_states = attn.to_out[0](hidden_states)
|
|
||||||
hidden_states = attn.to_out[1](hidden_states)
|
|
||||||
|
|
||||||
if encoder_hidden_states is not None:
|
|
||||||
return hidden_states, encoder_hidden_states
|
|
||||||
else:
|
|
||||||
return hidden_states
|
|
||||||
|
|
||||||
|
|
||||||
class Flux2Attention(torch.nn.Module):
|
class Flux2Attention(torch.nn.Module):
|
||||||
_default_processor_cls = Flux2AttnProcessor
|
|
||||||
_available_processors = [Flux2AttnProcessor]
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
query_dim: int,
|
query_dim: int,
|
||||||
@@ -449,7 +378,6 @@ class Flux2Attention(torch.nn.Module):
|
|||||||
eps: float = 1e-5,
|
eps: float = 1e-5,
|
||||||
out_dim: int = None,
|
out_dim: int = None,
|
||||||
elementwise_affine: bool = True,
|
elementwise_affine: bool = True,
|
||||||
processor=None,
|
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@@ -485,59 +413,45 @@ class Flux2Attention(torch.nn.Module):
|
|||||||
self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
|
self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
|
||||||
self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
|
self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
|
||||||
|
|
||||||
if processor is None:
|
|
||||||
processor = self._default_processor_cls()
|
|
||||||
self.processor = processor
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
image_rotary_emb: Optional[torch.Tensor] = None,
|
image_rotary_emb: Optional[torch.Tensor] = None,
|
||||||
|
kv_cache = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
|
query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
|
||||||
kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
|
self, hidden_states, encoder_hidden_states
|
||||||
return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class Flux2ParallelSelfAttnProcessor:
|
|
||||||
_attention_backend = None
|
|
||||||
_parallel_config = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
if not hasattr(F, "scaled_dot_product_attention"):
|
|
||||||
raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
attn: "Flux2ParallelSelfAttention",
|
|
||||||
hidden_states: torch.Tensor,
|
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
|
||||||
image_rotary_emb: Optional[torch.Tensor] = None,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
# Parallel in (QKV + MLP in) projection
|
|
||||||
hidden_states = attn.to_qkv_mlp_proj(hidden_states)
|
|
||||||
qkv, mlp_hidden_states = torch.split(
|
|
||||||
hidden_states, [3 * attn.inner_dim, attn.mlp_hidden_dim * attn.mlp_mult_factor], dim=-1
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle the attention logic
|
query = query.unflatten(-1, (self.heads, -1))
|
||||||
query, key, value = qkv.chunk(3, dim=-1)
|
key = key.unflatten(-1, (self.heads, -1))
|
||||||
|
value = value.unflatten(-1, (self.heads, -1))
|
||||||
|
|
||||||
query = query.unflatten(-1, (attn.heads, -1))
|
query = self.norm_q(query)
|
||||||
key = key.unflatten(-1, (attn.heads, -1))
|
key = self.norm_k(key)
|
||||||
value = value.unflatten(-1, (attn.heads, -1))
|
|
||||||
|
|
||||||
query = attn.norm_q(query)
|
if self.added_kv_proj_dim is not None:
|
||||||
key = attn.norm_k(key)
|
encoder_query = encoder_query.unflatten(-1, (self.heads, -1))
|
||||||
|
encoder_key = encoder_key.unflatten(-1, (self.heads, -1))
|
||||||
|
encoder_value = encoder_value.unflatten(-1, (self.heads, -1))
|
||||||
|
|
||||||
|
encoder_query = self.norm_added_q(encoder_query)
|
||||||
|
encoder_key = self.norm_added_k(encoder_key)
|
||||||
|
|
||||||
|
query = torch.cat([encoder_query, query], dim=1)
|
||||||
|
key = torch.cat([encoder_key, key], dim=1)
|
||||||
|
value = torch.cat([encoder_value, value], dim=1)
|
||||||
|
|
||||||
if image_rotary_emb is not None:
|
if image_rotary_emb is not None:
|
||||||
query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
|
query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
|
||||||
key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
|
key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
|
||||||
|
|
||||||
query, key, value = query.to(hidden_states.dtype), key.to(hidden_states.dtype), value.to(hidden_states.dtype)
|
if kv_cache is not None:
|
||||||
|
key = torch.concat([key, kv_cache[0]], dim=1)
|
||||||
|
value = torch.concat([value, kv_cache[1]], dim=1)
|
||||||
hidden_states = attention_forward(
|
hidden_states = attention_forward(
|
||||||
query,
|
query,
|
||||||
key,
|
key,
|
||||||
@@ -547,30 +461,22 @@ class Flux2ParallelSelfAttnProcessor:
|
|||||||
hidden_states = hidden_states.flatten(2, 3)
|
hidden_states = hidden_states.flatten(2, 3)
|
||||||
hidden_states = hidden_states.to(query.dtype)
|
hidden_states = hidden_states.to(query.dtype)
|
||||||
|
|
||||||
# Handle the feedforward (FF) logic
|
if encoder_hidden_states is not None:
|
||||||
mlp_hidden_states = attn.mlp_act_fn(mlp_hidden_states)
|
encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
|
||||||
|
[encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
|
||||||
|
)
|
||||||
|
encoder_hidden_states = self.to_add_out(encoder_hidden_states)
|
||||||
|
|
||||||
# Concatenate and parallel output projection
|
hidden_states = self.to_out[0](hidden_states)
|
||||||
hidden_states = torch.cat([hidden_states, mlp_hidden_states], dim=-1)
|
hidden_states = self.to_out[1](hidden_states)
|
||||||
hidden_states = attn.to_out(hidden_states)
|
|
||||||
|
|
||||||
|
if encoder_hidden_states is not None:
|
||||||
|
return hidden_states, encoder_hidden_states
|
||||||
|
else:
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class Flux2ParallelSelfAttention(torch.nn.Module):
|
class Flux2ParallelSelfAttention(torch.nn.Module):
|
||||||
"""
|
|
||||||
Flux 2 parallel self-attention for the Flux 2 single-stream transformer blocks.
|
|
||||||
|
|
||||||
This implements a parallel transformer block, where the attention QKV projections are fused to the feedforward (FF)
|
|
||||||
input projections, and the attention output projections are fused to the FF output projections. See the [ViT-22B
|
|
||||||
paper](https://arxiv.org/abs/2302.05442) for a visual depiction of this type of transformer block.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_default_processor_cls = Flux2ParallelSelfAttnProcessor
|
|
||||||
_available_processors = [Flux2ParallelSelfAttnProcessor]
|
|
||||||
# Does not support QKV fusion as the QKV projections are always fused
|
|
||||||
_supports_qkv_fusion = False
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
query_dim: int,
|
query_dim: int,
|
||||||
@@ -614,20 +520,54 @@ class Flux2ParallelSelfAttention(torch.nn.Module):
|
|||||||
# Fused attention output projection + MLP output projection
|
# Fused attention output projection + MLP output projection
|
||||||
self.to_out = torch.nn.Linear(self.inner_dim + self.mlp_hidden_dim, self.out_dim, bias=out_bias)
|
self.to_out = torch.nn.Linear(self.inner_dim + self.mlp_hidden_dim, self.out_dim, bias=out_bias)
|
||||||
|
|
||||||
if processor is None:
|
|
||||||
processor = self._default_processor_cls()
|
|
||||||
self.processor = processor
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
image_rotary_emb: Optional[torch.Tensor] = None,
|
image_rotary_emb: Optional[torch.Tensor] = None,
|
||||||
|
kv_cache = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
|
# Parallel in (QKV + MLP in) projection
|
||||||
kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
|
hidden_states = self.to_qkv_mlp_proj(hidden_states)
|
||||||
return self.processor(self, hidden_states, attention_mask, image_rotary_emb, **kwargs)
|
qkv, mlp_hidden_states = torch.split(
|
||||||
|
hidden_states, [3 * self.inner_dim, self.mlp_hidden_dim * self.mlp_mult_factor], dim=-1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle the attention logic
|
||||||
|
query, key, value = qkv.chunk(3, dim=-1)
|
||||||
|
|
||||||
|
query = query.unflatten(-1, (self.heads, -1))
|
||||||
|
key = key.unflatten(-1, (self.heads, -1))
|
||||||
|
value = value.unflatten(-1, (self.heads, -1))
|
||||||
|
|
||||||
|
query = self.norm_q(query)
|
||||||
|
key = self.norm_k(key)
|
||||||
|
|
||||||
|
if image_rotary_emb is not None:
|
||||||
|
query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
|
||||||
|
key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
|
||||||
|
|
||||||
|
if kv_cache is not None:
|
||||||
|
key = torch.concat([key, kv_cache[0]], dim=1)
|
||||||
|
value = torch.concat([value, kv_cache[1]], dim=1)
|
||||||
|
hidden_states = attention_forward(
|
||||||
|
query,
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
q_pattern="b s n d", k_pattern="b s n d", v_pattern="b s n d", out_pattern="b s n d",
|
||||||
|
)
|
||||||
|
hidden_states = hidden_states.flatten(2, 3)
|
||||||
|
hidden_states = hidden_states.to(query.dtype)
|
||||||
|
|
||||||
|
# Handle the feedforward (FF) logic
|
||||||
|
mlp_hidden_states = self.mlp_act_fn(mlp_hidden_states)
|
||||||
|
|
||||||
|
# Concatenate and parallel output projection
|
||||||
|
hidden_states = torch.cat([hidden_states, mlp_hidden_states], dim=-1)
|
||||||
|
hidden_states = self.to_out(hidden_states)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class Flux2SingleTransformerBlock(nn.Module):
|
class Flux2SingleTransformerBlock(nn.Module):
|
||||||
@@ -657,7 +597,6 @@ class Flux2SingleTransformerBlock(nn.Module):
|
|||||||
eps=eps,
|
eps=eps,
|
||||||
mlp_ratio=mlp_ratio,
|
mlp_ratio=mlp_ratio,
|
||||||
mlp_mult_factor=2,
|
mlp_mult_factor=2,
|
||||||
processor=Flux2ParallelSelfAttnProcessor(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
@@ -669,6 +608,7 @@ class Flux2SingleTransformerBlock(nn.Module):
|
|||||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
split_hidden_states: bool = False,
|
split_hidden_states: bool = False,
|
||||||
text_seq_len: Optional[int] = None,
|
text_seq_len: Optional[int] = None,
|
||||||
|
kv_cache = None,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
# If encoder_hidden_states is None, hidden_states is assumed to have encoder_hidden_states already
|
# If encoder_hidden_states is None, hidden_states is assumed to have encoder_hidden_states already
|
||||||
# concatenated
|
# concatenated
|
||||||
@@ -685,6 +625,7 @@ class Flux2SingleTransformerBlock(nn.Module):
|
|||||||
attn_output = self.attn(
|
attn_output = self.attn(
|
||||||
hidden_states=norm_hidden_states,
|
hidden_states=norm_hidden_states,
|
||||||
image_rotary_emb=image_rotary_emb,
|
image_rotary_emb=image_rotary_emb,
|
||||||
|
kv_cache=kv_cache,
|
||||||
**joint_attention_kwargs,
|
**joint_attention_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -725,7 +666,6 @@ class Flux2TransformerBlock(nn.Module):
|
|||||||
added_proj_bias=bias,
|
added_proj_bias=bias,
|
||||||
out_bias=bias,
|
out_bias=bias,
|
||||||
eps=eps,
|
eps=eps,
|
||||||
processor=Flux2AttnProcessor(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
|
self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
|
||||||
@@ -742,6 +682,7 @@ class Flux2TransformerBlock(nn.Module):
|
|||||||
temb_mod_params_txt: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
|
temb_mod_params_txt: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
|
||||||
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
|
kv_cache = None,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
joint_attention_kwargs = joint_attention_kwargs or {}
|
joint_attention_kwargs = joint_attention_kwargs or {}
|
||||||
|
|
||||||
@@ -762,6 +703,7 @@ class Flux2TransformerBlock(nn.Module):
|
|||||||
hidden_states=norm_hidden_states,
|
hidden_states=norm_hidden_states,
|
||||||
encoder_hidden_states=norm_encoder_hidden_states,
|
encoder_hidden_states=norm_encoder_hidden_states,
|
||||||
image_rotary_emb=image_rotary_emb,
|
image_rotary_emb=image_rotary_emb,
|
||||||
|
kv_cache=kv_cache,
|
||||||
**joint_attention_kwargs,
|
**joint_attention_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -969,6 +911,7 @@ class Flux2DiT(torch.nn.Module):
|
|||||||
txt_ids: torch.Tensor = None,
|
txt_ids: torch.Tensor = None,
|
||||||
guidance: torch.Tensor = None,
|
guidance: torch.Tensor = None,
|
||||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
|
kv_cache = None,
|
||||||
use_gradient_checkpointing=False,
|
use_gradient_checkpointing=False,
|
||||||
use_gradient_checkpointing_offload=False,
|
use_gradient_checkpointing_offload=False,
|
||||||
):
|
):
|
||||||
@@ -1013,7 +956,7 @@ class Flux2DiT(torch.nn.Module):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 4. Double Stream Transformer Blocks
|
# 4. Double Stream Transformer Blocks
|
||||||
for index_block, block in enumerate(self.transformer_blocks):
|
for block_id, block in enumerate(self.transformer_blocks):
|
||||||
encoder_hidden_states, hidden_states = gradient_checkpoint_forward(
|
encoder_hidden_states, hidden_states = gradient_checkpoint_forward(
|
||||||
block,
|
block,
|
||||||
use_gradient_checkpointing=use_gradient_checkpointing,
|
use_gradient_checkpointing=use_gradient_checkpointing,
|
||||||
@@ -1024,12 +967,13 @@ class Flux2DiT(torch.nn.Module):
|
|||||||
temb_mod_params_txt=double_stream_mod_txt,
|
temb_mod_params_txt=double_stream_mod_txt,
|
||||||
image_rotary_emb=concat_rotary_emb,
|
image_rotary_emb=concat_rotary_emb,
|
||||||
joint_attention_kwargs=joint_attention_kwargs,
|
joint_attention_kwargs=joint_attention_kwargs,
|
||||||
|
kv_cache=None if kv_cache is None else kv_cache.get(f"double_{block_id}"),
|
||||||
)
|
)
|
||||||
# Concatenate text and image streams for single-block inference
|
# Concatenate text and image streams for single-block inference
|
||||||
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
|
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
|
||||||
|
|
||||||
# 5. Single Stream Transformer Blocks
|
# 5. Single Stream Transformer Blocks
|
||||||
for index_block, block in enumerate(self.single_transformer_blocks):
|
for block_id, block in enumerate(self.single_transformer_blocks):
|
||||||
hidden_states = gradient_checkpoint_forward(
|
hidden_states = gradient_checkpoint_forward(
|
||||||
block,
|
block,
|
||||||
use_gradient_checkpointing=use_gradient_checkpointing,
|
use_gradient_checkpointing=use_gradient_checkpointing,
|
||||||
@@ -1039,6 +983,7 @@ class Flux2DiT(torch.nn.Module):
|
|||||||
temb_mod_params=single_stream_mod,
|
temb_mod_params=single_stream_mod,
|
||||||
image_rotary_emb=concat_rotary_emb,
|
image_rotary_emb=concat_rotary_emb,
|
||||||
joint_attention_kwargs=joint_attention_kwargs,
|
joint_attention_kwargs=joint_attention_kwargs,
|
||||||
|
kv_cache=None if kv_cache is None else kv_cache.get(f"single_{block_id}"),
|
||||||
)
|
)
|
||||||
# Remove text tokens from concatenated stream
|
# Remove text tokens from concatenated stream
|
||||||
hidden_states = hidden_states[:, num_txt_tokens:, ...]
|
hidden_states = hidden_states[:, num_txt_tokens:, ...]
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ class Flux2ImagePipeline(BasePipeline):
|
|||||||
Flux2Unit_InputImageEmbedder(),
|
Flux2Unit_InputImageEmbedder(),
|
||||||
Flux2Unit_EditImageEmbedder(),
|
Flux2Unit_EditImageEmbedder(),
|
||||||
Flux2Unit_ImageIDs(),
|
Flux2Unit_ImageIDs(),
|
||||||
|
Flux2Unit_Inpaint(),
|
||||||
]
|
]
|
||||||
self.model_fn = model_fn_flux2
|
self.model_fn = model_fn_flux2
|
||||||
|
|
||||||
@@ -93,6 +94,16 @@ class Flux2ImagePipeline(BasePipeline):
|
|||||||
initial_noise: torch.Tensor = None,
|
initial_noise: torch.Tensor = None,
|
||||||
# Steps
|
# Steps
|
||||||
num_inference_steps: int = 30,
|
num_inference_steps: int = 30,
|
||||||
|
# KV Cache
|
||||||
|
kv_cache = None,
|
||||||
|
negative_kv_cache = None,
|
||||||
|
# LoRA
|
||||||
|
lora = None,
|
||||||
|
negative_lora = None,
|
||||||
|
# Inpaint
|
||||||
|
inpaint_mask: Image.Image = None,
|
||||||
|
inpaint_blur_size: int = None,
|
||||||
|
inpaint_blur_sigma: float = None,
|
||||||
# Progress bar
|
# Progress bar
|
||||||
progress_bar_cmd = tqdm,
|
progress_bar_cmd = tqdm,
|
||||||
):
|
):
|
||||||
@@ -101,9 +112,11 @@ class Flux2ImagePipeline(BasePipeline):
|
|||||||
# Parameters
|
# Parameters
|
||||||
inputs_posi = {
|
inputs_posi = {
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
|
"kv_cache": kv_cache,
|
||||||
}
|
}
|
||||||
inputs_nega = {
|
inputs_nega = {
|
||||||
"negative_prompt": negative_prompt,
|
"negative_prompt": negative_prompt,
|
||||||
|
"kv_cache": negative_kv_cache,
|
||||||
}
|
}
|
||||||
inputs_shared = {
|
inputs_shared = {
|
||||||
"cfg_scale": cfg_scale, "embedded_guidance": embedded_guidance,
|
"cfg_scale": cfg_scale, "embedded_guidance": embedded_guidance,
|
||||||
@@ -112,6 +125,9 @@ class Flux2ImagePipeline(BasePipeline):
|
|||||||
"height": height, "width": width,
|
"height": height, "width": width,
|
||||||
"seed": seed, "rand_device": rand_device, "initial_noise": initial_noise,
|
"seed": seed, "rand_device": rand_device, "initial_noise": initial_noise,
|
||||||
"num_inference_steps": num_inference_steps,
|
"num_inference_steps": num_inference_steps,
|
||||||
|
"positive_only_lora": lora,
|
||||||
|
"negative_only_lora": negative_lora,
|
||||||
|
"inpaint_mask": inpaint_mask, "inpaint_blur_size": inpaint_blur_size, "inpaint_blur_sigma": inpaint_blur_sigma,
|
||||||
}
|
}
|
||||||
for unit in self.units:
|
for unit in self.units:
|
||||||
inputs_shared, inputs_posi, inputs_nega = self.unit_runner(unit, self, inputs_shared, inputs_posi, inputs_nega)
|
inputs_shared, inputs_posi, inputs_nega = self.unit_runner(unit, self, inputs_shared, inputs_posi, inputs_nega)
|
||||||
@@ -560,6 +576,26 @@ class Flux2Unit_ImageIDs(PipelineUnit):
|
|||||||
return {"image_ids": image_ids}
|
return {"image_ids": image_ids}
|
||||||
|
|
||||||
|
|
||||||
|
class Flux2Unit_Inpaint(PipelineUnit):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(
|
||||||
|
input_params=("inpaint_mask", "height", "width", "inpaint_blur_size", "inpaint_blur_sigma"),
|
||||||
|
output_params=("inpaint_mask",),
|
||||||
|
)
|
||||||
|
|
||||||
|
def process(self, pipe: Flux2ImagePipeline, inpaint_mask, height, width, inpaint_blur_size, inpaint_blur_sigma):
|
||||||
|
if inpaint_mask is None:
|
||||||
|
return {}
|
||||||
|
inpaint_mask = pipe.preprocess_image(inpaint_mask.convert("RGB").resize((width // 16, height // 16)), min_value=0, max_value=1)
|
||||||
|
inpaint_mask = inpaint_mask.mean(dim=1, keepdim=True)
|
||||||
|
if inpaint_blur_size is not None and inpaint_blur_sigma is not None:
|
||||||
|
from torchvision.transforms import GaussianBlur
|
||||||
|
blur = GaussianBlur(kernel_size=inpaint_blur_size * 2 + 1, sigma=inpaint_blur_sigma)
|
||||||
|
inpaint_mask = blur(inpaint_mask)
|
||||||
|
inpaint_mask = rearrange(inpaint_mask, "B C H W -> B (H W) C")
|
||||||
|
return {"inpaint_mask": inpaint_mask}
|
||||||
|
|
||||||
|
|
||||||
def model_fn_flux2(
|
def model_fn_flux2(
|
||||||
dit: Flux2DiT,
|
dit: Flux2DiT,
|
||||||
latents=None,
|
latents=None,
|
||||||
@@ -570,6 +606,7 @@ def model_fn_flux2(
|
|||||||
image_ids=None,
|
image_ids=None,
|
||||||
edit_latents=None,
|
edit_latents=None,
|
||||||
edit_image_ids=None,
|
edit_image_ids=None,
|
||||||
|
kv_cache=None,
|
||||||
use_gradient_checkpointing=False,
|
use_gradient_checkpointing=False,
|
||||||
use_gradient_checkpointing_offload=False,
|
use_gradient_checkpointing_offload=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -587,6 +624,7 @@ def model_fn_flux2(
|
|||||||
encoder_hidden_states=prompt_embeds,
|
encoder_hidden_states=prompt_embeds,
|
||||||
txt_ids=text_ids,
|
txt_ids=text_ids,
|
||||||
img_ids=image_ids,
|
img_ids=image_ids,
|
||||||
|
kv_cache=kv_cache,
|
||||||
use_gradient_checkpointing=use_gradient_checkpointing,
|
use_gradient_checkpointing=use_gradient_checkpointing,
|
||||||
use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
|
use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
|
||||||
)
|
)
|
||||||
|
|||||||
256
examples/flux2/model_inference/Template-KleinBase4B.py
Normal file
256
examples/flux2/model_inference/Template-KleinBase4B.py
Normal file
@@ -0,0 +1,256 @@
|
|||||||
|
from diffsynth.diffusion.template import TemplatePipeline
|
||||||
|
from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def load_template_pipeline(model_ids):
|
||||||
|
template = TemplatePipeline.from_pretrained(
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device="cuda",
|
||||||
|
model_configs=[ModelConfig(model_id=model_id) for model_id in model_ids],
|
||||||
|
)
|
||||||
|
return template
|
||||||
|
|
||||||
|
# Base Model
|
||||||
|
pipe = Flux2ImagePipeline.from_pretrained(
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device="cuda",
|
||||||
|
model_configs=[
|
||||||
|
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors"),
|
||||||
|
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors"),
|
||||||
|
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||||
|
],
|
||||||
|
tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
|
||||||
|
)
|
||||||
|
# image = pipe(
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# )
|
||||||
|
# image.save("image_base.jpg")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-Brightness"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{"scale": 0.7}],
|
||||||
|
# negative_template_inputs = [{"scale": 0.5}]
|
||||||
|
# )
|
||||||
|
# image.save("image_Brightness_light.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{"scale": 0.5}],
|
||||||
|
# negative_template_inputs = [{"scale": 0.5}]
|
||||||
|
# )
|
||||||
|
# image.save("image_Brightness_normal.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{"scale": 0.3}],
|
||||||
|
# negative_template_inputs = [{"scale": 0.5}]
|
||||||
|
# )
|
||||||
|
# image.save("image_Brightness_dark.jpg")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-ControlNet"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone, bathed in bright sunshine.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_depth.jpg"),
|
||||||
|
# "prompt": "A cat is sitting on a stone, bathed in bright sunshine.",
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_depth.jpg"),
|
||||||
|
# "prompt": "",
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_ControlNet_sunshine.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone, surrounded by colorful magical particles.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_depth.jpg"),
|
||||||
|
# "prompt": "A cat is sitting on a stone, surrounded by colorful magical particles.",
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_depth.jpg"),
|
||||||
|
# "prompt": "",
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_ControlNet_magic.jpg")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-Edit"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="Put a hat on this cat.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "prompt": "Put a hat on this cat.",
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "prompt": "",
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_Edit_hat.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="Make the cat turn its head to look to the right.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "prompt": "Make the cat turn its head to look to the right.",
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "prompt": "",
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_Edit_head.jpg")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-Upscaler"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_lowres_512.jpg"),
|
||||||
|
# "prompt": "A cat is sitting on a stone.",
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_lowres_512.jpg"),
|
||||||
|
# "prompt": "",
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_Upscaler_1.png")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_lowres_100.jpg"),
|
||||||
|
# "prompt": "A cat is sitting on a stone.",
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_lowres_100.jpg"),
|
||||||
|
# "prompt": "",
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_Upscaler_2.png")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-SoftRGB"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "R": 128/255,
|
||||||
|
# "G": 128/255,
|
||||||
|
# "B": 128/255
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_rgb_normal.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "R": 208/255,
|
||||||
|
# "G": 185/255,
|
||||||
|
# "B": 138/255
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_rgb_warm.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "R": 94/255,
|
||||||
|
# "G": 163/255,
|
||||||
|
# "B": 174/255
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_rgb_cold.jpg")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-PandaMeme"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A meme with a sleepy expression.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{}],
|
||||||
|
# negative_template_inputs = [{}],
|
||||||
|
# )
|
||||||
|
# image.save("image_PandaMeme_sleepy.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A meme with a happy expression.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{}],
|
||||||
|
# negative_template_inputs = [{}],
|
||||||
|
# )
|
||||||
|
# image.save("image_PandaMeme_happy.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A meme with a surprised expression.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{}],
|
||||||
|
# negative_template_inputs = [{}],
|
||||||
|
# )
|
||||||
|
# image.save("image_PandaMeme_surprised.jpg")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-Sharpness"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{"scale": 0.1}],
|
||||||
|
# negative_template_inputs = [{"scale": 0.5}],
|
||||||
|
# )
|
||||||
|
# image.save("image_Sharpness_0.1.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{"scale": 0.8}],
|
||||||
|
# negative_template_inputs = [{"scale": 0.5}],
|
||||||
|
# )
|
||||||
|
# image.save("image_Sharpness_0.8.jpg")
|
||||||
|
|
||||||
|
# template = load_template_pipeline(["DiffSynth-Studio/Template-KleinBase4B-Inpaint"])
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="An orange cat is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "mask": Image.open("data/assets/image_mask_1.jpg"),
|
||||||
|
# "force_inpaint": True,
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "mask": Image.open("data/assets/image_mask_1.jpg"),
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_Inpaint_1.jpg")
|
||||||
|
# image = template(
|
||||||
|
# pipe,
|
||||||
|
# prompt="A cat wearing sunglasses is sitting on a stone.",
|
||||||
|
# seed=0, cfg_scale=4, num_inference_steps=50,
|
||||||
|
# template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "mask": Image.open("data/assets/image_mask_2.jpg"),
|
||||||
|
# }],
|
||||||
|
# negative_template_inputs = [{
|
||||||
|
# "image": Image.open("data/assets/image_reference.jpg"),
|
||||||
|
# "mask": Image.open("data/assets/image_mask_2.jpg"),
|
||||||
|
# }],
|
||||||
|
# )
|
||||||
|
# image.save("image_Inpaint_2.jpg")
|
||||||
17
examples/flux2/model_training/full/Template-KleinBase4B.sh
Normal file
17
examples/flux2/model_training/full/Template-KleinBase4B.sh
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
accelerate launch examples/flux2/model_training/train.py \
|
||||||
|
--dataset_base_path xxx \
|
||||||
|
--dataset_metadata_path xxx/metadata.jsonl \
|
||||||
|
--extra_inputs "template_inputs" \
|
||||||
|
--max_pixels 1048576 \
|
||||||
|
--dataset_repeat 1 \
|
||||||
|
--model_id_with_origin_paths "black-forest-labs/FLUX.2-klein-4B:text_encoder/*.safetensors,black-forest-labs/FLUX.2-klein-base-4B:transformer/*.safetensors,black-forest-labs/FLUX.2-klein-4B:vae/diffusion_pytorch_model.safetensors" \
|
||||||
|
--template_model_id_or_path "xxx" \
|
||||||
|
--tokenizer_path "black-forest-labs/FLUX.2-klein-4B:tokenizer/" \
|
||||||
|
--learning_rate 1e-4 \
|
||||||
|
--num_epochs 999 \
|
||||||
|
--remove_prefix_in_ckpt "pipe.template_model." \
|
||||||
|
--output_path "./models/train/Template-KleinBase4B_full" \
|
||||||
|
--trainable_models "template_model" \
|
||||||
|
--save_steps 1000 \
|
||||||
|
--use_gradient_checkpointing \
|
||||||
|
--find_unused_parameters
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
from diffsynth import load_state_dict
|
||||||
|
from safetensors.torch import save_file
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def Flux2DiTStateDictConverter(state_dict):
|
||||||
|
rename_dict = {
|
||||||
|
"time_guidance_embed.timestep_embedder.linear_1.weight": "time_guidance_embed.timestep_embedder.0.weight",
|
||||||
|
"time_guidance_embed.timestep_embedder.linear_2.weight": "time_guidance_embed.timestep_embedder.2.weight",
|
||||||
|
"x_embedder.weight": "img_embedder.weight",
|
||||||
|
"context_embedder.weight": "txt_embedder.weight",
|
||||||
|
}
|
||||||
|
state_dict_ = {}
|
||||||
|
for name in state_dict:
|
||||||
|
if name in rename_dict:
|
||||||
|
state_dict_[rename_dict[name]] = state_dict[name]
|
||||||
|
elif name.startswith("transformer_blocks"):
|
||||||
|
if name.endswith("attn.to_q.weight"):
|
||||||
|
state_dict_[name.replace("to_q", "img_to_qkv").replace(".attn.", ".")] = torch.concat([
|
||||||
|
state_dict[name.replace("to_q", "to_q")],
|
||||||
|
state_dict[name.replace("to_q", "to_k")],
|
||||||
|
state_dict[name.replace("to_q", "to_v")],
|
||||||
|
], dim=0)
|
||||||
|
elif name.endswith("attn.to_k.weight") or name.endswith("attn.to_v.weight"):
|
||||||
|
continue
|
||||||
|
elif name.endswith("attn.to_out.0.weight"):
|
||||||
|
state_dict_[name.replace("attn.to_out.0.weight", "img_to_out.weight")] = state_dict[name]
|
||||||
|
elif name.endswith("attn.norm_q.weight"):
|
||||||
|
state_dict_[name.replace("attn.norm_q.weight", "img_norm_q.weight")] = state_dict[name]
|
||||||
|
elif name.endswith("attn.norm_k.weight"):
|
||||||
|
state_dict_[name.replace("attn.norm_k.weight", "img_norm_k.weight")] = state_dict[name]
|
||||||
|
elif name.endswith("attn.norm_added_q.weight"):
|
||||||
|
state_dict_[name.replace("attn.norm_added_q.weight", "txt_norm_q.weight")] = state_dict[name]
|
||||||
|
elif name.endswith("attn.norm_added_k.weight"):
|
||||||
|
state_dict_[name.replace("attn.norm_added_k.weight", "txt_norm_k.weight")] = state_dict[name]
|
||||||
|
elif name.endswith("attn.to_add_out.weight"):
|
||||||
|
state_dict_[name.replace("attn.to_add_out.weight", "txt_to_out.weight")] = state_dict[name]
|
||||||
|
elif name.endswith("attn.add_q_proj.weight"):
|
||||||
|
state_dict_[name.replace("add_q_proj", "txt_to_qkv").replace(".attn.", ".")] = torch.concat([
|
||||||
|
state_dict[name.replace("add_q_proj", "add_q_proj")],
|
||||||
|
state_dict[name.replace("add_q_proj", "add_k_proj")],
|
||||||
|
state_dict[name.replace("add_q_proj", "add_v_proj")],
|
||||||
|
], dim=0)
|
||||||
|
elif ".ff." in name:
|
||||||
|
state_dict_[name.replace(".ff.", ".img_ff.")] = state_dict[name]
|
||||||
|
elif ".ff_context." in name:
|
||||||
|
state_dict_[name.replace(".ff_context.", ".txt_ff.")] = state_dict[name]
|
||||||
|
elif name.endswith("attn.add_k_proj.weight") or name.endswith("attn.add_v_proj.weight"):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
state_dict_[name] = state_dict[name]
|
||||||
|
elif name.startswith("single_transformer_blocks"):
|
||||||
|
state_dict_[name.replace(".attn.", ".")] = state_dict[name]
|
||||||
|
else:
|
||||||
|
state_dict_[name] = state_dict[name]
|
||||||
|
return state_dict_
|
||||||
|
|
||||||
|
|
||||||
|
state_dict = load_state_dict("xxx.safetensors")
|
||||||
|
save_file(state_dict, "yyy.safetensors")
|
||||||
@@ -18,6 +18,7 @@ class Flux2ImageTrainingModule(DiffusionTrainingModule):
|
|||||||
extra_inputs=None,
|
extra_inputs=None,
|
||||||
fp8_models=None,
|
fp8_models=None,
|
||||||
offload_models=None,
|
offload_models=None,
|
||||||
|
template_model_id_or_path=None,
|
||||||
device="cpu",
|
device="cpu",
|
||||||
task="sft",
|
task="sft",
|
||||||
):
|
):
|
||||||
@@ -26,6 +27,7 @@ class Flux2ImageTrainingModule(DiffusionTrainingModule):
|
|||||||
model_configs = self.parse_model_configs(model_paths, model_id_with_origin_paths, fp8_models=fp8_models, offload_models=offload_models, device=device)
|
model_configs = self.parse_model_configs(model_paths, model_id_with_origin_paths, fp8_models=fp8_models, offload_models=offload_models, device=device)
|
||||||
tokenizer_config = self.parse_path_or_model_id(tokenizer_path, default_value=ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="tokenizer/"))
|
tokenizer_config = self.parse_path_or_model_id(tokenizer_path, default_value=ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="tokenizer/"))
|
||||||
self.pipe = Flux2ImagePipeline.from_pretrained(torch_dtype=torch.bfloat16, device=device, model_configs=model_configs, tokenizer_config=tokenizer_config)
|
self.pipe = Flux2ImagePipeline.from_pretrained(torch_dtype=torch.bfloat16, device=device, model_configs=model_configs, tokenizer_config=tokenizer_config)
|
||||||
|
self.pipe = self.load_training_template_model(self.pipe, template_model_id_or_path, args.use_gradient_checkpointing, args.use_gradient_checkpointing_offload)
|
||||||
self.pipe = self.split_pipeline_units(task, self.pipe, trainable_models, lora_base_model)
|
self.pipe = self.split_pipeline_units(task, self.pipe, trainable_models, lora_base_model)
|
||||||
|
|
||||||
# Training mode
|
# Training mode
|
||||||
@@ -126,6 +128,7 @@ if __name__ == "__main__":
|
|||||||
extra_inputs=args.extra_inputs,
|
extra_inputs=args.extra_inputs,
|
||||||
fp8_models=args.fp8_models,
|
fp8_models=args.fp8_models,
|
||||||
offload_models=args.offload_models,
|
offload_models=args.offload_models,
|
||||||
|
template_model_id_or_path=args.template_model_id_or_path,
|
||||||
task=args.task,
|
task=args.task,
|
||||||
device="cpu" if args.initialize_model_on_cpu else accelerator.device,
|
device="cpu" if args.initialize_model_on_cpu else accelerator.device,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user