Support ERNIE-Image (#1389)

* ernie-image pipeline

* ernie-image inference and training

* style fix

* ernie docs

* lowvram

* final style fix

* pr-review

* pr-fix round2

* set uniform training weight

* fix

* update lowvram docs
This commit is contained in:
Hong Zhang
2026-04-13 14:57:10 +08:00
committed by GitHub
parent f77b6357c5
commit 960d8c62c0
21 changed files with 1461 additions and 2 deletions

View File

@@ -0,0 +1,21 @@
def ErnieImageTextEncoderStateDictConverter(state_dict):
"""
Maps checkpoint keys from multimodal Mistral3Model format
to text-only Ministral3Model format.
Checkpoint keys (Mistral3Model):
language_model.model.layers.0.input_layernorm.weight
language_model.model.norm.weight
Model keys (ErnieImageTextEncoder → self.model = Ministral3Model):
model.layers.0.input_layernorm.weight
model.norm.weight
Mapping: language_model. → model.
"""
new_state_dict = {}
for key in state_dict:
if key.startswith("language_model.model."):
new_key = key.replace("language_model.model.", "model.", 1)
new_state_dict[new_key] = state_dict[key]
return new_state_dict