diff --git a/examples/z_image/model_training/full/Z-Image-Turbo.sh b/examples/z_image/model_training/full/Z-Image-Turbo.sh index 21dba65..7bbe909 100644 --- a/examples/z_image/model_training/full/Z-Image-Turbo.sh +++ b/examples/z_image/model_training/full/Z-Image-Turbo.sh @@ -1,8 +1,9 @@ -accelerate launch examples/z_image/model_training/train.py \ +# This example is tested on 8*A100 +accelerate launch --config_file examples/z_image/model_training/full/accelerate_config.yaml examples/z_image/model_training/train.py \ --dataset_base_path data/example_image_dataset \ --dataset_metadata_path data/example_image_dataset/metadata.csv \ --max_pixels 1048576 \ - --dataset_repeat 50 \ + --dataset_repeat 400 \ --model_id_with_origin_paths "Tongyi-MAI/Z-Image-Turbo:transformer/*.safetensors,Tongyi-MAI/Z-Image-Turbo:text_encoder/*.safetensors,Tongyi-MAI/Z-Image-Turbo:vae/diffusion_pytorch_model.safetensors" \ --learning_rate 1e-5 \ --num_epochs 2 \ diff --git a/examples/z_image/model_training/full/accelerate_config.yaml b/examples/z_image/model_training/full/accelerate_config.yaml new file mode 100644 index 0000000..83280f7 --- /dev/null +++ b/examples/z_image/model_training/full/accelerate_config.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + gradient_accumulation_steps: 1 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: false + zero_stage: 2 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +enable_cpu_affinity: false +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false