rwkv6 lora finetune support (https://github.com/JL-er/RWKV-LORA)

2024-03-13 17:51:53 +08:00
parent c6024520af
commit 333619839a
13 changed files with 3009 additions and 2 deletions
--- a/finetune/lora/v6/src/init.py
+++ b/finetune/lora/v6/src/init.py
--- a/finetune/lora/v6/src/binidx.py
+++ b/finetune/lora/v6/src/binidx.py
@@ -0,0 +1,303 @@
+from lib2to3.pgen2 import token
+import os
+import torch
+import numpy as np
+import shutil
+import struct
+from functools import lru_cache
+from itertools import accumulate
+
+
+def print_rank_0(*message):
+    pass
+    # """If distributed is initialized print only on rank 0."""
+    # if torch.distributed.is_initialized():
+    #     if torch.distributed.get_rank() == 0:
+    #         print(*message, flush=True)
+    # else:
+    #     print(*message, flush=True)
+
+
+def _warmup_mmap_file(path):
+    pass
+    # with open(path, "rb") as stream:
+    #     while stream.read(100 * 1024 * 1024):
+    #         pass
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: float,
+    7: np.double,
+    8: np.uint16,
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + ".idx"
+
+
+def data_file_path(prefix_path):
+    return prefix_path + ".bin"
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b"MMIDIDX\x00\x00"
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, "wb")
+
+                    # Write Magic string so we can check the file format then opening it again.
+                    self._file.write(cls._HDR_MAGIC)
+                    # Write version number
+                    # Little endian unsigned 64 Bit integer
+                    self._file.write(struct.pack("<Q", 1))
+                    # Little endian unsigned 8 Bit integer
+                    self._file.write(struct.pack("<B", code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    # Little endian unsigned 64 Bit integer
+                    self._file.write(struct.pack("<Q", len(sizes)))
+                    # Little endian unsigned 64 Bit integer
+                    self._file.write(struct.pack("<Q", len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order="C"))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order="C"))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order="C"))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, "rb") as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    "Index file doesn't match expected format. "
+                    "Make sure that --dataset-impl is configured properly."
+                )
+                # Little endian unsigned 64 Bit integer
+                version = struct.unpack("<Q", stream.read(8))
+                assert (1,) == version
+
+                # Little endian unsigned 8 Bit integer
+                (dtype_code,) = struct.unpack("<B", stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack("<Q", stream.read(8))[0]
+                self._doc_count = struct.unpack("<Q", stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(
+                self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
+            )
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64,
+                count=self._len,
+                offset=offset + self._sizes.nbytes,
+            )
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64,
+                count=self._doc_count,
+                offset=offset + self._sizes.nbytes + self._pointers.nbytes,
+            )
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(
+            data_file_path(self._path), mode="r", order="C"
+        )
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
+            )
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr
+            )
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+
+    def get(self, idx, offset=0, length=None):
+        """Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(
+            self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr
+        )
+        return np_array
+
+    def pad(self, idx, length=None):
+        ptr, size = self._index[idx]
+        try:
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr
+            )
+        except:
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
+            )
+            ptr0, _ = self._index[0]
+            np_array0 = np.frombuffer(
+                self._bin_buffer,
+                dtype=self._index.dtype,
+                count=length - size,
+                offset=ptr0,
+            )
+            np_array = np.append(np_array, np_array0)
+        return np_array
+
+    def only(self, idx):
+        ptr, size = self._index[idx]
+        np_array = np.frombuffer(
+            self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
+        )
+
+        return np_array
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(index_file_path(path)) and os.path.exists(
+            data_file_path(path)
+        )
--- a/finetune/lora/v6/src/dataset.py
+++ b/finetune/lora/v6/src/dataset.py
@@ -0,0 +1,242 @@
+########################################################################################################
+# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
+########################################################################################################
+
+import json, math, random, os, sys
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from pytorch_lightning.utilities import rank_zero_info
+from .binidx import MMapIndexedDataset
+from .utils import MaybeIsPrime
+
+
+class MyDataset(Dataset):
+    def __init__(self, args):
+        self.args = args
+
+        if args.data_type == "binidx":
+            self.vocab_size = args.vocab_size
+            rank_zero_info(
+                f"Current vocab size = {self.vocab_size} (make sure it's correct)"
+            )
+
+            if args.my_pile_version == 1:
+                self.data = MMapIndexedDataset(args.data_file)
+                self.data_size = (
+                    len(self.data._bin_buffer) // self.data._index._dtype_size
+                )
+                rank_zero_info(f"Data has {self.data_size} tokens.")
+            elif args.my_pile_version == 2:
+                data_list = (
+                    open(args.data_file, "r", encoding="utf-8")
+                    .read()
+                    .strip()
+                    .split("\n")
+                )
+                data_list = [i.strip().split(" ") for i in data_list]
+                self.data = []
+                self.data_size = int(data_list[-1][-1])
+                rank_zero_info(f"Data has {self.data_size} chunks.")
+                for d in data_list:
+                    data = MMapIndexedDataset(d[0])
+                    data_size = len(data._bin_buffer) // data._index._dtype_size
+                    assert (data_size - args.ctx_len) == int(d[1])
+                    self.data += [[int(d[-1]), int(d[1]), data]]
+                # rank_zero_info(self.data)
+
+            if args.my_qa_mask > 0:
+                # self.data_pile = MMapIndexedDataset('/fsx/pile/pile_20B_tokenizer_text_document')
+                self.data_pile = MMapIndexedDataset(
+                    "/fsx/pile_deduped/pile_0.87_deduped_text_document"
+                )
+                self.data_pile_size = (
+                    len(self.data_pile._bin_buffer) // self.data._index._dtype_size
+                )
+            else:
+                self.data_pile = None
+                self.data_pile_size = 0
+
+            if args.my_pile_stage > 0:
+                # assert self.data_size == 332115325534 and self.vocab_size == 50277
+                self.samples_per_epoch = args.epoch_steps * args.real_bsz
+                assert self.samples_per_epoch == 40320
+                rank_zero_info(
+                    f"########## Pile 20b-tokenized stage {args.my_pile_stage} ##########"
+                )
+                dataset_slot = self.data_size // args.ctx_len
+                if args.my_pile_stage != 4:
+                    assert MaybeIsPrime(args.magic_prime)
+                    assert args.magic_prime % 3 == 2
+                    assert (
+                        args.magic_prime / dataset_slot > 0.99
+                        and args.magic_prime / dataset_slot <= 1
+                    )
+        elif args.data_type == "numpy":
+            self.data = np.load(args.data_file).astype("int")
+            self.vocab_size = args.vocab_size
+            rank_zero_info(
+                f"Current vocab size = {self.vocab_size} (make sure it's correct)"
+            )
+            self.data_size = len(self.data)
+            rank_zero_info(f"Data has {self.data_size} tokens.")
+        elif args.data_type == "uint16":
+            self.data = (
+                np.fromfile(args.data_file, dtype=np.uint16)
+                .astype("int32")
+                .reshape(-1, args.my_sample_len)
+            )
+            self.vocab_size = args.vocab_size
+            rank_zero_info(
+                f"Current vocab size = {self.vocab_size} (make sure it's correct)"
+            )
+            self.data_size = self.data.shape[0]
+            rank_zero_info(f"Data has {self.data_size} samples.")
+        else:
+            if args.data_type == "dummy":
+                rank_zero_info("Building dummy data...")
+                self.data = ""
+                for i in range(100000):
+                    aa = (i) % 10000
+                    bb = (i * i) % 10000
+                    cc = aa + bb
+                    self.data += f".{aa}+{bb}={cc}."
+            else:
+                self.data = open(args.data_file, "r", encoding=args.data_type).read()
+            rank_zero_info("Building token list...")
+            unique = sorted(list(set(self.data)))
+            self.vocab_size = len(unique)
+            # rank_zero_info()
+            # for u in unique:
+            #     print(u, end=' ')
+            # rank_zero_info('\n\n')
+            xx = 0
+            xxObj = {}
+            for u in unique:
+                xxObj[xx] = u
+                xx += 1
+            with open(
+                f"{args.proj_dir}/vocab.json", "w", encoding="utf-8"
+            ) as vocab_file:
+                vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
+            self.data_size = len(self.data)
+            rank_zero_info(
+                f"Data has {self.data_size} tokens, {self.vocab_size} vocab size."
+            )
+            self.stoi = {ch: i for i, ch in enumerate(unique)}
+            self.itos = {i: ch for i, ch in enumerate(unique)}
+
+    def __len__(self):
+        return self.args.epoch_steps * self.args.micro_bsz
+
+    def __getitem__(self, idx):
+        args = self.args
+        rank = self.global_rank
+        epoch = self.real_epoch
+        world_size = self.world_size
+        # print(f"epoch {epoch} idx {idx} rank {rank}/{world_size}")
+
+        if args.data_type == "uint16":
+            i = np.random.randint(0, self.data_size - 1)
+            dix = self.data[i]
+            x = torch.tensor(dix[:-1], dtype=torch.long)
+            y = torch.tensor(dix[1:], dtype=torch.long)
+        else:
+            ctx_len = args.ctx_len
+            req_len = ctx_len + 1
+            magic_prime = args.magic_prime
+            data = self.data
+
+            if args.my_pile_stage > 0:
+                ii = 1 + epoch * self.samples_per_epoch + (idx * world_size) + rank
+
+                if args.my_qa_mask > 0:
+                    ii_orig = ii
+                    if ii % 2 == 0:
+                        ii = -1
+                        data = self.data_pile
+                    else:
+                        ii = ii // 2
+                if data == self.data_pile:
+                    i = np.random.randint(0, self.data_pile_size - req_len)
+                else:
+                    if args.my_pile_stage == 4 or ii < args.my_random_steps:
+                        # cheat: pick a random spot in dataset
+                        if args.my_pile_version == 1:
+                            i = np.random.randint(0, self.data_size - req_len)
+                        else:
+                            i = np.random.randint(0, self.data_size)
+                    else:
+                        ii = ii - args.my_random_steps
+                        factor = (math.sqrt(5) - 1) / 2
+                        factor = int(magic_prime * factor)
+                        i = ((factor * ii * ii * ii) % magic_prime) * ctx_len
+                        i = i + args.my_pile_shift
+                # print(f"epoch {epoch} idx {idx} rank {rank}/{world_size} ii {ii} pos {round(i / self.data_size, 3)}")
+            else:
+                # cheat: pick a random spot in dataset
+                i = np.random.randint(0, self.data_size - req_len)
+
+            if args.data_type == "binidx":
+                if args.my_pile_version == 1:
+                    dix = data.get(idx=0, offset=i, length=req_len).astype(int)
+                    # dix = data.pad(idx=idx, length=req_len).astype(int)
+                else:
+                    # self.data : cutoff, chunk_count, data
+                    for j in range(len(data)):
+                        if i < data[j][0]:
+                            ii = i
+                            i = (i - (data[j - 1][0] if j > 0 else 0)) % data[j][1]
+                            dix = (
+                                data[j][2]
+                                .get(idx=0, offset=i, length=req_len)
+                                .astype(int)
+                            )
+                            # print(ii, j, i)
+                            break
+            elif args.data_type == "numpy":
+                dix = data[i : i + req_len]
+            else:
+                dix = [self.stoi[s] for s in data[i : i + req_len]]
+
+            if args.my_qa_mask == 1:
+                if data == self.data_pile:
+                    z = [1] * ctx_len
+                else:
+                    z = [0] * ctx_len
+                    z_sum = 0
+                    isGood = False
+                    for i in range(3, ctx_len):
+                        if (
+                            dix[i] == 27
+                            and dix[i - 1] == 34
+                            and dix[i - 2] == 187
+                            and dix[i - 3] == 187
+                        ):
+                            isGood = True
+                        if dix[i] == 0:
+                            isGood = False
+                        if isGood:
+                            z[i] = 1
+                            z_sum += 1
+                    if z_sum == 0:
+                        z = [1] * ctx_len
+                        i = np.random.randint(0, self.data_pile_size - req_len)
+                        dix = self.data_pile.get(
+                            idx=0, offset=i, length=req_len
+                        ).astype(int)
+                z = torch.tensor(z, dtype=torch.bfloat16)
+
+            x = torch.tensor(dix[:-1], dtype=torch.long)
+            y = torch.tensor(dix[1:], dtype=torch.long)
+
+            # if ii_orig < 50:
+            #     # if rank == 1:
+            #     print('rank', rank, 'i', ii_orig, ii, i, 'x', x[:5], '...', x[-5:])
+            # else:
+            #     exit(0)
+
+            if args.my_qa_mask == 1:
+                return x, y, z
+
+            return x, y
--- a/finetune/lora/v6/src/model.py
+++ b/finetune/lora/v6/src/model.py
--- a/finetune/lora/v6/src/trainer.py
+++ b/finetune/lora/v6/src/trainer.py
@@ -0,0 +1,310 @@
+import os, math, time, datetime, subprocess
+import torch
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from pytorch_lightning.utilities import rank_zero_info, rank_zero_only
+from .model import LORA_CONFIG
+
+
+def my_save(args, trainer, dd, ff):
+    if "14b-run1" in ff:
+        fn = ff.split("/")[-1]
+        fff = "/dev/shm/" + fn
+        torch.save(dd, fff)
+        subprocess.Popen(f" aws s3 mv {fff} s3://rwkv-14b-4k/{fn} --quiet", shell=True)
+    elif ("world/14b" in ff) or ("world/7b" in ff):
+        aa = ff.split("/")[1]
+        fn = ff.split("/")[-1]
+        fff = f"/dev/shm/{aa}-{fn}"
+        torch.save(dd, fff)
+        subprocess.Popen(
+            f" aws s3 mv {fff} s3://rwkv-world/{aa}-{fn} --quiet", shell=True
+        )
+    else:
+        if "deepspeed_stage_3" in args.strategy:
+            trainer.save_checkpoint(ff, weights_only=True)
+        else:
+            torch.save(dd, ff)
+
+
+class train_callback(pl.Callback):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        args = self.args
+        # if args.cuda_cleanup > 0:
+        #     torch.cuda.empty_cache()
+        real_step = trainer.global_step + args.epoch_begin * args.epoch_steps
+
+        # LR schedule
+        w_step = args.warmup_steps
+        if args.lr_final == args.lr_init or args.epoch_count == 0:
+            lr = args.lr_init
+        else:
+            decay_step = real_step - args.my_pile_edecay * args.epoch_steps
+            decay_total = (args.epoch_count - args.my_pile_edecay) * args.epoch_steps
+            progress = (decay_step - w_step + 1) / (decay_total - w_step)
+            progress = min(1, max(0, progress))
+
+            if args.lr_final == 0 or args.lr_init == 0:  # linear decay
+                lr = args.lr_init + (args.lr_final - args.lr_init) * progress
+            else:  # exp decay
+                lr = args.lr_init * math.exp(
+                    math.log(args.lr_final / args.lr_init) * pow(progress, 1)
+                )
+            # if trainer.is_global_zero:
+            #     print(trainer.global_step, decay_step, decay_total, w_step, progress, lr)
+
+        if args.my_exit_tokens != 0:  # cosine decay
+            real_tokens = real_step * args.ctx_len * args.real_bsz
+            warmup_tokens = w_step * args.ctx_len * args.real_bsz
+            progress = (real_tokens - warmup_tokens) / (
+                abs(args.my_exit_tokens) - warmup_tokens
+            )
+            progress = max(0, min(1, progress))
+            lr_final_factor = args.lr_final / args.lr_init
+            lr_mult = (0.5 + lr_final_factor / 2) + (
+                0.5 - lr_final_factor / 2
+            ) * math.cos(math.pi * progress)
+            if args.my_exit_tokens > 0:
+                lr = args.lr_init * lr_mult
+            else:
+                lr = (lr + args.lr_init * lr_mult) / 2
+            if progress >= 1:
+                if (trainer.is_global_zero) or ("deepspeed_stage_3" in args.strategy):
+                    my_save(
+                        args,
+                        trainer,
+                        pl_module.state_dict(),
+                        f"{args.proj_dir}/rwkv-final.pth",
+                    )
+                    exit(0)
+        if trainer.global_step < w_step:
+            lr = lr * (0.2 + 0.8 * trainer.global_step / w_step)
+
+        if args.weight_decay_final > 0:
+            wd_now = args.weight_decay * math.exp(
+                math.log(args.weight_decay_final / args.weight_decay) * progress
+            )
+        else:
+            wd_now = args.weight_decay
+
+        for param_group in trainer.optimizers[0].param_groups:
+            if param_group["weight_decay"] > 0:
+                param_group["weight_decay"] = wd_now
+            if args.layerwise_lr > 0:
+                param_group["lr"] = lr * param_group["my_lr_scale"]
+                # print(param_group["lr"], param_group["my_lr_scale"])
+            else:
+                param_group["lr"] = lr
+
+        trainer.my_lr = lr
+        trainer.my_wd = wd_now
+        # rank_zero_info(f"{real_step} {lr}")
+
+        if trainer.global_step == 0:
+            if trainer.is_global_zero:  # logging
+                trainer.my_loss_sum = 0
+                trainer.my_loss_count = 0
+                trainer.my_log = open(args.proj_dir + "/train_log.txt", "a")
+                trainer.my_log.write(
+                    f"NEW RUN {args.my_timestamp}\n{vars(self.args)}\n"
+                )
+                try:
+                    print(f"\n{trainer.strategy.config}\n")
+                    trainer.my_log.write(f"{trainer.strategy.config}\n")
+                except:
+                    pass
+                trainer.my_log.flush()
+                if len(args.wandb) > 0:
+                    print("Login to wandb...")
+                    import wandb
+
+                    wandb.init(
+                        project=args.wandb,
+                        name=args.run_name + " " + args.my_timestamp,
+                        config=args,
+                        save_code=False,
+                    )
+                    trainer.my_wandb = wandb
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        args = self.args
+        token_per_step = args.ctx_len * args.real_bsz
+        real_step = trainer.global_step + args.epoch_begin * args.epoch_steps
+        if trainer.is_global_zero:  # logging
+            t_now = time.time_ns()
+            kt_s = 0
+            try:
+                t_cost = (t_now - trainer.my_time_ns) / 1e9
+                kt_s = token_per_step / t_cost / 1000
+                self.log("REAL it/s", 1.0 / t_cost, prog_bar=True, on_step=True)
+                self.log("Kt/s", kt_s, prog_bar=True, on_step=True)
+            except:
+                pass
+            trainer.my_time_ns = t_now
+            if pl.__version__[0] == "2":
+                trainer.my_loss = outputs["loss"]
+            else:
+                trainer.my_loss = trainer.my_loss_all.float().mean().item()
+            trainer.my_loss_sum += trainer.my_loss
+            trainer.my_loss_count += 1
+            trainer.my_epoch_loss = trainer.my_loss_sum / trainer.my_loss_count
+            self.log("lr", trainer.my_lr, prog_bar=True, on_step=True)
+            self.log("loss", trainer.my_epoch_loss, prog_bar=True, on_step=True)
+            # self.log("s", real_step, prog_bar=True, on_step=True)
+
+            if len(args.wandb) > 0:
+                lll = {
+                    "loss": trainer.my_loss,
+                    "lr": trainer.my_lr,
+                    "wd": trainer.my_wd,
+                    "Gtokens": real_step * token_per_step / 1e9,
+                }
+                if kt_s > 0:
+                    lll["kt/s"] = kt_s
+                trainer.my_wandb.log(lll, step=int(real_step))
+        if (trainer.is_global_zero) or (
+            "deepspeed_stage_3" in args.strategy
+        ):  # save pth
+            if args.magic_prime > 0:
+                expand_factor = 2 if args.my_qa_mask > 0 else 1
+                if int(real_step) == int(
+                    args.magic_prime * expand_factor // args.real_bsz
+                ) - 1 + int(args.my_random_steps):
+                    to_save_dict = pl_module.state_dict()
+                    my_save(
+                        args,
+                        trainer,
+                        to_save_dict,
+                        f"{args.proj_dir}/rwkv-final.pth",
+                    )
+        # if args.batch_save==batch_idx :
+        #     to_save_dict = pl_module.state_dict()
+        #     for name, state in to_save_dict.items():
+        #         if 'img' in name:
+        #             to_save_dict[name] = state
+        #     try:
+        #             my_save(
+        #                 args, trainer,
+        #                 to_save_dict,
+        #                 f"{args.proj_dir}/rwkv-{args.epoch_begin + trainer.current_epoch}-{batch_idx}.pth",
+        #             )
+        #     except Exception as e:
+        #         print('Error\n\n', e, '\n\n')
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        args = self.args
+        if pl.__version__[0] == "2":
+            dataset = trainer.train_dataloader.dataset
+        else:
+            dataset = trainer.train_dataloader.dataset.datasets
+        assert "MyDataset" in str(dataset)
+        dataset.global_rank = trainer.global_rank
+        dataset.real_epoch = int(args.epoch_begin + trainer.current_epoch)
+        dataset.world_size = trainer.world_size
+        # print(f'########## world_size {dataset.world_size} global_rank {dataset.global_rank} real_epoch {dataset.real_epoch} ##########')
+
+    def on_train_epoch_end(self, trainer, pl_module):
+        args = self.args
+        to_save_dict = {}
+        if (trainer.is_global_zero) or (
+            "deepspeed_stage_3" in args.strategy
+        ):  # save pth
+            if (
+                args.epoch_save > 0 and trainer.current_epoch % args.epoch_save == 0
+            ) or (trainer.current_epoch == args.epoch_count - 1):
+                if args.data_type == "wds_img":
+                    raw_dict = pl_module.state_dict()
+                    for k in raw_dict:
+                        if k.startswith("encoder.") or k.startswith("decoder."):
+                            to_save_dict[k] = raw_dict[k]
+                else:
+                    to_save_dict = pl_module.state_dict()
+
+                if args.data_type == "img" and not args.lora:
+                    for name, state in to_save_dict.items():
+                        if "img" in name:
+                            to_save_dict[name] = state
+
+                if args.lora:
+                    enable_time_finetune = "time" in LORA_CONFIG["parts"]
+                    enable_ln_finetune = "ln" in LORA_CONFIG["parts"]
+                    lora_dict = {}
+                    for name, state in to_save_dict.items():
+                        if "img" in name:
+                            lora_dict[name] = state
+                        if (
+                            ".lora_" in name
+                            or (enable_time_finetune and ".time_" in name)
+                            or (enable_ln_finetune and ".ln" in name)
+                        ):
+                            lora_dict[name] = state
+                    to_save_dict = lora_dict
+
+                try:
+                    my_save(
+                        args,
+                        trainer,
+                        to_save_dict,
+                        f"{args.proj_dir}/rwkv-{args.epoch_begin + trainer.current_epoch}.pth",
+                    )
+                except Exception as e:
+                    print("Error\n\n", e, "\n\n")
+
+        if trainer.is_global_zero:  # logging
+            trainer.my_log.write(
+                f"{args.epoch_begin + trainer.current_epoch} {trainer.my_epoch_loss:.6f} {math.exp(trainer.my_epoch_loss):.4f} {trainer.my_lr:.8f} {datetime.datetime.now()} {trainer.current_epoch}\n"
+            )
+            trainer.my_log.flush()
+
+            trainer.my_loss_sum = 0
+            trainer.my_loss_count = 0
+            if (args.epoch_begin + trainer.current_epoch) >= args.my_exit:
+                exit(0)
+
+
+@rank_zero_only
+def generate_init_weight(model, init_weight_name):
+    mm = model.generate_init_weight()
+
+    if model.args.my_pile_stage == 1:
+        if len(model.args.load_model) > 0:
+            print(f"Combine weights from {model.args.load_model}...")
+            load_dict = torch.load(model.args.load_model, map_location="cpu")
+            for k in load_dict:
+                try:
+                    assert k in mm
+                except:
+                    print("missing", k)
+                    exit(0)
+                src = load_dict[k]
+                try:
+                    mm[k] = src.reshape(mm[k].shape)
+                except:
+                    tmp = mm[k].squeeze().clone()
+                    print(k, src.shape, "-->", mm[k].shape)
+                    ss = src.shape[0]
+                    dd = tmp.shape[0]
+                    for i in range(dd):
+                        pos = i / dd * ss
+                        if pos >= ss - 1:
+                            tmp[i] = src[ss - 1]
+                        else:
+                            p0 = int(math.floor(pos))
+                            ii = pos - p0
+                            tmp[i] = src[p0] * (1 - ii) + src[p0 + 1] * (ii)
+                    mm[k] = tmp.reshape(mm[k].shape)
+                    sss = src.squeeze().float().cpu().numpy()
+                    print(sss[:10], "...", sss[-10:])
+                    mmm = mm[k].squeeze().float().cpu().numpy()
+                    print(mmm[:10], "...", mmm[-10:])
+
+    print(f"Save to {init_weight_name}...")
+    torch.save(mm, init_weight_name)
+
+    if model.args.my_pile_stage == 1:
+        print("Done. Now go for stage 2.")
+        exit(0)
--- a/finetune/lora/v6/src/utils.py
+++ b/finetune/lora/v6/src/utils.py
@@ -0,0 +1,139 @@
+import json, time, random, os
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+time_slot = {}
+time_ref = time.time_ns()
+
+
+def record_time(name):
+    if name not in time_slot:
+        time_slot[name] = 1e20
+    tt = (time.time_ns() - time_ref) / 1e9
+    if tt < time_slot[name]:
+        time_slot[name] = tt
+
+
+class TOKENIZER:
+    def __init__(self, WORD_NAME, UNKNOWN_CHAR="\ue083"):
+        if "list" in str(type(WORD_NAME)):
+            self.charMode = False
+            if WORD_NAME[0] == WORD_NAME[1]:
+                from transformers import PreTrainedTokenizerFast
+
+                self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])
+            else:
+                from transformers import GPT2TokenizerFast
+
+                self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])
+            self.vocab_size = len(self.tokenizer)
+        else:
+            self.charMode = True
+            with open(WORD_NAME + ".json", "r", encoding="utf-16") as result_file:
+                self.word_table = json.load(result_file)
+
+            self.vocab_size = len(self.word_table)
+
+            self.stoi = {v: int(k) for k, v in self.word_table.items()}
+            self.itos = {int(k): v for k, v in self.word_table.items()}
+
+            self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]
+
+    def refine_context(self, context):
+        context = context.strip().split("\n")
+        for c in range(len(context)):
+            context[c] = context[c].strip().strip("\u3000").strip("\r")
+        context = list(filter(lambda c: c != "", context))
+        context = "\n" + ("\n".join(context)).strip()
+        if context == "":
+            context = "\n"
+        return context
+
+    def sample_logits(
+        self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None
+    ):
+        # out[self.UNKNOWN_CHAR] = -float('Inf')
+        lastChar = int(x[-1])
+
+        probs = F.softmax(out, dim=-1)
+
+        if self.charMode:
+            if self.itos[lastChar] == "\n":
+                top_p = top_p_newline
+            else:
+                top_p = top_p_usual
+        else:
+            top_p = top_p_usual
+
+        if os.environ["RWKV_RUN_DEVICE"] == "cpu":
+            probs = probs.numpy()
+            sorted_probs = np.sort(probs)[::-1]
+            cumulative_probs = np.cumsum(sorted_probs)
+            cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])
+            probs[probs < cutoff] = 0
+            if temperature != 1.0:
+                probs = probs.pow(1.0 / temperature)
+            probs = probs / np.sum(probs)
+            out = np.random.choice(a=len(probs), p=probs)
+            return out
+        else:
+            sorted_probs = torch.sort(probs, descending=True)[0]
+            cumulative_probs = torch.cumsum(sorted_probs, dim=-1).cpu().numpy()
+            cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])
+            probs[probs < cutoff] = 0
+            if temperature != 1.0:
+                probs = probs.pow(1.0 / temperature)
+            out = torch.multinomial(probs, num_samples=1)[0]
+            return out
+
+
+def MaybeIsPrime(number):
+    if FermatPrimalityTest(number) and MillerRabinPrimalityTest(number):
+        return True
+    else:
+        return False
+
+
+def FermatPrimalityTest(number):
+    if number > 1:
+        for time in range(3):
+            randomNumber = random.randint(2, number) - 1
+            if pow(randomNumber, number - 1, number) != 1:
+                return False
+        return True
+    else:
+        return False
+
+
+def MillerRabinPrimalityTest(number):
+    if number == 2:
+        return True
+    elif number == 1 or number % 2 == 0:
+        return False
+    oddPartOfNumber = number - 1
+    timesTwoDividNumber = 0
+    while oddPartOfNumber % 2 == 0:
+        oddPartOfNumber = oddPartOfNumber // 2
+        timesTwoDividNumber = timesTwoDividNumber + 1
+
+    for time in range(3):
+        while True:
+            randomNumber = random.randint(2, number) - 1
+            if randomNumber != 0 and randomNumber != 1:
+                break
+
+        randomNumberWithPower = pow(randomNumber, oddPartOfNumber, number)
+
+        if (randomNumberWithPower != 1) and (randomNumberWithPower != number - 1):
+            iterationNumber = 1
+
+            while (iterationNumber <= timesTwoDividNumber - 1) and (
+                randomNumberWithPower != number - 1
+            ):
+                randomNumberWithPower = pow(randomNumberWithPower, 2, number)
+                iterationNumber = iterationNumber + 1
+            if randomNumberWithPower != (number - 1):
+                return False
+
+    return True