rwkv6 lora finetune support (https://github.com/JL-er/RWKV-LORA)
This commit is contained in:
0
finetune/lora/v6/src/__init__.py
vendored
Normal file
0
finetune/lora/v6/src/__init__.py
vendored
Normal file
303
finetune/lora/v6/src/binidx.py
vendored
Normal file
303
finetune/lora/v6/src/binidx.py
vendored
Normal file
@@ -0,0 +1,303 @@
|
||||
from lib2to3.pgen2 import token
|
||||
import os
|
||||
import torch
|
||||
import numpy as np
|
||||
import shutil
|
||||
import struct
|
||||
from functools import lru_cache
|
||||
from itertools import accumulate
|
||||
|
||||
|
||||
def print_rank_0(*message):
|
||||
pass
|
||||
# """If distributed is initialized print only on rank 0."""
|
||||
# if torch.distributed.is_initialized():
|
||||
# if torch.distributed.get_rank() == 0:
|
||||
# print(*message, flush=True)
|
||||
# else:
|
||||
# print(*message, flush=True)
|
||||
|
||||
|
||||
def _warmup_mmap_file(path):
|
||||
pass
|
||||
# with open(path, "rb") as stream:
|
||||
# while stream.read(100 * 1024 * 1024):
|
||||
# pass
|
||||
|
||||
|
||||
dtypes = {
|
||||
1: np.uint8,
|
||||
2: np.int8,
|
||||
3: np.int16,
|
||||
4: np.int32,
|
||||
5: np.int64,
|
||||
6: float,
|
||||
7: np.double,
|
||||
8: np.uint16,
|
||||
}
|
||||
|
||||
|
||||
def code(dtype):
|
||||
for k in dtypes.keys():
|
||||
if dtypes[k] == dtype:
|
||||
return k
|
||||
raise ValueError(dtype)
|
||||
|
||||
|
||||
def index_file_path(prefix_path):
|
||||
return prefix_path + ".idx"
|
||||
|
||||
|
||||
def data_file_path(prefix_path):
|
||||
return prefix_path + ".bin"
|
||||
|
||||
|
||||
class MMapIndexedDataset(torch.utils.data.Dataset):
|
||||
class Index(object):
|
||||
_HDR_MAGIC = b"MMIDIDX\x00\x00"
|
||||
|
||||
@classmethod
|
||||
def writer(cls, path, dtype):
|
||||
class _Writer(object):
|
||||
def __enter__(self):
|
||||
self._file = open(path, "wb")
|
||||
|
||||
# Write Magic string so we can check the file format then opening it again.
|
||||
self._file.write(cls._HDR_MAGIC)
|
||||
# Write version number
|
||||
# Little endian unsigned 64 Bit integer
|
||||
self._file.write(struct.pack("<Q", 1))
|
||||
# Little endian unsigned 8 Bit integer
|
||||
self._file.write(struct.pack("<B", code(dtype)))
|
||||
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def _get_pointers(sizes):
|
||||
dtype_size = dtype().itemsize
|
||||
address = 0
|
||||
pointers = []
|
||||
|
||||
for size in sizes:
|
||||
pointers.append(address)
|
||||
address += size * dtype_size
|
||||
|
||||
return pointers
|
||||
|
||||
def write(self, sizes, doc_idx):
|
||||
pointers = self._get_pointers(sizes)
|
||||
|
||||
# Little endian unsigned 64 Bit integer
|
||||
self._file.write(struct.pack("<Q", len(sizes)))
|
||||
# Little endian unsigned 64 Bit integer
|
||||
self._file.write(struct.pack("<Q", len(doc_idx)))
|
||||
|
||||
sizes = np.array(sizes, dtype=np.int32)
|
||||
self._file.write(sizes.tobytes(order="C"))
|
||||
del sizes
|
||||
|
||||
pointers = np.array(pointers, dtype=np.int64)
|
||||
self._file.write(pointers.tobytes(order="C"))
|
||||
del pointers
|
||||
|
||||
doc_idx = np.array(doc_idx, dtype=np.int64)
|
||||
self._file.write(doc_idx.tobytes(order="C"))
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self._file.close()
|
||||
|
||||
return _Writer()
|
||||
|
||||
def __init__(self, path, skip_warmup=False):
|
||||
with open(path, "rb") as stream:
|
||||
magic_test = stream.read(9)
|
||||
assert self._HDR_MAGIC == magic_test, (
|
||||
"Index file doesn't match expected format. "
|
||||
"Make sure that --dataset-impl is configured properly."
|
||||
)
|
||||
# Little endian unsigned 64 Bit integer
|
||||
version = struct.unpack("<Q", stream.read(8))
|
||||
assert (1,) == version
|
||||
|
||||
# Little endian unsigned 8 Bit integer
|
||||
(dtype_code,) = struct.unpack("<B", stream.read(1))
|
||||
self._dtype = dtypes[dtype_code]
|
||||
self._dtype_size = self._dtype().itemsize
|
||||
|
||||
self._len = struct.unpack("<Q", stream.read(8))[0]
|
||||
self._doc_count = struct.unpack("<Q", stream.read(8))[0]
|
||||
offset = stream.tell()
|
||||
|
||||
if not skip_warmup:
|
||||
print_rank_0(" warming up index mmap file...")
|
||||
_warmup_mmap_file(path)
|
||||
|
||||
self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
|
||||
self._bin_buffer = memoryview(self._bin_buffer_mmap)
|
||||
print_rank_0(" reading sizes...")
|
||||
self._sizes = np.frombuffer(
|
||||
self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
|
||||
)
|
||||
print_rank_0(" reading pointers...")
|
||||
self._pointers = np.frombuffer(
|
||||
self._bin_buffer,
|
||||
dtype=np.int64,
|
||||
count=self._len,
|
||||
offset=offset + self._sizes.nbytes,
|
||||
)
|
||||
print_rank_0(" reading document index...")
|
||||
self._doc_idx = np.frombuffer(
|
||||
self._bin_buffer,
|
||||
dtype=np.int64,
|
||||
count=self._doc_count,
|
||||
offset=offset + self._sizes.nbytes + self._pointers.nbytes,
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
self._bin_buffer_mmap._mmap.close()
|
||||
del self._bin_buffer_mmap
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
@property
|
||||
def sizes(self):
|
||||
return self._sizes
|
||||
|
||||
@property
|
||||
def doc_idx(self):
|
||||
return self._doc_idx
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def __getitem__(self, i):
|
||||
return self._pointers[i], self._sizes[i]
|
||||
|
||||
def __len__(self):
|
||||
return self._len
|
||||
|
||||
def __init__(self, path, skip_warmup=False):
|
||||
super().__init__()
|
||||
|
||||
self._path = None
|
||||
self._index = None
|
||||
self._bin_buffer = None
|
||||
|
||||
self._do_init(path, skip_warmup)
|
||||
|
||||
def __getstate__(self):
|
||||
return self._path
|
||||
|
||||
def __setstate__(self, state):
|
||||
self._do_init(state)
|
||||
|
||||
def _do_init(self, path, skip_warmup):
|
||||
self._path = path
|
||||
self._index = self.Index(index_file_path(self._path), skip_warmup)
|
||||
|
||||
if not skip_warmup:
|
||||
print_rank_0(" warming up data mmap file...")
|
||||
_warmup_mmap_file(data_file_path(self._path))
|
||||
print_rank_0(" creating numpy buffer of mmap...")
|
||||
self._bin_buffer_mmap = np.memmap(
|
||||
data_file_path(self._path), mode="r", order="C"
|
||||
)
|
||||
print_rank_0(" creating memory view of numpy buffer...")
|
||||
self._bin_buffer = memoryview(self._bin_buffer_mmap)
|
||||
|
||||
def __del__(self):
|
||||
self._bin_buffer_mmap._mmap.close()
|
||||
del self._bin_buffer_mmap
|
||||
del self._index
|
||||
|
||||
def __len__(self):
|
||||
return len(self._index)
|
||||
|
||||
# @lru_cache(maxsize=8)
|
||||
def __getitem__(self, idx):
|
||||
if isinstance(idx, int):
|
||||
ptr, size = self._index[idx]
|
||||
np_array = np.frombuffer(
|
||||
self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
|
||||
)
|
||||
return np_array
|
||||
elif isinstance(idx, slice):
|
||||
start, stop, step = idx.indices(len(self))
|
||||
if step != 1:
|
||||
raise ValueError("Slices into indexed_dataset must be contiguous")
|
||||
ptr = self._index._pointers[start]
|
||||
sizes = self._index._sizes[idx]
|
||||
offsets = list(accumulate(sizes))
|
||||
total_size = sum(sizes)
|
||||
np_array = np.frombuffer(
|
||||
self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr
|
||||
)
|
||||
sents = np.split(np_array, offsets[:-1])
|
||||
return sents
|
||||
|
||||
def get(self, idx, offset=0, length=None):
|
||||
"""Retrieves a single item from the dataset with the option to only
|
||||
return a portion of the item.
|
||||
|
||||
get(idx) is the same as [idx] but get() does not support slicing.
|
||||
"""
|
||||
ptr, size = self._index[idx]
|
||||
if length is None:
|
||||
length = size - offset
|
||||
ptr += offset * np.dtype(self._index.dtype).itemsize
|
||||
np_array = np.frombuffer(
|
||||
self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr
|
||||
)
|
||||
return np_array
|
||||
|
||||
def pad(self, idx, length=None):
|
||||
ptr, size = self._index[idx]
|
||||
try:
|
||||
np_array = np.frombuffer(
|
||||
self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr
|
||||
)
|
||||
except:
|
||||
np_array = np.frombuffer(
|
||||
self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
|
||||
)
|
||||
ptr0, _ = self._index[0]
|
||||
np_array0 = np.frombuffer(
|
||||
self._bin_buffer,
|
||||
dtype=self._index.dtype,
|
||||
count=length - size,
|
||||
offset=ptr0,
|
||||
)
|
||||
np_array = np.append(np_array, np_array0)
|
||||
return np_array
|
||||
|
||||
def only(self, idx):
|
||||
ptr, size = self._index[idx]
|
||||
np_array = np.frombuffer(
|
||||
self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
|
||||
)
|
||||
|
||||
return np_array
|
||||
|
||||
@property
|
||||
def sizes(self):
|
||||
return self._index.sizes
|
||||
|
||||
@property
|
||||
def doc_idx(self):
|
||||
return self._index.doc_idx
|
||||
|
||||
def get_doc_idx(self):
|
||||
return self._index._doc_idx
|
||||
|
||||
def set_doc_idx(self, doc_idx_):
|
||||
self._index._doc_idx = doc_idx_
|
||||
|
||||
@property
|
||||
def supports_prefetch(self):
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def exists(path):
|
||||
return os.path.exists(index_file_path(path)) and os.path.exists(
|
||||
data_file_path(path)
|
||||
)
|
||||
242
finetune/lora/v6/src/dataset.py
vendored
Normal file
242
finetune/lora/v6/src/dataset.py
vendored
Normal file
@@ -0,0 +1,242 @@
|
||||
########################################################################################################
|
||||
# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
|
||||
########################################################################################################
|
||||
|
||||
import json, math, random, os, sys
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
from pytorch_lightning.utilities import rank_zero_info
|
||||
from .binidx import MMapIndexedDataset
|
||||
from .utils import MaybeIsPrime
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
|
||||
if args.data_type == "binidx":
|
||||
self.vocab_size = args.vocab_size
|
||||
rank_zero_info(
|
||||
f"Current vocab size = {self.vocab_size} (make sure it's correct)"
|
||||
)
|
||||
|
||||
if args.my_pile_version == 1:
|
||||
self.data = MMapIndexedDataset(args.data_file)
|
||||
self.data_size = (
|
||||
len(self.data._bin_buffer) // self.data._index._dtype_size
|
||||
)
|
||||
rank_zero_info(f"Data has {self.data_size} tokens.")
|
||||
elif args.my_pile_version == 2:
|
||||
data_list = (
|
||||
open(args.data_file, "r", encoding="utf-8")
|
||||
.read()
|
||||
.strip()
|
||||
.split("\n")
|
||||
)
|
||||
data_list = [i.strip().split(" ") for i in data_list]
|
||||
self.data = []
|
||||
self.data_size = int(data_list[-1][-1])
|
||||
rank_zero_info(f"Data has {self.data_size} chunks.")
|
||||
for d in data_list:
|
||||
data = MMapIndexedDataset(d[0])
|
||||
data_size = len(data._bin_buffer) // data._index._dtype_size
|
||||
assert (data_size - args.ctx_len) == int(d[1])
|
||||
self.data += [[int(d[-1]), int(d[1]), data]]
|
||||
# rank_zero_info(self.data)
|
||||
|
||||
if args.my_qa_mask > 0:
|
||||
# self.data_pile = MMapIndexedDataset('/fsx/pile/pile_20B_tokenizer_text_document')
|
||||
self.data_pile = MMapIndexedDataset(
|
||||
"/fsx/pile_deduped/pile_0.87_deduped_text_document"
|
||||
)
|
||||
self.data_pile_size = (
|
||||
len(self.data_pile._bin_buffer) // self.data._index._dtype_size
|
||||
)
|
||||
else:
|
||||
self.data_pile = None
|
||||
self.data_pile_size = 0
|
||||
|
||||
if args.my_pile_stage > 0:
|
||||
# assert self.data_size == 332115325534 and self.vocab_size == 50277
|
||||
self.samples_per_epoch = args.epoch_steps * args.real_bsz
|
||||
assert self.samples_per_epoch == 40320
|
||||
rank_zero_info(
|
||||
f"########## Pile 20b-tokenized stage {args.my_pile_stage} ##########"
|
||||
)
|
||||
dataset_slot = self.data_size // args.ctx_len
|
||||
if args.my_pile_stage != 4:
|
||||
assert MaybeIsPrime(args.magic_prime)
|
||||
assert args.magic_prime % 3 == 2
|
||||
assert (
|
||||
args.magic_prime / dataset_slot > 0.99
|
||||
and args.magic_prime / dataset_slot <= 1
|
||||
)
|
||||
elif args.data_type == "numpy":
|
||||
self.data = np.load(args.data_file).astype("int")
|
||||
self.vocab_size = args.vocab_size
|
||||
rank_zero_info(
|
||||
f"Current vocab size = {self.vocab_size} (make sure it's correct)"
|
||||
)
|
||||
self.data_size = len(self.data)
|
||||
rank_zero_info(f"Data has {self.data_size} tokens.")
|
||||
elif args.data_type == "uint16":
|
||||
self.data = (
|
||||
np.fromfile(args.data_file, dtype=np.uint16)
|
||||
.astype("int32")
|
||||
.reshape(-1, args.my_sample_len)
|
||||
)
|
||||
self.vocab_size = args.vocab_size
|
||||
rank_zero_info(
|
||||
f"Current vocab size = {self.vocab_size} (make sure it's correct)"
|
||||
)
|
||||
self.data_size = self.data.shape[0]
|
||||
rank_zero_info(f"Data has {self.data_size} samples.")
|
||||
else:
|
||||
if args.data_type == "dummy":
|
||||
rank_zero_info("Building dummy data...")
|
||||
self.data = ""
|
||||
for i in range(100000):
|
||||
aa = (i) % 10000
|
||||
bb = (i * i) % 10000
|
||||
cc = aa + bb
|
||||
self.data += f".{aa}+{bb}={cc}."
|
||||
else:
|
||||
self.data = open(args.data_file, "r", encoding=args.data_type).read()
|
||||
rank_zero_info("Building token list...")
|
||||
unique = sorted(list(set(self.data)))
|
||||
self.vocab_size = len(unique)
|
||||
# rank_zero_info()
|
||||
# for u in unique:
|
||||
# print(u, end=' ')
|
||||
# rank_zero_info('\n\n')
|
||||
xx = 0
|
||||
xxObj = {}
|
||||
for u in unique:
|
||||
xxObj[xx] = u
|
||||
xx += 1
|
||||
with open(
|
||||
f"{args.proj_dir}/vocab.json", "w", encoding="utf-8"
|
||||
) as vocab_file:
|
||||
vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
|
||||
self.data_size = len(self.data)
|
||||
rank_zero_info(
|
||||
f"Data has {self.data_size} tokens, {self.vocab_size} vocab size."
|
||||
)
|
||||
self.stoi = {ch: i for i, ch in enumerate(unique)}
|
||||
self.itos = {i: ch for i, ch in enumerate(unique)}
|
||||
|
||||
def __len__(self):
|
||||
return self.args.epoch_steps * self.args.micro_bsz
|
||||
|
||||
def __getitem__(self, idx):
|
||||
args = self.args
|
||||
rank = self.global_rank
|
||||
epoch = self.real_epoch
|
||||
world_size = self.world_size
|
||||
# print(f"epoch {epoch} idx {idx} rank {rank}/{world_size}")
|
||||
|
||||
if args.data_type == "uint16":
|
||||
i = np.random.randint(0, self.data_size - 1)
|
||||
dix = self.data[i]
|
||||
x = torch.tensor(dix[:-1], dtype=torch.long)
|
||||
y = torch.tensor(dix[1:], dtype=torch.long)
|
||||
else:
|
||||
ctx_len = args.ctx_len
|
||||
req_len = ctx_len + 1
|
||||
magic_prime = args.magic_prime
|
||||
data = self.data
|
||||
|
||||
if args.my_pile_stage > 0:
|
||||
ii = 1 + epoch * self.samples_per_epoch + (idx * world_size) + rank
|
||||
|
||||
if args.my_qa_mask > 0:
|
||||
ii_orig = ii
|
||||
if ii % 2 == 0:
|
||||
ii = -1
|
||||
data = self.data_pile
|
||||
else:
|
||||
ii = ii // 2
|
||||
if data == self.data_pile:
|
||||
i = np.random.randint(0, self.data_pile_size - req_len)
|
||||
else:
|
||||
if args.my_pile_stage == 4 or ii < args.my_random_steps:
|
||||
# cheat: pick a random spot in dataset
|
||||
if args.my_pile_version == 1:
|
||||
i = np.random.randint(0, self.data_size - req_len)
|
||||
else:
|
||||
i = np.random.randint(0, self.data_size)
|
||||
else:
|
||||
ii = ii - args.my_random_steps
|
||||
factor = (math.sqrt(5) - 1) / 2
|
||||
factor = int(magic_prime * factor)
|
||||
i = ((factor * ii * ii * ii) % magic_prime) * ctx_len
|
||||
i = i + args.my_pile_shift
|
||||
# print(f"epoch {epoch} idx {idx} rank {rank}/{world_size} ii {ii} pos {round(i / self.data_size, 3)}")
|
||||
else:
|
||||
# cheat: pick a random spot in dataset
|
||||
i = np.random.randint(0, self.data_size - req_len)
|
||||
|
||||
if args.data_type == "binidx":
|
||||
if args.my_pile_version == 1:
|
||||
dix = data.get(idx=0, offset=i, length=req_len).astype(int)
|
||||
# dix = data.pad(idx=idx, length=req_len).astype(int)
|
||||
else:
|
||||
# self.data : cutoff, chunk_count, data
|
||||
for j in range(len(data)):
|
||||
if i < data[j][0]:
|
||||
ii = i
|
||||
i = (i - (data[j - 1][0] if j > 0 else 0)) % data[j][1]
|
||||
dix = (
|
||||
data[j][2]
|
||||
.get(idx=0, offset=i, length=req_len)
|
||||
.astype(int)
|
||||
)
|
||||
# print(ii, j, i)
|
||||
break
|
||||
elif args.data_type == "numpy":
|
||||
dix = data[i : i + req_len]
|
||||
else:
|
||||
dix = [self.stoi[s] for s in data[i : i + req_len]]
|
||||
|
||||
if args.my_qa_mask == 1:
|
||||
if data == self.data_pile:
|
||||
z = [1] * ctx_len
|
||||
else:
|
||||
z = [0] * ctx_len
|
||||
z_sum = 0
|
||||
isGood = False
|
||||
for i in range(3, ctx_len):
|
||||
if (
|
||||
dix[i] == 27
|
||||
and dix[i - 1] == 34
|
||||
and dix[i - 2] == 187
|
||||
and dix[i - 3] == 187
|
||||
):
|
||||
isGood = True
|
||||
if dix[i] == 0:
|
||||
isGood = False
|
||||
if isGood:
|
||||
z[i] = 1
|
||||
z_sum += 1
|
||||
if z_sum == 0:
|
||||
z = [1] * ctx_len
|
||||
i = np.random.randint(0, self.data_pile_size - req_len)
|
||||
dix = self.data_pile.get(
|
||||
idx=0, offset=i, length=req_len
|
||||
).astype(int)
|
||||
z = torch.tensor(z, dtype=torch.bfloat16)
|
||||
|
||||
x = torch.tensor(dix[:-1], dtype=torch.long)
|
||||
y = torch.tensor(dix[1:], dtype=torch.long)
|
||||
|
||||
# if ii_orig < 50:
|
||||
# # if rank == 1:
|
||||
# print('rank', rank, 'i', ii_orig, ii, i, 'x', x[:5], '...', x[-5:])
|
||||
# else:
|
||||
# exit(0)
|
||||
|
||||
if args.my_qa_mask == 1:
|
||||
return x, y, z
|
||||
|
||||
return x, y
|
||||
1086
finetune/lora/v6/src/model.py
vendored
Normal file
1086
finetune/lora/v6/src/model.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
310
finetune/lora/v6/src/trainer.py
vendored
Normal file
310
finetune/lora/v6/src/trainer.py
vendored
Normal file
@@ -0,0 +1,310 @@
|
||||
import os, math, time, datetime, subprocess
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.utilities import rank_zero_info, rank_zero_only
|
||||
from .model import LORA_CONFIG
|
||||
|
||||
|
||||
def my_save(args, trainer, dd, ff):
|
||||
if "14b-run1" in ff:
|
||||
fn = ff.split("/")[-1]
|
||||
fff = "/dev/shm/" + fn
|
||||
torch.save(dd, fff)
|
||||
subprocess.Popen(f" aws s3 mv {fff} s3://rwkv-14b-4k/{fn} --quiet", shell=True)
|
||||
elif ("world/14b" in ff) or ("world/7b" in ff):
|
||||
aa = ff.split("/")[1]
|
||||
fn = ff.split("/")[-1]
|
||||
fff = f"/dev/shm/{aa}-{fn}"
|
||||
torch.save(dd, fff)
|
||||
subprocess.Popen(
|
||||
f" aws s3 mv {fff} s3://rwkv-world/{aa}-{fn} --quiet", shell=True
|
||||
)
|
||||
else:
|
||||
if "deepspeed_stage_3" in args.strategy:
|
||||
trainer.save_checkpoint(ff, weights_only=True)
|
||||
else:
|
||||
torch.save(dd, ff)
|
||||
|
||||
|
||||
class train_callback(pl.Callback):
|
||||
def __init__(self, args):
|
||||
super().__init__()
|
||||
self.args = args
|
||||
|
||||
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
|
||||
args = self.args
|
||||
# if args.cuda_cleanup > 0:
|
||||
# torch.cuda.empty_cache()
|
||||
real_step = trainer.global_step + args.epoch_begin * args.epoch_steps
|
||||
|
||||
# LR schedule
|
||||
w_step = args.warmup_steps
|
||||
if args.lr_final == args.lr_init or args.epoch_count == 0:
|
||||
lr = args.lr_init
|
||||
else:
|
||||
decay_step = real_step - args.my_pile_edecay * args.epoch_steps
|
||||
decay_total = (args.epoch_count - args.my_pile_edecay) * args.epoch_steps
|
||||
progress = (decay_step - w_step + 1) / (decay_total - w_step)
|
||||
progress = min(1, max(0, progress))
|
||||
|
||||
if args.lr_final == 0 or args.lr_init == 0: # linear decay
|
||||
lr = args.lr_init + (args.lr_final - args.lr_init) * progress
|
||||
else: # exp decay
|
||||
lr = args.lr_init * math.exp(
|
||||
math.log(args.lr_final / args.lr_init) * pow(progress, 1)
|
||||
)
|
||||
# if trainer.is_global_zero:
|
||||
# print(trainer.global_step, decay_step, decay_total, w_step, progress, lr)
|
||||
|
||||
if args.my_exit_tokens != 0: # cosine decay
|
||||
real_tokens = real_step * args.ctx_len * args.real_bsz
|
||||
warmup_tokens = w_step * args.ctx_len * args.real_bsz
|
||||
progress = (real_tokens - warmup_tokens) / (
|
||||
abs(args.my_exit_tokens) - warmup_tokens
|
||||
)
|
||||
progress = max(0, min(1, progress))
|
||||
lr_final_factor = args.lr_final / args.lr_init
|
||||
lr_mult = (0.5 + lr_final_factor / 2) + (
|
||||
0.5 - lr_final_factor / 2
|
||||
) * math.cos(math.pi * progress)
|
||||
if args.my_exit_tokens > 0:
|
||||
lr = args.lr_init * lr_mult
|
||||
else:
|
||||
lr = (lr + args.lr_init * lr_mult) / 2
|
||||
if progress >= 1:
|
||||
if (trainer.is_global_zero) or ("deepspeed_stage_3" in args.strategy):
|
||||
my_save(
|
||||
args,
|
||||
trainer,
|
||||
pl_module.state_dict(),
|
||||
f"{args.proj_dir}/rwkv-final.pth",
|
||||
)
|
||||
exit(0)
|
||||
if trainer.global_step < w_step:
|
||||
lr = lr * (0.2 + 0.8 * trainer.global_step / w_step)
|
||||
|
||||
if args.weight_decay_final > 0:
|
||||
wd_now = args.weight_decay * math.exp(
|
||||
math.log(args.weight_decay_final / args.weight_decay) * progress
|
||||
)
|
||||
else:
|
||||
wd_now = args.weight_decay
|
||||
|
||||
for param_group in trainer.optimizers[0].param_groups:
|
||||
if param_group["weight_decay"] > 0:
|
||||
param_group["weight_decay"] = wd_now
|
||||
if args.layerwise_lr > 0:
|
||||
param_group["lr"] = lr * param_group["my_lr_scale"]
|
||||
# print(param_group["lr"], param_group["my_lr_scale"])
|
||||
else:
|
||||
param_group["lr"] = lr
|
||||
|
||||
trainer.my_lr = lr
|
||||
trainer.my_wd = wd_now
|
||||
# rank_zero_info(f"{real_step} {lr}")
|
||||
|
||||
if trainer.global_step == 0:
|
||||
if trainer.is_global_zero: # logging
|
||||
trainer.my_loss_sum = 0
|
||||
trainer.my_loss_count = 0
|
||||
trainer.my_log = open(args.proj_dir + "/train_log.txt", "a")
|
||||
trainer.my_log.write(
|
||||
f"NEW RUN {args.my_timestamp}\n{vars(self.args)}\n"
|
||||
)
|
||||
try:
|
||||
print(f"\n{trainer.strategy.config}\n")
|
||||
trainer.my_log.write(f"{trainer.strategy.config}\n")
|
||||
except:
|
||||
pass
|
||||
trainer.my_log.flush()
|
||||
if len(args.wandb) > 0:
|
||||
print("Login to wandb...")
|
||||
import wandb
|
||||
|
||||
wandb.init(
|
||||
project=args.wandb,
|
||||
name=args.run_name + " " + args.my_timestamp,
|
||||
config=args,
|
||||
save_code=False,
|
||||
)
|
||||
trainer.my_wandb = wandb
|
||||
|
||||
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
||||
args = self.args
|
||||
token_per_step = args.ctx_len * args.real_bsz
|
||||
real_step = trainer.global_step + args.epoch_begin * args.epoch_steps
|
||||
if trainer.is_global_zero: # logging
|
||||
t_now = time.time_ns()
|
||||
kt_s = 0
|
||||
try:
|
||||
t_cost = (t_now - trainer.my_time_ns) / 1e9
|
||||
kt_s = token_per_step / t_cost / 1000
|
||||
self.log("REAL it/s", 1.0 / t_cost, prog_bar=True, on_step=True)
|
||||
self.log("Kt/s", kt_s, prog_bar=True, on_step=True)
|
||||
except:
|
||||
pass
|
||||
trainer.my_time_ns = t_now
|
||||
if pl.__version__[0] == "2":
|
||||
trainer.my_loss = outputs["loss"]
|
||||
else:
|
||||
trainer.my_loss = trainer.my_loss_all.float().mean().item()
|
||||
trainer.my_loss_sum += trainer.my_loss
|
||||
trainer.my_loss_count += 1
|
||||
trainer.my_epoch_loss = trainer.my_loss_sum / trainer.my_loss_count
|
||||
self.log("lr", trainer.my_lr, prog_bar=True, on_step=True)
|
||||
self.log("loss", trainer.my_epoch_loss, prog_bar=True, on_step=True)
|
||||
# self.log("s", real_step, prog_bar=True, on_step=True)
|
||||
|
||||
if len(args.wandb) > 0:
|
||||
lll = {
|
||||
"loss": trainer.my_loss,
|
||||
"lr": trainer.my_lr,
|
||||
"wd": trainer.my_wd,
|
||||
"Gtokens": real_step * token_per_step / 1e9,
|
||||
}
|
||||
if kt_s > 0:
|
||||
lll["kt/s"] = kt_s
|
||||
trainer.my_wandb.log(lll, step=int(real_step))
|
||||
if (trainer.is_global_zero) or (
|
||||
"deepspeed_stage_3" in args.strategy
|
||||
): # save pth
|
||||
if args.magic_prime > 0:
|
||||
expand_factor = 2 if args.my_qa_mask > 0 else 1
|
||||
if int(real_step) == int(
|
||||
args.magic_prime * expand_factor // args.real_bsz
|
||||
) - 1 + int(args.my_random_steps):
|
||||
to_save_dict = pl_module.state_dict()
|
||||
my_save(
|
||||
args,
|
||||
trainer,
|
||||
to_save_dict,
|
||||
f"{args.proj_dir}/rwkv-final.pth",
|
||||
)
|
||||
# if args.batch_save==batch_idx :
|
||||
# to_save_dict = pl_module.state_dict()
|
||||
# for name, state in to_save_dict.items():
|
||||
# if 'img' in name:
|
||||
# to_save_dict[name] = state
|
||||
# try:
|
||||
# my_save(
|
||||
# args, trainer,
|
||||
# to_save_dict,
|
||||
# f"{args.proj_dir}/rwkv-{args.epoch_begin + trainer.current_epoch}-{batch_idx}.pth",
|
||||
# )
|
||||
# except Exception as e:
|
||||
# print('Error\n\n', e, '\n\n')
|
||||
|
||||
def on_train_epoch_start(self, trainer, pl_module):
|
||||
args = self.args
|
||||
if pl.__version__[0] == "2":
|
||||
dataset = trainer.train_dataloader.dataset
|
||||
else:
|
||||
dataset = trainer.train_dataloader.dataset.datasets
|
||||
assert "MyDataset" in str(dataset)
|
||||
dataset.global_rank = trainer.global_rank
|
||||
dataset.real_epoch = int(args.epoch_begin + trainer.current_epoch)
|
||||
dataset.world_size = trainer.world_size
|
||||
# print(f'########## world_size {dataset.world_size} global_rank {dataset.global_rank} real_epoch {dataset.real_epoch} ##########')
|
||||
|
||||
def on_train_epoch_end(self, trainer, pl_module):
|
||||
args = self.args
|
||||
to_save_dict = {}
|
||||
if (trainer.is_global_zero) or (
|
||||
"deepspeed_stage_3" in args.strategy
|
||||
): # save pth
|
||||
if (
|
||||
args.epoch_save > 0 and trainer.current_epoch % args.epoch_save == 0
|
||||
) or (trainer.current_epoch == args.epoch_count - 1):
|
||||
if args.data_type == "wds_img":
|
||||
raw_dict = pl_module.state_dict()
|
||||
for k in raw_dict:
|
||||
if k.startswith("encoder.") or k.startswith("decoder."):
|
||||
to_save_dict[k] = raw_dict[k]
|
||||
else:
|
||||
to_save_dict = pl_module.state_dict()
|
||||
|
||||
if args.data_type == "img" and not args.lora:
|
||||
for name, state in to_save_dict.items():
|
||||
if "img" in name:
|
||||
to_save_dict[name] = state
|
||||
|
||||
if args.lora:
|
||||
enable_time_finetune = "time" in LORA_CONFIG["parts"]
|
||||
enable_ln_finetune = "ln" in LORA_CONFIG["parts"]
|
||||
lora_dict = {}
|
||||
for name, state in to_save_dict.items():
|
||||
if "img" in name:
|
||||
lora_dict[name] = state
|
||||
if (
|
||||
".lora_" in name
|
||||
or (enable_time_finetune and ".time_" in name)
|
||||
or (enable_ln_finetune and ".ln" in name)
|
||||
):
|
||||
lora_dict[name] = state
|
||||
to_save_dict = lora_dict
|
||||
|
||||
try:
|
||||
my_save(
|
||||
args,
|
||||
trainer,
|
||||
to_save_dict,
|
||||
f"{args.proj_dir}/rwkv-{args.epoch_begin + trainer.current_epoch}.pth",
|
||||
)
|
||||
except Exception as e:
|
||||
print("Error\n\n", e, "\n\n")
|
||||
|
||||
if trainer.is_global_zero: # logging
|
||||
trainer.my_log.write(
|
||||
f"{args.epoch_begin + trainer.current_epoch} {trainer.my_epoch_loss:.6f} {math.exp(trainer.my_epoch_loss):.4f} {trainer.my_lr:.8f} {datetime.datetime.now()} {trainer.current_epoch}\n"
|
||||
)
|
||||
trainer.my_log.flush()
|
||||
|
||||
trainer.my_loss_sum = 0
|
||||
trainer.my_loss_count = 0
|
||||
if (args.epoch_begin + trainer.current_epoch) >= args.my_exit:
|
||||
exit(0)
|
||||
|
||||
|
||||
@rank_zero_only
|
||||
def generate_init_weight(model, init_weight_name):
|
||||
mm = model.generate_init_weight()
|
||||
|
||||
if model.args.my_pile_stage == 1:
|
||||
if len(model.args.load_model) > 0:
|
||||
print(f"Combine weights from {model.args.load_model}...")
|
||||
load_dict = torch.load(model.args.load_model, map_location="cpu")
|
||||
for k in load_dict:
|
||||
try:
|
||||
assert k in mm
|
||||
except:
|
||||
print("missing", k)
|
||||
exit(0)
|
||||
src = load_dict[k]
|
||||
try:
|
||||
mm[k] = src.reshape(mm[k].shape)
|
||||
except:
|
||||
tmp = mm[k].squeeze().clone()
|
||||
print(k, src.shape, "-->", mm[k].shape)
|
||||
ss = src.shape[0]
|
||||
dd = tmp.shape[0]
|
||||
for i in range(dd):
|
||||
pos = i / dd * ss
|
||||
if pos >= ss - 1:
|
||||
tmp[i] = src[ss - 1]
|
||||
else:
|
||||
p0 = int(math.floor(pos))
|
||||
ii = pos - p0
|
||||
tmp[i] = src[p0] * (1 - ii) + src[p0 + 1] * (ii)
|
||||
mm[k] = tmp.reshape(mm[k].shape)
|
||||
sss = src.squeeze().float().cpu().numpy()
|
||||
print(sss[:10], "...", sss[-10:])
|
||||
mmm = mm[k].squeeze().float().cpu().numpy()
|
||||
print(mmm[:10], "...", mmm[-10:])
|
||||
|
||||
print(f"Save to {init_weight_name}...")
|
||||
torch.save(mm, init_weight_name)
|
||||
|
||||
if model.args.my_pile_stage == 1:
|
||||
print("Done. Now go for stage 2.")
|
||||
exit(0)
|
||||
139
finetune/lora/v6/src/utils.py
vendored
Normal file
139
finetune/lora/v6/src/utils.py
vendored
Normal file
@@ -0,0 +1,139 @@
|
||||
import json, time, random, os
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
time_slot = {}
|
||||
time_ref = time.time_ns()
|
||||
|
||||
|
||||
def record_time(name):
|
||||
if name not in time_slot:
|
||||
time_slot[name] = 1e20
|
||||
tt = (time.time_ns() - time_ref) / 1e9
|
||||
if tt < time_slot[name]:
|
||||
time_slot[name] = tt
|
||||
|
||||
|
||||
class TOKENIZER:
|
||||
def __init__(self, WORD_NAME, UNKNOWN_CHAR="\ue083"):
|
||||
if "list" in str(type(WORD_NAME)):
|
||||
self.charMode = False
|
||||
if WORD_NAME[0] == WORD_NAME[1]:
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
|
||||
self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])
|
||||
else:
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])
|
||||
self.vocab_size = len(self.tokenizer)
|
||||
else:
|
||||
self.charMode = True
|
||||
with open(WORD_NAME + ".json", "r", encoding="utf-16") as result_file:
|
||||
self.word_table = json.load(result_file)
|
||||
|
||||
self.vocab_size = len(self.word_table)
|
||||
|
||||
self.stoi = {v: int(k) for k, v in self.word_table.items()}
|
||||
self.itos = {int(k): v for k, v in self.word_table.items()}
|
||||
|
||||
self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]
|
||||
|
||||
def refine_context(self, context):
|
||||
context = context.strip().split("\n")
|
||||
for c in range(len(context)):
|
||||
context[c] = context[c].strip().strip("\u3000").strip("\r")
|
||||
context = list(filter(lambda c: c != "", context))
|
||||
context = "\n" + ("\n".join(context)).strip()
|
||||
if context == "":
|
||||
context = "\n"
|
||||
return context
|
||||
|
||||
def sample_logits(
|
||||
self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None
|
||||
):
|
||||
# out[self.UNKNOWN_CHAR] = -float('Inf')
|
||||
lastChar = int(x[-1])
|
||||
|
||||
probs = F.softmax(out, dim=-1)
|
||||
|
||||
if self.charMode:
|
||||
if self.itos[lastChar] == "\n":
|
||||
top_p = top_p_newline
|
||||
else:
|
||||
top_p = top_p_usual
|
||||
else:
|
||||
top_p = top_p_usual
|
||||
|
||||
if os.environ["RWKV_RUN_DEVICE"] == "cpu":
|
||||
probs = probs.numpy()
|
||||
sorted_probs = np.sort(probs)[::-1]
|
||||
cumulative_probs = np.cumsum(sorted_probs)
|
||||
cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])
|
||||
probs[probs < cutoff] = 0
|
||||
if temperature != 1.0:
|
||||
probs = probs.pow(1.0 / temperature)
|
||||
probs = probs / np.sum(probs)
|
||||
out = np.random.choice(a=len(probs), p=probs)
|
||||
return out
|
||||
else:
|
||||
sorted_probs = torch.sort(probs, descending=True)[0]
|
||||
cumulative_probs = torch.cumsum(sorted_probs, dim=-1).cpu().numpy()
|
||||
cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])
|
||||
probs[probs < cutoff] = 0
|
||||
if temperature != 1.0:
|
||||
probs = probs.pow(1.0 / temperature)
|
||||
out = torch.multinomial(probs, num_samples=1)[0]
|
||||
return out
|
||||
|
||||
|
||||
def MaybeIsPrime(number):
|
||||
if FermatPrimalityTest(number) and MillerRabinPrimalityTest(number):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def FermatPrimalityTest(number):
|
||||
if number > 1:
|
||||
for time in range(3):
|
||||
randomNumber = random.randint(2, number) - 1
|
||||
if pow(randomNumber, number - 1, number) != 1:
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def MillerRabinPrimalityTest(number):
|
||||
if number == 2:
|
||||
return True
|
||||
elif number == 1 or number % 2 == 0:
|
||||
return False
|
||||
oddPartOfNumber = number - 1
|
||||
timesTwoDividNumber = 0
|
||||
while oddPartOfNumber % 2 == 0:
|
||||
oddPartOfNumber = oddPartOfNumber // 2
|
||||
timesTwoDividNumber = timesTwoDividNumber + 1
|
||||
|
||||
for time in range(3):
|
||||
while True:
|
||||
randomNumber = random.randint(2, number) - 1
|
||||
if randomNumber != 0 and randomNumber != 1:
|
||||
break
|
||||
|
||||
randomNumberWithPower = pow(randomNumber, oddPartOfNumber, number)
|
||||
|
||||
if (randomNumberWithPower != 1) and (randomNumberWithPower != number - 1):
|
||||
iterationNumber = 1
|
||||
|
||||
while (iterationNumber <= timesTwoDividNumber - 1) and (
|
||||
randomNumberWithPower != number - 1
|
||||
):
|
||||
randomNumberWithPower = pow(randomNumberWithPower, 2, number)
|
||||
iterationNumber = iterationNumber + 1
|
||||
if randomNumberWithPower != (number - 1):
|
||||
return False
|
||||
|
||||
return True
|
||||
Reference in New Issue
Block a user