lora finetune (need to be refactored)
This commit is contained in:
133
finetune/lora/cuda/wkv_cuda.cu
vendored
Normal file
133
finetune/lora/cuda/wkv_cuda.cu
vendored
Normal file
@@ -0,0 +1,133 @@
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define MIN_VALUE (-1e38)
|
||||
|
||||
template <typename F>
|
||||
__global__ void kernel_forward(const int B, const int T, const int C,
|
||||
const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,
|
||||
F *__restrict__ const _y) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int _b = idx / C;
|
||||
const int _c = idx % C;
|
||||
const int _offset = _b * T * C + _c;
|
||||
|
||||
F u = _u[_c];
|
||||
F w = _w[_c];
|
||||
const F *__restrict__ const k = _k + _offset;
|
||||
const F *__restrict__ const v = _v + _offset;
|
||||
F *__restrict__ const y = _y + _offset;
|
||||
|
||||
// aa and bb are running sums divided by exp(pp) (to avoid overflow)
|
||||
F aa = 0, bb = 0, pp = MIN_VALUE;
|
||||
for (int i = 0; i < T; i++) {
|
||||
const int ii = i * C;
|
||||
const F kk = k[ii];
|
||||
const F vv = v[ii];
|
||||
|
||||
F ww = u + kk;
|
||||
F p = max(pp, ww);
|
||||
F e1 = exp(pp - p);
|
||||
F e2 = exp(ww - p);
|
||||
y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);
|
||||
|
||||
ww = w + pp;
|
||||
p = max(ww, kk);
|
||||
e1 = exp(ww - p);
|
||||
e2 = exp(kk - p);
|
||||
aa = e1 * aa + e2 * vv;
|
||||
bb = e1 * bb + e2;
|
||||
pp = p;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
__global__ void kernel_backward(const int B, const int T, const int C,
|
||||
const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,
|
||||
const F *__restrict__ const _y, const F *__restrict__ const _gy,
|
||||
F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, F *__restrict__ const _gv) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int _b = idx / C;
|
||||
const int _c = idx % C;
|
||||
const int _offset = _b * T * C + _c;
|
||||
|
||||
F u = _u[_c];
|
||||
F w = _w[_c];
|
||||
const F *__restrict__ const k = _k + _offset;
|
||||
const F *__restrict__ const v = _v + _offset;
|
||||
const F *__restrict__ const y = _y + _offset;
|
||||
const F *__restrict__ const gy = _gy + _offset;
|
||||
F *__restrict__ const gk = _gk + _offset;
|
||||
F *__restrict__ const gv = _gv + _offset;
|
||||
|
||||
F q[Tmax], r[Tmax];
|
||||
|
||||
F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
|
||||
for (int i = 0; i < T; i++) {
|
||||
const int ii = i * C;
|
||||
const F kk = k[ii];
|
||||
const F vv = v[ii];
|
||||
const F yy = y[ii];
|
||||
|
||||
F ww = u + kk;
|
||||
F p = max(pp, ww);
|
||||
F e1 = exp(pp - p);
|
||||
F e2 = exp(ww - p);
|
||||
const F qq = gy[ii] / (e1 * bb + e2);
|
||||
gw += (ga - gb * yy) * e1 * qq;
|
||||
gu += (vv - yy) * e2 * qq;
|
||||
q[i] = qq;
|
||||
r[i] = ww - p;
|
||||
|
||||
ww = w + pp;
|
||||
p = max(ww, kk);
|
||||
e1 = exp(ww - p);
|
||||
e2 = exp(kk - p);
|
||||
ga = e1 * (aa + ga);
|
||||
gb = e1 * (bb + gb);
|
||||
aa = e1 * aa + e2 * vv;
|
||||
bb = e1 * bb + e2;
|
||||
pp = p;
|
||||
}
|
||||
const int _offsetBC = _b * C + _c;
|
||||
_gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward()
|
||||
_gu[_offsetBC] = gu;
|
||||
|
||||
aa = 0, bb = 0, pp = MIN_VALUE;
|
||||
for (int i = T - 1; i >= 0; i--) {
|
||||
const int ii = i * C;
|
||||
const F kk = k[ii];
|
||||
const F vv = v[ii];
|
||||
const F yy = y[ii];
|
||||
const F qq = q[i];
|
||||
const F rr = r[i];
|
||||
|
||||
F e1 = qq * exp(rr);
|
||||
F e2 = exp(kk + pp);
|
||||
gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb);
|
||||
gv[ii] = e1 + e2 * aa;
|
||||
|
||||
const F ww = w + pp;
|
||||
const F www = rr - u - kk;
|
||||
const F p = max(ww, www);
|
||||
e1 = exp(ww - p);
|
||||
e2 = qq * exp(www - p);
|
||||
aa = e1 * aa + e2;
|
||||
bb = e1 * bb - e2 * yy;
|
||||
pp = p;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {
|
||||
dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
|
||||
assert(B * C % threadsPerBlock.x == 0);
|
||||
dim3 numBlocks(B * C / threadsPerBlock.x);
|
||||
kernel_forward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
|
||||
}
|
||||
|
||||
void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) {
|
||||
dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
|
||||
assert(B * C % threadsPerBlock.x == 0);
|
||||
dim3 numBlocks(B * C / threadsPerBlock.x);
|
||||
kernel_backward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
|
||||
}
|
||||
132
finetune/lora/cuda/wkv_cuda_bf16.cu
vendored
Normal file
132
finetune/lora/cuda/wkv_cuda_bf16.cu
vendored
Normal file
@@ -0,0 +1,132 @@
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "ATen/ATen.h"
|
||||
#define MIN_VALUE (-1e38)
|
||||
typedef at::BFloat16 bf16;
|
||||
|
||||
__global__ void kernel_forward(const int B, const int T, const int C,
|
||||
const float *__restrict__ const _w, const bf16 *__restrict__ const _u, const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v,
|
||||
bf16 *__restrict__ const _y) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int _b = idx / C;
|
||||
const int _c = idx % C;
|
||||
const int _offset = _b * T * C + _c;
|
||||
|
||||
float u = float(_u[_c]);
|
||||
float w = _w[_c];
|
||||
const bf16 *__restrict__ const k = _k + _offset;
|
||||
const bf16 *__restrict__ const v = _v + _offset;
|
||||
bf16 *__restrict__ const y = _y + _offset;
|
||||
|
||||
// aa and bb are running sums divided by exp(pp) (to avoid overflow)
|
||||
float aa = 0, bb = 0, pp = MIN_VALUE;
|
||||
for (int i = 0; i < T; i++) {
|
||||
const int ii = i * C;
|
||||
const float kk = float(k[ii]);
|
||||
const float vv = float(v[ii]);
|
||||
|
||||
float ww = u + kk;
|
||||
float p = max(pp, ww);
|
||||
float e1 = exp(pp - p);
|
||||
float e2 = exp(ww - p);
|
||||
y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2));
|
||||
|
||||
ww = w + pp;
|
||||
p = max(ww, kk);
|
||||
e1 = exp(ww - p);
|
||||
e2 = exp(kk - p);
|
||||
aa = e1 * aa + e2 * vv;
|
||||
bb = e1 * bb + e2;
|
||||
pp = p;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_backward(const int B, const int T, const int C,
|
||||
const float *__restrict__ const _w, const bf16 *__restrict__ const _u, const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v,
|
||||
const bf16 *__restrict__ const _y, const bf16 *__restrict__ const _gy,
|
||||
bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu, bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int _b = idx / C;
|
||||
const int _c = idx % C;
|
||||
const int _offset = _b * T * C + _c;
|
||||
|
||||
float u = float(_u[_c]);
|
||||
float w = _w[_c];
|
||||
const bf16 *__restrict__ const k = _k + _offset;
|
||||
const bf16 *__restrict__ const v = _v + _offset;
|
||||
const bf16 *__restrict__ const y = _y + _offset;
|
||||
const bf16 *__restrict__ const gy = _gy + _offset;
|
||||
bf16 *__restrict__ const gk = _gk + _offset;
|
||||
bf16 *__restrict__ const gv = _gv + _offset;
|
||||
|
||||
float q[Tmax], r[Tmax];
|
||||
|
||||
float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
|
||||
for (int i = 0; i < T; i++) {
|
||||
const int ii = i * C;
|
||||
const float kk = float(k[ii]);
|
||||
const float vv = float(v[ii]);
|
||||
const float yy = float(y[ii]);
|
||||
|
||||
float ww = u + kk;
|
||||
float p = max(pp, ww);
|
||||
float e1 = exp(pp - p);
|
||||
float e2 = exp(ww - p);
|
||||
const float qq = float(gy[ii]) / (e1 * bb + e2);
|
||||
gw += (ga - gb * yy) * e1 * qq;
|
||||
gu += (vv - yy) * e2 * qq;
|
||||
q[i] = qq;
|
||||
r[i] = ww - p;
|
||||
|
||||
ww = w + pp;
|
||||
p = max(ww, kk);
|
||||
e1 = exp(ww - p);
|
||||
e2 = exp(kk - p);
|
||||
ga = e1 * (aa + ga);
|
||||
gb = e1 * (bb + gb);
|
||||
aa = e1 * aa + e2 * vv;
|
||||
bb = e1 * bb + e2;
|
||||
pp = p;
|
||||
}
|
||||
const int _offsetBC = _b * C + _c;
|
||||
_gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward()
|
||||
_gu[_offsetBC] = bf16(gu);
|
||||
|
||||
aa = 0, bb = 0, pp = MIN_VALUE;
|
||||
for (int i = T - 1; i >= 0; i--) {
|
||||
const int ii = i * C;
|
||||
const float kk = float(k[ii]);
|
||||
const float vv = float(v[ii]);
|
||||
const float yy = float(y[ii]);
|
||||
const float qq = q[i];
|
||||
const float rr = r[i];
|
||||
|
||||
float e1 = qq * exp(rr);
|
||||
float e2 = exp(kk + pp);
|
||||
gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb));
|
||||
gv[ii] = bf16(e1 + e2 * aa);
|
||||
|
||||
const float ww = w + pp;
|
||||
const float www = rr - u - kk;
|
||||
const float p = max(ww, www);
|
||||
e1 = exp(ww - p);
|
||||
e2 = qq * exp(www - p);
|
||||
aa = e1 * aa + e2;
|
||||
bb = e1 * bb - e2 * yy;
|
||||
pp = p;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_forward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) {
|
||||
dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
|
||||
assert(B * C % threadsPerBlock.x == 0);
|
||||
dim3 numBlocks(B * C / threadsPerBlock.x);
|
||||
kernel_forward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
|
||||
}
|
||||
|
||||
void cuda_backward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) {
|
||||
dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
|
||||
assert(B * C % threadsPerBlock.x == 0);
|
||||
dim3 numBlocks(B * C / threadsPerBlock.x);
|
||||
kernel_backward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
|
||||
}
|
||||
21
finetune/lora/cuda/wkv_op.cpp
vendored
Normal file
21
finetune/lora/cuda/wkv_op.cpp
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
#include <torch/extension.h>
|
||||
|
||||
void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);
|
||||
void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv);
|
||||
|
||||
void forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
|
||||
cuda_forward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>());
|
||||
}
|
||||
void backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
|
||||
cuda_backward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), gy.data_ptr<float>(), gw.data_ptr<float>(), gu.data_ptr<float>(), gk.data_ptr<float>(), gv.data_ptr<float>());
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("forward", &forward, "wkv forward");
|
||||
m.def("backward", &backward, "wkv backward");
|
||||
}
|
||||
|
||||
TORCH_LIBRARY(wkv, m) {
|
||||
m.def("forward", forward);
|
||||
m.def("backward", backward);
|
||||
}
|
||||
25
finetune/lora/cuda/wkv_op_bf16.cpp
vendored
Normal file
25
finetune/lora/cuda/wkv_op_bf16.cpp
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
#include <torch/extension.h>
|
||||
#include "ATen/ATen.h"
|
||||
typedef at::BFloat16 bf16;
|
||||
|
||||
void cuda_forward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y);
|
||||
void cuda_backward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv);
|
||||
|
||||
void forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
|
||||
cuda_forward(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>());
|
||||
}
|
||||
void backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y,
|
||||
torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
|
||||
cuda_backward(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>(),
|
||||
gy.data_ptr<bf16>(), gw.data_ptr<bf16>(), gu.data_ptr<bf16>(), gk.data_ptr<bf16>(), gv.data_ptr<bf16>());
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("forward", &forward, "wkv forward");
|
||||
m.def("backward", &backward, "wkv backward");
|
||||
}
|
||||
|
||||
TORCH_LIBRARY(wkv, m) {
|
||||
m.def("forward", forward);
|
||||
m.def("backward", backward);
|
||||
}
|
||||
Reference in New Issue
Block a user