add pre-compiled beta cuda kernel (rwkv-beta==0.8.5, 40%+ faster for fp16) (thanks to #180, pre-compiled kernel of RTX 40 Series will be included later)

2023-09-18 23:02:49 +08:00
parent 5e5e1e9651
commit d7abe5f0d1
3 changed files with 3 additions and 0 deletions
--- a/backend-python/rwkv_pip/beta/model.py
+++ b/backend-python/rwkv_pip/beta/model.py
@@ -94,6 +94,7 @@ if os.environ.get("RWKV_CUDA_ON") == "1":
                f"{current_path}/cuda/att_one_v5.cu",
            ],
            verbose=True,
+            extra_ldflags=["cublas.lib"],
            extra_cuda_cflags=[
                "-t 4",
                "-std=c++17",
--- a/backend-python/rwkv_pip/beta/wkv_cuda.pyd
+++ b/backend-python/rwkv_pip/beta/wkv_cuda.pyd