add WebGPU Python Mode (https://github.com/cryscan/web-rwkv-py)
This commit is contained in:
27
backend-python/convert_safetensors.py
vendored
27
backend-python/convert_safetensors.py
vendored
@@ -30,6 +30,33 @@ def convert_file(pt_filename: str, sf_filename: str, rename={}, transpose_names=
|
||||
if "state_dict" in loaded:
|
||||
loaded = loaded["state_dict"]
|
||||
|
||||
kk = list(loaded.keys())
|
||||
version = 4
|
||||
for x in kk:
|
||||
if "ln_x" in x:
|
||||
version = max(5, version)
|
||||
if "gate.weight" in x:
|
||||
version = max(5.1, version)
|
||||
if int(version) == 5 and "att.time_decay" in x:
|
||||
if len(loaded[x].shape) > 1:
|
||||
if loaded[x].shape[1] > 1:
|
||||
version = max(5.2, version)
|
||||
if "time_maa" in x:
|
||||
version = max(6, version)
|
||||
|
||||
if version == 5.1 and "midi" in pt_filename.lower():
|
||||
import numpy as np
|
||||
|
||||
np.set_printoptions(precision=4, suppress=True, linewidth=200)
|
||||
kk = list(loaded.keys())
|
||||
_, n_emb = loaded["emb.weight"].shape
|
||||
for k in kk:
|
||||
if "time_decay" in k or "time_faaaa" in k:
|
||||
# print(k, mm[k].shape)
|
||||
loaded[k] = (
|
||||
loaded[k].unsqueeze(1).repeat(1, n_emb // loaded[k].shape[0])
|
||||
)
|
||||
|
||||
loaded = {k: v.clone().half() for k, v in loaded.items()}
|
||||
# for k, v in loaded.items():
|
||||
# print(f'{k}\t{v.shape}\t{v.dtype}')
|
||||
|
||||
@@ -37,6 +37,11 @@ def get_args(args: Union[Sequence[str], None] = None):
|
||||
action="store_true",
|
||||
help="whether to use rwkv.cpp (default: False)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--webgpu",
|
||||
action="store_true",
|
||||
help="whether to use webgpu (default: False)",
|
||||
)
|
||||
args = parser.parse_args(args)
|
||||
|
||||
return args
|
||||
|
||||
@@ -8,7 +8,6 @@ import base64
|
||||
from fastapi import APIRouter, Request, status, HTTPException
|
||||
from sse_starlette.sse import EventSourceResponse
|
||||
from pydantic import BaseModel, Field
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
from utils.rwkv import *
|
||||
from utils.log import quick_log
|
||||
@@ -396,6 +395,8 @@ class EmbeddingsBody(BaseModel):
|
||||
|
||||
|
||||
def embedding_base64(embedding: List[float]) -> str:
|
||||
import numpy as np
|
||||
|
||||
return base64.b64encode(np.array(embedding).astype(np.float32)).decode("utf-8")
|
||||
|
||||
|
||||
|
||||
@@ -87,18 +87,34 @@ def add_state(body: AddStateBody):
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
devices: List[torch.device] = []
|
||||
state: Union[Any, None] = None
|
||||
|
||||
if body.state is not None:
|
||||
if type(body.state) == list or type(body.state) == np.ndarray:
|
||||
devices = [
|
||||
(
|
||||
tensor.device
|
||||
if hasattr(tensor, "device")
|
||||
else torch.device("cpu")
|
||||
)
|
||||
for tensor in body.state
|
||||
]
|
||||
state = (
|
||||
[tensor.cpu() for tensor in body.state]
|
||||
if hasattr(body.state[0], "device")
|
||||
else copy.deepcopy(body.state)
|
||||
)
|
||||
else:
|
||||
pass # WebGPU
|
||||
|
||||
id: int = trie.insert(body.prompt)
|
||||
devices: List[torch.device] = [
|
||||
(tensor.device if hasattr(tensor, "device") else torch.device("cpu"))
|
||||
for tensor in body.state
|
||||
]
|
||||
dtrie[id] = {
|
||||
"tokens": copy.deepcopy(body.tokens),
|
||||
"state": [tensor.cpu() for tensor in body.state]
|
||||
if hasattr(body.state[0], "device")
|
||||
else copy.deepcopy(body.state),
|
||||
"state": state,
|
||||
"logits": copy.deepcopy(body.logits),
|
||||
"devices": devices,
|
||||
}
|
||||
@@ -174,6 +190,7 @@ def longest_prefix_state(body: LongestPrefixStateBody, request: Request):
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
id = -1
|
||||
try:
|
||||
@@ -185,14 +202,16 @@ def longest_prefix_state(body: LongestPrefixStateBody, request: Request):
|
||||
v = dtrie[id]
|
||||
devices: List[torch.device] = v["devices"]
|
||||
prompt: str = trie[id]
|
||||
state: Union[Any, None] = v["state"]
|
||||
|
||||
if state is not None and type(state) == list and hasattr(state[0], "device"):
|
||||
state = [tensor.to(devices[i]) for i, tensor in enumerate(state)]
|
||||
|
||||
quick_log(request, body, "Hit:\n" + prompt)
|
||||
return {
|
||||
"prompt": prompt,
|
||||
"tokens": v["tokens"],
|
||||
"state": [tensor.to(devices[i]) for i, tensor in enumerate(v["state"])]
|
||||
if hasattr(v["state"][0], "device")
|
||||
else v["state"],
|
||||
"state": state,
|
||||
"logits": v["logits"],
|
||||
}
|
||||
else:
|
||||
|
||||
2
backend-python/rwkv_pip/utils.py
vendored
2
backend-python/rwkv_pip/utils.py
vendored
@@ -84,6 +84,8 @@ class PIPELINE:
|
||||
return e / e.sum(axis=axis, keepdims=True)
|
||||
|
||||
def sample_logits(self, logits, temperature=1.0, top_p=0.85, top_k=0):
|
||||
if type(logits) == list:
|
||||
logits = np.array(logits)
|
||||
np_logits = type(logits) == np.ndarray
|
||||
if np_logits:
|
||||
probs = self.np_softmax(logits, axis=-1)
|
||||
|
||||
21
backend-python/rwkv_pip/webgpu/model.py
vendored
Normal file
21
backend-python/rwkv_pip/webgpu/model.py
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
from typing import Any, List, Union
|
||||
|
||||
try:
|
||||
import web_rwkv_py as wrp
|
||||
except ModuleNotFoundError:
|
||||
try:
|
||||
from . import web_rwkv_py as wrp
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"web_rwkv_py not found, install it from https://github.com/cryscan/web-rwkv-py"
|
||||
)
|
||||
|
||||
|
||||
class RWKV:
|
||||
def __init__(self, model_path: str, strategy=None):
|
||||
self.model = wrp.v5.Model(model_path, turbo=False)
|
||||
self.w = {} # fake weight
|
||||
self.w["emb.weight"] = [0] * wrp.peek_info(model_path).num_vocab
|
||||
|
||||
def forward(self, tokens: List[int], state: Union[Any, None] = None):
|
||||
return wrp.v5.run_one(self.model, tokens, state)
|
||||
@@ -8,7 +8,6 @@ from typing import Dict, Iterable, List, Tuple, Union, Type
|
||||
from utils.log import quick_log
|
||||
from fastapi import HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
import numpy as np
|
||||
from routes import state_cache
|
||||
import global_var
|
||||
|
||||
@@ -68,6 +67,8 @@ class AbstractRWKV(ABC):
|
||||
pass
|
||||
|
||||
def get_embedding(self, input: str, fast_mode: bool) -> Tuple[List[float], int]:
|
||||
import numpy as np
|
||||
|
||||
if fast_mode:
|
||||
embedding, token_len = self.__fast_embedding(
|
||||
self.fix_tokens(self.pipeline.encode(input)), None
|
||||
@@ -222,6 +223,8 @@ class AbstractRWKV(ABC):
|
||||
def generate(
|
||||
self, prompt: str, stop: Union[str, List[str], None] = None
|
||||
) -> Iterable[Tuple[str, str, int, int]]:
|
||||
import numpy as np
|
||||
|
||||
quick_log(None, None, "Generation Prompt:\n" + prompt)
|
||||
cache = None
|
||||
delta_prompt = prompt
|
||||
@@ -231,7 +234,7 @@ class AbstractRWKV(ABC):
|
||||
)
|
||||
except HTTPException:
|
||||
pass
|
||||
if cache is None or cache["prompt"] == "":
|
||||
if cache is None or cache["prompt"] == "" or cache["state"] is None:
|
||||
self.model_state = None
|
||||
self.model_tokens = []
|
||||
else:
|
||||
@@ -511,6 +514,7 @@ def get_tokenizer(tokenizer_len: int):
|
||||
def RWKV(model: str, strategy: str, tokenizer: Union[str, None]) -> AbstractRWKV:
|
||||
rwkv_beta = global_var.get(global_var.Args).rwkv_beta
|
||||
rwkv_cpp = getattr(global_var.get(global_var.Args), "rwkv.cpp")
|
||||
webgpu = global_var.get(global_var.Args).webgpu
|
||||
|
||||
if "midi" in model.lower() or "abc" in model.lower():
|
||||
os.environ["RWKV_RESCALE_LAYER"] = "999"
|
||||
@@ -526,6 +530,11 @@ def RWKV(model: str, strategy: str, tokenizer: Union[str, None]) -> AbstractRWKV
|
||||
from rwkv_pip.cpp.model import (
|
||||
RWKV as Model,
|
||||
)
|
||||
elif webgpu:
|
||||
print("Using webgpu")
|
||||
from rwkv_pip.webgpu.model import (
|
||||
RWKV as Model,
|
||||
)
|
||||
else:
|
||||
from rwkv_pip.model import (
|
||||
RWKV as Model,
|
||||
|
||||
Reference in New Issue
Block a user