RWKV-Runner/backend-python/utils/rwkv.py

from abc import ABC, abstractmethod
from enum import Enum, auto
import os
import pathlib
import copy
import re
from typing import Dict, Iterable, List, Tuple, Union, Type
from utils.log import quick_log
from fastapi import HTTPException
from pydantic import BaseModel, Field
from routes import state_cache
import global_var


END_OF_TEXT = 0
END_OF_LINE_DOUBLE = 535


os.environ["TORCH_EXTENSIONS_DIR"] = f"{pathlib.Path(__file__).parent.parent.resolve()}"


class RWKVType(Enum):
    NoneType = auto()
    Raven = auto()
    World = auto()
    Music = auto()


class AbstractRWKV(ABC):
    def __init__(self, model, pipeline):
        self.name = "rwkv"
        self.model = model
        self.pipeline = pipeline
        self.model_state = None
        self.model_tokens = []
        self.rwkv_type: RWKVType = RWKVType.NoneType
        self.tokenizer_len = len(model.w["emb.weight"])

        self.max_tokens_per_generation = 500
        self.temperature = 1
        self.top_p = 0.3
        self.top_k = 0
        self.penalty_alpha_presence = 0
        self.penalty_alpha_frequency = 1

    @abstractmethod
    def adjust_occurrence(self, occurrence: Dict, token: int):
        pass

    @abstractmethod
    def adjust_forward_logits(self, logits: List[float], occurrence: Dict, i: int):
        pass

    # Model only saw '\n\n' as [187, 187] before, but the tokenizer outputs [535] for it at the end
    @abstractmethod
    def fix_tokens(self, tokens) -> List[int]:
        pass

    @abstractmethod
    def run_rnn(
        self, _tokens: List[str], newline_adj: int = 0
    ) -> Tuple[List[float], int]:
        pass

    @abstractmethod
    def delta_postprocess(self, delta: str) -> str:
        pass

    def get_embedding(self, input: str, fast_mode: bool) -> Tuple[List[float], int]:
        import numpy as np

        if fast_mode:
            embedding, token_len = self.__fast_embedding(
                self.fix_tokens(self.pipeline.encode(input)), None
            )
        else:
            self.model_state = None
            self.model_tokens = []
            _, token_len = self.run_rnn(self.fix_tokens(self.pipeline.encode(input)))
            embedding = self.model_state[-11].tolist()
        embedding = (embedding / np.linalg.norm(embedding)).tolist()
        return embedding, token_len

    def __fast_embedding(self, tokens: List[str], state):
        import torch

        tokens = [int(x) for x in tokens]
        token_len = len(tokens)
        self = self.model

        with torch.no_grad():
            w = self.w
            args = self.args

            if state == None:
                state = [None] * args.n_layer * 5
                for i in range(
                    args.n_layer
                ):  # state: 0=att_xx 1=att_aa 2=att_bb 3=att_pp 4=ffn_xx
                    dd = self.strategy[i]
                    dev = dd.device
                    atype = dd.atype
                    state[i * 5 + 0] = torch.zeros(
                        args.n_embd, dtype=atype, requires_grad=False, device=dev
                    ).contiguous()
                    state[i * 5 + 1] = torch.zeros(
                        args.n_embd, dtype=torch.float, requires_grad=False, device=dev
                    ).contiguous()
                    state[i * 5 + 2] = torch.zeros(
                        args.n_embd, dtype=torch.float, requires_grad=False, device=dev
                    ).contiguous()
                    state[i * 5 + 3] = (
                        torch.zeros(
                            args.n_embd,
                            dtype=torch.float,
                            requires_grad=False,
                            device=dev,
                        ).contiguous()
                        - 1e30
                    )
                    state[i * 5 + 4] = torch.zeros(
                        args.n_embd, dtype=atype, requires_grad=False, device=dev
                    ).contiguous()

                    break

            seq_mode = len(tokens) > 1

            x = w["emb.weight"][tokens if seq_mode else tokens[0]]

            for i in range(args.n_layer):
                bbb = f"blocks.{i}."
                att = f"blocks.{i}.att."
                ffn = f"blocks.{i}.ffn."
                dd = self.strategy[i]
                dev = dd.device
                atype = dd.atype
                wtype = dd.wtype
                if seq_mode:
                    if "cuda" in str(dev) and os.environ["RWKV_CUDA_ON"] == "1":
                        ATT = (
                            self.cuda_att_seq
                            if wtype != torch.uint8
                            else self.cuda_att_seq_i8
                        )
                    else:
                        ATT = self.att_seq if wtype != torch.uint8 else self.att_seq_i8
                    FFN = self.ffn_seq if wtype != torch.uint8 else self.ffn_seq_i8
                else:
                    ATT = self.att_one if wtype != torch.uint8 else self.att_one_i8
                    FFN = self.ffn_one if wtype != torch.uint8 else self.ffn_one_i8

                x = x.to(dtype=atype, device=dev)

                kw = w[f"{att}key.weight"]
                vw = w[f"{att}value.weight"]
                rw = w[f"{att}receptance.weight"]
                ow = w[f"{att}output.weight"]
                if dd.stream:
                    kw = kw.to(device=dev, non_blocking=True)
                    vw = vw.to(device=dev, non_blocking=True)
                    rw = rw.to(device=dev, non_blocking=True)
                    ow = ow.to(device=dev, non_blocking=True)
                kmx = w[f"{att}key.weight_mx"] if wtype == torch.uint8 else x
                krx = w[f"{att}key.weight_rx"] if wtype == torch.uint8 else x
                kmy = w[f"{att}key.weight_my"] if wtype == torch.uint8 else x
                kry = w[f"{att}key.weight_ry"] if wtype == torch.uint8 else x
                vmx = w[f"{att}value.weight_mx"] if wtype == torch.uint8 else x
                vrx = w[f"{att}value.weight_rx"] if wtype == torch.uint8 else x
                vmy = w[f"{att}value.weight_my"] if wtype == torch.uint8 else x
                vry = w[f"{att}value.weight_ry"] if wtype == torch.uint8 else x
                rmx = w[f"{att}receptance.weight_mx"] if wtype == torch.uint8 else x
                rrx = w[f"{att}receptance.weight_rx"] if wtype == torch.uint8 else x
                rmy = w[f"{att}receptance.weight_my"] if wtype == torch.uint8 else x
                rry = w[f"{att}receptance.weight_ry"] if wtype == torch.uint8 else x
                omx = w[f"{att}output.weight_mx"] if wtype == torch.uint8 else x
                orx = w[f"{att}output.weight_rx"] if wtype == torch.uint8 else x
                omy = w[f"{att}output.weight_my"] if wtype == torch.uint8 else x
                ory = w[f"{att}output.weight_ry"] if wtype == torch.uint8 else x
                (
                    x,
                    state[i * 5 + 0],
                    state[i * 5 + 1],
                    state[i * 5 + 2],
                    state[i * 5 + 3],
                ) = ATT(
                    x,
                    state[i * 5 + 0],
                    state[i * 5 + 1],
                    state[i * 5 + 2],
                    state[i * 5 + 3],
                    w[f"{bbb}ln1.weight"],
                    w[f"{bbb}ln1.bias"],
                    w[f"{att}time_mix_k"],
                    w[f"{att}time_mix_v"],
                    w[f"{att}time_mix_r"],
                    w[f"{att}time_decay"],
                    w[f"{att}time_first"],
                    kw,
                    vw,
                    rw,
                    ow,
                    kmx,
                    krx,
                    kmy,
                    kry,
                    vmx,
                    vrx,
                    vmy,
                    vry,
                    rmx,
                    rrx,
                    rmy,
                    rry,
                    omx,
                    orx,
                    omy,
                    ory,
                )

                return state[0].tolist(), token_len

    def generate(
        self, prompt: str, stop: Union[str, List[str], None] = None
    ) -> Iterable[Tuple[str, str, int, int]]:
        import numpy as np

        quick_log(None, None, "Generation Prompt:\n" + prompt)
        cache = None
        delta_prompt = prompt
        try:
            cache = state_cache.longest_prefix_state(
                state_cache.LongestPrefixStateBody(prompt=prompt), None
            )
        except HTTPException:
            pass
        if cache is None or cache["prompt"] == "" or cache["state"] is None:
            self.model_state = None
            self.model_tokens = []
        else:
            delta_prompt = prompt[len(cache["prompt"]) :]
            self.model_state = cache["state"]
            self.model_tokens = cache["tokens"]
            logits = cache["logits"]

        prompt_token_len = 0
        if delta_prompt != "":
            logits, prompt_token_len = self.run_rnn(
                self.fix_tokens(self.pipeline.encode(delta_prompt))
            )
            try:
                state_cache.add_state(
                    state_cache.AddStateBody(
                        prompt=prompt,
                        tokens=self.model_tokens,
                        state=self.model_state,
                        logits=logits,
                    )
                )
            except HTTPException:
                pass

        begin = len(self.model_tokens)
        out_last = begin

        occurrence: Dict = {}

        completion_token_len = 0
        response = ""
        for i in range(self.max_tokens_per_generation):
            self.adjust_forward_logits(logits, occurrence, i)

            token = self.pipeline.sample_logits(
                logits, temperature=self.temperature, top_p=self.top_p, top_k=self.top_k
            )

            if token == END_OF_TEXT:
                yield response, "", prompt_token_len, completion_token_len
                break

            self.adjust_occurrence(occurrence, token)

            logits, _ = self.run_rnn([token])
            completion_token_len = completion_token_len + 1
            delta: str = self.delta_postprocess(
                self.pipeline.decode(self.model_tokens[out_last:])
            )
            if "\ufffd" not in delta:  # avoid utf-8 display issues
                response += delta
                if stop is not None:
                    if type(stop) == str:
                        if stop in response:
                            try:
                                state_cache.add_state(
                                    state_cache.AddStateBody(
                                        prompt=prompt + response,
                                        tokens=self.model_tokens,
                                        state=self.model_state,
                                        logits=logits,
                                    )
                                )
                            except HTTPException:
                                pass
                            response = response.split(stop)[0]
                            yield response, "", prompt_token_len, completion_token_len
                            break
                    elif type(stop) == list:
                        stop_exist_regex = "|".join(stop)
                        matched = re.search(stop_exist_regex, response)
                        if matched:
                            try:
                                state_cache.add_state(
                                    state_cache.AddStateBody(
                                        prompt=prompt + response,
                                        tokens=self.model_tokens,
                                        state=self.model_state,
                                        logits=logits,
                                    )
                                )
                            except HTTPException:
                                pass
                            response = response.split(matched.group())[0]
                            yield response, "", prompt_token_len, completion_token_len
                            break
                out_last = begin + i + 1
                if i == self.max_tokens_per_generation - 1:
                    try:
                        state_cache.add_state(
                            state_cache.AddStateBody(
                                prompt=prompt + response,
                                tokens=self.model_tokens,
                                state=self.model_state,
                                logits=logits,
                            )
                        )
                    except HTTPException:
                        pass
                yield response, delta, prompt_token_len, completion_token_len


class TextRWKV(AbstractRWKV):
    def __init__(self, model, pipeline) -> None:
        super().__init__(model, pipeline)

        self.CHUNK_LEN = 256

        self.max_tokens_per_generation = 500
        self.temperature = 1
        self.top_p = 0.3
        self.top_k = 0
        self.penalty_alpha_presence = 0
        self.penalty_alpha_frequency = 1

        self.interface = ":"
        if self.tokenizer_len < 65536:
            self.rwkv_type = RWKVType.Raven
            self.user = "Bob"
            self.bot = "Alice"
            self.END_OF_LINE = 187
        else:
            self.rwkv_type = RWKVType.World
            self.user = "User"
            self.bot = "Assistant"
            self.END_OF_LINE = 11

        self.AVOID_REPEAT_TOKENS = []
        AVOID_REPEAT = "，：？！"
        for i in AVOID_REPEAT:
            dd = self.pipeline.encode(i)
            assert len(dd) == 1
            self.AVOID_REPEAT_TOKENS += dd

        self.__preload()

    def adjust_occurrence(self, occurrence: Dict, token: int):
        for xxx in occurrence:
            occurrence[xxx] *= 0.996
        if token not in occurrence:
            occurrence[token] = 1
        else:
            occurrence[token] += 1

    def adjust_forward_logits(self, logits: List[float], occurrence: Dict, i: int):
        for n in occurrence:
            logits[n] -= (
                self.penalty_alpha_presence
                + occurrence[n] * self.penalty_alpha_frequency
            )

        if i == 0:
            for token in self.model_tokens:
                token = int(token)
                for xxx in occurrence:
                    occurrence[xxx] *= 0.996
                if token not in occurrence:
                    occurrence[token] = 1
                else:
                    occurrence[token] += 1

    # Model only saw '\n\n' as [187, 187] before, but the tokenizer outputs [535] for it at the end
    def fix_tokens(self, tokens) -> List[int]:
        if self.rwkv_type == RWKVType.World:
            return tokens
        if len(tokens) > 0 and tokens[-1] == END_OF_LINE_DOUBLE:
            tokens = tokens[:-1] + [self.END_OF_LINE, self.END_OF_LINE]
        return tokens

    def run_rnn(
        self, _tokens: List[str], newline_adj: int = 0
    ) -> Tuple[List[float], int]:
        tokens = [int(x) for x in _tokens]
        token_len = len(tokens)
        self.model_tokens += tokens

        while len(tokens) > 0:
            out, self.model_state = self.model.forward(
                tokens[: self.CHUNK_LEN], self.model_state
            )
            tokens = tokens[self.CHUNK_LEN :]

        out[self.END_OF_LINE] += newline_adj  # adjust \n probability

        if self.model_tokens[-1] in self.AVOID_REPEAT_TOKENS:
            out[self.model_tokens[-1]] = -999999999
        return out, token_len

    def delta_postprocess(self, delta: str) -> str:
        return delta

    def __preload(self):
        interface = self.interface
        user = self.user
        bot = self.bot
        preset_system = (
            f"""
The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. \
{bot} is very intelligent, creative and friendly. \
{bot} is unlikely to disagree with {user}, and {bot} doesn't like to ask {user} questions. \
{bot} likes to tell {user} a lot about herself and her opinions. \
{bot} usually gives {user} kind, helpful and informative advices.\n
"""
            if self.rwkv_type == RWKVType.Raven
            else (
                f"{user}{interface} hi\n\n{bot}{interface} Hi. "
                + "I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.\n\n"
            )
        )
        logits, _ = self.run_rnn(self.fix_tokens(self.pipeline.encode(preset_system)))
        try:
            state_cache.add_state(
                state_cache.AddStateBody(
                    prompt=preset_system,
                    tokens=self.model_tokens,
                    state=self.model_state,
                    logits=logits,
                )
            )
        except HTTPException:
            pass


class MusicRWKV(AbstractRWKV):
    def __init__(self, model, pipeline):
        super().__init__(model, pipeline)

        self.max_tokens_per_generation = 500
        self.temperature = 1
        self.top_p = 0.8
        self.top_k = 8

        self.rwkv_type = RWKVType.Music

    def adjust_occurrence(self, occurrence: Dict, token: int):
        for n in occurrence:
            occurrence[n] *= 0.997  #### decay repetition penalty
        if token >= 128 or token == 127:
            occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
        else:
            occurrence[token] = 0.3 + (occurrence[token] if token in occurrence else 0)

    def adjust_forward_logits(self, logits: List[float], occurrence: Dict, i: int):
        for n in occurrence:
            logits[n] -= 0 + occurrence[n] * 0.5

        logits[0] += (i - 2000) / 500  # try not to be too short or too long
        logits[127] -= 1  # avoid "t125"

    def fix_tokens(self, tokens) -> List[int]:
        return tokens

    def run_rnn(
        self, _tokens: List[str], newline_adj: int = 0
    ) -> Tuple[List[float], int]:
        tokens = [int(x) for x in _tokens]
        token_len = len(tokens)
        self.model_tokens += tokens
        out, self.model_state = self.model.forward(tokens, self.model_state)
        return out, token_len

    def delta_postprocess(self, delta: str) -> str:
        return " " + delta


def get_tokenizer(tokenizer_len: int):
    tokenizer_dir = f"{pathlib.Path(__file__).parent.parent.resolve()}/rwkv_pip/"
    if tokenizer_len < 50277:
        return tokenizer_dir + "tokenizer-midi.json"
    elif tokenizer_len < 65536:
        return tokenizer_dir + "20B_tokenizer.json"
    else:
        return "rwkv_vocab_v20230424"


def RWKV(model: str, strategy: str, tokenizer: Union[str, None]) -> AbstractRWKV:
    rwkv_beta = global_var.get(global_var.Args).rwkv_beta
    rwkv_cpp = getattr(global_var.get(global_var.Args), "rwkv.cpp")
    webgpu = global_var.get(global_var.Args).webgpu

    if "midi" in model.lower() or "abc" in model.lower():
        os.environ["RWKV_RESCALE_LAYER"] = "999"

    # dynamic import to make RWKV_CUDA_ON work
    if rwkv_beta:
        print("Using rwkv-beta")
        from rwkv_pip.beta.model import (
            RWKV as Model,
        )
    elif rwkv_cpp:
        print("Using rwkv.cpp, strategy is ignored")
        from rwkv_pip.cpp.model import (
            RWKV as Model,
        )
    elif webgpu:
        print("Using webgpu")
        from rwkv_pip.webgpu.model import (
            RWKV as Model,
        )
    else:
        from rwkv_pip.model import (
            RWKV as Model,
        )
    from rwkv_pip.utils import PIPELINE

    filename, _ = os.path.splitext(os.path.basename(model))
    model = Model(model, strategy)
    if not tokenizer:
        tokenizer = get_tokenizer(len(model.w["emb.weight"]))
    pipeline = PIPELINE(model, tokenizer)

    rwkv_map: dict[str, Type[AbstractRWKV]] = {
        "20B_tokenizer": TextRWKV,
        "rwkv_vocab_v20230424": TextRWKV,
        "tokenizer-midi": MusicRWKV,
    }
    tokenizer_name = os.path.splitext(os.path.basename(tokenizer))[0]
    rwkv: AbstractRWKV
    if tokenizer_name in rwkv_map:
        rwkv = rwkv_map[tokenizer_name](model, pipeline)
    else:
        rwkv = TextRWKV(model, pipeline)
    rwkv.name = filename

    return rwkv


class ModelConfigBody(BaseModel):
    max_tokens: int = Field(default=None, gt=0, le=102400)
    temperature: float = Field(default=None, ge=0, le=2)
    top_p: float = Field(default=None, ge=0, le=1)
    presence_penalty: float = Field(default=None, ge=-2, le=2)
    frequency_penalty: float = Field(default=None, ge=-2, le=2)

    model_config = {
        "json_schema_extra": {
            "example": {
                "max_tokens": 1000,
                "temperature": 1.2,
                "top_p": 0.5,
                "presence_penalty": 0.4,
                "frequency_penalty": 0.4,
            }
        }
    }


def set_rwkv_config(model: AbstractRWKV, body: ModelConfigBody):
    if body.max_tokens is not None:
        model.max_tokens_per_generation = body.max_tokens
    if body.temperature is not None:
        if body.temperature < 0.1:
            model.temperature = 0.1
        else:
            model.temperature = body.temperature
    if body.top_p is not None:
        model.top_p = body.top_p
    if body.presence_penalty is not None:
        model.penalty_alpha_presence = body.presence_penalty
    if body.frequency_penalty is not None:
        model.penalty_alpha_frequency = body.frequency_penalty


def get_rwkv_config(model: AbstractRWKV) -> ModelConfigBody:
    return ModelConfigBody(
        max_tokens=model.max_tokens_per_generation,
        temperature=model.temperature,
        top_p=model.top_p,
        presence_penalty=model.penalty_alpha_presence,
        frequency_penalty=model.penalty_alpha_frequency,
    )
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								from abc import ABC, abstractmethod
-												RWKVType

											
										
										
											2023-07-31 22:46:13 +08:00
+								from enum import Enum, auto
-												fixed torch version; CUDA acceleration utils

											
										
										
											2023-05-23 11:19:39 +08:00
+								import os
 								import pathlib
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								import copy
-												support for `stop` array

											
										
										
											2023-07-25 16:10:22 +08:00
+								import re
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								from typing import Dict, Iterable, List, Tuple, Union, Type
-												log Generation Prompt

											
										
										
											2023-06-12 13:41:51 +08:00
+								from utils.log import quick_log
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								from fastapi import HTTPException
-												safe ModelConfigBody

											
										
										
											2023-05-30 23:13:27 +08:00
+								from pydantic import BaseModel, Field
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								from routes import state_cache
-												add rwkv-cuda-beta support (faster)

											
										
										
											2023-08-14 22:07:15 +08:00
+								import global_var
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
 								END_OF_TEXT = 0
-												fix_tokens

											
										
										
											2023-05-31 14:55:13 +08:00
+								END_OF_LINE_DOUBLE = 535
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
 								os.environ["TORCH_EXTENSIONS_DIR"] = f"{pathlib.Path(__file__).parent.parent.resolve()}"
-												RWKVType

											
										
										
											2023-07-31 22:46:13 +08:00
+								class RWKVType(Enum):
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								    NoneType = auto()
-												RWKVType

											
										
										
											2023-07-31 22:46:13 +08:00
+								    Raven = auto()
 								    World = auto()
 								    Music = auto()
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								class AbstractRWKV(ABC):
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								    def __init__(self, model, pipeline):
 								        self.name = "rwkv"
 								        self.model = model
 								        self.pipeline = pipeline
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								        self.model_state = None
 								        self.model_tokens = []
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								        self.rwkv_type: RWKVType = RWKVType.NoneType
 								        self.tokenizer_len = len(model.w["emb.weight"])
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
 								        self.max_tokens_per_generation = 500
 								        self.temperature = 1
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								        self.top_p = 0.3
 								        self.top_k = 0
 								        self.penalty_alpha_presence = 0
 								        self.penalty_alpha_frequency = 1
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    @abstractmethod
 								    def adjust_occurrence(self, occurrence: Dict, token: int):
 								        pass
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    @abstractmethod
 								    def adjust_forward_logits(self, logits: List[float], occurrence: Dict, i: int):
 								        pass
-												feat: preload preset_system

											
										
										
											2023-05-29 00:08:13 +08:00
-												fix_tokens

											
										
										
											2023-05-31 14:55:13 +08:00
+								    # Model only saw '\n\n' as [187, 187] before, but the tokenizer outputs [535] for it at the end
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    @abstractmethod
 								    def fix_tokens(self, tokens) -> List[int]:
 								        pass
-												fix_tokens

											
										
										
											2023-05-31 14:55:13 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    @abstractmethod
 								    def run_rnn(
 								        self, _tokens: List[str], newline_adj: int = 0
 								    ) -> Tuple[List[float], int]:
 								        pass
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    @abstractmethod
 								    def delta_postprocess(self, delta: str) -> str:
 								        pass
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								    def get_embedding(self, input: str, fast_mode: bool) -> Tuple[List[float], int]:
-												add WebGPU Python Mode (https://github.com/cryscan/web-rwkv-py)

											
										
										
											2023-12-14 18:37:07 +08:00
+								        import numpy as np
-												embeddings api compatible with openai api and langchain(sdk)

											
										
										
											2023-06-19 22:51:06 +08:00
+								        if fast_mode:
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								            embedding, token_len = self.__fast_embedding(
-												embeddings api compatible with openai api and langchain(sdk)

											
										
										
											2023-06-19 22:51:06 +08:00
+								                self.fix_tokens(self.pipeline.encode(input)), None
 								            )
 								        else:
 								            self.model_state = None
 								            self.model_tokens = []
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								            _, token_len = self.run_rnn(self.fix_tokens(self.pipeline.encode(input)))
-												improve embeddings API results

											
										
										
											2023-07-25 20:30:43 +08:00
+								            embedding = self.model_state[-11].tolist()
-												embeddings api compatible with openai api and langchain(sdk)

											
										
										
											2023-06-19 22:51:06 +08:00
+								        embedding = (embedding / np.linalg.norm(embedding)).tolist()
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								        return embedding, token_len
-												embeddings api compatible with openai api and langchain(sdk)

											
										
										
											2023-06-19 22:51:06 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    def __fast_embedding(self, tokens: List[str], state):
-												improve python backend startup speed

											
										
										
											2023-07-25 16:14:29 +08:00
+								        import torch
-												embeddings api compatible with openai api and langchain(sdk)

											
										
										
											2023-06-19 22:51:06 +08:00
+								        tokens = [int(x) for x in tokens]
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								        token_len = len(tokens)
-												embeddings api compatible with openai api and langchain(sdk)

											
										
										
											2023-06-19 22:51:06 +08:00
+								        self = self.model
 								        with torch.no_grad():
 								            w = self.w
 								            args = self.args
 								            if state == None:
 								                state = [None] * args.n_layer * 5
 								                for i in range(
 								                    args.n_layer
 								                ):  # state: 0=att_xx 1=att_aa 2=att_bb 3=att_pp 4=ffn_xx
 								                    dd = self.strategy[i]
 								                    dev = dd.device
 								                    atype = dd.atype
 								                    state[i * 5 + 0] = torch.zeros(
 								                        args.n_embd, dtype=atype, requires_grad=False, device=dev
 								                    ).contiguous()
 								                    state[i * 5 + 1] = torch.zeros(
 								                        args.n_embd, dtype=torch.float, requires_grad=False, device=dev
 								                    ).contiguous()
 								                    state[i * 5 + 2] = torch.zeros(
 								                        args.n_embd, dtype=torch.float, requires_grad=False, device=dev
 								                    ).contiguous()
 								                    state[i * 5 + 3] = (
 								                        torch.zeros(
 								                            args.n_embd,
 								                            dtype=torch.float,
 								                            requires_grad=False,
 								                            device=dev,
 								                        ).contiguous()
 								                        - 1e30
 								                    )
 								                    state[i * 5 + 4] = torch.zeros(
 								                        args.n_embd, dtype=atype, requires_grad=False, device=dev
 								                    ).contiguous()
 								                    break
 								            seq_mode = len(tokens) > 1
 								            x = w["emb.weight"][tokens if seq_mode else tokens[0]]
 								            for i in range(args.n_layer):
 								                bbb = f"blocks.{i}."
 								                att = f"blocks.{i}.att."
 								                ffn = f"blocks.{i}.ffn."
 								                dd = self.strategy[i]
 								                dev = dd.device
 								                atype = dd.atype
 								                wtype = dd.wtype
 								                if seq_mode:
 								                    if "cuda" in str(dev) and os.environ["RWKV_CUDA_ON"] == "1":
 								                        ATT = (
 								                            self.cuda_att_seq
 								                            if wtype != torch.uint8
 								                            else self.cuda_att_seq_i8
 								                        )
 								                    else:
 								                        ATT = self.att_seq if wtype != torch.uint8 else self.att_seq_i8
 								                    FFN = self.ffn_seq if wtype != torch.uint8 else self.ffn_seq_i8
 								                else:
 								                    ATT = self.att_one if wtype != torch.uint8 else self.att_one_i8
 								                    FFN = self.ffn_one if wtype != torch.uint8 else self.ffn_one_i8
 								                x = x.to(dtype=atype, device=dev)
 								                kw = w[f"{att}key.weight"]
 								                vw = w[f"{att}value.weight"]
 								                rw = w[f"{att}receptance.weight"]
 								                ow = w[f"{att}output.weight"]
 								                if dd.stream:
 								                    kw = kw.to(device=dev, non_blocking=True)
 								                    vw = vw.to(device=dev, non_blocking=True)
 								                    rw = rw.to(device=dev, non_blocking=True)
 								                    ow = ow.to(device=dev, non_blocking=True)
 								                kmx = w[f"{att}key.weight_mx"] if wtype == torch.uint8 else x
 								                krx = w[f"{att}key.weight_rx"] if wtype == torch.uint8 else x
 								                kmy = w[f"{att}key.weight_my"] if wtype == torch.uint8 else x
 								                kry = w[f"{att}key.weight_ry"] if wtype == torch.uint8 else x
 								                vmx = w[f"{att}value.weight_mx"] if wtype == torch.uint8 else x
 								                vrx = w[f"{att}value.weight_rx"] if wtype == torch.uint8 else x
 								                vmy = w[f"{att}value.weight_my"] if wtype == torch.uint8 else x
 								                vry = w[f"{att}value.weight_ry"] if wtype == torch.uint8 else x
 								                rmx = w[f"{att}receptance.weight_mx"] if wtype == torch.uint8 else x
 								                rrx = w[f"{att}receptance.weight_rx"] if wtype == torch.uint8 else x
 								                rmy = w[f"{att}receptance.weight_my"] if wtype == torch.uint8 else x
 								                rry = w[f"{att}receptance.weight_ry"] if wtype == torch.uint8 else x
 								                omx = w[f"{att}output.weight_mx"] if wtype == torch.uint8 else x
 								                orx = w[f"{att}output.weight_rx"] if wtype == torch.uint8 else x
 								                omy = w[f"{att}output.weight_my"] if wtype == torch.uint8 else x
 								                ory = w[f"{att}output.weight_ry"] if wtype == torch.uint8 else x
 								                (
 								                    x,
 								                    state[i * 5 + 0],
 								                    state[i * 5 + 1],
 								                    state[i * 5 + 2],
 								                    state[i * 5 + 3],
 								                ) = ATT(
 								                    x,
 								                    state[i * 5 + 0],
 								                    state[i * 5 + 1],
 								                    state[i * 5 + 2],
 								                    state[i * 5 + 3],
 								                    w[f"{bbb}ln1.weight"],
 								                    w[f"{bbb}ln1.bias"],
 								                    w[f"{att}time_mix_k"],
 								                    w[f"{att}time_mix_v"],
 								                    w[f"{att}time_mix_r"],
 								                    w[f"{att}time_decay"],
 								                    w[f"{att}time_first"],
 								                    kw,
 								                    vw,
 								                    rw,
 								                    ow,
 								                    kmx,
 								                    krx,
 								                    kmy,
 								                    kry,
 								                    vmx,
 								                    vrx,
 								                    vmy,
 								                    vry,
 								                    rmx,
 								                    rrx,
 								                    rmy,
 								                    rry,
 								                    omx,
 								                    orx,
 								                    omy,
 								                    ory,
 								                )
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								                return state[0].tolist(), token_len
-												embeddings api compatible with openai api and langchain(sdk)

											
										
										
											2023-06-19 22:51:06 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    def generate(
-												add rwkv-cuda-beta support (faster)

											
										
										
											2023-08-14 22:07:15 +08:00
+								        self, prompt: str, stop: Union[str, List[str], None] = None
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    ) -> Iterable[Tuple[str, str, int, int]]:
-												add WebGPU Python Mode (https://github.com/cryscan/web-rwkv-py)

											
										
										
											2023-12-14 18:37:07 +08:00
+								        import numpy as np
-												log Generation Prompt

											
										
										
											2023-06-12 13:41:51 +08:00
+								        quick_log(None, None, "Generation Prompt:\n" + prompt)
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								        cache = None
 								        delta_prompt = prompt
 								        try:
 								            cache = state_cache.longest_prefix_state(
-												add logs for state cache and switch-model

											
										
										
											2023-06-09 20:46:19 +08:00
+								                state_cache.LongestPrefixStateBody(prompt=prompt), None
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								            )
 								        except HTTPException:
 								            pass
-												add WebGPU Python Mode (https://github.com/cryscan/web-rwkv-py)

											
										
										
											2023-12-14 18:37:07 +08:00
+								        if cache is None or cache["prompt"] == "" or cache["state"] is None:
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								            self.model_state = None
 								            self.model_tokens = []
 								        else:
 								            delta_prompt = prompt[len(cache["prompt"]) :]
-												improve state cache performance

											
										
										
											2023-12-28 22:15:31 +08:00
+								            self.model_state = cache["state"]
 								            self.model_tokens = cache["tokens"]
 								            logits = cache["logits"]
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								        prompt_token_len = 0
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								        if delta_prompt != "":
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								            logits, prompt_token_len = self.run_rnn(
 								                self.fix_tokens(self.pipeline.encode(delta_prompt))
 								            )
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								            try:
 								                state_cache.add_state(
 								                    state_cache.AddStateBody(
 								                        prompt=prompt,
 								                        tokens=self.model_tokens,
 								                        state=self.model_state,
 								                        logits=logits,
 								                    )
 								                )
 								            except HTTPException:
 								                pass
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								        begin = len(self.model_tokens)
 								        out_last = begin
 								        occurrence: Dict = {}
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								        completion_token_len = 0
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								        response = ""
 								        for i in range(self.max_tokens_per_generation):
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								            self.adjust_forward_logits(logits, occurrence, i)
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								            token = self.pipeline.sample_logits(
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								                logits, temperature=self.temperature, top_p=self.top_p, top_k=self.top_k
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								            )
 								            if token == END_OF_TEXT:
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								                yield response, "", prompt_token_len, completion_token_len
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								                break
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
 								            self.adjust_occurrence(occurrence, token)
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								            logits, _ = self.run_rnn([token])
 								            completion_token_len = completion_token_len + 1
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								            delta: str = self.delta_postprocess(
 								                self.pipeline.decode(self.model_tokens[out_last:])
 								            )
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								            if "\ufffd" not in delta:  # avoid utf-8 display issues
 								                response += delta
 								                if stop is not None:
-												support for `stop` array

											
										
										
											2023-07-25 16:10:22 +08:00
+								                    if type(stop) == str:
 								                        if stop in response:
 								                            try:
 								                                state_cache.add_state(
 								                                    state_cache.AddStateBody(
 								                                        prompt=prompt + response,
 								                                        tokens=self.model_tokens,
 								                                        state=self.model_state,
 								                                        logits=logits,
 								                                    )
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								                                )
-												support for `stop` array

											
										
										
											2023-07-25 16:10:22 +08:00
+								                            except HTTPException:
 								                                pass
 								                            response = response.split(stop)[0]
 								                            yield response, "", prompt_token_len, completion_token_len
 								                            break
 								                    elif type(stop) == list:
 								                        stop_exist_regex = "|".join(stop)
 								                        matched = re.search(stop_exist_regex, response)
 								                        if matched:
 								                            try:
 								                                state_cache.add_state(
 								                                    state_cache.AddStateBody(
 								                                        prompt=prompt + response,
 								                                        tokens=self.model_tokens,
 								                                        state=self.model_state,
 								                                        logits=logits,
 								                                    )
 								                                )
 								                            except HTTPException:
 								                                pass
 								                            response = response.split(matched.group())[0]
 								                            yield response, "", prompt_token_len, completion_token_len
 								                            break
-												support for rwkv-4-world

											
										
										
											2023-05-28 12:53:14 +08:00
+								                out_last = begin + i + 1
-												feat: use model state cache to achieve 5x - 50x faster preparation time for generation

											
										
										
											2023-05-28 23:52:38 +08:00
+								                if i == self.max_tokens_per_generation - 1:
 								                    try:
 								                        state_cache.add_state(
 								                            state_cache.AddStateBody(
 								                                prompt=prompt + response,
 								                                tokens=self.model_tokens,
 								                                state=self.model_state,
 								                                logits=logits,
 								                            )
 								                        )
 								                    except HTTPException:
 								                        pass
-												add usage

											
										
										
											2023-06-20 15:55:52 +08:00
+								                yield response, delta, prompt_token_len, completion_token_len
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								class TextRWKV(AbstractRWKV):
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								    def __init__(self, model, pipeline) -> None:
 								        super().__init__(model, pipeline)
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
 								        self.CHUNK_LEN = 256
 								        self.max_tokens_per_generation = 500
 								        self.temperature = 1
 								        self.top_p = 0.3
 								        self.top_k = 0
 								        self.penalty_alpha_presence = 0
 								        self.penalty_alpha_frequency = 1
 								        self.interface = ":"
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								        if self.tokenizer_len < 65536:
-												RWKVType

											
										
										
											2023-07-31 22:46:13 +08:00
+								            self.rwkv_type = RWKVType.Raven
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								            self.user = "Bob"
 								            self.bot = "Alice"
 								            self.END_OF_LINE = 187
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								        else:
 								            self.rwkv_type = RWKVType.World
-												change default World series prefix to User/Assistant

											
										
										
											2023-10-26 16:58:53 +08:00
+								            self.user = "User"
 								            self.bot = "Assistant"
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								            self.END_OF_LINE = 11
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
 								        self.AVOID_REPEAT_TOKENS = []
 								        AVOID_REPEAT = "，：？！"
 								        for i in AVOID_REPEAT:
 								            dd = self.pipeline.encode(i)
 								            assert len(dd) == 1
 								            self.AVOID_REPEAT_TOKENS += dd
 								        self.__preload()
 								    def adjust_occurrence(self, occurrence: Dict, token: int):
 								        for xxx in occurrence:
 								            occurrence[xxx] *= 0.996
 								        if token not in occurrence:
 								            occurrence[token] = 1
 								        else:
 								            occurrence[token] += 1
 								    def adjust_forward_logits(self, logits: List[float], occurrence: Dict, i: int):
 								        for n in occurrence:
 								            logits[n] -= (
 								                self.penalty_alpha_presence
 								                + occurrence[n] * self.penalty_alpha_frequency
 								            )
-												global penalty

											
										
										
											2023-07-31 22:02:28 +08:00
+								        if i == 0:
 								            for token in self.model_tokens:
 								                token = int(token)
 								                for xxx in occurrence:
 								                    occurrence[xxx] *= 0.996
 								                if token not in occurrence:
 								                    occurrence[token] = 1
 								                else:
 								                    occurrence[token] += 1
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    # Model only saw '\n\n' as [187, 187] before, but the tokenizer outputs [535] for it at the end
 								    def fix_tokens(self, tokens) -> List[int]:
-												RWKVType

											
										
										
											2023-07-31 22:46:13 +08:00
+								        if self.rwkv_type == RWKVType.World:
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								            return tokens
 								        if len(tokens) > 0 and tokens[-1] == END_OF_LINE_DOUBLE:
 								            tokens = tokens[:-1] + [self.END_OF_LINE, self.END_OF_LINE]
 								        return tokens
 								    def run_rnn(
 								        self, _tokens: List[str], newline_adj: int = 0
 								    ) -> Tuple[List[float], int]:
 								        tokens = [int(x) for x in _tokens]
 								        token_len = len(tokens)
 								        self.model_tokens += tokens
 								        while len(tokens) > 0:
 								            out, self.model_state = self.model.forward(
 								                tokens[: self.CHUNK_LEN], self.model_state
 								            )
 								            tokens = tokens[self.CHUNK_LEN :]
 								        out[self.END_OF_LINE] += newline_adj  # adjust \n probability
 								        if self.model_tokens[-1] in self.AVOID_REPEAT_TOKENS:
 								            out[self.model_tokens[-1]] = -999999999
 								        return out, token_len
 								    def delta_postprocess(self, delta: str) -> str:
 								        return delta
 								    def __preload(self):
 								        interface = self.interface
 								        user = self.user
 								        bot = self.bot
 								        preset_system = (
 								            f"""
 								The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. \
 								{bot} is very intelligent, creative and friendly. \
 								{bot} is unlikely to disagree with {user}, and {bot} doesn't like to ask {user} questions. \
 								{bot} likes to tell {user} a lot about herself and her opinions. \
 								{bot} usually gives {user} kind, helpful and informative advices.\n
 								"""
-												RWKVType

											
										
										
											2023-07-31 22:46:13 +08:00
+								            if self.rwkv_type == RWKVType.Raven
-												add rwkv-cuda-beta support (faster)

											
										
										
											2023-08-14 22:07:15 +08:00
+								            else (
 								                f"{user}{interface} hi\n\n{bot}{interface} Hi. "
 								                + "I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.\n\n"
 								            )
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								        )
 								        logits, _ = self.run_rnn(self.fix_tokens(self.pipeline.encode(preset_system)))
 								        try:
 								            state_cache.add_state(
 								                state_cache.AddStateBody(
 								                    prompt=preset_system,
 								                    tokens=self.model_tokens,
 								                    state=self.model_state,
 								                    logits=logits,
 								                )
 								            )
 								        except HTTPException:
 								            pass
 								class MusicRWKV(AbstractRWKV):
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								    def __init__(self, model, pipeline):
 								        super().__init__(model, pipeline)
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
 								        self.max_tokens_per_generation = 500
 								        self.temperature = 1
 								        self.top_p = 0.8
 								        self.top_k = 8
-												RWKVType

											
										
										
											2023-07-31 22:46:13 +08:00
+								        self.rwkv_type = RWKVType.Music
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								    def adjust_occurrence(self, occurrence: Dict, token: int):
 								        for n in occurrence:
 								            occurrence[n] *= 0.997  #### decay repetition penalty
 								        if token >= 128 or token == 127:
 								            occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
 								        else:
 								            occurrence[token] = 0.3 + (occurrence[token] if token in occurrence else 0)
 								    def adjust_forward_logits(self, logits: List[float], occurrence: Dict, i: int):
 								        for n in occurrence:
 								            logits[n] -= 0 + occurrence[n] * 0.5
 								        logits[0] += (i - 2000) / 500  # try not to be too short or too long
 								        logits[127] -= 1  # avoid "t125"
 								    def fix_tokens(self, tokens) -> List[int]:
 								        return tokens
 								    def run_rnn(
 								        self, _tokens: List[str], newline_adj: int = 0
 								    ) -> Tuple[List[float], int]:
 								        tokens = [int(x) for x in _tokens]
 								        token_len = len(tokens)
 								        self.model_tokens += tokens
 								        out, self.model_state = self.model.forward(tokens, self.model_state)
 								        return out, token_len
 								    def delta_postprocess(self, delta: str) -> str:
 								        return " " + delta
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								def get_tokenizer(tokenizer_len: int):
 								    tokenizer_dir = f"{pathlib.Path(__file__).parent.parent.resolve()}/rwkv_pip/"
 								    if tokenizer_len < 50277:
 								        return tokenizer_dir + "tokenizer-midi.json"
 								    elif tokenizer_len < 65536:
 								        return tokenizer_dir + "20B_tokenizer.json"
 								    else:
 								        return "rwkv_vocab_v20230424"
 								def RWKV(model: str, strategy: str, tokenizer: Union[str, None]) -> AbstractRWKV:
 								    rwkv_beta = global_var.get(global_var.Args).rwkv_beta
-												rwkv.cpp(ggml) support

											
										
										
											2023-12-12 20:29:55 +08:00
+								    rwkv_cpp = getattr(global_var.get(global_var.Args), "rwkv.cpp")
-												add WebGPU Python Mode (https://github.com/cryscan/web-rwkv-py)

											
										
										
											2023-12-14 18:37:07 +08:00
+								    webgpu = global_var.get(global_var.Args).webgpu
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
-												RWKV_RESCALE_LAYER 999 for music model

											
										
										
											2023-12-04 17:51:21 +08:00
+								    if "midi" in model.lower() or "abc" in model.lower():
 								        os.environ["RWKV_RESCALE_LAYER"] = "999"
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								    # dynamic import to make RWKV_CUDA_ON work
 								    if rwkv_beta:
-												rwkv.cpp(ggml) support

											
										
										
											2023-12-12 20:29:55 +08:00
+								        print("Using rwkv-beta")
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								        from rwkv_pip.beta.model import (
 								            RWKV as Model,
 								        )
-												rwkv.cpp(ggml) support

											
										
										
											2023-12-12 20:29:55 +08:00
+								    elif rwkv_cpp:
 								        print("Using rwkv.cpp, strategy is ignored")
 								        from rwkv_pip.cpp.model import (
 								            RWKV as Model,
 								        )
-												add WebGPU Python Mode (https://github.com/cryscan/web-rwkv-py)

											
										
										
											2023-12-14 18:37:07 +08:00
+								    elif webgpu:
 								        print("Using webgpu")
 								        from rwkv_pip.webgpu.model import (
 								            RWKV as Model,
 								        )
-												RWKVType now no longer relies on the file name

											
										
										
											2023-10-26 16:55:33 +08:00
+								    else:
 								        from rwkv_pip.model import (
 								            RWKV as Model,
 								        )
 								    from rwkv_pip.utils import PIPELINE
 								    filename, _ = os.path.splitext(os.path.basename(model))
 								    model = Model(model, strategy)
 								    if not tokenizer:
 								        tokenizer = get_tokenizer(len(model.w["emb.weight"]))
 								    pipeline = PIPELINE(model, tokenizer)
 								    rwkv_map: dict[str, Type[AbstractRWKV]] = {
 								        "20B_tokenizer": TextRWKV,
 								        "rwkv_vocab_v20230424": TextRWKV,
 								        "tokenizer-midi": MusicRWKV,
 								    }
 								    tokenizer_name = os.path.splitext(os.path.basename(tokenizer))[0]
 								    rwkv: AbstractRWKV
 								    if tokenizer_name in rwkv_map:
 								        rwkv = rwkv_map[tokenizer_name](model, pipeline)
 								    else:
 								        rwkv = TextRWKV(model, pipeline)
 								    rwkv.name = filename
 								    return rwkv
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
+								class ModelConfigBody(BaseModel):
-												safe ModelConfigBody

											
										
										
											2023-05-30 23:13:27 +08:00
+								    max_tokens: int = Field(default=None, gt=0, le=102400)
 								    temperature: float = Field(default=None, ge=0, le=2)
 								    top_p: float = Field(default=None, ge=0, le=1)
 								    presence_penalty: float = Field(default=None, ge=-2, le=2)
 								    frequency_penalty: float = Field(default=None, ge=-2, le=2)
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
-												fix `/docs` default api params (Pydantic v2)

											
										
										
											2023-11-07 22:53:11 +08:00
+								    model_config = {
 								        "json_schema_extra": {
-												improve api docs

											
										
										
											2023-06-15 21:52:22 +08:00
+								            "example": {
 								                "max_tokens": 1000,
 								                "temperature": 1.2,
 								                "top_p": 0.5,
 								                "presence_penalty": 0.4,
 								                "frequency_penalty": 0.4,
 								            }
 								        }
-												fix `/docs` default api params (Pydantic v2)

											
										
										
											2023-11-07 22:53:11 +08:00
+								    }
-												improve api docs

											
										
										
											2023-06-15 21:52:22 +08:00
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								def set_rwkv_config(model: AbstractRWKV, body: ModelConfigBody):
-												safe ModelConfigBody

											
										
										
											2023-05-30 23:13:27 +08:00
+								    if body.max_tokens is not None:
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
+								        model.max_tokens_per_generation = body.max_tokens
-												safe ModelConfigBody

											
										
										
											2023-05-30 23:13:27 +08:00
+								    if body.temperature is not None:
-												fix the crash issue caused by temperature being 0

											
										
										
											2023-06-04 11:53:33 +08:00
+								        if body.temperature < 0.1:
 								            model.temperature = 0.1
 								        else:
 								            model.temperature = body.temperature
-												safe ModelConfigBody

											
										
										
											2023-05-30 23:13:27 +08:00
+								    if body.top_p is not None:
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
+								        model.top_p = body.top_p
-												safe ModelConfigBody

											
										
										
											2023-05-30 23:13:27 +08:00
+								    if body.presence_penalty is not None:
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
+								        model.penalty_alpha_presence = body.presence_penalty
-												safe ModelConfigBody

											
										
										
											2023-05-30 23:13:27 +08:00
+								    if body.frequency_penalty is not None:
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
+								        model.penalty_alpha_frequency = body.frequency_penalty
-												add support for MIDI RWKV

											
										
										
											2023-07-25 16:09:31 +08:00
+								def get_rwkv_config(model: AbstractRWKV) -> ModelConfigBody:
-												preliminary usable features

											
										
										
											2023-05-17 11:39:00 +08:00
+								    return ModelConfigBody(
 								        max_tokens=model.max_tokens_per_generation,
 								        temperature=model.temperature,
 								        top_p=model.top_p,
 								        presence_penalty=model.penalty_alpha_presence,
 								        frequency_penalty=model.penalty_alpha_frequency,
 								    )