RWKV-Runner/backend-python/routes/state_cache.py

from typing import Any, Dict, List, Union
from utils.log import quick_log
from fastapi import APIRouter, HTTPException, Request, Response, status
from pydantic import BaseModel
import gc
import copy

router = APIRouter()

trie = None
dtrie: Dict = {}
max_trie_len = 3000
loop_start_id = 1  # to prevent preloaded prompts from being deleted
loop_del_trie_id = loop_start_id


def init():
    global trie
    try:
        import cyac

        # import mmap
        # import os
        #
        # if os.path.exists("state_cache.trie"):
        #     with open("state_cache.trie", "r") as bf:
        #         buff_object = mmap.mmap(bf.fileno(), 0, access=mmap.ACCESS_READ)
        #     trie = cyac.Trie.from_buff(buff_object, copy=False)
        # else:
        trie = cyac.Trie()
    except ModuleNotFoundError:
        print("cyac not found")


@router.post("/disable-state-cache", tags=["State Cache"])
def disable_state_cache():
    global trie, dtrie

    trie = None
    dtrie = {}
    gc.collect()

    return "success"


@router.post("/enable-state-cache", tags=["State Cache"])
def enable_state_cache():
    global trie, dtrie
    try:
        import cyac

        trie = cyac.Trie()
        dtrie = {}
        gc.collect()

        return "success"
    except ModuleNotFoundError:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "cyac not found")


class AddStateBody(BaseModel):
    prompt: str
    tokens: List[Union[str, int]]
    state: Any
    logits: Any


@router.post("/add-state", tags=["State Cache"])
def add_state(body: AddStateBody):
    global trie, dtrie, loop_del_trie_id
    if trie is None:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")

    import torch

    try:
        id: int = trie.insert(body.prompt)
        device: torch.device = body.state[0].device
        dtrie[id] = {
            "tokens": copy.deepcopy(body.tokens),
            "state": [tensor.cpu() for tensor in body.state]
            if device != torch.device("cpu")
            else copy.deepcopy(body.state),
            "logits": copy.deepcopy(body.logits),
            "device": device,
        }

        if len(trie) >= max_trie_len:
            del_prompt = trie[loop_del_trie_id]
            trie.remove(del_prompt)
            dtrie[loop_del_trie_id] = None
            loop_del_trie_id = loop_del_trie_id + 1
            if loop_del_trie_id >= max_trie_len:
                loop_del_trie_id = loop_start_id

        quick_log(
            None,
            None,
            f"New Trie Id: {id}\nTrie Len: {len(trie)}\nTrie Buff Size: {trie.buff_size()}\nDtrie Buff Size Of Id: {_get_a_dtrie_buff_size(dtrie[id])}",
        )
        return "success"
    except Exception as e:
        raise HTTPException(
            status.HTTP_400_BAD_REQUEST, f"insert failed, bad prompt.\n{e}"
        )


@router.post("/reset-state", tags=["State Cache"])
def reset_state():
    global trie, dtrie
    if trie is None:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")

    import cyac

    trie = cyac.Trie()
    dtrie = {}
    gc.collect()

    return "success"


class LongestPrefixStateBody(BaseModel):
    prompt: str


def _get_a_dtrie_buff_size(dtrie_v):
    # print(sys.getsizeof(dtrie_v["tokens"][0]))  # str
    # print(sys.getsizeof(dtrie_v["tokens"][0]) * len(dtrie_v["tokens"]))
    # print(dtrie_v["state"][0][0].element_size())
    # print(dtrie_v["state"][0].nelement())
    # print(len(dtrie_v["state"]))
    # print(
    #     len(dtrie_v["state"])
    #     * dtrie_v["state"][0].nelement()
    #     * dtrie_v["state"][0][0].element_size()
    # )
    # print(dtrie_v["logits"][0].element_size())
    # print(dtrie_v["logits"].nelement())
    # print(dtrie_v["logits"][0].element_size() * dtrie_v["logits"].nelement())
    return 54 * len(dtrie_v["tokens"]) + 491520 + 262144 + 28  # TODO


@router.post("/longest-prefix-state", tags=["State Cache"])
def longest_prefix_state(body: LongestPrefixStateBody, request: Request):
    global trie
    if trie is None:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")

    import torch

    id = -1
    try:
        for id, len in trie.prefix(body.prompt):
            pass
    except:
        pass
    if id != -1:
        v = dtrie[id]
        device: torch.device = v["device"]
        prompt: str = trie[id]

        quick_log(request, body, "Hit:\n" + prompt)
        return {
            "prompt": prompt,
            "tokens": v["tokens"],
            "state": [tensor.to(device) for tensor in v["state"]]
            if device != torch.device("cpu")
            else v["state"],
            "logits": v["logits"],
            "device": device.type,
        }
    else:
        return {
            "prompt": "",
            "tokens": [],
            "state": None,
            "logits": None,
            "device": None,
        }


@router.post("/save-state", tags=["State Cache"])
def save_state():
    global trie
    if trie is None:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")

    # trie.save("state_cache.trie")

    return "not implemented"
chore (AddStateBody class) 2023-08-13 21:27:29 +08:00			`from typing import Any, Dict, List, Union`
add logs for state cache and switch-model 2023-06-09 20:46:19 +08:00			`from utils.log import quick_log`
			`from fastapi import APIRouter, HTTPException, Request, Response, status`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`from pydantic import BaseModel`
			`import gc`
			`import copy`

			`router = APIRouter()`

			`trie = None`
			`dtrie: Dict = {}`
max_trie_len 2023-06-12 15:22:17 +08:00			`max_trie_len = 3000`
			`loop_start_id = 1 # to prevent preloaded prompts from being deleted`
			`loop_del_trie_id = loop_start_id`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00

			`def init():`
			`global trie`
			`try:`
			`import cyac`
avoid misoperations of state_cache 2023-06-12 12:32:50 +08:00
			`# import mmap`
			`# import os`
			`#`
			`# if os.path.exists("state_cache.trie"):`
			`# with open("state_cache.trie", "r") as bf:`
			`# buff_object = mmap.mmap(bf.fileno(), 0, access=mmap.ACCESS_READ)`
			`# trie = cyac.Trie.from_buff(buff_object, copy=False)`
			`# else:`
			`trie = cyac.Trie()`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`except ModuleNotFoundError:`
			`print("cyac not found")`


chore 2023-07-26 22:24:26 +08:00			`@router.post("/disable-state-cache", tags=["State Cache"])`
fix cross-device state cache exception 2023-07-11 11:20:12 +08:00			`def disable_state_cache():`
			`global trie, dtrie`

			`trie = None`
			`dtrie = {}`
			`gc.collect()`

			`return "success"`


chore 2023-07-26 22:24:26 +08:00			`@router.post("/enable-state-cache", tags=["State Cache"])`
fix cross-device state cache exception 2023-07-11 11:20:12 +08:00			`def enable_state_cache():`
			`global trie, dtrie`
			`try:`
			`import cyac`

			`trie = cyac.Trie()`
			`dtrie = {}`
			`gc.collect()`

			`return "success"`
			`except ModuleNotFoundError:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "cyac not found")`


feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`class AddStateBody(BaseModel):`
			`prompt: str`
chore (AddStateBody class) 2023-08-13 21:27:29 +08:00			`tokens: List[Union[str, int]]`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`state: Any`
			`logits: Any`


chore 2023-07-26 22:24:26 +08:00			`@router.post("/add-state", tags=["State Cache"])`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`def add_state(body: AddStateBody):`
max_trie_len 2023-06-12 15:22:17 +08:00			`global trie, dtrie, loop_del_trie_id`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`if trie is None:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")`

improve python backend startup speed 2023-07-25 16:14:29 +08:00			`import torch`

fix the state cache crash caused by bad prompts 2023-06-15 22:37:00 +08:00			`try:`
type 2023-06-19 22:32:02 +08:00			`id: int = trie.insert(body.prompt)`
			`device: torch.device = body.state[0].device`
fix the state cache crash caused by bad prompts 2023-06-15 22:37:00 +08:00			`dtrie[id] = {`
			`"tokens": copy.deepcopy(body.tokens),`
			`"state": [tensor.cpu() for tensor in body.state]`
			`if device != torch.device("cpu")`
			`else copy.deepcopy(body.state),`
			`"logits": copy.deepcopy(body.logits),`
			`"device": device,`
			`}`
max_trie_len 2023-06-12 15:22:17 +08:00
fix the state cache crash caused by bad prompts 2023-06-15 22:37:00 +08:00			`if len(trie) >= max_trie_len:`
			`del_prompt = trie[loop_del_trie_id]`
			`trie.remove(del_prompt)`
			`dtrie[loop_del_trie_id] = None`
			`loop_del_trie_id = loop_del_trie_id + 1`
			`if loop_del_trie_id >= max_trie_len:`
			`loop_del_trie_id = loop_start_id`

			`quick_log(`
			`None,`
			`None,`
			`f"New Trie Id: {id}\nTrie Len: {len(trie)}\nTrie Buff Size: {trie.buff_size()}\nDtrie Buff Size Of Id: {_get_a_dtrie_buff_size(dtrie[id])}",`
			`)`
			`return "success"`
			`except Exception as e:`
			`raise HTTPException(`
			`status.HTTP_400_BAD_REQUEST, f"insert failed, bad prompt.\n{e}"`
			`)`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00

chore 2023-07-26 22:24:26 +08:00			`@router.post("/reset-state", tags=["State Cache"])`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`def reset_state():`
avoid misoperations of state_cache 2023-06-12 12:32:50 +08:00			`global trie, dtrie`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`if trie is None:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")`

fix cross-device state cache exception 2023-07-11 11:20:12 +08:00			`import cyac`

feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`trie = cyac.Trie()`
avoid misoperations of state_cache 2023-06-12 12:32:50 +08:00			`dtrie = {}`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`gc.collect()`

			`return "success"`


			`class LongestPrefixStateBody(BaseModel):`
			`prompt: str`


max_trie_len 2023-06-12 15:22:17 +08:00			`def _get_a_dtrie_buff_size(dtrie_v):`
			`# print(sys.getsizeof(dtrie_v["tokens"][0])) # str`
			`# print(sys.getsizeof(dtrie_v["tokens"][0]) * len(dtrie_v["tokens"]))`
			`# print(dtrie_v["state"][0][0].element_size())`
			`# print(dtrie_v["state"][0].nelement())`
			`# print(len(dtrie_v["state"]))`
			`# print(`
			`# len(dtrie_v["state"])`
			`# * dtrie_v["state"][0].nelement()`
			`# * dtrie_v["state"][0][0].element_size()`
			`# )`
			`# print(dtrie_v["logits"][0].element_size())`
			`# print(dtrie_v["logits"].nelement())`
			`# print(dtrie_v["logits"][0].element_size() * dtrie_v["logits"].nelement())`
type 2023-06-19 22:32:02 +08:00			`return 54 * len(dtrie_v["tokens"]) + 491520 + 262144 + 28 # TODO`
max_trie_len 2023-06-12 15:22:17 +08:00

chore 2023-07-26 22:24:26 +08:00			`@router.post("/longest-prefix-state", tags=["State Cache"])`
add logs for state cache and switch-model 2023-06-09 20:46:19 +08:00			`def longest_prefix_state(body: LongestPrefixStateBody, request: Request):`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`global trie`
			`if trie is None:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")`

improve python backend startup speed 2023-07-25 16:14:29 +08:00			`import torch`

feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`id = -1`
fix the state cache crash caused by bad prompts 2023-06-15 22:37:00 +08:00			`try:`
			`for id, len in trie.prefix(body.prompt):`
			`pass`
			`except:`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`pass`
			`if id != -1:`
			`v = dtrie[id]`
type 2023-06-19 22:32:02 +08:00			`device: torch.device = v["device"]`
			`prompt: str = trie[id]`

log Generation Prompt 2023-06-12 13:41:51 +08:00			`quick_log(request, body, "Hit:\n" + prompt)`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`return {`
add logs for state cache and switch-model 2023-06-09 20:46:19 +08:00			`"prompt": prompt,`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`"tokens": v["tokens"],`
move state cache to memory (todo: state cache db) 2023-06-02 21:33:57 +08:00			`"state": [tensor.to(device) for tensor in v["state"]]`
			`if device != torch.device("cpu")`
			`else v["state"],`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`"logits": v["logits"],`
type 2023-06-19 22:32:02 +08:00			`"device": device.type,`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`}`
			`else:`
avoid misoperations of state_cache 2023-06-12 12:32:50 +08:00			`return {`
			`"prompt": "",`
			`"tokens": [],`
			`"state": None,`
			`"logits": None,`
			`"device": None,`
			`}`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00

chore 2023-07-26 22:24:26 +08:00			`@router.post("/save-state", tags=["State Cache"])`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00			`def save_state():`
			`global trie`
			`if trie is None:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "trie not loaded")`

avoid misoperations of state_cache 2023-06-12 12:32:50 +08:00			`# trie.save("state_cache.trie")`
feat: use model state cache to achieve 5x - 50x faster preparation time for generation 2023-05-28 23:52:38 +08:00
avoid misoperations of state_cache 2023-06-12 12:32:50 +08:00			`return "not implemented"`