RWKV-Runner/backend-python/routes/completion.py

import asyncio
import json
from threading import Lock
from typing import List, Union
from enum import Enum
import base64

from fastapi import APIRouter, Request, status, HTTPException
from sse_starlette.sse import EventSourceResponse
from pydantic import BaseModel, Field
import numpy as np
import tiktoken
from utils.rwkv import *
from utils.log import quick_log
import global_var

router = APIRouter()


class Role(Enum):
    User = "user"
    Assistant = "assistant"
    System = "system"


class Message(BaseModel):
    role: Role
    content: str = Field(min_length=1)


class ChatCompletionBody(ModelConfigBody):
    messages: Union[List[Message], None]
    model: str = "rwkv"
    stream: bool = False
    stop: Union[str, List[str], None] = [
        "\n\nUser",
        "\n\nQuestion",
        "\n\nQ",
        "\n\nHuman",
        "\n\nBob",
    ]
    user_name: Union[str, None] = None
    assistant_name: Union[str, None] = None

    class Config:
        schema_extra = {
            "example": {
                "messages": [{"role": Role.User.value, "content": "hello"}],
                "model": "rwkv",
                "stream": False,
                "stop": None,
                "user_name": None,
                "assistant_name": None,
                "max_tokens": 1000,
                "temperature": 1.2,
                "top_p": 0.5,
                "presence_penalty": 0.4,
                "frequency_penalty": 0.4,
            }
        }


class CompletionBody(ModelConfigBody):
    prompt: Union[str, List[str], None]
    model: str = "rwkv"
    stream: bool = False
    stop: Union[str, List[str], None] = None

    class Config:
        schema_extra = {
            "example": {
                "prompt": "The following is an epic science fiction masterpiece that is immortalized, "
                + "with delicate descriptions and grand depictions of interstellar civilization wars.\nChapter 1.\n",
                "model": "rwkv",
                "stream": False,
                "stop": None,
                "max_tokens": 100,
                "temperature": 1.2,
                "top_p": 0.5,
                "presence_penalty": 0.4,
                "frequency_penalty": 0.4,
            }
        }


completion_lock = Lock()

requests_num = 0


async def eval_rwkv(
    model: AbstractRWKV,
    request: Request,
    body: ModelConfigBody,
    prompt: str,
    stream: bool,
    stop: Union[str, List[str], None],
    chat_mode: bool,
):
    global requests_num
    requests_num = requests_num + 1
    quick_log(request, None, "Start Waiting. RequestsNum: " + str(requests_num))
    while completion_lock.locked():
        if await request.is_disconnected():
            requests_num = requests_num - 1
            print(f"{request.client} Stop Waiting (Lock)")
            quick_log(
                request,
                None,
                "Stop Waiting (Lock). RequestsNum: " + str(requests_num),
            )
            return
        await asyncio.sleep(0.1)
    else:
        with completion_lock:
            if await request.is_disconnected():
                requests_num = requests_num - 1
                print(f"{request.client} Stop Waiting (Lock)")
                quick_log(
                    request,
                    None,
                    "Stop Waiting (Lock). RequestsNum: " + str(requests_num),
                )
                return
            set_rwkv_config(model, global_var.get(global_var.Model_Config))
            set_rwkv_config(model, body)

            response, prompt_tokens, completion_tokens = "", 0, 0
            for response, delta, prompt_tokens, completion_tokens in model.generate(
                prompt,
                stop=stop,
            ):
                if await request.is_disconnected():
                    break
                if stream:
                    yield json.dumps(
                        {
                            "object": "chat.completion.chunk"
                            if chat_mode
                            else "text_completion",
                            # "response": response,
                            "model": model.name,
                            "choices": [
                                {
                                    "delta": {"content": delta},
                                    "index": 0,
                                    "finish_reason": None,
                                }
                                if chat_mode
                                else {
                                    "text": delta,
                                    "index": 0,
                                    "finish_reason": None,
                                }
                            ],
                        }
                    )
            # torch_gc()
            requests_num = requests_num - 1
            if await request.is_disconnected():
                print(f"{request.client} Stop Waiting")
                quick_log(
                    request,
                    body,
                    response + "\nStop Waiting. RequestsNum: " + str(requests_num),
                )
                return
            quick_log(
                request,
                body,
                response + "\nFinished. RequestsNum: " + str(requests_num),
            )
            if stream:
                yield json.dumps(
                    {
                        "object": "chat.completion.chunk"
                        if chat_mode
                        else "text_completion",
                        # "response": response,
                        "model": model.name,
                        "choices": [
                            {
                                "delta": {},
                                "index": 0,
                                "finish_reason": "stop",
                            }
                            if chat_mode
                            else {
                                "text": "",
                                "index": 0,
                                "finish_reason": "stop",
                            }
                        ],
                    }
                )
                yield "[DONE]"
            else:
                yield {
                    "object": "chat.completion" if chat_mode else "text_completion",
                    # "response": response,
                    "model": model.name,
                    "usage": {
                        "prompt_tokens": prompt_tokens,
                        "completion_tokens": completion_tokens,
                        "total_tokens": prompt_tokens + completion_tokens,
                    },
                    "choices": [
                        {
                            "message": {
                                "role": Role.Assistant.value,
                                "content": response,
                            },
                            "index": 0,
                            "finish_reason": "stop",
                        }
                        if chat_mode
                        else {
                            "text": response,
                            "index": 0,
                            "finish_reason": "stop",
                        }
                    ],
                }


@router.post("/v1/chat/completions", tags=["Completions"])
@router.post("/chat/completions", tags=["Completions"])
async def chat_completions(body: ChatCompletionBody, request: Request):
    model: TextRWKV = global_var.get(global_var.Model)
    if model is None:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")

    if body.messages is None or body.messages == []:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "messages not found")

    basic_system: str = ""
    if body.messages[0].role == Role.System:
        basic_system = body.messages[0].content

    interface = model.interface
    user = model.user if body.user_name is None else body.user_name
    bot = model.bot if body.assistant_name is None else body.assistant_name

    is_raven = model.rwkv_type == RWKVType.Raven

    completion_text: str = ""
    if basic_system == "":
        completion_text = (
            f"""
The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. \
{bot} is very intelligent, creative and friendly. \
{bot} is unlikely to disagree with {user}, and {bot} doesn't like to ask {user} questions. \
{bot} likes to tell {user} a lot about herself and her opinions. \
{bot} usually gives {user} kind, helpful and informative advices.\n
"""
            if is_raven
            else (
                f"{user}{interface} hi\n\n{bot}{interface} Hi. "
                + "I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.\n\n"
            )
        )
    elif basic_system != "":
        completion_text = (
            (
                f"The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. "
                if is_raven
                else f"{user}{interface} hi\n\n{bot}{interface} Hi. "
            )
            + basic_system.replace("\r\n", "\n")
            .replace("\r", "\n")
            .replace("\n\n", "\n")
            .replace("\n", " ")
            .strip()
            .replace("You are", f"{bot} is" if is_raven else "I am")
            .replace("you are", f"{bot} is" if is_raven else "I am")
            .replace("You're", f"{bot} is" if is_raven else "I'm")
            .replace("you're", f"{bot} is" if is_raven else "I'm")
            .replace("You", f"{bot}" if is_raven else "I")
            .replace("you", f"{bot}" if is_raven else "I")
            .replace("Your", f"{bot}'s" if is_raven else "My")
            .replace("your", f"{bot}'s" if is_raven else "my")
            .replace("你", f"{bot}" if is_raven else "我")
            + "\n\n"
        )

    for message in body.messages[(0 if basic_system == "" else 1) :]:
        append_message: str = ""
        if message.role == Role.User:
            append_message = f"{user}{interface} " + message.content
        elif message.role == Role.Assistant:
            append_message = f"{bot}{interface} " + message.content
        elif message.role == Role.System:
            append_message = message.content
        completion_text += (
            append_message.replace("\r\n", "\n")
            .replace("\r", "\n")
            .replace("\n\n", "\n")
            .strip()
            + "\n\n"
        )
    completion_text += f"{bot}{interface}"

    if type(body.stop) == str:
        body.stop = [body.stop, f"\n\n{user}", f"\n\n{bot}"]
    else:
        body.stop.append(f"\n\n{user}")
        body.stop.append(f"\n\n{bot}")

    if body.stream:
        return EventSourceResponse(
            eval_rwkv(
                model, request, body, completion_text, body.stream, body.stop, True
            )
        )
    else:
        try:
            return await eval_rwkv(
                model, request, body, completion_text, body.stream, body.stop, True
            ).__anext__()
        except StopAsyncIteration:
            return None


@router.post("/v1/completions", tags=["Completions"])
@router.post("/completions", tags=["Completions"])
async def completions(body: CompletionBody, request: Request):
    model: AbstractRWKV = global_var.get(global_var.Model)
    if model is None:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")

    if body.prompt is None or body.prompt == "" or body.prompt == []:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "prompt not found")

    if type(body.prompt) == list:
        body.prompt = body.prompt[0]  # TODO: support multiple prompts

    if body.stream:
        return EventSourceResponse(
            eval_rwkv(model, request, body, body.prompt, body.stream, body.stop, False)
        )
    else:
        try:
            return await eval_rwkv(
                model, request, body, body.prompt, body.stream, body.stop, False
            ).__anext__()
        except StopAsyncIteration:
            return None


class EmbeddingsBody(BaseModel):
    input: Union[str, List[str], List[List[int]], None]
    model: str = "rwkv"
    encoding_format: str = None
    fast_mode: bool = False

    class Config:
        schema_extra = {
            "example": {
                "input": "a big apple",
                "model": "rwkv",
                "encoding_format": None,
                "fast_mode": False,
            }
        }


def embedding_base64(embedding: List[float]) -> str:
    return base64.b64encode(np.array(embedding).astype(np.float32)).decode("utf-8")


@router.post("/v1/embeddings", tags=["Embeddings"])
@router.post("/embeddings", tags=["Embeddings"])
@router.post("/v1/engines/text-embedding-ada-002/embeddings", tags=["Embeddings"])
@router.post("/engines/text-embedding-ada-002/embeddings", tags=["Embeddings"])
async def embeddings(body: EmbeddingsBody, request: Request):
    model: AbstractRWKV = global_var.get(global_var.Model)
    if model is None:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")

    if body.input is None or body.input == "" or body.input == [] or body.input == [[]]:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, "input not found")

    global requests_num
    requests_num = requests_num + 1
    quick_log(request, None, "Start Waiting. RequestsNum: " + str(requests_num))
    while completion_lock.locked():
        if await request.is_disconnected():
            requests_num = requests_num - 1
            print(f"{request.client} Stop Waiting (Lock)")
            quick_log(
                request,
                None,
                "Stop Waiting (Lock). RequestsNum: " + str(requests_num),
            )
            return
        await asyncio.sleep(0.1)
    else:
        with completion_lock:
            if await request.is_disconnected():
                requests_num = requests_num - 1
                print(f"{request.client} Stop Waiting (Lock)")
                quick_log(
                    request,
                    None,
                    "Stop Waiting (Lock). RequestsNum: " + str(requests_num),
                )
                return

            base64_format = False
            if body.encoding_format == "base64":
                base64_format = True

            embeddings = []
            prompt_tokens = 0
            if type(body.input) == list:
                if type(body.input[0]) == list:
                    encoding = tiktoken.model.encoding_for_model(
                        "text-embedding-ada-002"
                    )
                    for i in range(len(body.input)):
                        if await request.is_disconnected():
                            break
                        input = encoding.decode(body.input[i])
                        embedding, token_len = model.get_embedding(
                            input, body.fast_mode
                        )
                        prompt_tokens = prompt_tokens + token_len
                        if base64_format:
                            embedding = embedding_base64(embedding)
                        embeddings.append(embedding)
                else:
                    for i in range(len(body.input)):
                        if await request.is_disconnected():
                            break
                        embedding, token_len = model.get_embedding(
                            body.input[i], body.fast_mode
                        )
                        prompt_tokens = prompt_tokens + token_len
                        if base64_format:
                            embedding = embedding_base64(embedding)
                        embeddings.append(embedding)
            else:
                embedding, prompt_tokens = model.get_embedding(
                    body.input, body.fast_mode
                )
                if base64_format:
                    embedding = embedding_base64(embedding)
                embeddings.append(embedding)

            requests_num = requests_num - 1
            if await request.is_disconnected():
                print(f"{request.client} Stop Waiting")
                quick_log(
                    request,
                    None,
                    "Stop Waiting. RequestsNum: " + str(requests_num),
                )
                return
            quick_log(
                request,
                None,
                "Finished. RequestsNum: " + str(requests_num),
            )

            ret_data = [
                {
                    "object": "embedding",
                    "index": i,
                    "embedding": embedding,
                }
                for i, embedding in enumerate(embeddings)
            ]

            return {
                "object": "list",
                "data": ret_data,
                "model": model.name,
                "usage": {
                    "prompt_tokens": prompt_tokens,
                    "total_tokens": prompt_tokens,
                },
            }
preliminary usable features 2023-05-17 11:39:00 +08:00			`import asyncio`
backend api 2023-05-07 17:27:54 +08:00			`import json`
preliminary usable features 2023-05-17 11:39:00 +08:00			`from threading import Lock`
fix input with array type (#96, #107) 2023-07-17 12:59:45 +08:00			`from typing import List, Union`
allow multiple systems 2023-08-04 22:27:55 +08:00			`from enum import Enum`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`import base64`
backend api 2023-05-07 17:27:54 +08:00
			`from fastapi import APIRouter, Request, status, HTTPException`
			`from sse_starlette.sse import EventSourceResponse`
allow multiple systems 2023-08-04 22:27:55 +08:00			`from pydantic import BaseModel, Field`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`import numpy as np`
			`import tiktoken`
backend api 2023-05-07 17:27:54 +08:00			`from utils.rwkv import *`
add logs 2023-06-03 17:12:59 +08:00			`from utils.log import quick_log`
backend api 2023-05-07 17:27:54 +08:00			`import global_var`

			`router = APIRouter()`


allow multiple systems 2023-08-04 22:27:55 +08:00			`class Role(Enum):`
			`User = "user"`
			`Assistant = "assistant"`
			`System = "system"`


backend api 2023-05-07 17:27:54 +08:00			`class Message(BaseModel):`
allow multiple systems 2023-08-04 22:27:55 +08:00			`role: Role`
			`content: str = Field(min_length=1)`
backend api 2023-05-07 17:27:54 +08:00

add compatible /v1/completions API 2023-05-22 11:18:37 +08:00			`class ChatCompletionBody(ModelConfigBody):`
allow completions input to be null 2023-08-04 22:22:59 +08:00			`messages: Union[List[Message], None]`
update 2023-05-17 11:47:45 +08:00			`model: str = "rwkv"`
			`stream: bool = False`
allow completions input to be null 2023-08-04 22:22:59 +08:00			`stop: Union[str, List[str], None] = [`
improve default ChatCompletion `stop` 2023-07-29 19:19:38 +08:00			`"\n\nUser",`
			`"\n\nQuestion",`
			`"\n\nQ",`
			`"\n\nHuman",`
			`"\n\nBob",`
			`]`
allow completions input to be null 2023-08-04 22:22:59 +08:00			`user_name: Union[str, None] = None`
			`assistant_name: Union[str, None] = None`
preliminary usable features 2023-05-17 11:39:00 +08:00
improve api docs 2023-06-15 21:52:22 +08:00			`class Config:`
			`schema_extra = {`
			`"example": {`
allow multiple systems 2023-08-04 22:27:55 +08:00			`"messages": [{"role": Role.User.value, "content": "hello"}],`
improve api docs 2023-06-15 21:52:22 +08:00			`"model": "rwkv",`
			`"stream": False,`
			`"stop": None,`
allow custom user_name and assistant_name (`/chat/completions` API) 2023-07-31 22:48:54 +08:00			`"user_name": None,`
			`"assistant_name": None,`
improve api docs 2023-06-15 21:52:22 +08:00			`"max_tokens": 1000,`
			`"temperature": 1.2,`
			`"top_p": 0.5,`
			`"presence_penalty": 0.4,`
			`"frequency_penalty": 0.4,`
			`}`
			`}`

preliminary usable features 2023-05-17 11:39:00 +08:00
refactor completions api 2023-06-18 20:16:52 +08:00			`class CompletionBody(ModelConfigBody):`
allow completions input to be null 2023-08-04 22:22:59 +08:00			`prompt: Union[str, List[str], None]`
refactor completions api 2023-06-18 20:16:52 +08:00			`model: str = "rwkv"`
			`stream: bool = False`
allow completions input to be null 2023-08-04 22:22:59 +08:00			`stop: Union[str, List[str], None] = None`
refactor completions api 2023-06-18 20:16:52 +08:00
			`class Config:`
			`schema_extra = {`
			`"example": {`
			`"prompt": "The following is an epic science fiction masterpiece that is immortalized, "`
			`+ "with delicate descriptions and grand depictions of interstellar civilization wars.\nChapter 1.\n",`
			`"model": "rwkv",`
			`"stream": False,`
			`"stop": None,`
			`"max_tokens": 100,`
			`"temperature": 1.2,`
			`"top_p": 0.5,`
			`"presence_penalty": 0.4,`
			`"frequency_penalty": 0.4,`
			`}`
			`}`


preliminary usable features 2023-05-17 11:39:00 +08:00			`completion_lock = Lock()`
backend api 2023-05-07 17:27:54 +08:00
add logs 2023-06-03 17:12:59 +08:00			`requests_num = 0`

backend api 2023-05-07 17:27:54 +08:00
refactor completions api 2023-06-18 20:16:52 +08:00			`async def eval_rwkv(`
add support for MIDI RWKV 2023-07-25 16:09:31 +08:00			`model: AbstractRWKV,`
refactor completions api 2023-06-18 20:16:52 +08:00			`request: Request,`
			`body: ModelConfigBody,`
			`prompt: str,`
			`stream: bool,`
allow completions input to be null 2023-08-04 22:22:59 +08:00			`stop: Union[str, List[str], None],`
refactor completions api 2023-06-18 20:16:52 +08:00			`chat_mode: bool,`
			`):`
			`global requests_num`
			`requests_num = requests_num + 1`
			`quick_log(request, None, "Start Waiting. RequestsNum: " + str(requests_num))`
			`while completion_lock.locked():`
			`if await request.is_disconnected():`
			`requests_num = requests_num - 1`
			`print(f"{request.client} Stop Waiting (Lock)")`
			`quick_log(`
			`request,`
			`None,`
			`"Stop Waiting (Lock). RequestsNum: " + str(requests_num),`
			`)`
			`return`
			`await asyncio.sleep(0.1)`
			`else:`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`with completion_lock:`
			`if await request.is_disconnected():`
			`requests_num = requests_num - 1`
			`print(f"{request.client} Stop Waiting (Lock)")`
			`quick_log(`
			`request,`
			`None,`
			`"Stop Waiting (Lock). RequestsNum: " + str(requests_num),`
			`)`
			`return`
			`set_rwkv_config(model, global_var.get(global_var.Model_Config))`
			`set_rwkv_config(model, body)`

			`response, prompt_tokens, completion_tokens = "", 0, 0`
			`for response, delta, prompt_tokens, completion_tokens in model.generate(`
			`prompt,`
			`stop=stop,`
			`):`
			`if await request.is_disconnected():`
			`break`
			`if stream:`
			`yield json.dumps(`
			`{`
			`"object": "chat.completion.chunk"`
			`if chat_mode`
			`else "text_completion",`
remove `response` field of completions api 2023-07-29 19:20:43 +08:00			`# "response": response,`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"model": model.name,`
			`"choices": [`
			`{`
			`"delta": {"content": delta},`
			`"index": 0,`
			`"finish_reason": None,`
			`}`
			`if chat_mode`
			`else {`
			`"text": delta,`
			`"index": 0,`
			`"finish_reason": None,`
			`}`
			`],`
			`}`
			`)`
			`# torch_gc()`
refactor completions api 2023-06-18 20:16:52 +08:00			`requests_num = requests_num - 1`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`if await request.is_disconnected():`
			`print(f"{request.client} Stop Waiting")`
			`quick_log(`
			`request,`
			`body,`
			`response + "\nStop Waiting. RequestsNum: " + str(requests_num),`
			`)`
			`return`
refactor completions api 2023-06-18 20:16:52 +08:00			`quick_log(`
			`request,`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`body,`
			`response + "\nFinished. RequestsNum: " + str(requests_num),`
refactor completions api 2023-06-18 20:16:52 +08:00			`)`
			`if stream:`
			`yield json.dumps(`
			`{`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`"object": "chat.completion.chunk"`
			`if chat_mode`
			`else "text_completion",`
remove `response` field of completions api 2023-07-29 19:20:43 +08:00			`# "response": response,`
exact model name 2023-06-19 22:30:49 +08:00			`"model": model.name,`
refactor completions api 2023-06-18 20:16:52 +08:00			`"choices": [`
			`{`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"delta": {},`
refactor completions api 2023-06-18 20:16:52 +08:00			`"index": 0,`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"finish_reason": "stop",`
refactor completions api 2023-06-18 20:16:52 +08:00			`}`
			`if chat_mode`
			`else {`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"text": "",`
refactor completions api 2023-06-18 20:16:52 +08:00			`"index": 0,`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"finish_reason": "stop",`
refactor completions api 2023-06-18 20:16:52 +08:00			`}`
			`],`
			`}`
			`)`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`yield "[DONE]"`
			`else:`
			`yield {`
			`"object": "chat.completion" if chat_mode else "text_completion",`
remove `response` field of completions api 2023-07-29 19:20:43 +08:00			`# "response": response,`
exact model name 2023-06-19 22:30:49 +08:00			`"model": model.name,`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"usage": {`
			`"prompt_tokens": prompt_tokens,`
			`"completion_tokens": completion_tokens,`
			`"total_tokens": prompt_tokens + completion_tokens,`
			`},`
refactor completions api 2023-06-18 20:16:52 +08:00			`"choices": [`
			`{`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"message": {`
allow multiple systems 2023-08-04 22:27:55 +08:00			`"role": Role.Assistant.value,`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"content": response,`
			`},`
refactor completions api 2023-06-18 20:16:52 +08:00			`"index": 0,`
			`"finish_reason": "stop",`
			`}`
			`if chat_mode`
			`else {`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"text": response,`
refactor completions api 2023-06-18 20:16:52 +08:00			`"index": 0,`
			`"finish_reason": "stop",`
			`}`
			`],`
			`}`


chore 2023-07-26 22:24:26 +08:00			`@router.post("/v1/chat/completions", tags=["Completions"])`
			`@router.post("/chat/completions", tags=["Completions"])`
add compatible /v1/completions API 2023-05-22 11:18:37 +08:00			`async def chat_completions(body: ChatCompletionBody, request: Request):`
add support for MIDI RWKV 2023-07-25 16:09:31 +08:00			`model: TextRWKV = global_var.get(global_var.Model)`
preliminary usable features 2023-05-17 11:39:00 +08:00			`if model is None:`
update 2023-05-07 22:48:52 +08:00			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")`
backend api 2023-05-07 17:27:54 +08:00
allow multiple systems 2023-08-04 22:27:55 +08:00			`if body.messages is None or body.messages == []:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "messages not found")`

			`basic_system: str = ""`
			`if body.messages[0].role == Role.System:`
			`basic_system = body.messages[0].content`
backend api 2023-05-07 17:27:54 +08:00
support for rwkv-4-world 2023-05-28 12:53:14 +08:00			`interface = model.interface`
allow custom user_name and assistant_name (`/chat/completions` API) 2023-07-31 22:48:54 +08:00			`user = model.user if body.user_name is None else body.user_name`
			`bot = model.bot if body.assistant_name is None else body.assistant_name`

			`is_raven = model.rwkv_type == RWKVType.Raven`
support for rwkv-4-world 2023-05-28 12:53:14 +08:00
allow multiple systems 2023-08-04 22:27:55 +08:00			`completion_text: str = ""`
			`if basic_system == "":`
			`completion_text = (`
			`f"""`
add `role: "system"` support 2023-05-24 14:01:22 +08:00			`The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. \`
			`{bot} is very intelligent, creative and friendly. \`
			`{bot} is unlikely to disagree with {user}, and {bot} doesn't like to ask {user} questions. \`
			`{bot} likes to tell {user} a lot about herself and her opinions. \`
			`{bot} usually gives {user} kind, helpful and informative advices.\n`
			`"""`
allow multiple systems 2023-08-04 22:27:55 +08:00			`if is_raven`
			`else (`
			`f"{user}{interface} hi\n\n{bot}{interface} Hi. "`
			`+ "I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.\n\n"`
			`)`
			`)`
			`elif basic_system != "":`
			`completion_text = (`
			`(`
add `role: "system"` support 2023-05-24 14:01:22 +08:00			`f"The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. "`
allow custom user_name and assistant_name (`/chat/completions` API) 2023-07-31 22:48:54 +08:00			`if is_raven`
improve system for rwkv-4-world 2023-05-31 12:46:06 +08:00			`else f"{user}{interface} hi\n\n{bot}{interface} Hi. "`
fix completion_text 2023-05-21 23:25:58 +08:00			`)`
allow multiple systems 2023-08-04 22:27:55 +08:00			`+ basic_system.replace("\r\n", "\n")`
			`.replace("\r", "\n")`
			`.replace("\n\n", "\n")`
			`.replace("\n", " ")`
			`.strip()`
			`.replace("You are", f"{bot} is" if is_raven else "I am")`
			`.replace("you are", f"{bot} is" if is_raven else "I am")`
			`.replace("You're", f"{bot} is" if is_raven else "I'm")`
			`.replace("you're", f"{bot} is" if is_raven else "I'm")`
			`.replace("You", f"{bot}" if is_raven else "I")`
			`.replace("you", f"{bot}" if is_raven else "I")`
			`.replace("Your", f"{bot}'s" if is_raven else "My")`
			`.replace("your", f"{bot}'s" if is_raven else "my")`
			`.replace("你", f"{bot}" if is_raven else "我")`
			`+ "\n\n"`
			`)`

			`for message in body.messages[(0 if basic_system == "" else 1) :]:`
			`append_message: str = ""`
			`if message.role == Role.User:`
			`append_message = f"{user}{interface} " + message.content`
			`elif message.role == Role.Assistant:`
			`append_message = f"{bot}{interface} " + message.content`
			`elif message.role == Role.System:`
			`append_message = message.content`
			`completion_text += (`
			`append_message.replace("\r\n", "\n")`
			`.replace("\r", "\n")`
			`.replace("\n\n", "\n")`
			`.strip()`
			`+ "\n\n"`
			`)`
add `role: "system"` support 2023-05-24 14:01:22 +08:00			`completion_text += f"{bot}{interface}"`
backend api 2023-05-07 17:27:54 +08:00
allow custom user_name and assistant_name (`/chat/completions` API) 2023-07-31 22:48:54 +08:00			`if type(body.stop) == str:`
			`body.stop = [body.stop, f"\n\n{user}", f"\n\n{bot}"]`
			`else:`
			`body.stop.append(f"\n\n{user}")`
			`body.stop.append(f"\n\n{bot}")`

preliminary usable features 2023-05-17 11:39:00 +08:00			`if body.stream:`
refactor completions api 2023-06-18 20:16:52 +08:00			`return EventSourceResponse(`
improve default ChatCompletion `stop` 2023-07-29 19:19:38 +08:00			`eval_rwkv(`
			`model, request, body, completion_text, body.stream, body.stop, True`
			`)`
refactor completions api 2023-06-18 20:16:52 +08:00			`)`
preliminary usable features 2023-05-17 11:39:00 +08:00			`else:`
add logs 2023-06-03 17:12:59 +08:00			`try:`
refactor completions api 2023-06-18 20:16:52 +08:00			`return await eval_rwkv(`
improve default ChatCompletion `stop` 2023-07-29 19:19:38 +08:00			`model, request, body, completion_text, body.stream, body.stop, True`
refactor completions api 2023-06-18 20:16:52 +08:00			`).__anext__()`
add logs 2023-06-03 17:12:59 +08:00			`except StopAsyncIteration:`
			`return None`
add compatible /v1/completions API 2023-05-22 11:18:37 +08:00

chore 2023-07-26 22:24:26 +08:00			`@router.post("/v1/completions", tags=["Completions"])`
			`@router.post("/completions", tags=["Completions"])`
add compatible /v1/completions API 2023-05-22 11:18:37 +08:00			`async def completions(body: CompletionBody, request: Request):`
add support for MIDI RWKV 2023-07-25 16:09:31 +08:00			`model: AbstractRWKV = global_var.get(global_var.Model)`
add compatible /v1/completions API 2023-05-22 11:18:37 +08:00			`if model is None:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")`
support for rwkv-4-world 2023-05-28 12:53:14 +08:00
improve `/completions` api compatibility 2023-07-10 20:45:08 +08:00			`if body.prompt is None or body.prompt == "" or body.prompt == []:`
improve api concurrency performance 2023-05-27 15:18:12 +08:00			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "prompt not found")`
add compatible /v1/completions API 2023-05-22 11:18:37 +08:00
improve `/completions` api compatibility 2023-07-10 20:45:08 +08:00			`if type(body.prompt) == list:`
			`body.prompt = body.prompt[0] # TODO: support multiple prompts`

add compatible /v1/completions API 2023-05-22 11:18:37 +08:00			`if body.stream:`
refactor completions api 2023-06-18 20:16:52 +08:00			`return EventSourceResponse(`
			`eval_rwkv(model, request, body, body.prompt, body.stream, body.stop, False)`
			`)`
add compatible /v1/completions API 2023-05-22 11:18:37 +08:00			`else:`
add logs 2023-06-03 17:12:59 +08:00			`try:`
refactor completions api 2023-06-18 20:16:52 +08:00			`return await eval_rwkv(`
			`model, request, body, body.prompt, body.stream, body.stop, False`
			`).__anext__()`
add logs 2023-06-03 17:12:59 +08:00			`except StopAsyncIteration:`
			`return None`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00

			`class EmbeddingsBody(BaseModel):`
allow completions input to be null 2023-08-04 22:22:59 +08:00			`input: Union[str, List[str], List[List[int]], None]`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`model: str = "rwkv"`
			`encoding_format: str = None`
			`fast_mode: bool = False`

			`class Config:`
			`schema_extra = {`
			`"example": {`
			`"input": "a big apple",`
			`"model": "rwkv",`
			`"encoding_format": None,`
			`"fast_mode": False,`
			`}`
			`}`


			`def embedding_base64(embedding: List[float]) -> str:`
			`return base64.b64encode(np.array(embedding).astype(np.float32)).decode("utf-8")`


chore 2023-07-26 22:24:26 +08:00			`@router.post("/v1/embeddings", tags=["Embeddings"])`
			`@router.post("/embeddings", tags=["Embeddings"])`
			`@router.post("/v1/engines/text-embedding-ada-002/embeddings", tags=["Embeddings"])`
			`@router.post("/engines/text-embedding-ada-002/embeddings", tags=["Embeddings"])`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`async def embeddings(body: EmbeddingsBody, request: Request):`
add support for MIDI RWKV 2023-07-25 16:09:31 +08:00			`model: AbstractRWKV = global_var.get(global_var.Model)`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`if model is None:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")`

			`if body.input is None or body.input == "" or body.input == [] or body.input == [[]]:`
			`raise HTTPException(status.HTTP_400_BAD_REQUEST, "input not found")`

			`global requests_num`
			`requests_num = requests_num + 1`
			`quick_log(request, None, "Start Waiting. RequestsNum: " + str(requests_num))`
			`while completion_lock.locked():`
			`if await request.is_disconnected():`
			`requests_num = requests_num - 1`
			`print(f"{request.client} Stop Waiting (Lock)")`
			`quick_log(`
			`request,`
			`None,`
			`"Stop Waiting (Lock). RequestsNum: " + str(requests_num),`
			`)`
			`return`
			`await asyncio.sleep(0.1)`
			`else:`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`with completion_lock:`
			`if await request.is_disconnected():`
			`requests_num = requests_num - 1`
			`print(f"{request.client} Stop Waiting (Lock)")`
			`quick_log(`
			`request,`
			`None,`
			`"Stop Waiting (Lock). RequestsNum: " + str(requests_num),`
			`)`
			`return`

			`base64_format = False`
			`if body.encoding_format == "base64":`
			`base64_format = True`

			`embeddings = []`
			`prompt_tokens = 0`
			`if type(body.input) == list:`
			`if type(body.input[0]) == list:`
			`encoding = tiktoken.model.encoding_for_model(`
			`"text-embedding-ada-002"`
add usage 2023-06-20 15:55:52 +08:00			`)`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`for i in range(len(body.input)):`
			`if await request.is_disconnected():`
			`break`
			`input = encoding.decode(body.input[i])`
			`embedding, token_len = model.get_embedding(`
			`input, body.fast_mode`
			`)`
			`prompt_tokens = prompt_tokens + token_len`
			`if base64_format:`
			`embedding = embedding_base64(embedding)`
			`embeddings.append(embedding)`
			`else:`
			`for i in range(len(body.input)):`
			`if await request.is_disconnected():`
			`break`
			`embedding, token_len = model.get_embedding(`
			`body.input[i], body.fast_mode`
			`)`
			`prompt_tokens = prompt_tokens + token_len`
			`if base64_format:`
			`embedding = embedding_base64(embedding)`
			`embeddings.append(embedding)`
			`else:`
			`embedding, prompt_tokens = model.get_embedding(`
			`body.input, body.fast_mode`
			`)`
			`if base64_format:`
			`embedding = embedding_base64(embedding)`
			`embeddings.append(embedding)`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`requests_num = requests_num - 1`
			`if await request.is_disconnected():`
			`print(f"{request.client} Stop Waiting")`
			`quick_log(`
			`request,`
			`None,`
			`"Stop Waiting. RequestsNum: " + str(requests_num),`
			`)`
			`return`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`quick_log(`
			`request,`
			`None,`
lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`"Finished. RequestsNum: " + str(requests_num),`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`)`

lora finetune (need to be refactored) 2023-07-03 17:41:47 +08:00			`ret_data = [`
			`{`
			`"object": "embedding",`
			`"index": i,`
			`"embedding": embedding,`
			`}`
			`for i, embedding in enumerate(embeddings)`
			`]`

			`return {`
			`"object": "list",`
			`"data": ret_data,`
			`"model": model.name,`
			`"usage": {`
			`"prompt_tokens": prompt_tokens,`
			`"total_tokens": prompt_tokens,`
			`},`
embeddings api compatible with openai api and langchain(sdk) 2023-06-19 22:51:06 +08:00			`}`