RWKV-Runner/backend-python/routes/completion.py

453 lines
15 KiB
Python
Raw Normal View History

2023-05-17 11:39:00 +08:00
import asyncio
2023-05-07 17:27:54 +08:00
import json
2023-05-17 11:39:00 +08:00
from threading import Lock
2023-05-07 17:27:54 +08:00
from typing import List
import base64
2023-05-07 17:27:54 +08:00
from fastapi import APIRouter, Request, status, HTTPException
from sse_starlette.sse import EventSourceResponse
from pydantic import BaseModel
import numpy as np
import tiktoken
2023-05-07 17:27:54 +08:00
from utils.rwkv import *
2023-06-03 17:12:59 +08:00
from utils.log import quick_log
2023-05-07 17:27:54 +08:00
import global_var
router = APIRouter()
class Message(BaseModel):
role: str
content: str
2023-05-22 11:18:37 +08:00
class ChatCompletionBody(ModelConfigBody):
2023-05-07 17:27:54 +08:00
messages: List[Message]
2023-05-17 11:47:45 +08:00
model: str = "rwkv"
stream: bool = False
2023-05-22 11:24:57 +08:00
stop: str = None
2023-05-17 11:39:00 +08:00
2023-06-15 21:52:22 +08:00
class Config:
schema_extra = {
"example": {
"messages": [{"role": "user", "content": "hello"}],
"model": "rwkv",
"stream": False,
"stop": None,
"max_tokens": 1000,
"temperature": 1.2,
"top_p": 0.5,
"presence_penalty": 0.4,
"frequency_penalty": 0.4,
}
}
2023-05-17 11:39:00 +08:00
2023-06-18 20:16:52 +08:00
class CompletionBody(ModelConfigBody):
prompt: str
model: str = "rwkv"
stream: bool = False
stop: str = None
class Config:
schema_extra = {
"example": {
"prompt": "The following is an epic science fiction masterpiece that is immortalized, "
+ "with delicate descriptions and grand depictions of interstellar civilization wars.\nChapter 1.\n",
"model": "rwkv",
"stream": False,
"stop": None,
"max_tokens": 100,
"temperature": 1.2,
"top_p": 0.5,
"presence_penalty": 0.4,
"frequency_penalty": 0.4,
}
}
2023-05-17 11:39:00 +08:00
completion_lock = Lock()
2023-05-07 17:27:54 +08:00
2023-06-03 17:12:59 +08:00
requests_num = 0
2023-05-07 17:27:54 +08:00
2023-06-18 20:16:52 +08:00
async def eval_rwkv(
model: RWKV,
request: Request,
body: ModelConfigBody,
prompt: str,
stream: bool,
stop: str,
chat_mode: bool,
):
global requests_num
requests_num = requests_num + 1
quick_log(request, None, "Start Waiting. RequestsNum: " + str(requests_num))
while completion_lock.locked():
if await request.is_disconnected():
requests_num = requests_num - 1
print(f"{request.client} Stop Waiting (Lock)")
quick_log(
request,
None,
"Stop Waiting (Lock). RequestsNum: " + str(requests_num),
)
return
await asyncio.sleep(0.1)
else:
completion_lock.acquire()
if await request.is_disconnected():
completion_lock.release()
requests_num = requests_num - 1
print(f"{request.client} Stop Waiting (Lock)")
quick_log(
request,
None,
"Stop Waiting (Lock). RequestsNum: " + str(requests_num),
)
return
set_rwkv_config(model, global_var.get(global_var.Model_Config))
set_rwkv_config(model, body)
2023-06-20 15:55:52 +08:00
response, prompt_tokens, completion_tokens = "", 0, 0
for response, delta, prompt_tokens, completion_tokens in model.generate(
2023-06-18 20:16:52 +08:00
prompt,
stop=stop,
):
if await request.is_disconnected():
break
if stream:
yield json.dumps(
{
"object": "chat.completion.chunk"
if chat_mode
else "text_completion",
2023-06-18 20:16:52 +08:00
"response": response,
2023-06-19 22:30:49 +08:00
"model": model.name,
2023-06-18 20:16:52 +08:00
"choices": [
{
"delta": {"content": delta},
"index": 0,
"finish_reason": None,
}
if chat_mode
else {
"text": delta,
"index": 0,
"finish_reason": None,
}
],
}
)
# torch_gc()
requests_num = requests_num - 1
completion_lock.release()
if await request.is_disconnected():
print(f"{request.client} Stop Waiting")
quick_log(
request,
body,
response + "\nStop Waiting. RequestsNum: " + str(requests_num),
)
return
quick_log(
request,
body,
response + "\nFinished. RequestsNum: " + str(requests_num),
)
if stream:
yield json.dumps(
{
"object": "chat.completion.chunk"
if chat_mode
else "text_completion",
2023-06-18 20:16:52 +08:00
"response": response,
2023-06-19 22:30:49 +08:00
"model": model.name,
2023-06-18 20:16:52 +08:00
"choices": [
{
"delta": {},
"index": 0,
"finish_reason": "stop",
}
if chat_mode
else {
"text": "",
"index": 0,
"finish_reason": "stop",
}
],
}
)
yield "[DONE]"
else:
yield {
"object": "chat.completion" if chat_mode else "text_completion",
2023-06-18 20:16:52 +08:00
"response": response,
2023-06-19 22:30:49 +08:00
"model": model.name,
2023-06-20 15:55:52 +08:00
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
},
2023-06-18 20:16:52 +08:00
"choices": [
{
"message": {
"role": "assistant",
"content": response,
},
"index": 0,
"finish_reason": "stop",
}
if chat_mode
else {
"text": response,
"index": 0,
"finish_reason": "stop",
}
],
}
2023-05-07 17:27:54 +08:00
@router.post("/v1/chat/completions")
@router.post("/chat/completions")
2023-05-22 11:18:37 +08:00
async def chat_completions(body: ChatCompletionBody, request: Request):
2023-05-17 11:39:00 +08:00
model: RWKV = global_var.get(global_var.Model)
if model is None:
2023-05-07 22:48:52 +08:00
raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")
2023-05-07 17:27:54 +08:00
question = body.messages[-1]
2023-05-17 11:39:00 +08:00
if question.role == "user":
2023-05-07 17:27:54 +08:00
question = question.content
2023-05-29 22:26:22 +08:00
elif question.role == "system":
question = body.messages[-2]
if question.role == "user":
question = question.content
else:
raise HTTPException(status.HTTP_400_BAD_REQUEST, "no question found")
2023-05-07 17:27:54 +08:00
else:
raise HTTPException(status.HTTP_400_BAD_REQUEST, "no question found")
2023-05-28 12:53:14 +08:00
interface = model.interface
user = model.user
bot = model.bot
completion_text = (
f"""
2023-05-24 14:01:22 +08:00
The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. \
{bot} is very intelligent, creative and friendly. \
{bot} is unlikely to disagree with {user}, and {bot} doesn't like to ask {user} questions. \
{bot} likes to tell {user} a lot about herself and her opinions. \
{bot} usually gives {user} kind, helpful and informative advices.\n
"""
2023-05-28 12:53:14 +08:00
if user == "Bob"
2023-06-18 20:16:52 +08:00
else f"{user}{interface} hi\n\n{bot}{interface} Hi. "
+ "I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.\n\n"
2023-05-28 12:53:14 +08:00
)
2023-05-07 17:27:54 +08:00
for message in body.messages:
2023-05-24 14:01:22 +08:00
if message.role == "system":
completion_text = (
f"The following is a coherent verbose detailed conversation between a girl named {bot} and her friend {user}. "
2023-05-28 12:53:14 +08:00
if user == "Bob"
2023-05-31 12:46:06 +08:00
else f"{user}{interface} hi\n\n{bot}{interface} Hi. "
2023-05-24 14:01:22 +08:00
+ message.content.replace("\\n", "\n")
.replace("\r\n", "\n")
.replace("\n\n", "\n")
.replace("\n", " ")
.strip()
2023-05-31 12:46:06 +08:00
.replace("You are", f"{bot} is" if user == "Bob" else "I am")
.replace("you are", f"{bot} is" if user == "Bob" else "I am")
.replace("You're", f"{bot} is" if user == "Bob" else "I'm")
.replace("you're", f"{bot} is" if user == "Bob" else "I'm")
.replace("You", f"{bot}" if user == "Bob" else "I")
.replace("you", f"{bot}" if user == "Bob" else "I")
.replace("Your", f"{bot}'s" if user == "Bob" else "My")
.replace("your", f"{bot}'s" if user == "Bob" else "my")
.replace("", f"{bot}" if user == "Bob" else "")
2023-05-24 14:01:22 +08:00
+ "\n\n"
)
2023-05-29 22:26:22 +08:00
break
for message in body.messages:
if message.role == "user":
2023-05-21 23:25:58 +08:00
completion_text += (
2023-05-24 14:01:22 +08:00
f"{user}{interface} "
2023-05-21 23:25:58 +08:00
+ message.content.replace("\\n", "\n")
.replace("\r\n", "\n")
.replace("\n\n", "\n")
.strip()
+ "\n\n"
)
2023-05-17 11:39:00 +08:00
elif message.role == "assistant":
2023-05-21 23:25:58 +08:00
completion_text += (
2023-05-24 14:01:22 +08:00
f"{bot}{interface} "
2023-05-21 23:25:58 +08:00
+ message.content.replace("\\n", "\n")
.replace("\r\n", "\n")
.replace("\n\n", "\n")
.strip()
+ "\n\n"
)
2023-05-24 14:01:22 +08:00
completion_text += f"{bot}{interface}"
2023-05-07 17:27:54 +08:00
2023-06-18 20:16:52 +08:00
stop = f"\n\n{user}" if body.stop is None else body.stop
2023-05-17 11:39:00 +08:00
if body.stream:
2023-06-18 20:16:52 +08:00
return EventSourceResponse(
eval_rwkv(model, request, body, completion_text, body.stream, stop, True)
)
2023-05-17 11:39:00 +08:00
else:
2023-06-03 17:12:59 +08:00
try:
2023-06-18 20:16:52 +08:00
return await eval_rwkv(
model, request, body, completion_text, body.stream, stop, True
).__anext__()
2023-06-03 17:12:59 +08:00
except StopAsyncIteration:
return None
2023-05-22 11:18:37 +08:00
@router.post("/v1/completions")
@router.post("/completions")
async def completions(body: CompletionBody, request: Request):
model: RWKV = global_var.get(global_var.Model)
if model is None:
raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")
2023-05-28 12:53:14 +08:00
2023-05-27 15:18:12 +08:00
if body.prompt is None or body.prompt == "":
raise HTTPException(status.HTTP_400_BAD_REQUEST, "prompt not found")
2023-05-22 11:18:37 +08:00
if body.stream:
2023-06-18 20:16:52 +08:00
return EventSourceResponse(
eval_rwkv(model, request, body, body.prompt, body.stream, body.stop, False)
)
2023-05-22 11:18:37 +08:00
else:
2023-06-03 17:12:59 +08:00
try:
2023-06-18 20:16:52 +08:00
return await eval_rwkv(
model, request, body, body.prompt, body.stream, body.stop, False
).__anext__()
2023-06-03 17:12:59 +08:00
except StopAsyncIteration:
return None
class EmbeddingsBody(BaseModel):
input: str | List[str] | List[List[int]]
model: str = "rwkv"
encoding_format: str = None
fast_mode: bool = False
class Config:
schema_extra = {
"example": {
"input": "a big apple",
"model": "rwkv",
"encoding_format": None,
"fast_mode": False,
}
}
def embedding_base64(embedding: List[float]) -> str:
return base64.b64encode(np.array(embedding).astype(np.float32)).decode("utf-8")
@router.post("/v1/embeddings")
@router.post("/embeddings")
@router.post("/v1/engines/text-embedding-ada-002/embeddings")
@router.post("/engines/text-embedding-ada-002/embeddings")
async def embeddings(body: EmbeddingsBody, request: Request):
model: RWKV = global_var.get(global_var.Model)
if model is None:
raise HTTPException(status.HTTP_400_BAD_REQUEST, "model not loaded")
if body.input is None or body.input == "" or body.input == [] or body.input == [[]]:
raise HTTPException(status.HTTP_400_BAD_REQUEST, "input not found")
global requests_num
requests_num = requests_num + 1
quick_log(request, None, "Start Waiting. RequestsNum: " + str(requests_num))
while completion_lock.locked():
if await request.is_disconnected():
requests_num = requests_num - 1
print(f"{request.client} Stop Waiting (Lock)")
quick_log(
request,
None,
"Stop Waiting (Lock). RequestsNum: " + str(requests_num),
)
return
await asyncio.sleep(0.1)
else:
completion_lock.acquire()
if await request.is_disconnected():
completion_lock.release()
requests_num = requests_num - 1
print(f"{request.client} Stop Waiting (Lock)")
quick_log(
request,
None,
"Stop Waiting (Lock). RequestsNum: " + str(requests_num),
)
return
base64_format = False
if body.encoding_format == "base64":
base64_format = True
embeddings = []
2023-06-20 15:55:52 +08:00
prompt_tokens = 0
if type(body.input) == list:
if type(body.input[0]) == list:
encoding = tiktoken.model.encoding_for_model("text-embedding-ada-002")
for i in range(len(body.input)):
if await request.is_disconnected():
break
input = encoding.decode(body.input[i])
2023-06-20 15:55:52 +08:00
embedding, token_len = model.get_embedding(input, body.fast_mode)
prompt_tokens = prompt_tokens + token_len
if base64_format:
embedding = embedding_base64(embedding)
embeddings.append(embedding)
else:
for i in range(len(body.input)):
if await request.is_disconnected():
break
2023-06-20 15:55:52 +08:00
embedding, token_len = model.get_embedding(
body.input[i], body.fast_mode
)
prompt_tokens = prompt_tokens + token_len
if base64_format:
embedding = embedding_base64(embedding)
embeddings.append(embedding)
else:
2023-06-20 15:55:52 +08:00
embedding, prompt_tokens = model.get_embedding(body.input, body.fast_mode)
if base64_format:
embedding = embedding_base64(embedding)
embeddings.append(embedding)
requests_num = requests_num - 1
completion_lock.release()
if await request.is_disconnected():
print(f"{request.client} Stop Waiting")
quick_log(
request,
None,
"Stop Waiting. RequestsNum: " + str(requests_num),
)
return
quick_log(
request,
None,
"Finished. RequestsNum: " + str(requests_num),
)
ret_data = [
{
"object": "embedding",
"index": i,
"embedding": embedding,
}
for i, embedding in enumerate(embeddings)
]
return {
"object": "list",
"data": ret_data,
"model": model.name,
2023-06-20 15:55:52 +08:00
"usage": {"prompt_tokens": prompt_tokens, "total_tokens": prompt_tokens},
}