OpenAI-compatible API
在开发应用的时候,不少公司开发的api是配合使用openai的客户端的,如一个经典的openai的调用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
from openai import OpenAI
client = OpenAI(
api_key='xxxx'
)
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Write a limerick about python exceptions"}
]
)
print(completion.choices[0].message.content)
print(completion)
|
如何开发一个非常简单的服务端来配合使用openai的客户端呢?这是这个blog的内容。
openai的主要请求是/v1/chat/completions,那么为了能够相应该请求我们需要设定这样的服务器
1
2
3
4
5
6
7
|
from fastapi import FastAPI
app = FastAPI(title="OpenAI-compatible API")
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
pass
|
其中ChatCompletionRequest是一个请求格式
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
from typing import List, Optional
from pydantic import BaseModel
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str = "mock-gpt-model"
messages: List[ChatMessage]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.1
stream: Optional[bool] = False
|
那么完整的代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
import time
from typing import List, Optional
from pydantic import BaseModel
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str = "mock-gpt-model"
messages: List[ChatMessage]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.1
stream: Optional[bool] = False
from fastapi import FastAPI
app = FastAPI(title="OpenAI-compatible API")
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
if request.messages and request.messages[0].role == 'user':
resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
else:
resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"
return {
"id": "1337",
"object": "chat.completion",
"created": time.time(),
"model": request.model,
"choices": [{
"message": ChatMessage(role="assistant", content=resp_content)
}]
}
|
到这里已经完成了简单的请求,但是openai的client还支持鉴权,我们可以添加一下代码添加安全性管理
import time
from typing import List, Optional
from pydantic import BaseModel
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str = "mock-gpt-model"
messages: List[ChatMessage]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.1
stream: Optional[bool] = False
from fastapi import FastAPI, Depends, HTTPException, status
from fastapi.security import HTTPBearer
app = FastAPI(title="OpenAI-compatible API")
bearer_scheme = HTTPBearer(auto_error=False)
async def credentials(authorization=Depends(bearer_scheme)):
if authorization and authorization.credentials == '123':
# api key is valid
return authorization.credentials
# raise http error 401
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid API key",
)
@app.post("/v1/chat/completions", dependencies=[Depends(credentials)])
async def chat_completions(request: ChatCompletionRequest):
if request.messages and request.messages[0].role == 'user':
resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
else:
resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"
return {
"id": "1337",
"object": "chat.completion",
"created": time.time(),
"model": request.model,
"choices": [{
"message": ChatMessage(role="assistant", content=resp_content)
}]
}
如果api_key不是123的话就报错,到这里一个简单的完整的服务器就✅了,能够无缝切换使用openai的client客户端而不需要其他修改
进阶
由于llm需要大量的计算资源,如何使得生成的token一个一个吐出来是一个良好的用户体验
对代码进行修改
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
import time
from typing import List, Optional
from pydantic import BaseModel
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str = "mock-gpt-model"
messages: List[ChatMessage]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.1
stream: Optional[bool] = False
from fastapi import FastAPI, Depends, HTTPException, status
from fastapi.security import HTTPBearer
app = FastAPI(title="OpenAI-compatible API")
bearer_scheme = HTTPBearer(auto_error=False)
async def credentials(authorization=Depends(bearer_scheme)):
if authorization and authorization.credentials == '123':
# api key is valid
return authorization.credentials
# raise http error 401
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid API key",
)
import asyncio
import json
async def _resp_async_generator(text_resp: str):
# let's pretend every word is a token and return it over time
tokens = text_resp.split(" ")
for i, token in enumerate(tokens):
chunk = {
"id": i,
"object": "chat.completion.chunk",
"created": time.time(),
"model": "blah",
"choices": [{"delta": {"content": token + " "}}],
}
yield f"data: {json.dumps(chunk)}\n\n"
await asyncio.sleep(0.5)
yield "data: [DONE]\n\n"
from starlette.responses import StreamingResponse
@app.post("/v1/chat/completions", dependencies=[Depends(credentials)])
async def chat_completions(request: ChatCompletionRequest):
if request.messages and request.messages[0].role == 'user':
resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
else:
resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"
if request.stream:
return StreamingResponse(_resp_async_generator(resp_content), media_type="application/x-ndjson")
return {
"id": "1337",
"object": "chat.completion",
"created": time.time(),
"model": request.model,
"choices": [{
"message": ChatMessage(role="assistant", content=resp_content)
}]
}
|
参考
-
https://towardsdatascience.com/how-to-build-an-openai-compatible-api-87c8edea2f06
-
vllm