OpenAI-compatible API

在开发应用的时候,不少公司开发的api是配合使用openai的客户端的,如一个经典的openai的调用

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
from openai import OpenAI

client = OpenAI(
    api_key='xxxx'
)

completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": "Write a limerick about python exceptions"}
    ]
)

print(completion.choices[0].message.content)
print(completion)

如何开发一个非常简单的服务端来配合使用openai的客户端呢?这是这个blog的内容。

openai的主要请求是/v1/chat/completions,那么为了能够相应该请求我们需要设定这样的服务器

1
2
3
4
5
6
7
from fastapi import FastAPI

app = FastAPI(title="OpenAI-compatible API")

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
    pass

其中ChatCompletionRequest是一个请求格式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
from typing import List, Optional

from pydantic import BaseModel


class ChatMessage(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model: str = "mock-gpt-model"
    messages: List[ChatMessage]
    max_tokens: Optional[int] = 512
    temperature: Optional[float] = 0.1
    stream: Optional[bool] = False

那么完整的代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import time
from typing import List, Optional

from pydantic import BaseModel


class ChatMessage(BaseModel):
    role: str
    content: str


class ChatCompletionRequest(BaseModel):
    model: str = "mock-gpt-model"
    messages: List[ChatMessage]
    max_tokens: Optional[int] = 512
    temperature: Optional[float] = 0.1
    stream: Optional[bool] = False


from fastapi import FastAPI

app = FastAPI(title="OpenAI-compatible API")

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
    if request.messages and request.messages[0].role == 'user':
        resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
    else:
        resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"

    return {
        "id": "1337",
        "object": "chat.completion",
        "created": time.time(),
        "model": request.model,
        "choices": [{
            "message": ChatMessage(role="assistant", content=resp_content)
        }]
    }

到这里已经完成了简单的请求,但是openai的client还支持鉴权,我们可以添加一下代码添加安全性管理

import time
from typing import List, Optional

from pydantic import BaseModel


class ChatMessage(BaseModel):
    role: str
    content: str


class ChatCompletionRequest(BaseModel):
    model: str = "mock-gpt-model"
    messages: List[ChatMessage]
    max_tokens: Optional[int] = 512
    temperature: Optional[float] = 0.1
    stream: Optional[bool] = False


from fastapi import FastAPI, Depends, HTTPException, status
from fastapi.security import HTTPBearer

app = FastAPI(title="OpenAI-compatible API")

bearer_scheme = HTTPBearer(auto_error=False)


async def credentials(authorization=Depends(bearer_scheme)):
    if authorization and authorization.credentials == '123':
        # api key is valid
        return authorization.credentials

        # raise http error 401
    raise HTTPException(
        status_code=status.HTTP_401_UNAUTHORIZED,
        detail="Invalid API key",
    )


@app.post("/v1/chat/completions", dependencies=[Depends(credentials)])
async def chat_completions(request: ChatCompletionRequest):
    if request.messages and request.messages[0].role == 'user':
        resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
    else:
        resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"

    return {
        "id": "1337",
        "object": "chat.completion",
        "created": time.time(),
        "model": request.model,
        "choices": [{
            "message": ChatMessage(role="assistant", content=resp_content)
        }]
    }

如果api_key不是123的话就报错,到这里一个简单的完整的服务器就✅了,能够无缝切换使用openai的client客户端而不需要其他修改

进阶

由于llm需要大量的计算资源,如何使得生成的token一个一个吐出来是一个良好的用户体验

对代码进行修改

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import time
from typing import List, Optional

from pydantic import BaseModel


class ChatMessage(BaseModel):
    role: str
    content: str


class ChatCompletionRequest(BaseModel):
    model: str = "mock-gpt-model"
    messages: List[ChatMessage]
    max_tokens: Optional[int] = 512
    temperature: Optional[float] = 0.1
    stream: Optional[bool] = False


from fastapi import FastAPI, Depends, HTTPException, status
from fastapi.security import HTTPBearer

app = FastAPI(title="OpenAI-compatible API")

bearer_scheme = HTTPBearer(auto_error=False)


async def credentials(authorization=Depends(bearer_scheme)):
    if authorization and authorization.credentials == '123':
        # api key is valid
        return authorization.credentials

        # raise http error 401
    raise HTTPException(
        status_code=status.HTTP_401_UNAUTHORIZED,
        detail="Invalid API key",
    )


import asyncio
import json


async def _resp_async_generator(text_resp: str):
    # let's pretend every word is a token and return it over time
    tokens = text_resp.split(" ")

    for i, token in enumerate(tokens):
        chunk = {
            "id": i,
            "object": "chat.completion.chunk",
            "created": time.time(),
            "model": "blah",
            "choices": [{"delta": {"content": token + " "}}],
        }
        yield f"data: {json.dumps(chunk)}\n\n"
        await asyncio.sleep(0.5)
    yield "data: [DONE]\n\n"


from starlette.responses import StreamingResponse


@app.post("/v1/chat/completions", dependencies=[Depends(credentials)])
async def chat_completions(request: ChatCompletionRequest):
    if request.messages and request.messages[0].role == 'user':
        resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
    else:
        resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"

    if request.stream:
        return StreamingResponse(_resp_async_generator(resp_content), media_type="application/x-ndjson")

    return {
        "id": "1337",
        "object": "chat.completion",
        "created": time.time(),
        "model": request.model,
        "choices": [{
            "message": ChatMessage(role="assistant", content=resp_content)
        }]
    }

参考

  1. https://towardsdatascience.com/how-to-build-an-openai-compatible-api-87c8edea2f06

  2. vllm