Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions deploy/paddleocr_vl_docker/hps/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# PaddleOCR-VL 高性能服务化部署(Beta)

本目录提供一套支持并发请求处理的 PaddleOCR-VL 高性能服务化部署方案。

## 环境要求

- x64 CPU
- NVIDIA GPU,Compute Capability >= 8.0 且 < 12.0
- NVIDIA 驱动支持 CUDA 12.6
- Docker >= 19.03

## 快速开始

拉取 PaddleOCR 源码并切换到当前目录:

```shell
git clone https://github.com/PaddlePaddle/PaddleOCR.git
cd deploy/paddleocr_vl_docker/hps
```

下载并拷贝必要文件到当前目录:

```shell
bash prepare.sh
```

启动服务:

```shell
docker compose up
```

上述命令将依次启动 3 个容器,每个容器对应一个服务:

- **`paddleocr-vlm-server`**:基于 vLLM 的 VLM 推理服务。
- **`paddleocr-vl-tritonserver`**:基于 Triton Inference Server 的 PaddleOCR-VL 产线推理服务。
- **`paddleocr-vl-api`**:使用 FastAPI 实现的网关服务,用于将 HTTP 请求转发至 Triton Inference Server,并封装返回结果,简化客户端调用流程。**该服务为对外入口**,客户端可直接通过 HTTP 调用。

> 首次启动会自动下载并构建镜像,耗时较长;从第二次启动起将直接使用本地镜像,启动速度更快。
## 调整服务配置

tbd
51 changes: 51 additions & 0 deletions deploy/paddleocr_vl_docker/hps/compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
services:
paddleocr-vl-api:
build:
context: .
dockerfile: gateway.Dockerfile
container_name: paddleocr-vl-api
ports:
- 8080:8080
depends_on:
paddleocr-vl-tritonserver:
condition: service_healthy
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]

paddleocr-vl-tritonserver:
build:
context: .
dockerfile: tritonserver.Dockerfile
container_name: paddleocr-vl-tritonserver
depends_on:
paddleocr-vlm-server:
condition: service_healthy
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0"]
capabilities: [gpu]
shm_size: 4gb
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8000/v2/health/ready || exit 1"]

paddleocr-vlm-server:
image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:latest-offline
container_name: paddleocr-vlm-server
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0"]
capabilities: [gpu]
# TODO: Allow using a regular user
user: root
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]
start_period: 300s
11 changes: 11 additions & 0 deletions deploy/paddleocr_vl_docker/hps/gateway.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM python:3.10-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends curl libgl1 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY gateway .
RUN --mount=type=bind,source=paddlex_hps_PaddleOCR-VL_sdk/client,target=/tmp/sdk \
python -m pip install -r requirements.txt \
&& python -m pip install -r /tmp/sdk/requirements.txt \
&& python -m pip install /tmp/sdk/paddlex_hps_client-*.whl
CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8080", "app:app"]
172 changes: 172 additions & 0 deletions deploy/paddleocr_vl_docker/hps/gateway/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/usr/bin/env python

# TODO:
# 1. Concurrency control
# 2. Timeout control
# 3. Seperate infer and non-infer operations
# 4. Fix FastAPI encoding bug
# 5. Add exception handlers for a standardized error response

import logging
from typing import Optional

import fastapi
from fastapi.responses import JSONResponse
from paddlex_hps_client import triton_request
from paddlex.inference.serving.infra.models import AIStudioNoResultResponse
from paddlex.inference.serving.infra.utils import generate_log_id
from paddlex.inference.serving.schemas import paddleocr_vl as schema
from tritonclient import grpc as triton_grpc

TRITONSERVER_URL = "paddleocr-vl-tritonserver:8001"

logger = logging.getLogger(__name__)


def _configure_logger(logger: logging.Logger):
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(funcName)s - %(levelname)s - %(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)


_configure_logger(logger)


def _create_aistudio_output_without_result(
error_code: str, error_msg: str, *, log_id: Optional[str] = None
) -> dict:
resp = AIStudioNoResultResponse(
logId=log_id if log_id is not None else generate_log_id(),
errorCode=error_code,
errorMsg=error_msg,
)
return resp.model_dump()


def _add_primary_operations(app: fastapi.FastAPI) -> None:
def _create_handler(model_name: str):
def _handler(request: dict):
request_log_id = request.get("logId", generate_log_id())
logger.info(
"Gateway server starts processing %r request %s",
model_name,
request_log_id,
)
if "logId" in request:
logger.warning(
"Duplicate 'logId' field found in %r request %s",
model_name,
request_log_id,
)
request["logId"] = request_log_id

try:
output = triton_request(
triton_client,
model_name,
request,
request_kwargs=dict(
timeout=600,
client_timeout=600,
),
)
except triton_grpc.InferenceServerException as e:
if e.message() == "Deadline Exceeded":
logger.warning(
"Timeout when processing %r request %s",
model_name,
request_log_id,
)
is_timedout = True
status_code = 504
output = _create_aistudio_output_without_result(
504,
"Gateway timeout",
log_id=request_log_id,
)
output = output.model_dump()
else:
logger.error(
"Failed to process %r request %s due to `InferenceServerException`: %s",
model_name,
request_log_id,
e,
)
status_code = 500
output = _create_aistudio_output_without_result(
500,
"Internal server error",
log_id=request_log_id,
)
output = output.model_dump()
except Exception as e:
logger.error(
"Failed to process %r request %s",
model_name,
request_log_id,
exc_info=True,
)
status_code = 500
output = _create_aistudio_output_without_result(
500,
"Internal server error",
log_id=request_log_id,
)
output = output.model_dump()
return JSONResponse(status_code=500, content=output)
if output["errorCode"] != 0:
output = _create_aistudio_output_without_result(
output["errorCode"],
output["errorMsg"],
log_id=request_log_id,
)
output = output.model_dump()
else:
status_code = 200
return JSONResponse(status_code=status_code, content=output)

return _handler

for operation_name, (endpoint, _, _) in schema.PRIMARY_OPERATIONS.items():
# TODO: API docs
app.post(
endpoint,
operation_id=operation_name,
)(
_create_handler(endpoint[1:]),
)


app = fastapi.FastAPI()


@app.get(
"/health",
operation_id="checkHealth",
)
def check_health():
return _create_aistudio_output_without_result(0, "Healthy")


_add_primary_operations(app)


# HACK
# https://github.com/encode/starlette/issues/864
class _EndpointFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
return record.getMessage().find("/health") == -1


logging.getLogger("uvicorn.access").addFilter(_EndpointFilter())

# HACK
triton_client: triton_grpc.InferenceServerClient = triton_grpc.InferenceServerClient(
TRITONSERVER_URL,
keepalive_options=triton_grpc.KeepAliveOptions(keepalive_timeout_ms=600000),
)
3 changes: 3 additions & 0 deletions deploy/paddleocr_vl_docker/hps/gateway/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fastapi == 0.123.6
paddlex[serving] >= 3.3.10
uvicorn == 0.35.0
5 changes: 5 additions & 0 deletions deploy/paddleocr_vl_docker/hps/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash

wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PaddleOCR-VL_sdk.tar.gz
tar -xf paddlex_hps_PaddleOCR-VL_sdk.tar.gz
cp ../pipeline_config_vllm.yaml paddlex_hps_PaddleOCR-VL_sdk/server/pipeline_config.yaml
8 changes: 8 additions & 0 deletions deploy/paddleocr_vl_docker/hps/tritonserver.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-gpu
RUN apt-get update \
&& apt-get install -y --no-install-recommends curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY paddlex_hps_PaddleOCR-VL_sdk/server .
ENV PADDLEX_HPS_DEVICE_TYPE=gpu
CMD ["/bin/bash", "server.sh"]
Loading