PaddlePaddle · Bobholamovic · Dec 5, 2025 · Dec 10, 2025 · Dec 12, 2025 · Jan 8, 2026
diff --git a/deploy/paddleocr_vl_docker/hps/README.md b/deploy/paddleocr_vl_docker/hps/README.md
@@ -0,0 +1,43 @@
+# PaddleOCR-VL 高性能服务化部署（Beta）
+
+本目录提供一套支持并发请求处理的 PaddleOCR-VL 高性能服务化部署方案。
+
+## 环境要求
+
+- x64 CPU
+- NVIDIA GPU，Compute Capability >= 8.0 且 < 12.0
+- NVIDIA 驱动支持 CUDA 12.6
+- Docker >= 19.03
+
+## 快速开始
+
+拉取 PaddleOCR 源码并切换到当前目录：
+
+```shell
+git clone https://github.com/PaddlePaddle/PaddleOCR.git
+cd deploy/paddleocr_vl_docker/hps
+```
+
+下载并拷贝必要文件到当前目录：
+
+```shell
+bash prepare.sh
+```
+
+启动服务：
+
+```shell
+docker compose up
+```
+
+上述命令将依次启动 3 个容器，每个容器对应一个服务：
+
+- **`paddleocr-vlm-server`**：基于 vLLM 的 VLM 推理服务。
+- **`paddleocr-vl-tritonserver`**：基于 Triton Inference Server 的 PaddleOCR-VL 产线推理服务。
+- **`paddleocr-vl-api`**：使用 FastAPI 实现的网关服务，用于将 HTTP 请求转发至 Triton Inference Server，并封装返回结果，简化客户端调用流程。**该服务为对外入口**，客户端可直接通过 HTTP 调用。
+
+> 首次启动会自动下载并构建镜像，耗时较长；从第二次启动起将直接使用本地镜像，启动速度更快。
+
+## 调整服务配置
+
+tbd
diff --git a/deploy/paddleocr_vl_docker/hps/compose.yaml b/deploy/paddleocr_vl_docker/hps/compose.yaml
@@ -0,0 +1,51 @@
+services:
+  paddleocr-vl-api:
+    build:
+      context: .
+      dockerfile: gateway.Dockerfile
+    container_name: paddleocr-vl-api
+    ports:
+      - 8080:8080
+    depends_on:
+      paddleocr-vl-tritonserver:
+        condition: service_healthy
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]
+
+  paddleocr-vl-tritonserver:
+    build:
+      context: .
+      dockerfile: tritonserver.Dockerfile
+    container_name: paddleocr-vl-tritonserver
+    depends_on:
+      paddleocr-vlm-server:
+        condition: service_healthy
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]
+              capabilities: [gpu]
+    shm_size: 4gb
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8000/v2/health/ready || exit 1"]
+
+  paddleocr-vlm-server:
+    image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:latest-offline
+    container_name: paddleocr-vlm-server
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]
+              capabilities: [gpu]
+    # TODO: Allow using a regular user
+    user: root
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]
+      start_period: 300s
diff --git a/deploy/paddleocr_vl_docker/hps/gateway.Dockerfile b/deploy/paddleocr_vl_docker/hps/gateway.Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.10-slim
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl libgl1 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY gateway .
+RUN --mount=type=bind,source=paddlex_hps_PaddleOCR-VL_sdk/client,target=/tmp/sdk \
+    python -m pip install -r requirements.txt \
+    && python -m pip install -r /tmp/sdk/requirements.txt \
+    && python -m pip install /tmp/sdk/paddlex_hps_client-*.whl
+CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8080", "app:app"]
diff --git a/deploy/paddleocr_vl_docker/hps/gateway/app.py b/deploy/paddleocr_vl_docker/hps/gateway/app.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+
+# TODO:
+# 1. Concurrency control
+# 2. Timeout control
+# 3. Seperate infer and non-infer operations
+# 4. Fix FastAPI encoding bug
+# 5. Add exception handlers for a standardized error response
+
+import logging
+from typing import Optional
+
+import fastapi
+from fastapi.responses import JSONResponse
+from paddlex_hps_client import triton_request
+from paddlex.inference.serving.infra.models import AIStudioNoResultResponse
+from paddlex.inference.serving.infra.utils import generate_log_id
+from paddlex.inference.serving.schemas import paddleocr_vl as schema
+from tritonclient import grpc as triton_grpc
+
+TRITONSERVER_URL = "paddleocr-vl-tritonserver:8001"
+
+logger = logging.getLogger(__name__)
+
+
+def _configure_logger(logger: logging.Logger):
+    logger.setLevel(logging.INFO)
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.INFO)
+    formatter = logging.Formatter(
+        "%(asctime)s - %(funcName)s - %(levelname)s - %(message)s"
+    )
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+
+_configure_logger(logger)
+
+
+def _create_aistudio_output_without_result(
+    error_code: str, error_msg: str, *, log_id: Optional[str] = None
+) -> dict:
+    resp = AIStudioNoResultResponse(
+        logId=log_id if log_id is not None else generate_log_id(),
+        errorCode=error_code,
+        errorMsg=error_msg,
+    )
+    return resp.model_dump()
+
+
+def _add_primary_operations(app: fastapi.FastAPI) -> None:
+    def _create_handler(model_name: str):
+        def _handler(request: dict):
+            request_log_id = request.get("logId", generate_log_id())
+            logger.info(
+                "Gateway server starts processing %r request %s",
+                model_name,
+                request_log_id,
+            )
+            if "logId" in request:
+                logger.warning(
+                    "Duplicate 'logId' field found in %r request %s",
+                    model_name,
+                    request_log_id,
+                )
+            request["logId"] = request_log_id
+
+            try:
+                output = triton_request(
+                    triton_client,
+                    model_name,
+                    request,
+                    request_kwargs=dict(
+                        timeout=600,
+                        client_timeout=600,
+                    ),
+                )
+            except triton_grpc.InferenceServerException as e:
+                if e.message() == "Deadline Exceeded":
+                    logger.warning(
+                        "Timeout when processing %r request %s",
+                        model_name,
+                        request_log_id,
+                    )
+                    is_timedout = True
+                    status_code = 504
+                    output = _create_aistudio_output_without_result(
+                        504,
+                        "Gateway timeout",
+                        log_id=request_log_id,
+                    )
+                    output = output.model_dump()
+                else:
+                    logger.error(
+                        "Failed to process %r request %s due to `InferenceServerException`: %s",
+                        model_name,
+                        request_log_id,
+                        e,
+                    )
+                    status_code = 500
+                    output = _create_aistudio_output_without_result(
+                        500,
+                        "Internal server error",
+                        log_id=request_log_id,
+                    )
+                    output = output.model_dump()
+            except Exception as e:
+                logger.error(
+                    "Failed to process %r request %s",
+                    model_name,
+                    request_log_id,
+                    exc_info=True,
+                )
+                status_code = 500
+                output = _create_aistudio_output_without_result(
+                    500,
+                    "Internal server error",
+                    log_id=request_log_id,
+                )
+                output = output.model_dump()
+                return JSONResponse(status_code=500, content=output)
+            if output["errorCode"] != 0:
+                output = _create_aistudio_output_without_result(
+                    output["errorCode"],
+                    output["errorMsg"],
+                    log_id=request_log_id,
+                )
+                output = output.model_dump()
+            else:
+                status_code = 200
+            return JSONResponse(status_code=status_code, content=output)
+
+        return _handler
+
+    for operation_name, (endpoint, _, _) in schema.PRIMARY_OPERATIONS.items():
+        # TODO: API docs
+        app.post(
+            endpoint,
+            operation_id=operation_name,
+        )(
+            _create_handler(endpoint[1:]),
+        )
+
+
+app = fastapi.FastAPI()
+
+
+@app.get(
+    "/health",
+    operation_id="checkHealth",
+)
+def check_health():
+    return _create_aistudio_output_without_result(0, "Healthy")
+
+
+_add_primary_operations(app)
+
+
+# HACK
+# https://github.com/encode/starlette/issues/864
+class _EndpointFilter(logging.Filter):
+    def filter(self, record: logging.LogRecord) -> bool:
+        return record.getMessage().find("/health") == -1
+
+
+logging.getLogger("uvicorn.access").addFilter(_EndpointFilter())
+
+# HACK
+triton_client: triton_grpc.InferenceServerClient = triton_grpc.InferenceServerClient(
+    TRITONSERVER_URL,
+    keepalive_options=triton_grpc.KeepAliveOptions(keepalive_timeout_ms=600000),
+)
diff --git a/deploy/paddleocr_vl_docker/hps/gateway/requirements.txt b/deploy/paddleocr_vl_docker/hps/gateway/requirements.txt
@@ -0,0 +1,3 @@
+fastapi == 0.123.6
+paddlex[serving] >= 3.3.10
+uvicorn == 0.35.0
diff --git a/deploy/paddleocr_vl_docker/hps/prepare.sh b/deploy/paddleocr_vl_docker/hps/prepare.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PaddleOCR-VL_sdk.tar.gz
+tar -xf paddlex_hps_PaddleOCR-VL_sdk.tar.gz
+cp ../pipeline_config_vllm.yaml paddlex_hps_PaddleOCR-VL_sdk/server/pipeline_config.yaml
diff --git a/deploy/paddleocr_vl_docker/hps/tritonserver.Dockerfile b/deploy/paddleocr_vl_docker/hps/tritonserver.Dockerfile
@@ -0,0 +1,8 @@
+FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-gpu
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY paddlex_hps_PaddleOCR-VL_sdk/server .
+ENV PADDLEX_HPS_DEVICE_TYPE=gpu
+CMD ["/bin/bash", "server.sh"]