Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/version3.x/pipeline_usage/seal_recognition.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,23 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model</a>/<a href="https://p
<br />
<b>If you are more concerned with model accuracy, please choose a model with higher accuracy. If you are more concerned with inference speed, please choose a model with faster inference speed. If you are more concerned with model storage size, please choose a model with smaller storage size</b>.

## 1.1 Known Issues

⚠️ **Multi-page PDF Processing Issue (PaddleX v3.2.0/v3.2.1)**

If you are using PaddleX v3.2.0 or v3.2.1, you may encounter an `IndexError: list index out of range` error when processing multi-page PDF files. This is a known bug in PaddleX that has been fixed in subsequent versions.

**Solutions:**

1. **Recommended**: Install the PaddleX version with the fix
```bash
pip install 'git+https://github.com/PaddlePaddle/PaddleX.git@release/3.2#egg=paddlex[ocr-core]'
```

2. **Temporary workaround**: Split multi-page PDFs into single-page PDFs and process them separately

Related link: [PaddleX Fix Commit](https://github.com/PaddlePaddle/PaddleX/commit/bdcc1f7dc)


## 2. Quick Start

Expand Down
19 changes: 18 additions & 1 deletion docs/version3.x/pipeline_usage/seal_recognition.md
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,24 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型</a>/<a href="https://padd
</details>

<br />
<b>如您更考虑模型精度,请选择精度较高的模型,如您更考虑模型推理速度,请选择推理速度较快的模型,如您更考虑模型存储大小,请选择存储大小较小的模型</b>。
<b>如您更考虑模型精度,请选择精度较高的模型,如您更考虑模型推理速度,请选择推理速度较快的模型,如您更考虑模型存储大小,请选择存储大小较小的模型</b>。

## 1.1 已知问题

⚠️ **多页PDF处理问题 (PaddleX v3.2.0/v3.2.1)**

如果您使用 PaddleX v3.2.0 或 v3.2.1 版本,在处理多页PDF文件时可能会遇到 `IndexError: list index out of range` 错误。这是 PaddleX 的一个已知bug,已在后续版本中修复。

**解决方案:**

1. **推荐方式**:安装包含修复的 PaddleX 版本
```bash
pip install 'git+https://github.com/PaddlePaddle/PaddleX.git@release/3.2#egg=paddlex[ocr-core]'
```

2. **临时方案**:将多页PDF拆分为单页PDF分别处理

相关链接:[PaddleX修复提交](https://github.com/PaddlePaddle/PaddleX/commit/bdcc1f7dc)

## 2. 快速开始

Expand Down
86 changes: 68 additions & 18 deletions paddleocr/_pipelines/seal_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings

from .._utils.cli import (
add_simple_inference_args,
get_subcommand_args,
Expand Down Expand Up @@ -51,6 +53,8 @@ def __init__(
seal_rec_score_thresh=None,
**kwargs,
):
# Check for known PaddleX bugs and warn users
self._check_paddlex_version()

self._params = {
"doc_orientation_classify_model_name": doc_orientation_classify_model_name,
Expand Down Expand Up @@ -80,6 +84,33 @@ def __init__(
}
super().__init__(**kwargs)

def _check_paddlex_version(self):
"""Check for known PaddleX bugs and warn users"""
try:
import paddlex
from packaging.version import parse

paddlex_version = parse(paddlex.__version__)

# Check for the multi-page PDF bug (fixed in commit bdcc1f7dc, not yet released)
if parse("3.2.0") <= paddlex_version <= parse("3.2.1"):
warnings.warn(
f"\nDetected PaddleX version {paddlex.__version__} which contains a known bug "
"that causes 'IndexError: list index out of range' when processing multi-page PDFs "
"with seal recognition enabled.\n\n"
"This bug has been fixed in PaddleX but not yet released. "
"If you encounter this error, you have two options:\n"
"1. Install the fixed version from GitHub:\n"
" pip install 'git+https://github.com/PaddlePaddle/PaddleX.git@release/3.2#egg=paddlex[ocr-core]'\n"
"2. Process single-page PDFs only, or extract pages individually.\n\n"
"For more details, see: https://github.com/PaddlePaddle/PaddleX/commit/bdcc1f7dc",
UserWarning,
stacklevel=3,
)
except (ImportError, AttributeError):
# PaddleX not installed yet or version not available
pass

@property
def _paddlex_pipeline_name(self):
return "seal_recognition"
Expand All @@ -104,24 +135,43 @@ def predict_iter(
seal_rec_score_thresh=None,
**kwargs,
):
return self.paddlex_pipeline.predict(
input,
use_doc_orientation_classify=use_doc_orientation_classify,
use_doc_unwarping=use_doc_unwarping,
use_layout_detection=use_layout_detection,
layout_det_res=layout_det_res,
layout_threshold=layout_threshold,
layout_nms=layout_nms,
layout_unclip_ratio=layout_unclip_ratio,
layout_merge_bboxes_mode=layout_merge_bboxes_mode,
seal_det_limit_side_len=seal_det_limit_side_len,
seal_det_limit_type=seal_det_limit_type,
seal_det_thresh=seal_det_thresh,
seal_det_box_thresh=seal_det_box_thresh,
seal_det_unclip_ratio=seal_det_unclip_ratio,
seal_rec_score_thresh=seal_rec_score_thresh,
**kwargs,
)
try:
yield from self.paddlex_pipeline.predict(
input,
use_doc_orientation_classify=use_doc_orientation_classify,
use_doc_unwarping=use_doc_unwarping,
use_layout_detection=use_layout_detection,
layout_det_res=layout_det_res,
layout_threshold=layout_threshold,
layout_nms=layout_nms,
layout_unclip_ratio=layout_unclip_ratio,
layout_merge_bboxes_mode=layout_merge_bboxes_mode,
seal_det_limit_side_len=seal_det_limit_side_len,
seal_det_limit_type=seal_det_limit_type,
seal_det_thresh=seal_det_thresh,
seal_det_box_thresh=seal_det_box_thresh,
seal_det_unclip_ratio=seal_det_unclip_ratio,
seal_rec_score_thresh=seal_rec_score_thresh,
**kwargs,
)
except IndexError as e:
# Check if this is the known multi-page PDF bug
if "list index out of range" in str(e):
import paddlex
from packaging.version import parse

paddlex_version = parse(paddlex.__version__)
if parse("3.2.0") <= paddlex_version <= parse("3.2.1"):
raise RuntimeError(
f"Encountered a known bug in PaddleX {paddlex.__version__} when processing multi-page PDFs "
"with seal recognition.\n\n"
"To fix this issue, please install the fixed version:\n"
" pip install 'git+https://github.com/PaddlePaddle/PaddleX.git@release/3.2#egg=paddlex[ocr-core]'\n\n"
"Alternatively, process single-page PDFs only.\n\n"
"For more details, see: https://github.com/PaddlePaddle/PaddleX/commit/bdcc1f7dc"
) from e
# Re-raise if it's a different error
raise

def predict(
self,
Expand Down
35 changes: 35 additions & 0 deletions tests/pipelines/test_seal_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,38 @@ def test_predict_params(
"dummy_path",
params,
)


def test_paddlex_version_warning(monkeypatch) -> None:
"""
Test that a warning is issued for PaddleX versions with the multi-page PDF bug.
"""
import warnings

# Mock paddlex to simulate version 3.2.0
class MockPaddleX:
__version__ = "3.2.0"

monkeypatch.setitem(__import__("sys").modules, "paddlex", MockPaddleX())

# Test that a warning is issued during initialization
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# Import seal_recognition here to trigger the warning
from paddleocr._pipelines.seal_recognition import SealRecognition

# Create instance which should trigger the version check
try:
_ = SealRecognition()
except Exception:
# Initialization might fail due to missing PaddleX modules, but that's OK
# We're just testing if the warning is issued
pass

# Check that a warning was issued
assert len(w) >= 1
assert any(
"PaddleX version" in str(warning.message)
and "multi-page PDFs" in str(warning.message)
for warning in w
), f"Expected warning about PaddleX version bug, got: {[str(warning.message) for warning in w]}"
Loading