Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class OcrOptions(BaseOptions):
bitmap_area_threshold: float = (
0.05 # percentage of the area for a bitmap to processed with OCR
)
force_low_confidence_ocr: bool = False # If enabled low confidence programmatic cells are processed with OCR


class OcrAutoOptions(OcrOptions):
Expand Down
25 changes: 22 additions & 3 deletions docling/models/base_ocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,23 @@ def find_ocr_rects(size, bitmap_rects):
bitmap_rects = page._backend.get_bitmap_rects()
else:
bitmap_rects = []

force_ocr_on_low_confidence_cells = False

if self.options.force_low_confidence_ocr:
for cell in page.cells:
if cell.confidence == 0.0:
force_ocr_on_low_confidence_cells = True
break

coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)

# return full-page rectangle if page is dominantly covered with bitmaps
if self.options.force_full_page_ocr or coverage > max(
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
if (
self.options.force_full_page_ocr
or coverage
> max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold)
or force_ocr_on_low_confidence_cells
):
return [
BoundingBox(
Expand Down Expand Up @@ -145,6 +157,13 @@ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
# Get existing cells from the read-only property
existing_cells = page.cells

force_ocr_on_low_confidence_cells = False
if self.options.force_low_confidence_ocr:
existing_cells_length = len(existing_cells)
existing_cells = [cell for cell in existing_cells if cell.confidence != 0.0]
if len(existing_cells) < existing_cells_length:
force_ocr_on_low_confidence_cells = True

# Combine existing and OCR cells with overlap filtering
final_cells = self._combine_cells(existing_cells, ocr_cells)

Expand All @@ -158,7 +177,7 @@ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
# unreliable. Filter out cells where from_ocr=False, keeping any OCR-
# generated cells. This ensures downstream components (e.g., table
# structure model) fall back to OCR-extracted textline cells.
if self.options.force_full_page_ocr:
if self.options.force_full_page_ocr or force_ocr_on_low_confidence_cells:
page.parsed_page.word_cells = [
c for c in page.parsed_page.word_cells if c.from_ocr
]
Expand Down
3 changes: 2 additions & 1 deletion docling/models/page_preprocessing_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, options: PagePreprocessingOptions):
self.options = options

# Pre-compiled regex patterns for efficiency
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
self.GLYPH_RE = re.compile(r"GLYPH<[^>]+>") # anything between < and >
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
Expand Down Expand Up @@ -76,6 +76,7 @@ def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
text_scores = []
for c in page.cells:
score = self.rate_text_quality(c.text)
c.confidence = score
text_scores.append(score)

with warnings.catch_warnings():
Expand Down