docling-project · arceushui · Dec 17, 2025
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -84,6 +84,7 @@ class OcrOptions(BaseOptions):
     bitmap_area_threshold: float = (
         0.05  # percentage of the area for a bitmap to processed with OCR
     )
+    force_low_confidence_ocr: bool = False  # If enabled low confidence programmatic cells are processed with OCR
 
 
 class OcrAutoOptions(OcrOptions):

diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
@@ -90,11 +90,23 @@ def find_ocr_rects(size, bitmap_rects):
             bitmap_rects = page._backend.get_bitmap_rects()
         else:
             bitmap_rects = []
+
+        force_ocr_on_low_confidence_cells = False
+
+        if self.options.force_low_confidence_ocr:
+            for cell in page.cells:
+                if cell.confidence == 0.0:
+                    force_ocr_on_low_confidence_cells = True
+                    break
+
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
 
         # return full-page rectangle if page is dominantly covered with bitmaps
-        if self.options.force_full_page_ocr or coverage > max(
-            BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
+        if (
+            self.options.force_full_page_ocr
+            or coverage
+            > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold)
+            or force_ocr_on_low_confidence_cells
         ):
             return [
                 BoundingBox(
@@ -145,6 +157,13 @@ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
         # Get existing cells from the read-only property
         existing_cells = page.cells
 
+        force_ocr_on_low_confidence_cells = False
+        if self.options.force_low_confidence_ocr:
+            existing_cells_length = len(existing_cells)
+            existing_cells = [cell for cell in existing_cells if cell.confidence != 0.0]
+            if len(existing_cells) < existing_cells_length:
+                force_ocr_on_low_confidence_cells = True
+
         # Combine existing and OCR cells with overlap filtering
         final_cells = self._combine_cells(existing_cells, ocr_cells)
 
@@ -158,7 +177,7 @@ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
         # unreliable. Filter out cells where from_ocr=False, keeping any OCR-
         # generated cells. This ensures downstream components (e.g., table
         # structure model) fall back to OCR-extracted textline cells.
-        if self.options.force_full_page_ocr:
+        if self.options.force_full_page_ocr or force_ocr_on_low_confidence_cells:
             page.parsed_page.word_cells = [
                 c for c in page.parsed_page.word_cells if c.from_ocr
             ]

diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py
@@ -27,7 +27,7 @@ def __init__(self, options: PagePreprocessingOptions):
         self.options = options
 
         # Pre-compiled regex patterns for efficiency
-        self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
+        self.GLYPH_RE = re.compile(r"GLYPH<[^>]+>")  # anything between < and >
         self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
         self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
         self.SLASH_NUMBER_GARBAGE_RE = re.compile(
@@ -76,6 +76,7 @@ def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
         text_scores = []
         for c in page.cells:
             score = self.rate_text_quality(c.text)
+            c.confidence = score
             text_scores.append(score)
 
         with warnings.catch_warnings():