ispras · oksidgy · Aug 21, 2025 · Aug 12, 2025 · Aug 20, 2025
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
@@ -1,6 +1,7 @@
 from collections import namedtuple
 from typing import List, Optional
 
+import numpy as np
 from dedocutils.data_structures import BBox
 from numpy import ndarray
 
@@ -79,7 +80,7 @@ def parse_contours_to_tree(contours: List, hierarchy: List, *, config: dict) ->
         if len(contours) == 0:
             return table_tree
 
-        bbox = [cv2.boundingRect(c) for c in contours[0]][0]   # [x_begin, y_begin, width, height]
+        bbox = cv2.boundingRect(contours[0].astype(np.int32))   # [x_begin, y_begin, width, height]
         table_tree.cell_box = BBox(x_top_left=bbox[0], y_top_left=bbox[1], width=bbox[2], height=bbox[3])
 
         table_tree = table_tree.__build_childs(table_tree, hierarchy, contours)
@@ -101,7 +102,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
         list_childs = []
         for i, h in enumerate(hierarchy[0]):
             if h[3] == cur.id_contours:
-                bbox = cv2.boundingRect(contours[i])  # [x_begin, y_begin, width, height]
+                bbox = cv2.boundingRect(contours[i].astype(np.int32))  # [x_begin, y_begin, width, height]
                 # Эвристика №1 на ячейку
                 if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
                     if self.config.get("debug_mode", False):

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -53,6 +53,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
                                                                           config=self.config)
         self.binarizer = AdaptiveBinarizer()
         self.ocr = OCRLineExtractor(config=self.config)
+        self.page_number = None
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         return super().read(file_path, parameters)
@@ -68,6 +69,7 @@ def _process_one_page(self,
         from dedoc.utils.parameter_utils import get_path_param
 
         #  --- Step 1: correct orientation and detect column count ---
+        self.page_number = page_number
         rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
         if self.config.get("debug_mode", False):
             self.logger.info(f"Angle page rotation = {angle}")
@@ -105,7 +107,6 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param
         Return: rotated_image and indicator if the page is one-column
         """
         import os
-        from datetime import datetime
         import cv2
         from dedoc.utils.parameter_utils import get_path_param
 
@@ -124,7 +125,7 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param
 
         if self.config.get("debug_mode", False):
             debug_dir = get_path_param(self.config, "path_debug")
-            img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
+            img_path = os.path.join(debug_dir, f"page-{self.page_number}_result_orientation.jpg")
             self.logger.info(f"Save image to {img_path}")
             cv2.imwrite(img_path, rotated_image)
 

diff --git a/...reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/...reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py
@@ -112,19 +112,25 @@ def __delete_ref_table(self, lines: List[LineWithMeta], table_name: str) -> None
     @staticmethod
     def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]:
         widths = []
-        prev_uid = None
-        start = None
-        end = None
-        for cell_id, cell in enumerate(row):
-            if prev_uid is None:
-                start = cell.bbox.x_top_left
-                prev_uid = cell.uuid
-            elif prev_uid != cell.uuid:
-                widths.append(end - start)
-                start = cell.bbox.x_top_left
-            end = cell.bbox.x_bottom_right
-            if cell_id == len(row) - 1:
-                widths.append(end - start)
+        prev_cell_uuid = None
+        cell_x_left = None
+        cell_x_right = None
+        for column_num, cell in enumerate(row):
+            if prev_cell_uuid is None:  # the first column
+                cell_x_left = cell.bbox.x_top_left
+                cell_x_right = cell.bbox.x_bottom_right
+                prev_cell_uuid = cell.uuid
+                continue
+
+            if prev_cell_uuid != cell.uuid:  # a new cell starts
+                widths.append(cell_x_right - cell_x_left)
+                cell_x_left = cell.bbox.x_top_left
+
+            cell_x_right = cell.bbox.x_bottom_right
+
+            if column_num == len(row) - 1:  # the last column
+                widths.append(cell_x_right - cell_x_left)
+
         return widths
 
     def __is_equal_width_cells(self, table_part_1: List[List[Cell]], table_part_2: List[List[Cell]]) -> bool:

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -21,13 +21,13 @@ class OnePageTableExtractor(BaseTableExtractor):
     def __init__(self, *, config: dict, logger: logging.Logger) -> None:
         super().__init__(config=config, logger=logger)
 
+        self.language = "rus"
+        self.page_number = None
         self.image = None
-        self.page_number = 0
         self.table_header_extractor = TableHeaderExtractor(logger=self.logger)
         self.count_vertical_extended = 0
         self.splitter = CellSplitter()
         self.table_options = TableTypeAdditionalOptions()
-        self.language = "rus"
 
     def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str) -> List[ScanTable]:
         """
@@ -89,20 +89,18 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type:
                 tables.append(table)
             except Exception as ex:
                 self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}")
-                if self.config.get("debug_mode", False):
-                    raise ex
         return tables
 
     def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[List[Cell]]:
-        # Эвристика 1: Таблица должна состоять из 1 строк и более
+        # Heuristic 1: The table must have 1 or more rows.
         if len(cells) < 1:
-            raise RecognizeError("Invalid recognized table")
+            raise RecognizeError("Invalid recognized table. Heuristic 1: The table must have 1 or more rows.")
 
         cells = self.splitter.split(cells=cells)
 
-        # Эвристика 2: таблица должна иметь больше одного столбца
+        # Heuristic 2: The table must have more than one column.
         if cells[0] == [] or (len(cells[0]) <= 1 and self.table_options.detect_one_cell_table not in table_type):
-            raise RecognizeError("Invalid recognized table")
+            raise RecognizeError("Invalid recognized table. Heuristic 2: The table must have more than one column.")
 
         # Postprocess table
         if self.table_options.split_last_column in table_type:

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import time
+import traceback
 from typing import List, Optional, Tuple
 
 import cv2
@@ -37,9 +38,9 @@ def recognize_tables_from_image(self, image: np.ndarray, page_number: int, langu
             cleaned_image, scan_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type)
             return cleaned_image, scan_tables
         except Exception as ex:
-            logging.warning(ex)
-            if self.config.get("debug_mode", False):
-                raise ex
+            traceback_message = "".join(traceback.format_exception(type(ex), value=ex, tb=ex.__traceback__))
+            logging.warning(traceback_message)
+
             return image, []
 
     def __rec_tables_from_img(self, src_image: np.ndarray, page_num: int, language: str, table_type: str) -> Tuple[np.ndarray, List[ScanTable]]:
@@ -95,11 +96,11 @@ def __clean_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray
     def __filter_bad_tables(self, tables: List[ScanTable], image: np.ndarray) -> List[ScanTable]:
         filtered = []
         for table in tables:
-            if not self.__if_not_table(table, image):
+            if not self.__is_not_table(table, image):
                 filtered.append(table)
         return filtered
 
-    def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool:
+    def __is_not_table(self, table: ScanTable, image: np.ndarray) -> bool:
         bbox = table.location.bbox
         height, width = image.shape
         table_image = image[max(bbox.y_top_left, 0): min(bbox.y_bottom_right, height), max(bbox.x_top_left, 0): min(bbox.x_bottom_right, width)]

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py
@@ -123,7 +123,7 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
     filtered_cont_id = []
     for i, c in enumerate(table_contours):
         # Returns the location and width,height for table contour
-        x, y, w, h = cv2.boundingRect(c)
+        x, y, w, h = cv2.boundingRect(c.astype(np.int32))
         table_image = img[y:y + h, x:x + w]
 
         # filter contours which not similar a table contour
@@ -134,7 +134,7 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
         return contours, hierarchy
 
     for c in contours[np.array(filtered_cont_id)]:
-        x, y, w, h = cv2.boundingRect(c)
+        x, y, w, h = cv2.boundingRect(c.astype(np.int32))
         cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5)
 
     if config.get("debug_mode", False):
@@ -152,6 +152,10 @@ def __filter_table(image: np.ndarray, table_image: np.ndarray) -> bool:
     table_area = table_image.shape[0] * table_image.shape[1]
     image_area = image.shape[0] * image.shape[1]
 
+    config = get_config()
+    if config.get("debug_mode", False):
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "table_image.jpg"), table_image)
+
     res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or (table_area < image_area * 0.2)
     return res
 

diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py
@@ -273,3 +273,23 @@ def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:
                 "page_height": 841
             }
             self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)
+
+    def test_with_table_wo_external_bounds_1(self) -> None:
+        file_name = os.path.join("..", "lising", "russian_invoice.jpg")
+        result = self._send_request(file_name, data=dict(table_type="wo_external_bounds", language="rus+eng"))
+
+        self.assertEqual(1, len(result["content"]["tables"]))
+        table = result["content"]["tables"][0]["cells"]
+        row0 = self._get_text_of_row(table[0])
+
+        self.assertEqual(row0[:2], ["Сумма\nпрописью\nв валюте", "Две тысячи сто шестьдесят евро 00 евроцентов"])
+
+    def test_with_table_wo_external_bounds_2(self) -> None:
+        result = self._send_request("2312.pdf", data=dict(table_type="wo_external_bounds", language="rus", pages=":12"))
+
+        self.assertEqual(2, len(result["content"]["tables"]))
+
+        table = result["content"]["tables"][0]["cells"]
+        row0 = self._get_text_of_row(table[0])
+
+        self.assertEqual(row0[:2], ["Номер", "Извещения\nмореплавателям"])
diff --git a/tests/data/lising/platezhka.jpg → tests/data/lising/russian_invoice.jpg b/tests/data/lising/platezhka.jpg → tests/data/lising/russian_invoice.jpg
diff --git a/tests/data/tables/2312.pdf b/tests/data/tables/2312.pdf
diff --git a/tests/unit_tests/test_module_table_detection.py b/tests/unit_tests/test_module_table_detection.py
@@ -24,7 +24,7 @@ def get_table(self, image: np.ndarray, language: str = "rus", table_type: str =
         return tables
 
     def test_table_wo_external_bounds(self) -> None:
-        path_image = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "lising", "platezhka.jpg"))
+        path_image = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "lising", "russian_invoice.jpg"))
 
         image = cv2.imread(path_image, 0)
 
@@ -39,7 +39,7 @@ def test_table_wo_external_bounds(self) -> None:
         self.assertTrue(equal_with_eps(bbox.height, 754, 10))
 
     def test_table_split_right_column(self) -> None:
-        path_image = get_full_path("data/lising/platezhka.jpg")
+        path_image = get_full_path("data/lising/russian_invoice.jpg")
 
         image = cv2.imread(path_image, 0)
 
@@ -53,7 +53,7 @@ def test_table_split_right_column(self) -> None:
         self.assertTrue(tables[0].cells[10][-1].get_text(), "30110978700000070815")
 
     def test_table_extract_one_cell_and_one_cell_tables(self) -> None:
-        path_image = get_full_path("data/lising/platezhka.jpg")
+        path_image = get_full_path("data/lising/russian_invoice.jpg")
         image = cv2.imread(path_image, 0)
 
         tables = self.get_table(image, "rus+eng", table_type="table_wo_external_bounds+one_cell_table")