Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import namedtuple
from typing import List, Optional

import numpy as np
from dedocutils.data_structures import BBox
from numpy import ndarray

Expand Down Expand Up @@ -79,7 +80,7 @@ def parse_contours_to_tree(contours: List, hierarchy: List, *, config: dict) ->
if len(contours) == 0:
return table_tree

bbox = [cv2.boundingRect(c) for c in contours[0]][0] # [x_begin, y_begin, width, height]
bbox = cv2.boundingRect(contours[0].astype(np.int32)) # [x_begin, y_begin, width, height]
table_tree.cell_box = BBox(x_top_left=bbox[0], y_top_left=bbox[1], width=bbox[2], height=bbox[3])

table_tree = table_tree.__build_childs(table_tree, hierarchy, contours)
Expand All @@ -101,7 +102,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
list_childs = []
for i, h in enumerate(hierarchy[0]):
if h[3] == cur.id_contours:
bbox = cv2.boundingRect(contours[i]) # [x_begin, y_begin, width, height]
bbox = cv2.boundingRect(contours[i].astype(np.int32)) # [x_begin, y_begin, width, height]
# Эвристика №1 на ячейку
if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
if self.config.get("debug_mode", False):
Expand Down
5 changes: 3 additions & 2 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
config=self.config)
self.binarizer = AdaptiveBinarizer()
self.ocr = OCRLineExtractor(config=self.config)
self.page_number = None

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
return super().read(file_path, parameters)
Expand All @@ -68,6 +69,7 @@ def _process_one_page(self,
from dedoc.utils.parameter_utils import get_path_param

# --- Step 1: correct orientation and detect column count ---
self.page_number = page_number
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
if self.config.get("debug_mode", False):
self.logger.info(f"Angle page rotation = {angle}")
Expand Down Expand Up @@ -105,7 +107,6 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param
Return: rotated_image and indicator if the page is one-column
"""
import os
from datetime import datetime
import cv2
from dedoc.utils.parameter_utils import get_path_param

Expand All @@ -124,7 +125,7 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param

if self.config.get("debug_mode", False):
debug_dir = get_path_param(self.config, "path_debug")
img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
img_path = os.path.join(debug_dir, f"page-{self.page_number}_result_orientation.jpg")
self.logger.info(f"Save image to {img_path}")
cv2.imwrite(img_path, rotated_image)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,19 +112,25 @@ def __delete_ref_table(self, lines: List[LineWithMeta], table_name: str) -> None
@staticmethod
def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]:
widths = []
prev_uid = None
start = None
end = None
for cell_id, cell in enumerate(row):
if prev_uid is None:
start = cell.bbox.x_top_left
prev_uid = cell.uuid
elif prev_uid != cell.uuid:
widths.append(end - start)
start = cell.bbox.x_top_left
end = cell.bbox.x_bottom_right
if cell_id == len(row) - 1:
widths.append(end - start)
prev_cell_uuid = None
cell_x_left = None
cell_x_right = None
for column_num, cell in enumerate(row):
if prev_cell_uuid is None: # the first column
cell_x_left = cell.bbox.x_top_left
cell_x_right = cell.bbox.x_bottom_right
prev_cell_uuid = cell.uuid
continue

if prev_cell_uuid != cell.uuid: # a new cell starts
widths.append(cell_x_right - cell_x_left)
cell_x_left = cell.bbox.x_top_left

cell_x_right = cell.bbox.x_bottom_right

if column_num == len(row) - 1: # the last column
widths.append(cell_x_right - cell_x_left)

return widths

def __is_equal_width_cells(self, table_part_1: List[List[Cell]], table_part_2: List[List[Cell]]) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ class OnePageTableExtractor(BaseTableExtractor):
def __init__(self, *, config: dict, logger: logging.Logger) -> None:
super().__init__(config=config, logger=logger)

self.language = "rus"
self.page_number = None
self.image = None
self.page_number = 0
self.table_header_extractor = TableHeaderExtractor(logger=self.logger)
self.count_vertical_extended = 0
self.splitter = CellSplitter()
self.table_options = TableTypeAdditionalOptions()
self.language = "rus"

def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str) -> List[ScanTable]:
"""
Expand Down Expand Up @@ -89,20 +89,18 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type:
tables.append(table)
except Exception as ex:
self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}")
if self.config.get("debug_mode", False):
raise ex
return tables

def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[List[Cell]]:
# Эвристика 1: Таблица должна состоять из 1 строк и более
# Heuristic 1: The table must have 1 or more rows.
if len(cells) < 1:
raise RecognizeError("Invalid recognized table")
raise RecognizeError("Invalid recognized table. Heuristic 1: The table must have 1 or more rows.")

cells = self.splitter.split(cells=cells)

# Эвристика 2: таблица должна иметь больше одного столбца
# Heuristic 2: The table must have more than one column.
if cells[0] == [] or (len(cells[0]) <= 1 and self.table_options.detect_one_cell_table not in table_type):
raise RecognizeError("Invalid recognized table")
raise RecognizeError("Invalid recognized table. Heuristic 2: The table must have more than one column.")

# Postprocess table
if self.table_options.split_last_column in table_type:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import time
import traceback
from typing import List, Optional, Tuple

import cv2
Expand Down Expand Up @@ -37,9 +38,9 @@ def recognize_tables_from_image(self, image: np.ndarray, page_number: int, langu
cleaned_image, scan_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type)
return cleaned_image, scan_tables
except Exception as ex:
logging.warning(ex)
if self.config.get("debug_mode", False):
raise ex
traceback_message = "".join(traceback.format_exception(type(ex), value=ex, tb=ex.__traceback__))
logging.warning(traceback_message)

return image, []

def __rec_tables_from_img(self, src_image: np.ndarray, page_num: int, language: str, table_type: str) -> Tuple[np.ndarray, List[ScanTable]]:
Expand Down Expand Up @@ -95,11 +96,11 @@ def __clean_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray
def __filter_bad_tables(self, tables: List[ScanTable], image: np.ndarray) -> List[ScanTable]:
filtered = []
for table in tables:
if not self.__if_not_table(table, image):
if not self.__is_not_table(table, image):
filtered.append(table)
return filtered

def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool:
def __is_not_table(self, table: ScanTable, image: np.ndarray) -> bool:
bbox = table.location.bbox
height, width = image.shape
table_image = image[max(bbox.y_top_left, 0): min(bbox.y_bottom_right, height), max(bbox.x_top_left, 0): min(bbox.x_bottom_right, width)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
filtered_cont_id = []
for i, c in enumerate(table_contours):
# Returns the location and width,height for table contour
x, y, w, h = cv2.boundingRect(c)
x, y, w, h = cv2.boundingRect(c.astype(np.int32))
table_image = img[y:y + h, x:x + w]

# filter contours which not similar a table contour
Expand All @@ -134,7 +134,7 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
return contours, hierarchy

for c in contours[np.array(filtered_cont_id)]:
x, y, w, h = cv2.boundingRect(c)
x, y, w, h = cv2.boundingRect(c.astype(np.int32))
cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5)

if config.get("debug_mode", False):
Expand All @@ -152,6 +152,10 @@ def __filter_table(image: np.ndarray, table_image: np.ndarray) -> bool:
table_area = table_image.shape[0] * table_image.shape[1]
image_area = image.shape[0] * image.shape[1]

config = get_config()
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "table_image.jpg"), table_image)

res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or (table_area < image_area * 0.2)
return res

Expand Down
20 changes: 20 additions & 0 deletions tests/api_tests/test_api_module_table_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,23 @@ def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:
"page_height": 841
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)

def test_with_table_wo_external_bounds_1(self) -> None:
file_name = os.path.join("..", "lising", "russian_invoice.jpg")
result = self._send_request(file_name, data=dict(table_type="wo_external_bounds", language="rus+eng"))

self.assertEqual(1, len(result["content"]["tables"]))
table = result["content"]["tables"][0]["cells"]
row0 = self._get_text_of_row(table[0])

self.assertEqual(row0[:2], ["Сумма\nпрописью\nв валюте", "Две тысячи сто шестьдесят евро 00 евроцентов"])

def test_with_table_wo_external_bounds_2(self) -> None:
result = self._send_request("2312.pdf", data=dict(table_type="wo_external_bounds", language="rus", pages=":12"))

self.assertEqual(2, len(result["content"]["tables"]))

table = result["content"]["tables"][0]["cells"]
row0 = self._get_text_of_row(table[0])

self.assertEqual(row0[:2], ["Номер", "Извещения\nмореплавателям"])
Binary file added tests/data/tables/2312.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions tests/unit_tests/test_module_table_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def get_table(self, image: np.ndarray, language: str = "rus", table_type: str =
return tables

def test_table_wo_external_bounds(self) -> None:
path_image = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "lising", "platezhka.jpg"))
path_image = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "lising", "russian_invoice.jpg"))

image = cv2.imread(path_image, 0)

Expand All @@ -39,7 +39,7 @@ def test_table_wo_external_bounds(self) -> None:
self.assertTrue(equal_with_eps(bbox.height, 754, 10))

def test_table_split_right_column(self) -> None:
path_image = get_full_path("data/lising/platezhka.jpg")
path_image = get_full_path("data/lising/russian_invoice.jpg")

image = cv2.imread(path_image, 0)

Expand All @@ -53,7 +53,7 @@ def test_table_split_right_column(self) -> None:
self.assertTrue(tables[0].cells[10][-1].get_text(), "30110978700000070815")

def test_table_extract_one_cell_and_one_cell_tables(self) -> None:
path_image = get_full_path("data/lising/platezhka.jpg")
path_image = get_full_path("data/lising/russian_invoice.jpg")
image = cv2.imread(path_image, 0)

tables = self.get_table(image, "rus+eng", table_type="table_wo_external_bounds+one_cell_table")
Expand Down