Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.4
2.5
3 changes: 1 addition & 2 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby", "bad_encoding"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
textual_layer_classifier: str = Form("ml", enum=["ml", "simple", "letter"], description="Type of classifier for PDF textual layer detection")
each_page_textual_layer_detection: str = Form("false", enum=["true", "false"], description="Detect textual layer on each page. Slower but more accurate.")
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
Expand Down
18 changes: 14 additions & 4 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
details > summary {font-style: italic; cursor: pointer; display: list-item;}
.child.max {padding-left: 5px; flex: 1}
.parent {display: flex}
details { padding-left: 24px;}
</style>
</head>

Expand Down Expand Up @@ -100,7 +101,7 @@ <h4>Attachments handling</h4>

<div class="parameters">
<h4>PDF handling</h4>
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, each_page_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, textual_layer_classifier, each_page_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
<br>
<p>
<label>
Expand All @@ -116,7 +117,13 @@ <h4>PDF handling</h4>
</p>

<p>
<label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
<label>
<select name="textual_layer_classifier">
<option value="ml">ml</option>
<option value="simple">simple</option>
<option value="letter">letter</option>
</select> textual_layer_classifier
</label>
</p>

<p>
Expand All @@ -136,15 +143,18 @@ <h4>PDF handling</h4>
</label>
</p>

<details><summary>need_pdf_table_analysis</summary>
<details><summary>need_pdf_table_analysis, table_type</summary>
<br>
<p>
<label>
<input type="hidden" name="need_pdf_table_analysis" value="false">
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
</p>
<p>
<label>table_type <input name="table_type" type="text" size="20" value=""></label>
</p>
</details>

<br>
<p>
<label>pages <input name="pages" type="text" size="8" value=":"></label>
</p>
Expand Down
4 changes: 4 additions & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@


class ScanTable(Table):
"""
Utility class for storing recognized tables from document images. The class
:class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class.
"""
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:

super().__init__(cells, TableMetadata(page_id=page_number))
Expand Down
5 changes: 3 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import namedtuple
from typing import List, Optional

import numpy as np
from dedocutils.data_structures import BBox
from numpy import ndarray

Expand Down Expand Up @@ -79,7 +80,7 @@ def parse_contours_to_tree(contours: List, hierarchy: List, *, config: dict) ->
if len(contours) == 0:
return table_tree

bbox = [cv2.boundingRect(c) for c in contours[0]][0] # [x_begin, y_begin, width, height]
bbox = cv2.boundingRect(contours[0].astype(np.int32)) # [x_begin, y_begin, width, height]
table_tree.cell_box = BBox(x_top_left=bbox[0], y_top_left=bbox[1], width=bbox[2], height=bbox[3])

table_tree = table_tree.__build_childs(table_tree, hierarchy, contours)
Expand All @@ -101,7 +102,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
list_childs = []
for i, h in enumerate(hierarchy[0]):
if h[3] == cur.id_contours:
bbox = cv2.boundingRect(contours[i]) # [x_begin, y_begin, width, height]
bbox = cv2.boundingRect(contours[i].astype(np.int32)) # [x_begin, y_begin, width, height]
# Эвристика №1 на ячейку
if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
if self.config.get("debug_mode", False):
Expand Down
57 changes: 57 additions & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/table_type.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,61 @@
class TableTypeAdditionalOptions:
"""
Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by
class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`.

* Parameter `table_type=wo_external_bounds` - recognize tables without external bounds;

Example of a table of type `wo_external_bounds`::

text | text | text
--------+------+------
text | text | text
--------+------+------
text | text | text
--------+------+------
text | text | text


* Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table;

Example of a page with a table of type `one_cell_table`::

_________________________
Header of document
text text text +------+
text | text | <--- it is a table
+------+
________________________

* Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table;

Example of a table of type `split_last_column`::

+--------+------+-------+
| text | text | text1 |
+--------+------+ |
| text0 | text | text2 |
| | -----| |
| | text | text3 |
+--------+------+ |
| text | text | text4 |
+--------+------+-------+
|
Recognition
|
V
+--------+------+-------+
| text | text | text1 |
+--------+------+-------|
| text0 | text | text2 |
|--------+ -----+------ |
| text0 | text | text3 |
+--------+------+------ |
| text | text | text4 |
+--------+------+-------+

"""

def __init__(self) -> None:
self.table_wo_external_bounds = "wo_external_bounds"
self.detect_one_cell_table = "one_cell_table"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import Dict

from .abstract_txtlayer_classifier import AbstractTxtlayerClassifier
from .letter_txtlayer_classifier import LetterTxtlayerClassifier
from .ml_txtlayer_classifier import MlTxtlayerClassifier
from .simple_txtlayer_classifier import SimpleTxtlayerClassifier


def get_classifiers(config: dict) -> Dict[str, AbstractTxtlayerClassifier]:
return {
"ml": MlTxtlayerClassifier(config=config),
"simple": SimpleTxtlayerClassifier(config=config),
"letter": LetterTxtlayerClassifier(config=config)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import List

import numpy as np

from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.abstract_txtlayer_classifier import AbstractTxtlayerClassifier


class LetterTxtlayerClassifier(AbstractTxtlayerClassifier):
"""
Simple multilingual textual layer correctness classification.
Textual layer is considered as correct if percent of letters in the text > 50%.
"""
def __init__(self, *, config: dict) -> None:
super().__init__(config=config)
self.__symbol_threshold = 0.5

def predict(self, lines: List[List[LineWithMeta]]) -> np.ndarray:
texts = np.array(["".join(line.line for line in line_list) for line_list in lines])
result = np.array([bool(text.strip()) for text in texts])
ids_for_pred = np.where(result)[0]

for idx in ids_for_pred:
text = texts[idx].replace(".", "").replace("…", "")
letters_number = sum(1 for symbol in text if symbol.isalpha())
result[idx] = letters_number / max(len(text), 1) > self.__symbol_threshold

return result
20 changes: 11 additions & 9 deletions dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
import numpy as np

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import get_classifiers
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.abstract_txtlayer_classifier import AbstractTxtlayerClassifier
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.ml_txtlayer_classifier import MlTxtlayerClassifier
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.simple_txtlayer_classifier import SimpleTxtlayerClassifier
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_result import TxtLayerResult
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
from dedoc.utils.parameter_utils import get_bool_parameter, get_param_page_slice
Expand All @@ -22,8 +21,7 @@ def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None:
self.config = config
self.logger = config.get("logger", logging.getLogger())

self.ml_txtlayer_classifier = MlTxtlayerClassifier(config=config)
self.simple_txtlayer_classifier = SimpleTxtlayerClassifier(config=config)
self.classifiers = get_classifiers(config=config)
self.pdf_reader = pdf_reader

def detect_txtlayer(self, path: str, parameters: dict) -> List[TxtLayerResult]:
Expand All @@ -34,10 +32,11 @@ def detect_txtlayer(self, path: str, parameters: dict) -> List[TxtLayerResult]:
:param parameters: parameters for the txtlayer classifier
:return: information about a textual layer in the PDF document
"""
if get_bool_parameter(parameters, "fast_textual_layer_detection", False):
txtlayer_classifier = self.simple_txtlayer_classifier
else:
txtlayer_classifier = self.ml_txtlayer_classifier
classifier_name = str(parameters.get("textual_layer_classifier", "ml")).lower()
txtlayer_classifier = self.classifiers.get(classifier_name)
if txtlayer_classifier is None:
raise ValueError(f"Unknown textual layer classifier `{classifier_name}`")

classify_each_page = get_bool_parameter(parameters, "each_page_textual_layer_detection", False)
detect_function = self.__classify_each_page if classify_each_page else self.__classify_all_pages
try:
Expand Down Expand Up @@ -110,7 +109,10 @@ def __classify_each_page(self, path: str, parameters: dict, txtlayer_classifier:
prev_idx = 0
for transition_idx in transitions:
chunk_lines = list(chain.from_iterable(lines_for_predict[prev_idx:transition_idx]))
chunk_document = UnstructuredDocument(lines=chunk_lines, tables=document.tables, attachments=document.attachments)
if is_correct:
chunk_document = UnstructuredDocument(lines=chunk_lines, tables=document.tables, attachments=document.attachments)
else:
chunk_document = None
chunk_result = TxtLayerResult(start=prev_idx + fisrt_page_id + 1, end=transition_idx + fisrt_page_id, correct=is_correct, document=chunk_document)
result.append(chunk_result)
is_correct = not is_correct
Expand Down
5 changes: 3 additions & 2 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
config=self.config)
self.binarizer = AdaptiveBinarizer()
self.ocr = OCRLineExtractor(config=self.config)
self.page_number = None

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
return super().read(file_path, parameters)
Expand All @@ -68,6 +69,7 @@ def _process_one_page(self,
from dedoc.utils.parameter_utils import get_path_param

# --- Step 1: correct orientation and detect column count ---
self.page_number = page_number
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
if self.config.get("debug_mode", False):
self.logger.info(f"Angle page rotation = {angle}")
Expand Down Expand Up @@ -105,7 +107,6 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param
Return: rotated_image and indicator if the page is one-column
"""
import os
from datetime import datetime
import cv2
from dedoc.utils.parameter_utils import get_path_param

Expand All @@ -124,7 +125,7 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param

if self.config.get("debug_mode", False):
debug_dir = get_path_param(self.config, "path_debug")
img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
img_path = os.path.join(debug_dir, f"page-{self.page_number}_result_orientation.jpg")
self.logger.info(f"Save image to {img_path}")
cv2.imwrite(img_path, rotated_image)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,19 +112,25 @@ def __delete_ref_table(self, lines: List[LineWithMeta], table_name: str) -> None
@staticmethod
def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]:
widths = []
prev_uid = None
start = None
end = None
for cell_id, cell in enumerate(row):
if prev_uid is None:
start = cell.bbox.x_top_left
prev_uid = cell.uuid
elif prev_uid != cell.uuid:
widths.append(end - start)
start = cell.bbox.x_top_left
end = cell.bbox.x_bottom_right
if cell_id == len(row) - 1:
widths.append(end - start)
prev_cell_uuid = None
cell_x_left = None
cell_x_right = None
for column_num, cell in enumerate(row):
if prev_cell_uuid is None: # the first column
cell_x_left = cell.bbox.x_top_left
cell_x_right = cell.bbox.x_bottom_right
prev_cell_uuid = cell.uuid
continue

if prev_cell_uuid != cell.uuid: # a new cell starts
widths.append(cell_x_right - cell_x_left)
cell_x_left = cell.bbox.x_top_left

cell_x_right = cell.bbox.x_bottom_right

if column_num == len(row) - 1: # the last column
widths.append(cell_x_right - cell_x_left)

return widths

def __is_equal_width_cells(self, table_part_1: List[List[Cell]], table_part_2: List[List[Cell]]) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ class OnePageTableExtractor(BaseTableExtractor):
def __init__(self, *, config: dict, logger: logging.Logger) -> None:
super().__init__(config=config, logger=logger)

self.language = "rus"
self.page_number = None
self.image = None
self.page_number = 0
self.table_header_extractor = TableHeaderExtractor(logger=self.logger)
self.count_vertical_extended = 0
self.splitter = CellSplitter()
self.table_options = TableTypeAdditionalOptions()
self.language = "rus"

def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str) -> List[ScanTable]:
"""
Expand Down Expand Up @@ -89,20 +89,18 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type:
tables.append(table)
except Exception as ex:
self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}")
if self.config.get("debug_mode", False):
raise ex
return tables

def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[List[Cell]]:
# Эвристика 1: Таблица должна состоять из 1 строк и более
# Heuristic 1: The table must have 1 or more rows.
if len(cells) < 1:
raise RecognizeError("Invalid recognized table")
raise RecognizeError("Invalid recognized table. Heuristic 1: The table must have 1 or more rows.")

cells = self.splitter.split(cells=cells)

# Эвристика 2: таблица должна иметь больше одного столбца
# Heuristic 2: The table must have more than one column.
if cells[0] == [] or (len(cells[0]) <= 1 and self.table_options.detect_one_cell_table not in table_type):
raise RecognizeError("Invalid recognized table")
raise RecognizeError("Invalid recognized table. Heuristic 2: The table must have more than one column.")

# Postprocess table
if self.table_options.split_last_column in table_type:
Expand Down
Loading