Source code for ocr_utils.pdf_to_svg

import os
import shutil
import tempfile
from typing import IO, List, Optional, Tuple, Union

import pytesseract
from pdf2image import convert_from_path, pdfinfo_from_path
from svgwrite.drawing import Drawing
from tqdm import tqdm

from ocr_utils.alto_to_svg import alto_pages_and_cells_to_svg
from ocr_utils.table import DetectedCell, extract_and_hide_cells


def _decode(content: Union[str, bytes]) -> str:
    return content.decode() if isinstance(content, bytes) else content


def _run_tesseract(page: str, lang: str) -> str:
    return _decode(pytesseract.image_to_alto_xml(page, lang=lang))


_OCROutput = Tuple[str, Optional[List[DetectedCell]]]


def _ocr_on_image_file(file: IO, detect_tables: bool, lang: str) -> _OCROutput:
    with tempfile.NamedTemporaryFile(suffix='.png') as temp_file:
        if detect_tables:
            cells = extract_and_hide_cells(file.name, temp_file.name, lang)
        else:
            shutil.copyfile(file.name, temp_file.name)
            cells = []
        return _run_tesseract(temp_file.name, lang), cells


def _ocr_on_page(path: str, page_nb: int, detect_tables: bool, lang: str) -> _OCROutput:
    with tempfile.NamedTemporaryFile(suffix='.png') as temp_file:
        page = convert_from_path(path, first_page=page_nb + 1, last_page=page_nb + 1, dpi=300)[0]
        page.save(temp_file)
        return _ocr_on_image_file(temp_file, detect_tables, lang)


def _nb_pages_in_pdf(filename: str) -> int:
    return pdfinfo_from_path(filename)['Pages']


def _ocr_pdf(filename: str, detect_tables: bool, lang: str) -> List[_OCROutput]:
    if not os.path.exists(filename):
        raise ValueError(f'Input pdf not found at path {filename}.')
    nb_pages = _nb_pages_in_pdf(filename)
    result: List[_OCROutput] = []
    for page_nb in tqdm(range(nb_pages), 'Performing OCR.', leave=False):
        result.append(_ocr_on_page(filename, page_nb, detect_tables, lang))
    return result


def _unzip_ocr_outputs(ocr_outputs: List[_OCROutput]) -> Tuple[List[str], List[List[DetectedCell]]]:
    if not ocr_outputs:
        return [], []
    if ocr_outputs[0][1] is None:
        return [page for page, _ in ocr_outputs], [[] for _ in range(len(ocr_outputs))]
    pages: List[str] = []
    cells: List[List[DetectedCell]] = []
    for page, cells_ in ocr_outputs:
        pages.append(page)
        assert cells_ is not None, 'Either all cells_ must be None or none'
        cells.append(cells_)
    return pages, cells


def _generate_svg(ocr_outputs: List[_OCROutput]) -> Drawing:
    pages, cells = _unzip_ocr_outputs(ocr_outputs)
    return alto_pages_and_cells_to_svg(pages, cells)


def _generate_and_write_svg(ocr_outputs: List[_OCROutput], output_filename: str) -> None:
    svg = _generate_svg(ocr_outputs)
    svg.saveas(output_filename)


[docs]def pdf_to_svg(input_filename: str, output_filename: str, detect_tables: bool, lang: str) -> None: ocr_outputs = _ocr_pdf(input_filename, detect_tables, lang) _generate_and_write_svg(ocr_outputs, output_filename)