Spaces:
Sleeping
Sleeping
from docling.document_converter import DocumentConverter | |
import logging | |
import re | |
from uuid import uuid4 | |
from typing import List, Optional, Generator, Set | |
from functools import partial, reduce | |
from itertools import chain | |
from PyPDF2 import PdfReader, PdfWriter | |
tag_list = ["Sources:", "Source:", "Tags-", "Tags:", "CONTENTS", "ANNEX", "EXERCISES", "Project/Activity"] | |
logger = logging.getLogger(__name__) | |
import os | |
try: | |
converter = DocumentConverter() | |
except Exception as e: | |
logger.error(f"Error initializing Docling DocumentConverter: {e}") | |
def split_pdf(input_pdf, output_pdf, start_page, end_page): | |
reader = PdfReader(input_pdf) | |
writer = PdfWriter() | |
for i in range(start_page, end_page+1): | |
writer.add_page(reader.pages[i]) | |
with open(output_pdf, "wb") as output_file: | |
writer.write(output_file) | |
print(f"PDF split successfully: {output_pdf}") | |
def get_texts(res): | |
page_texts = {pg:"" for pg in res['pages'].keys()} | |
texts = res.get('texts') | |
for item in texts: | |
for prov in item['prov']: | |
page_no = prov['page_no'] | |
text = item['text'] | |
page_key = f'{page_no}' | |
if page_key not in page_texts: | |
page_texts[page_key] = text | |
else: | |
page_texts[page_key] += ' ' + text | |
return page_texts | |
def clean_the_text(text): | |
""" | |
Cleans the extracted text by removing unnecessary characters and formatting issues. | |
Args: | |
text (str): The extracted text. | |
Returns: | |
str: The cleaned text. | |
""" | |
try: | |
text = re.sub(r'\n\s*\n', '\n', text) | |
text = text.replace("\t", " ") | |
text = text.replace("\f", " ") | |
text = re.sub(r'\b(\w+\s*)\1{1,}', '\\1', text) | |
text = re.sub(r'[^a-zA-Z0-9\s@\-/,.\\]', ' ', text) | |
return text.strip() | |
except Exception as e: | |
logger.error(f"Error cleaning text: {e}") | |
return text | |
def get_tables(res_json): | |
page_tables = {pg:[] for pg in res_json['pages'].keys()} | |
try: | |
tables = res_json.get('tables', []) | |
if not isinstance(tables, list): | |
raise ValueError("Expected 'tables' to be a list.") | |
for table in tables: | |
try: | |
# Ensure 'prov' exists and has the necessary structure | |
prov = table.get('prov', []) | |
if not prov or not isinstance(prov, list): | |
raise ValueError("Missing or invalid 'prov' structure in table.") | |
page_no = str(prov[0].get('page_no')) | |
if not page_no: | |
raise ValueError("Missing or invalid 'page_no' in 'prov'.") | |
# Ensure 'data' and 'grid' exist | |
data = table.get('data', {}) | |
grid = data.get('grid', []) | |
if not isinstance(grid, list): | |
raise ValueError("Missing or invalid 'grid' structure in 'data'.") | |
# Add text to page_texts | |
page_tables[f'{page_no}'].append(grid) | |
except Exception as table_error: | |
print(f"Error processing table: {table_error}") | |
except Exception as e: | |
print(f"Error processing tables: {e}") | |
return page_tables | |
def table_to_text_or_json(table, rtrn_type="text"): | |
""" | |
Converts a table to a single string or JSON format. | |
Args: | |
table (dict): The table object to convert. | |
rtrn_type (str): The return type, either "text" or "json". Default is "text". | |
Returns: | |
str: The table converted to the specified format. | |
""" | |
table_text = "Here is a Table : \n" | |
for row in table: | |
for col in row: | |
val = col.get('text') | |
table_text+=f'{val} ,' | |
table_text+='\n' | |
return table_text | |
def clean_file_name(text: str): | |
""" | |
Cleans the file name by removing any special characters. | |
Args: | |
text (str): The original file name. | |
Returns: | |
str: The cleaned file name. | |
""" | |
try: | |
text = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text) | |
return text | |
except Exception as e: | |
logger.error(f"Error cleaning file name: {e}") | |
return text | |
def find_and_remove_header_footer( | |
text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int | |
) -> str: | |
""" | |
Heuristic to find footers and headers across different pages by searching for the longest common string. | |
For headers we only search in the first n_chars characters (for footer: last n_chars). | |
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", | |
but won't detect "Page 3 of 4" or similar. | |
:param n_chars: number of first/last characters where the header/footer shall be searched in | |
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header) | |
:param n_last_pages_to_ignore: number of last pages to ignore | |
:return: (cleaned pages, found_header_str, found_footer_str) | |
""" | |
pages = text.split("\f") | |
# header | |
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] | |
found_header = find_longest_common_ngram(start_of_pages) | |
if found_header: | |
pages = [page.replace(found_header, "") for page in pages] | |
# footer | |
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] | |
found_footer = find_longest_common_ngram(end_of_pages) | |
if found_footer: | |
pages = [page.replace(found_footer, "") for page in pages] | |
logger.debug(f"Removed header '{found_header}' and footer '{found_footer}' in document") | |
text = "\f".join(pages) | |
return text | |
def ngram(self, seq: str, n: int) -> Generator[str, None, None]: | |
""" | |
Return ngram (of tokens - currently split by whitespace) | |
:param seq: str, string from which the ngram shall be created | |
:param n: int, n of ngram | |
:return: str, ngram as string | |
""" | |
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization, | |
# we add a space here and remove it after creation of the ngrams again (see below) | |
seq = seq.replace("\n", " \n") | |
seq = seq.replace("\t", " \t") | |
words = seq.split(" ") | |
ngrams = ( | |
" ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1) | |
) | |
return ngrams | |
def allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: | |
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq)) | |
ngrams = map(partial(self.ngram, seq), lengths) | |
res = set(chain.from_iterable(ngrams)) | |
return res | |
def find_longest_common_ngram( | |
sequences: List[str], max_ngram: int = 30, min_ngram: int = 3 | |
) -> Optional[str]: | |
""" | |
Find the longest common ngram across different text sequences (e.g. start of pages). | |
Considering all ngrams between the specified range. Helpful for finding footers, headers etc. | |
:param sequences: list[str], list of strings that shall be searched for common n_grams | |
:param max_ngram: int, maximum length of ngram to consider | |
:param min_ngram: minimum length of ngram to consider | |
:return: str, common string of all sections | |
""" | |
sequences = [s for s in sequences if s] # filter empty sequences | |
if not sequences: | |
return None | |
seqs_ngrams = map(partial(allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences) | |
intersection = reduce(set.intersection, seqs_ngrams) | |
try: | |
longest = max(intersection, key=len) | |
except ValueError: | |
# no common sequence found | |
longest = "" | |
return longest if longest.strip() else None | |
class PdfToSectionConverter(): | |
def __int__(self): | |
""" | |
Initializes the PdfToSectionConverter class. | |
""" | |
pass | |
def convert(self, downloaded_pdf_path: str, file_title: str, doc_id: str = None, start_page_no: int = 0, | |
end_page_no: int = 0): | |
""" | |
Converts a PDF document to sections with metadata. | |
Args: | |
doc_obj (BytesIO): The PDF document object. | |
downloaded_pdf_path (str): Path to the downloaded PDF file. | |
file_title (str): The title of the file. | |
doc_id (str, optional): The document ID. Defaults to None. | |
start_page_no (int, optional): The starting page number. Defaults to 0. | |
end_page_no (int, optional): The ending page number. Defaults to 0. | |
Returns: | |
list: A list of dictionaries containing sections and metadata. | |
""" | |
try: | |
print(f"Splitting pdf from page {start_page_no+1} to {end_page_no+1}") | |
output_path = "/tmp/splitted.pdf" | |
split_pdf(downloaded_pdf_path, output_path, start_page_no, end_page_no) | |
print("OCR Started ....") | |
result = converter.convert(output_path) | |
json_objects = result.document.export_to_dict() | |
pages = list(json_objects['pages'].keys()) | |
texts = get_texts(json_objects) | |
tables = get_tables(json_objects) | |
except Exception as e: | |
logger.error(f"Error getting JSON result from parser: {e}") | |
return [] | |
output_doc_lst = [] | |
page_no = start_page_no | |
try: | |
for page in pages: | |
if page_no > end_page_no: | |
break | |
page_no += 1 | |
print(f"Page Number to be processed: {page_no}") | |
meta = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": []} | |
meta_table = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": "[]"} | |
# Extract text from the page | |
text_to_append = texts[page] | |
text_to_append = clean_the_text(text_to_append) | |
# Detect and extract tables | |
tables_to_append = tables[page] | |
if tables_to_append: | |
tables_to_append = [table_to_text_or_json(table=i, rtrn_type="text") for i in tables_to_append] | |
# Add the processed section to the output list | |
output_doc_lst.append( | |
{"doc_id": doc_id, "text": text_to_append, "vector_id": str(uuid4()), | |
"meta": meta, "content_type": 'text'}) | |
for table in tables_to_append: | |
output_doc_lst.append( | |
{"doc_id": doc_id, "text": table, "vector_id": str(uuid4()), | |
"meta": meta_table, "content_type": 'table'}) | |
# Post-process text to remove headers and footers | |
text_to_append_list = "\f".join([i['text'] for i in output_doc_lst]) | |
text_to_append_list = find_and_remove_header_footer(text=text_to_append_list, n_chars=10, | |
n_first_pages_to_ignore=0, | |
n_last_pages_to_ignore=0).split("\f") | |
for i in range(len(output_doc_lst)): | |
output_doc_lst[i]['text'] = clean_file_name(file_title) + "\n" + text_to_append_list[i] | |
except Exception as e: | |
logger.error(f"Error converting PDF to sections: {e}") | |
return output_doc_lst | |