from docling.document_converter import DocumentConverter from paddleocr import PaddleOCR from llama_parse import LlamaParse from pdf2image import convert_from_path import numpy as np import os llama_key = os.getenv('LLAMA_INDEX_API_KEY') def process_text(res): page_texts = {} texts = res.get('texts') for item in texts: for prov in item['prov']: page_no = prov['page_no'] text = item['text'] page_key = f'{page_no}' if page_key not in page_texts: page_texts[page_key] = text else: page_texts[page_key] += ' ' + text return page_texts def get_table_text(grids): table_text = "Here is a Table : \n" for row in grids: for col in row: val = col.get('text') table_text+=f'{val} ,' table_text+='\n' return table_text def process_tables(res , page_texts : dict = {}): try: tables = res.get('tables', []) if not isinstance(tables, list): raise ValueError("Expected 'tables' to be a list.") for table in tables: try: # Ensure 'prov' exists and has the necessary structure prov = table.get('prov', []) if not prov or not isinstance(prov, list): raise ValueError("Missing or invalid 'prov' structure in table.") page_no = str(prov[0].get('page_no')) if not page_no: raise ValueError("Missing or invalid 'page_no' in 'prov'.") # Ensure 'data' and 'grid' exist data = table.get('data', {}) grids = data.get('grid', []) if not isinstance(grids, list): raise ValueError("Missing or invalid 'grid' structure in 'data'.") # Process grid data into text text = get_table_text(grids) # Ensure `get_table_text` is well-defined if not isinstance(text, str): raise ValueError("get_table_text did not return a string.") # Add text to page_texts if page_no not in page_texts: page_texts[page_no] = text else: page_texts[page_no] += '\n' + text print(f"Processed page {page_no}") except Exception as table_error: print(f"Error processing table: {table_error}") return page_texts except Exception as e: print(f"Error processing tables: {e}") return page_texts def process_docs(doc_path): """ Process the uploaded PDF document using LlamaParse and PaddleOCR. Args: doc_path (str): Path to the uploaded PDF document. Returns: query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images. """ ## LLama Parser parser = LlamaParse( api_key=llama_key, result_type='markdown', verbose=True, language='en', num_workers=2 ) documents = parser.load_data(doc_path) docs = [doc.text for doc in documents] ## Paddle OCR ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True) images_pdf = convert_from_path(doc_path, 300) documents2 = [] for image in images_pdf: result = ocr.ocr(np.array(image), cls=True) text = " ".join([line[1][0] for line in result[0]]) documents2.append(text) docs2 = documents2 ## Docling converter = DocumentConverter() result = converter.convert(doc_path) res = result.document.export_to_dict() docs3 = process_text(res) docs3 = process_tables(res,docs3) return docs, docs2, docs3