Gopal2002 commited on
Commit
dfcf27a
·
verified ·
1 Parent(s): 7d46793

Create helper.py

Browse files
Files changed (1) hide show
  1. helper.py +93 -0
helper.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter
2
+ from paddleocr import PaddleOCR
3
+ from pdf2image import convert_from_path
4
+ import numpy as np
5
+
6
+ def process_text(res):
7
+ page_texts = {}
8
+ texts = res.get('texts')
9
+ for item in texts:
10
+ for prov in item['prov']:
11
+ page_no = prov['page_no']
12
+ text = item['text']
13
+ page_key = f'{page_no}'
14
+ if page_key not in page_texts:
15
+ page_texts[page_key] = text
16
+ else:
17
+ page_texts[page_key] += ' ' + text
18
+ return page_texts
19
+
20
+ def get_table_text(grids):
21
+ table_text = "Here is a Table : \n"
22
+ for row in grids:
23
+ for col in row:
24
+ val = col.get('text')
25
+ table_text+=f'{val} ,'
26
+ table_text+='\n'
27
+ return table_text
28
+ def process_tables(res , page_texts : dict = {}):
29
+ try:
30
+ tables = res.get('tables', [])
31
+ if not isinstance(tables, list):
32
+ raise ValueError("Expected 'tables' to be a list.")
33
+
34
+ for table in tables:
35
+ try:
36
+ # Ensure 'prov' exists and has the necessary structure
37
+ prov = table.get('prov', [])
38
+ if not prov or not isinstance(prov, list):
39
+ raise ValueError("Missing or invalid 'prov' structure in table.")
40
+ page_no = str(prov[0].get('page_no'))
41
+ if not page_no:
42
+ raise ValueError("Missing or invalid 'page_no' in 'prov'.")
43
+ # Ensure 'data' and 'grid' exist
44
+ data = table.get('data', {})
45
+ grids = data.get('grid', [])
46
+ if not isinstance(grids, list):
47
+ raise ValueError("Missing or invalid 'grid' structure in 'data'.")
48
+ # Process grid data into text
49
+ text = get_table_text(grids) # Ensure `get_table_text` is well-defined
50
+ if not isinstance(text, str):
51
+ raise ValueError("get_table_text did not return a string.")
52
+ # Add text to page_texts
53
+ if page_no not in page_texts:
54
+ page_texts[page_no] = text
55
+ else:
56
+ page_texts[page_no] += '\n' + text
57
+ print(f"Processed page {page_no}")
58
+ except Exception as table_error:
59
+ print(f"Error processing table: {table_error}")
60
+ return page_texts
61
+ except Exception as e:
62
+ print(f"Error processing tables: {e}")
63
+ return page_texts
64
+
65
+ def process_docs(doc_path):
66
+ """
67
+ Process the uploaded PDF document using LlamaParse and PaddleOCR.
68
+
69
+ Args:
70
+ doc_path (str): Path to the uploaded PDF document.
71
+
72
+ Returns:
73
+ query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
74
+ """
75
+
76
+ ## Paddle OCR
77
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
78
+ images_pdf = convert_from_path(doc_path, 300)
79
+ documents2 = []
80
+ for image in images_pdf:
81
+ result = ocr.ocr(np.array(image), cls=True)
82
+ text = "\n".join([line[1][0] for line in result[0]])
83
+ documents2.append(text)
84
+ docs2 = documents2
85
+
86
+ ## Docling
87
+ converter = DocumentConverter()
88
+ result = converter.convert(doc_path)
89
+ res = result.document.export_to_dict()
90
+ docs3 = process_text(res)
91
+ docs3 = process_tables(res,docs3)
92
+
93
+ return docs2,docs3