Spaces:
Running
Running
Sakshi
commited on
Commit
·
a327219
1
Parent(s):
0106d5f
removed azure layout ocr; added requirements.txt
Browse files- policy_analyser/analyse.py +1 -1
- policy_analyser/llm.py +0 -2
- policy_analyser/ocr.py +1 -113
- requirements.txt +4 -0
policy_analyser/analyse.py
CHANGED
@@ -8,7 +8,7 @@ from time import time
|
|
8 |
from datetime import datetime
|
9 |
|
10 |
from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT
|
11 |
-
from policy_analyser.ocr import
|
12 |
from policy_analyser.extraction import extract
|
13 |
from policy_analyser.rules import prepare_payload, rules
|
14 |
from policy_analyser.llm import call_openai
|
|
|
8 |
from datetime import datetime
|
9 |
|
10 |
from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT
|
11 |
+
from policy_analyser.ocr import PyMuPDF4LLMOCR
|
12 |
from policy_analyser.extraction import extract
|
13 |
from policy_analyser.rules import prepare_payload, rules
|
14 |
from policy_analyser.llm import call_openai
|
policy_analyser/llm.py
CHANGED
@@ -4,8 +4,6 @@
|
|
4 |
"""
|
5 |
|
6 |
# Imports
|
7 |
-
import os
|
8 |
-
|
9 |
from openai import AzureOpenAI
|
10 |
|
11 |
from policy_analyser import GPT_ENGINE, GPT_API_BASE, GPT_KEY, GPT_VERSION
|
|
|
4 |
"""
|
5 |
|
6 |
# Imports
|
|
|
|
|
7 |
from openai import AzureOpenAI
|
8 |
|
9 |
from policy_analyser import GPT_ENGINE, GPT_API_BASE, GPT_KEY, GPT_VERSION
|
policy_analyser/ocr.py
CHANGED
@@ -4,120 +4,8 @@
|
|
4 |
"""
|
5 |
|
6 |
# Imports
|
7 |
-
import json
|
8 |
-
|
9 |
-
from azure.core.credentials import AzureKeyCredential
|
10 |
-
from azure.ai.formrecognizer import DocumentAnalysisClient
|
11 |
import pymupdf4llm, pymupdf
|
12 |
|
13 |
-
from policy_analyser import CREDENTIALS
|
14 |
-
|
15 |
-
def convert_nested_complex_obj_to_json(result):
|
16 |
-
result = json.loads(json.dumps(result, default = lambda o : o.__dict__))
|
17 |
-
return result
|
18 |
-
|
19 |
-
class AzureLayoutOCR:
|
20 |
-
def __init__(self):
|
21 |
-
self.client = self._authenticate()
|
22 |
-
self.engine = 'azure/layout'
|
23 |
-
|
24 |
-
def _authenticate(self):
|
25 |
-
client = DocumentAnalysisClient(
|
26 |
-
endpoint=CREDENTIALS['azure']['layout']['endpoint'],
|
27 |
-
credential=AzureKeyCredential(CREDENTIALS['azure']['layout']['key']),
|
28 |
-
connection_verify=False
|
29 |
-
)
|
30 |
-
return client
|
31 |
-
|
32 |
-
def _table2md(self, table, **kwargs):
|
33 |
-
row_count, column_count = table['row_count'], table['column_count']
|
34 |
-
cells = table['cells']
|
35 |
-
|
36 |
-
markdown_table = []
|
37 |
-
table_offsets = (table['spans'][0]['offset'], table['spans'][-1]['offset'] + table['spans'][-1]['length'])
|
38 |
-
|
39 |
-
for _ in range(row_count + 1):
|
40 |
-
row = [''] * column_count
|
41 |
-
markdown_table.append(row)
|
42 |
-
|
43 |
-
header_row_idx = [0]
|
44 |
-
for cell in cells:
|
45 |
-
row_index = cell['row_index']
|
46 |
-
if cell['kind'] == 'columnHeader':
|
47 |
-
# Headers are in the first row of markdown_table, which is row_index 0
|
48 |
-
markdown_table[row_index + 1][cell['column_index']] = '**' + cell['content'].replace('|', '') + '**'
|
49 |
-
header_row_idx.append(row_index + 1)
|
50 |
-
else:
|
51 |
-
# Content cells are offset by 1 due to headers
|
52 |
-
markdown_table[row_index + 1][cell['column_index']] = cell['content'].replace('|', '')
|
53 |
-
|
54 |
-
markdown_output = ''
|
55 |
-
for row in markdown_table:
|
56 |
-
markdown_output += '| ' + ' | '.join(row) + ' |\n'
|
57 |
-
if markdown_table.index(row) in header_row_idx:
|
58 |
-
# if markdown_table.index(row) == 0:
|
59 |
-
# Add a separator after the header
|
60 |
-
markdown_output += '| ' + ' | '.join(['---'] * column_count) + ' |\n'
|
61 |
-
|
62 |
-
return markdown_output, table_offsets
|
63 |
-
|
64 |
-
def _paragraphs2md(self, paragraph, element_offsets, **kwargs):
|
65 |
-
paragraph_offsets = (
|
66 |
-
paragraph['spans'][0]['offset'], paragraph['spans'][-1]['offset'] + paragraph['spans'][-1]['length'])
|
67 |
-
for offset in element_offsets:
|
68 |
-
if paragraph_offsets[0] >= offset[0] and paragraph['spans'][0]['offset'] <= offset[1]:
|
69 |
-
return None, None
|
70 |
-
|
71 |
-
markdown_text = ''
|
72 |
-
|
73 |
-
if paragraph['role'] == 'title':
|
74 |
-
markdown_text += f'# {paragraph["content"]}'
|
75 |
-
elif paragraph == "sectionHeading":
|
76 |
-
markdown_text += f'## {paragraph["content"]}'
|
77 |
-
else:
|
78 |
-
markdown_text += f'{paragraph["content"]}'
|
79 |
-
return markdown_text, paragraph_offsets
|
80 |
-
|
81 |
-
def _stitch_paragraphs_elements(self, paragraphs, elements, **kwargs):
|
82 |
-
new_list = paragraphs + elements
|
83 |
-
sorted_new_list = sorted(new_list, key=lambda x: x['offset'][0])
|
84 |
-
return sorted_new_list
|
85 |
-
|
86 |
-
def _convert2md(self, result, **kwargs):
|
87 |
-
paragraphs, tables = result['paragraphs'], result['tables']
|
88 |
-
md_tables = []
|
89 |
-
for table in tables:
|
90 |
-
md, offset = self._table2md(table, requestId=kwargs.get('requestId'))
|
91 |
-
md_tables.append({'content': md, 'offset': offset})
|
92 |
-
|
93 |
-
table_offsets = [element['offset'] for element in md_tables]
|
94 |
-
md_paragraphs = []
|
95 |
-
|
96 |
-
for para in paragraphs:
|
97 |
-
md, offset = self._paragraphs2md(para, table_offsets, requestId=kwargs.get('requestId'))
|
98 |
-
if md is not None:
|
99 |
-
md_paragraphs.append({'content': md, 'offset': offset})
|
100 |
-
|
101 |
-
all_md_elements = self._stitch_paragraphs_elements(md_paragraphs, md_tables, requestId=kwargs.get('requestId'))
|
102 |
-
full_md = '\n\n'.join([record['content'] for record in all_md_elements])
|
103 |
-
return full_md
|
104 |
-
|
105 |
-
def _call_engine(self, image_reader, **kwargs):
|
106 |
-
poller = self.client.begin_analyze_document(
|
107 |
-
CREDENTIALS['azure']['layout']['model'],
|
108 |
-
image_reader
|
109 |
-
)
|
110 |
-
result = poller.result()
|
111 |
-
|
112 |
-
result = convert_nested_complex_obj_to_json(result)
|
113 |
-
md_text = self._convert2md(result, requestId=kwargs.get('requestId'))
|
114 |
-
|
115 |
-
return md_text, result
|
116 |
-
|
117 |
-
def __call__(self, file_bytes):
|
118 |
-
text, raw_response = self._call_engine(file_bytes)
|
119 |
-
return text, raw_response
|
120 |
-
|
121 |
class PyMuPDF4LLMOCR:
|
122 |
def __init__(self):
|
123 |
self.engine = 'open-source/pymupdf4llm'
|
@@ -135,6 +23,6 @@ if __name__ == '__main__':
|
|
135 |
import sys
|
136 |
filepath = sys.argv[1]
|
137 |
file_bytes = open(filepath, 'rb').read()
|
138 |
-
ocr =
|
139 |
text, raw_response = ocr(file_bytes)
|
140 |
print(text)
|
|
|
4 |
"""
|
5 |
|
6 |
# Imports
|
|
|
|
|
|
|
|
|
7 |
import pymupdf4llm, pymupdf
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
class PyMuPDF4LLMOCR:
|
10 |
def __init__(self):
|
11 |
self.engine = 'open-source/pymupdf4llm'
|
|
|
23 |
import sys
|
24 |
filepath = sys.argv[1]
|
25 |
file_bytes = open(filepath, 'rb').read()
|
26 |
+
ocr = PyMuPDF4LLMOCR()
|
27 |
text, raw_response = ocr(file_bytes)
|
28 |
print(text)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python-dotenv
|
2 |
+
pymupdf
|
3 |
+
pymupdf4llm
|
4 |
+
openai==1.58.1
|