Spaces:
Running
Running
Sakshi
commited on
Commit
·
d960853
1
Parent(s):
b5a66cc
added azure document intelligence ocr
Browse files- app.py +4 -1
- policy_analyser/__init__.py +3 -0
- policy_analyser/analyse.py +3 -1
- policy_analyser/ocr.py +32 -0
app.py
CHANGED
@@ -12,8 +12,11 @@ from policy_analyser.analyse import Health
|
|
12 |
if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
|
13 |
os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
|
14 |
|
|
|
|
|
|
|
15 |
if 'health_analyser' not in st.session_state:
|
16 |
-
st.session_state.health_analyser = Health()
|
17 |
|
18 |
def markdown_table_to_json(markdown):
|
19 |
lines = markdown.strip().split("\n")
|
|
|
12 |
if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
|
13 |
os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
|
14 |
|
15 |
+
if 'AZURE_LAYOUT_KEY' not in os.environ.get('AZURE_LAYOUT_KEY') or os.environ.get('AZURE_LAYOUT_KEY') in [None, '']:
|
16 |
+
os.environ['AZURE_LAYOUT_KEY'] = st.secrets['AZURE_LAYOUT_KEY']
|
17 |
+
|
18 |
if 'health_analyser' not in st.session_state:
|
19 |
+
st.session_state.health_analyser = Health(ocr_engine = 'azure/layout')
|
20 |
|
21 |
def markdown_table_to_json(markdown):
|
22 |
lines = markdown.strip().split("\n")
|
policy_analyser/__init__.py
CHANGED
@@ -31,6 +31,9 @@ GPT_ENGINE = 'o3-mini'
|
|
31 |
GPT_KEY = os.environ.get('GPT_KEY', '')
|
32 |
GPT_VERSION = '2024-12-01-preview'
|
33 |
GPT_API_BASE = 'https://ai-ackods910341544474.openai.azure.com/'
|
|
|
|
|
|
|
34 |
|
35 |
# EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
|
36 |
# entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
|
|
|
31 |
GPT_KEY = os.environ.get('GPT_KEY', '')
|
32 |
GPT_VERSION = '2024-12-01-preview'
|
33 |
GPT_API_BASE = 'https://ai-ackods910341544474.openai.azure.com/'
|
34 |
+
AZURE_LAYOUT_ENDPOINT = 'https://acko-document-intelligence.cognitiveservices.azure.com/'
|
35 |
+
AZURE_LAYOUT_KEY = os.environ.get('AZURE_LAYOUT_KEY', '')
|
36 |
+
AZURE_LAYOUT_MODEL = 'prebuilt-layout'
|
37 |
|
38 |
# EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
|
39 |
# entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
|
policy_analyser/analyse.py
CHANGED
@@ -9,7 +9,7 @@ from time import time
|
|
9 |
from datetime import datetime
|
10 |
|
11 |
from policy_analyser import PROMPTS_DIR, DATA_DIR
|
12 |
-
from policy_analyser.ocr import PyMuPDF4LLMOCR
|
13 |
from policy_analyser.llm import call_openai
|
14 |
from policy_analyser.utils import markdown_table_to_json
|
15 |
|
@@ -17,6 +17,8 @@ class LOB:
|
|
17 |
def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
|
18 |
if ocr_engine == 'open-source/pymupdf4llm':
|
19 |
self.engine = PyMuPDF4LLMOCR()
|
|
|
|
|
20 |
self.file_type = 'pdf'
|
21 |
with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
|
22 |
self.analysis_prompt = f.read()
|
|
|
9 |
from datetime import datetime
|
10 |
|
11 |
from policy_analyser import PROMPTS_DIR, DATA_DIR
|
12 |
+
from policy_analyser.ocr import PyMuPDF4LLMOCR, AzureDocumentIntelligenceOCR
|
13 |
from policy_analyser.llm import call_openai
|
14 |
from policy_analyser.utils import markdown_table_to_json
|
15 |
|
|
|
17 |
def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
|
18 |
if ocr_engine == 'open-source/pymupdf4llm':
|
19 |
self.engine = PyMuPDF4LLMOCR()
|
20 |
+
elif ocr_engine == 'azure/layout':
|
21 |
+
self.engine = AzureDocumentIntelligenceOCR()
|
22 |
self.file_type = 'pdf'
|
23 |
with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
|
24 |
self.analysis_prompt = f.read()
|
policy_analyser/ocr.py
CHANGED
@@ -5,6 +5,11 @@
|
|
5 |
|
6 |
# Imports
|
7 |
import pymupdf4llm, pymupdf
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
class PyMuPDF4LLMOCR:
|
10 |
def __init__(self):
|
@@ -19,6 +24,33 @@ class PyMuPDF4LLMOCR:
|
|
19 |
response = pymupdf4llm.to_markdown(document)
|
20 |
return response, None
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
if __name__ == '__main__':
|
23 |
import sys
|
24 |
filepath = sys.argv[1]
|
|
|
5 |
|
6 |
# Imports
|
7 |
import pymupdf4llm, pymupdf
|
8 |
+
from azure.core.credentials import AzureKeyCredential
|
9 |
+
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
10 |
+
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat, AnalyzeResult
|
11 |
+
|
12 |
+
from policy_analyser import AZURE_LAYOUT_ENDPOINT, AZURE_LAYOUT_KEY, AZURE_LAYOUT_MODEL
|
13 |
|
14 |
class PyMuPDF4LLMOCR:
|
15 |
def __init__(self):
|
|
|
24 |
response = pymupdf4llm.to_markdown(document)
|
25 |
return response, None
|
26 |
|
27 |
+
class AzureDocumentIntelligenceOCR:
|
28 |
+
def __init__(self):
|
29 |
+
self.engine = 'azure/layout'
|
30 |
+
self.client = None
|
31 |
+
|
32 |
+
def _authenticate(self):
|
33 |
+
if self.client is None:
|
34 |
+
try:
|
35 |
+
self.client = DocumentIntelligenceClient(
|
36 |
+
endpoint = AZURE_LAYOUT_ENDPOINT,
|
37 |
+
credential = AzureKeyCredential(AZURE_LAYOUT_KEY)
|
38 |
+
)
|
39 |
+
except Exception as e:
|
40 |
+
self.client = None
|
41 |
+
|
42 |
+
def __call__(self, file_bytes):
|
43 |
+
if self.client is not None:
|
44 |
+
poller = self.client.begin_analyze_document(
|
45 |
+
AZURE_LAYOUT_MODEL,
|
46 |
+
AnalyzeDocumentRequest(bytes_source = file_bytes),
|
47 |
+
output_content_format = ContentFormat.MARKDOWN
|
48 |
+
)
|
49 |
+
result = poller.result()
|
50 |
+
return result.content, None
|
51 |
+
else:
|
52 |
+
print('Client is not authenticated or reachable')
|
53 |
+
|
54 |
if __name__ == '__main__':
|
55 |
import sys
|
56 |
filepath = sys.argv[1]
|