Sakshi commited on
Commit
d960853
·
1 Parent(s): b5a66cc

added azure document intelligence ocr

Browse files
app.py CHANGED
@@ -12,8 +12,11 @@ from policy_analyser.analyse import Health
12
  if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
13
  os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
14
 
 
 
 
15
  if 'health_analyser' not in st.session_state:
16
- st.session_state.health_analyser = Health()
17
 
18
  def markdown_table_to_json(markdown):
19
  lines = markdown.strip().split("\n")
 
12
  if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
13
  os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
14
 
15
+ if 'AZURE_LAYOUT_KEY' not in os.environ.get('AZURE_LAYOUT_KEY') or os.environ.get('AZURE_LAYOUT_KEY') in [None, '']:
16
+ os.environ['AZURE_LAYOUT_KEY'] = st.secrets['AZURE_LAYOUT_KEY']
17
+
18
  if 'health_analyser' not in st.session_state:
19
+ st.session_state.health_analyser = Health(ocr_engine = 'azure/layout')
20
 
21
  def markdown_table_to_json(markdown):
22
  lines = markdown.strip().split("\n")
policy_analyser/__init__.py CHANGED
@@ -31,6 +31,9 @@ GPT_ENGINE = 'o3-mini'
31
  GPT_KEY = os.environ.get('GPT_KEY', '')
32
  GPT_VERSION = '2024-12-01-preview'
33
  GPT_API_BASE = 'https://ai-ackods910341544474.openai.azure.com/'
 
 
 
34
 
35
  # EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
36
  # entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
 
31
  GPT_KEY = os.environ.get('GPT_KEY', '')
32
  GPT_VERSION = '2024-12-01-preview'
33
  GPT_API_BASE = 'https://ai-ackods910341544474.openai.azure.com/'
34
+ AZURE_LAYOUT_ENDPOINT = 'https://acko-document-intelligence.cognitiveservices.azure.com/'
35
+ AZURE_LAYOUT_KEY = os.environ.get('AZURE_LAYOUT_KEY', '')
36
+ AZURE_LAYOUT_MODEL = 'prebuilt-layout'
37
 
38
  # EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
39
  # entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
policy_analyser/analyse.py CHANGED
@@ -9,7 +9,7 @@ from time import time
9
  from datetime import datetime
10
 
11
  from policy_analyser import PROMPTS_DIR, DATA_DIR
12
- from policy_analyser.ocr import PyMuPDF4LLMOCR
13
  from policy_analyser.llm import call_openai
14
  from policy_analyser.utils import markdown_table_to_json
15
 
@@ -17,6 +17,8 @@ class LOB:
17
  def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
18
  if ocr_engine == 'open-source/pymupdf4llm':
19
  self.engine = PyMuPDF4LLMOCR()
 
 
20
  self.file_type = 'pdf'
21
  with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
22
  self.analysis_prompt = f.read()
 
9
  from datetime import datetime
10
 
11
  from policy_analyser import PROMPTS_DIR, DATA_DIR
12
+ from policy_analyser.ocr import PyMuPDF4LLMOCR, AzureDocumentIntelligenceOCR
13
  from policy_analyser.llm import call_openai
14
  from policy_analyser.utils import markdown_table_to_json
15
 
 
17
  def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
18
  if ocr_engine == 'open-source/pymupdf4llm':
19
  self.engine = PyMuPDF4LLMOCR()
20
+ elif ocr_engine == 'azure/layout':
21
+ self.engine = AzureDocumentIntelligenceOCR()
22
  self.file_type = 'pdf'
23
  with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
24
  self.analysis_prompt = f.read()
policy_analyser/ocr.py CHANGED
@@ -5,6 +5,11 @@
5
 
6
  # Imports
7
  import pymupdf4llm, pymupdf
 
 
 
 
 
8
 
9
  class PyMuPDF4LLMOCR:
10
  def __init__(self):
@@ -19,6 +24,33 @@ class PyMuPDF4LLMOCR:
19
  response = pymupdf4llm.to_markdown(document)
20
  return response, None
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  if __name__ == '__main__':
23
  import sys
24
  filepath = sys.argv[1]
 
5
 
6
  # Imports
7
  import pymupdf4llm, pymupdf
8
+ from azure.core.credentials import AzureKeyCredential
9
+ from azure.ai.documentintelligence import DocumentIntelligenceClient
10
+ from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat, AnalyzeResult
11
+
12
+ from policy_analyser import AZURE_LAYOUT_ENDPOINT, AZURE_LAYOUT_KEY, AZURE_LAYOUT_MODEL
13
 
14
  class PyMuPDF4LLMOCR:
15
  def __init__(self):
 
24
  response = pymupdf4llm.to_markdown(document)
25
  return response, None
26
 
27
+ class AzureDocumentIntelligenceOCR:
28
+ def __init__(self):
29
+ self.engine = 'azure/layout'
30
+ self.client = None
31
+
32
+ def _authenticate(self):
33
+ if self.client is None:
34
+ try:
35
+ self.client = DocumentIntelligenceClient(
36
+ endpoint = AZURE_LAYOUT_ENDPOINT,
37
+ credential = AzureKeyCredential(AZURE_LAYOUT_KEY)
38
+ )
39
+ except Exception as e:
40
+ self.client = None
41
+
42
+ def __call__(self, file_bytes):
43
+ if self.client is not None:
44
+ poller = self.client.begin_analyze_document(
45
+ AZURE_LAYOUT_MODEL,
46
+ AnalyzeDocumentRequest(bytes_source = file_bytes),
47
+ output_content_format = ContentFormat.MARKDOWN
48
+ )
49
+ result = poller.result()
50
+ return result.content, None
51
+ else:
52
+ print('Client is not authenticated or reachable')
53
+
54
  if __name__ == '__main__':
55
  import sys
56
  filepath = sys.argv[1]