Sakshi commited on
Commit
a327219
·
1 Parent(s): 0106d5f

removed azure layout ocr; added requirements.txt

Browse files
policy_analyser/analyse.py CHANGED
@@ -8,7 +8,7 @@ from time import time
8
  from datetime import datetime
9
 
10
  from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT
11
- from policy_analyser.ocr import AzureLayoutOCR, PyMuPDF4LLMOCR
12
  from policy_analyser.extraction import extract
13
  from policy_analyser.rules import prepare_payload, rules
14
  from policy_analyser.llm import call_openai
 
8
  from datetime import datetime
9
 
10
  from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT
11
+ from policy_analyser.ocr import PyMuPDF4LLMOCR
12
  from policy_analyser.extraction import extract
13
  from policy_analyser.rules import prepare_payload, rules
14
  from policy_analyser.llm import call_openai
policy_analyser/llm.py CHANGED
@@ -4,8 +4,6 @@
4
  """
5
 
6
  # Imports
7
- import os
8
-
9
  from openai import AzureOpenAI
10
 
11
  from policy_analyser import GPT_ENGINE, GPT_API_BASE, GPT_KEY, GPT_VERSION
 
4
  """
5
 
6
  # Imports
 
 
7
  from openai import AzureOpenAI
8
 
9
  from policy_analyser import GPT_ENGINE, GPT_API_BASE, GPT_KEY, GPT_VERSION
policy_analyser/ocr.py CHANGED
@@ -4,120 +4,8 @@
4
  """
5
 
6
  # Imports
7
- import json
8
-
9
- from azure.core.credentials import AzureKeyCredential
10
- from azure.ai.formrecognizer import DocumentAnalysisClient
11
  import pymupdf4llm, pymupdf
12
 
13
- from policy_analyser import CREDENTIALS
14
-
15
- def convert_nested_complex_obj_to_json(result):
16
- result = json.loads(json.dumps(result, default = lambda o : o.__dict__))
17
- return result
18
-
19
- class AzureLayoutOCR:
20
- def __init__(self):
21
- self.client = self._authenticate()
22
- self.engine = 'azure/layout'
23
-
24
- def _authenticate(self):
25
- client = DocumentAnalysisClient(
26
- endpoint=CREDENTIALS['azure']['layout']['endpoint'],
27
- credential=AzureKeyCredential(CREDENTIALS['azure']['layout']['key']),
28
- connection_verify=False
29
- )
30
- return client
31
-
32
- def _table2md(self, table, **kwargs):
33
- row_count, column_count = table['row_count'], table['column_count']
34
- cells = table['cells']
35
-
36
- markdown_table = []
37
- table_offsets = (table['spans'][0]['offset'], table['spans'][-1]['offset'] + table['spans'][-1]['length'])
38
-
39
- for _ in range(row_count + 1):
40
- row = [''] * column_count
41
- markdown_table.append(row)
42
-
43
- header_row_idx = [0]
44
- for cell in cells:
45
- row_index = cell['row_index']
46
- if cell['kind'] == 'columnHeader':
47
- # Headers are in the first row of markdown_table, which is row_index 0
48
- markdown_table[row_index + 1][cell['column_index']] = '**' + cell['content'].replace('|', '') + '**'
49
- header_row_idx.append(row_index + 1)
50
- else:
51
- # Content cells are offset by 1 due to headers
52
- markdown_table[row_index + 1][cell['column_index']] = cell['content'].replace('|', '')
53
-
54
- markdown_output = ''
55
- for row in markdown_table:
56
- markdown_output += '| ' + ' | '.join(row) + ' |\n'
57
- if markdown_table.index(row) in header_row_idx:
58
- # if markdown_table.index(row) == 0:
59
- # Add a separator after the header
60
- markdown_output += '| ' + ' | '.join(['---'] * column_count) + ' |\n'
61
-
62
- return markdown_output, table_offsets
63
-
64
- def _paragraphs2md(self, paragraph, element_offsets, **kwargs):
65
- paragraph_offsets = (
66
- paragraph['spans'][0]['offset'], paragraph['spans'][-1]['offset'] + paragraph['spans'][-1]['length'])
67
- for offset in element_offsets:
68
- if paragraph_offsets[0] >= offset[0] and paragraph['spans'][0]['offset'] <= offset[1]:
69
- return None, None
70
-
71
- markdown_text = ''
72
-
73
- if paragraph['role'] == 'title':
74
- markdown_text += f'# {paragraph["content"]}'
75
- elif paragraph == "sectionHeading":
76
- markdown_text += f'## {paragraph["content"]}'
77
- else:
78
- markdown_text += f'{paragraph["content"]}'
79
- return markdown_text, paragraph_offsets
80
-
81
- def _stitch_paragraphs_elements(self, paragraphs, elements, **kwargs):
82
- new_list = paragraphs + elements
83
- sorted_new_list = sorted(new_list, key=lambda x: x['offset'][0])
84
- return sorted_new_list
85
-
86
- def _convert2md(self, result, **kwargs):
87
- paragraphs, tables = result['paragraphs'], result['tables']
88
- md_tables = []
89
- for table in tables:
90
- md, offset = self._table2md(table, requestId=kwargs.get('requestId'))
91
- md_tables.append({'content': md, 'offset': offset})
92
-
93
- table_offsets = [element['offset'] for element in md_tables]
94
- md_paragraphs = []
95
-
96
- for para in paragraphs:
97
- md, offset = self._paragraphs2md(para, table_offsets, requestId=kwargs.get('requestId'))
98
- if md is not None:
99
- md_paragraphs.append({'content': md, 'offset': offset})
100
-
101
- all_md_elements = self._stitch_paragraphs_elements(md_paragraphs, md_tables, requestId=kwargs.get('requestId'))
102
- full_md = '\n\n'.join([record['content'] for record in all_md_elements])
103
- return full_md
104
-
105
- def _call_engine(self, image_reader, **kwargs):
106
- poller = self.client.begin_analyze_document(
107
- CREDENTIALS['azure']['layout']['model'],
108
- image_reader
109
- )
110
- result = poller.result()
111
-
112
- result = convert_nested_complex_obj_to_json(result)
113
- md_text = self._convert2md(result, requestId=kwargs.get('requestId'))
114
-
115
- return md_text, result
116
-
117
- def __call__(self, file_bytes):
118
- text, raw_response = self._call_engine(file_bytes)
119
- return text, raw_response
120
-
121
  class PyMuPDF4LLMOCR:
122
  def __init__(self):
123
  self.engine = 'open-source/pymupdf4llm'
@@ -135,6 +23,6 @@ if __name__ == '__main__':
135
  import sys
136
  filepath = sys.argv[1]
137
  file_bytes = open(filepath, 'rb').read()
138
- ocr = AzureLayoutOCR()
139
  text, raw_response = ocr(file_bytes)
140
  print(text)
 
4
  """
5
 
6
  # Imports
 
 
 
 
7
  import pymupdf4llm, pymupdf
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  class PyMuPDF4LLMOCR:
10
  def __init__(self):
11
  self.engine = 'open-source/pymupdf4llm'
 
23
  import sys
24
  filepath = sys.argv[1]
25
  file_bytes = open(filepath, 'rb').read()
26
+ ocr = PyMuPDF4LLMOCR()
27
  text, raw_response = ocr(file_bytes)
28
  print(text)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ python-dotenv
2
+ pymupdf
3
+ pymupdf4llm
4
+ openai==1.58.1