Spaces:
Sleeping
Sleeping
""" | |
Run analysis | |
@author : Sakshi Tantak | |
""" | |
# Imports | |
import os | |
from time import time | |
from datetime import datetime | |
from policy_analyser import PROMPTS_DIR, DATA_DIR | |
from policy_analyser.ocr import PyMuPDF4LLMOCR, AzureDocumentIntelligenceOCR | |
from policy_analyser.llm import call_openai | |
from policy_analyser.utils import markdown_table_to_json | |
class LOB: | |
def __init__(self, ocr_engine = 'open-source/pymupdf4llm'): | |
if ocr_engine == 'open-source/pymupdf4llm': | |
self.engine = PyMuPDF4LLMOCR() | |
elif ocr_engine == 'azure/layout': | |
self.engine = AzureDocumentIntelligenceOCR() | |
self.file_type = 'pdf' | |
with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f: | |
self.analysis_prompt = f.read() | |
def __call__(self, file_bytes): | |
response = [ | |
{ | |
'stage' : 'OCR', | |
'response' : '', | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'ANALYSE', | |
'response' : '', | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'SUGGEST', | |
'response' : '', | |
'time' : 0 | |
} | |
] | |
try: | |
print('OCR Started ...') | |
ocr_start = time() | |
if isinstance(file_bytes, str): | |
text = file_bytes | |
elif isinstance(file_bytes, (bytearray, bytes)): | |
text, _ = self.engine(file_bytes) | |
ocr_end = time() | |
print(f'OCR done [{ocr_end - ocr_start}]') | |
if len(text) > 0: | |
response[0].update({'response' : text, 'time' : ocr_end - ocr_start}) | |
try: | |
print('Analysing ...') | |
analysis_start = time() | |
raw_response = self._analyse(text = text) | |
analysis_end = time() | |
print('Analysis : ', raw_response) | |
print(f'Analysed [{analysis_end - analysis_start}]') | |
if raw_response is not None and len(raw_response) > 0: | |
response[1].update({'response' : raw_response, 'time' : analysis_end - analysis_start}) | |
try: | |
print('Suggesting our policy ...') | |
suggestion_start = time() | |
suggestion = self._suggest(analysis = raw_response) | |
suggestion_end = time() | |
print(f'Suggested [{suggestion_end - suggestion_start}]') | |
if suggestion is not None and len(suggestion) > 0: | |
response[2].update({'response' : suggestion, 'time' : suggestion_end - suggestion_start}) | |
except Exception as sugg_e: | |
print(f'Exception while suggesting : {sugg_e}') | |
except Exception as analysis_e: | |
print(f'Exception while analysing : {analysis_e}') | |
except Exception as ocr_e: | |
print(f'Exception while OCR : {ocr_e}') | |
return response | |
def _analyse(self, **kwargs): | |
raise NotImplemented | |
def _suggest(self, **kwargs): | |
raise NotImplemented | |
class Health(LOB): | |
def __init__(self, ocr_engine = 'open-source/pymupdf4llm'): | |
super().__init__(ocr_engine) | |
with open(os.path.join(PROMPTS_DIR, 'health', 'analysis_output_format.txt'), 'r') as f: | |
self.analysis_output_format = f.read() | |
with open(os.path.join(PROMPTS_DIR, 'health', 'rules.txt'), 'r') as f: | |
self.rules = f.read() | |
with open(os.path.join(PROMPTS_DIR, 'health', 'suggest.txt'), 'r') as f: | |
self.suggest_prompt = f.read() | |
with open(os.path.join(DATA_DIR, 'health_policy.md'), 'r') as f: | |
self.acko_policy = f.read() | |
with open(os.path.join(DATA_DIR, 'health_super_topup.md'), 'r') as f: | |
self.acko_super_topup = f.read() | |
def _analyse(self, **kwargs): | |
text = kwargs.get('text') | |
if len(text) > 0: | |
prompt = self.analysis_prompt.replace('{{lob}}', 'health').replace('{{rules}}', self.rules).replace('{{output_format}}', self.analysis_output_format) | |
prompt += 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates" | |
response = call_openai(prompt) | |
if len(response) > 0: | |
return response | |
return '' | |
def _suggest(self, **kwargs): | |
analysis = kwargs.get('analysis') | |
if len(analysis) > 0: | |
bad_factors = markdown_table_to_json(analysis.split(f'<BAD>')[-1].split(f'</BAD>')[0].replace('## Bad Factors', '')) | |
bad_factor_names = [factor['Factor'] for factor in bad_factors] | |
avg_factors = markdown_table_to_json(analysis.split(f'<AVERAGE>')[-1].split(f'</AVERAGE>')[0].replace('## Average Factors', '')) | |
avg_factor_names = [factor['Factor'] for factor in bad_factors] | |
if len(bad_factors) > 3: | |
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' | |
print('selected policy') | |
elif len(avg_factors) > 3: | |
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' | |
print('selected policy') | |
elif len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]): | |
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup | |
print('selected super topup') | |
elif len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]): | |
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup | |
print('selected super topup') | |
else: | |
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' | |
# if len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]) \ | |
# or len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]): | |
# prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_super_topup | |
# else: | |
# prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top up Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' | |
response = call_openai(prompt) | |
if len(response) > 0: | |
return response | |
return '' | |
def __call__(self, file_bytes): | |
return super().__call__(file_bytes) | |
if __name__ == '__main__': | |
import os | |
import json | |
import sys | |
from tqdm import tqdm | |
filepaths = sys.argv[1:] | |
health = Health() | |
for filepath in tqdm(filepaths): | |
# if os.path.isfile(filepath.replace('.pdf', '.analysis.json')): | |
# continue | |
# if '.analysis' in filepath or '.e2e-analysis' in filepath: | |
# continue | |
print(filepath) | |
if filepath.endswith('.pdf'): | |
file_bytes = open(filepath, 'rb').read() | |
elif filepath.endswith(('.txt', '.md')): | |
file_bytes = open(filepath).read() | |
end2end = True | |
analysis = health(file_bytes) | |
# print(analysis) | |
basepath = os.path.splitext(filepath)[0] | |
# if not end2end: | |
# with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f: | |
# json.dump(analysis, f, indent = 4) | |
# else: | |
# with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.json', 'w') as f: | |
# json.dump(analysis, f, indent = 4) | |
# with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.md', 'w') as f: | |
# f.write(analysis[1]['response']) | |