Spaces:
Running
Running
""" | |
Run analysis | |
@author : Sakshi Tantak | |
""" | |
# Imports | |
from time import time | |
from datetime import datetime | |
from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT | |
from policy_analyser.ocr import PyMuPDF4LLMOCR | |
from policy_analyser.extraction import extract | |
from policy_analyser.rules import prepare_payload, rules | |
from policy_analyser.llm import call_openai | |
# OCR = AzureLayoutOCR() | |
OCR = PyMuPDF4LLMOCR() | |
def analyse(file_bytes, end2end = False): | |
print('OCR Started ...') | |
ocr_start = time() | |
if isinstance(file_bytes, str): | |
text = file_bytes | |
elif isinstance(file_bytes, (bytearray, bytes)): | |
text, _ = OCR(file_bytes) | |
ocr_end = time() | |
print(f'OCR done [{ocr_end - ocr_start}]') | |
if len(text) > 0: | |
if not end2end: | |
print('Extraction Started ...') | |
ext_start = time() | |
raw_response, entities = extract(text) | |
ext_end = time() | |
print(f'Extraction done [{ext_end - ext_start}]') | |
if len(entities) > 0: | |
print('Preparing payload for analysis ...') | |
payload = prepare_payload(entities) | |
print('Payload prepared for analysis') | |
print('Analysing ...') | |
analysis_start = time() | |
analysis = rules(payload) | |
analysis_end = time() | |
print(f'Analysed [{analysis_end - analysis_start}]') | |
print('Summarising ...') | |
summary = {} | |
summary_start = time() | |
for verdict in ['Good', 'Average', 'Bad']: | |
descriptions = '\n'.join([factor['reason'] for factor in analysis if factor['verdict'] == verdict]) | |
if len(descriptions) > 0: | |
prompt = f"""Given the following analysis on the {verdict} factors of a customer's policy that they have bought, generate a crisp and catchy summary of the factors for a customer. Try to make it factor-wise with bullet points | |
NOTE : THE POLICY WAS NOT SOLD BY US | |
analysis : {descriptions} | |
summary : """ | |
response = call_openai(prompt) | |
print(response) | |
else: | |
response = '' | |
summary[verdict] = response | |
summary_end = time() | |
# print(f'Summarised [{summary_end - summary_start}]') | |
# factors_str = '' | |
# for verdict in ['Good', 'Average', 'Bad']: | |
# factors_str += verdict + ' Factors:' | |
# factors_str += '\n' + '\n'.join([f"{factor['factor']}: {factor['reason']}" for factor in analysis if factor['verdict'] == verdict]) | |
# print('Suggesting ...') | |
# suggestion_start = time() | |
# suggestion = call_openai(f"""Given the following main factors and their values of a customer's health insurance policy, use these factors to compare with given Acko's health policy and suggest to the customer how the Average and Bad factors maybe covered better by Acko's policy. | |
# Format response in less than 50 words and make it factor-wise. Try to format in points. Include emojis to make it catchy. | |
# Customer Poliocy Factors: | |
# {factors_str} | |
# Acko Policy : {ACKO_POLICY} | |
# Customer Suggestion : """) | |
# suggestion_end = time() | |
# print(f'Suggested [{suggestion_end - suggestion_start}]') | |
response = [ | |
{ | |
'stage' : 'OCR', | |
'response' : text, | |
'time' : ocr_end - ocr_start | |
}, | |
{ | |
'stage' : 'EXTRACTION', | |
'response' : { | |
'raw' : raw_response, | |
'processed' : entities | |
}, | |
'time' : ext_end - ext_start | |
}, | |
{ | |
'stage' : 'POST_PROCESS', | |
'response' : payload, | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'ANALYSE', | |
'response' : analysis, | |
'time' : analysis_end - analysis_start | |
}, | |
{ | |
'stage' : 'ANALYSIS_SUMMARY', | |
'response' : summary, | |
'time' : summary_end - summary_start | |
}, | |
# { | |
# 'stage' : 'SUGGEST', | |
# 'response' : suggestion, | |
# 'time' : suggestion_end - suggestion_start | |
# } | |
] | |
return response | |
response = [ | |
{ | |
'stage' : 'OCR', | |
'response' : text, | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'EXTRACTION', | |
'response' : { | |
'raw' : '', | |
'processed' : [] | |
}, | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'POST_PROCESS', | |
'response' : {}, | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'ANALYSE', | |
'response' : [], | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'ANALYSIS_SUMMARY', | |
'response' : {'Good' : '', 'Average' : '', 'Bad' : ''}, | |
'time' : 0 | |
}, | |
# { | |
# 'stage' : 'SUGGEST', | |
# 'response' : '', | |
# 'time' : 0 | |
# } | |
] | |
return response | |
else: | |
response = [ | |
{ | |
'stage' : 'OCR', | |
'response' : text, | |
'time' : ocr_end - ocr_start | |
} | |
] | |
try: | |
print('Analysing ...') | |
analysis_start = time() | |
raw_response = call_openai(ANALYSIS_PROMPT + 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates") | |
analysis_end = time() | |
print('Analysis : ', raw_response) | |
print(f'Analysed [{analysis_end - analysis_start}]') | |
if raw_response is not None: | |
response.append( | |
{ | |
'stage' : 'ANALYSE', | |
'response' : raw_response, | |
'time' : analysis_end - analysis_start | |
} | |
) | |
print('Suggesting our policy ...') | |
suggestion_start = time() | |
suggestion = call_openai(SUGGESTION_PROMPT + "\nCustomer Policy Analysis : " + raw_response + "\nAcko's Policy : " + ACKO_POLICY) | |
suggestion_end = time() | |
print(f'Suggested [{suggestion_end - suggestion_start}]') | |
if suggestion is not None: | |
response.append({ | |
'stage' : 'SUGGEST', | |
'response' : suggestion, | |
'time' : suggestion_end - suggestion_start | |
} | |
) | |
return response | |
except Exception as e: | |
print(e) | |
response.extend( | |
[ | |
{ | |
'stage' : 'ANALYSE', | |
'response' : '', | |
'time' : 0 | |
}, | |
{ | |
'stage' : 'SUGGEST', | |
'response' : '', | |
'time' : 0 | |
} | |
] | |
) | |
return response | |
if __name__ == '__main__': | |
import os | |
import json | |
import sys | |
from tqdm import tqdm | |
filepaths = sys.argv[1:] | |
for filepath in tqdm(filepaths): | |
# if os.path.isfile(filepath.replace('.pdf', '.analysis.json')): | |
# continue | |
if '.analysis' in filepath or '.e2e-analysis' in filepath: | |
continue | |
print(filepath) | |
if filepath.endswith('.pdf'): | |
file_bytes = open(filepath, 'rb').read() | |
elif filepath.endswith(('.txt', '.md')): | |
file_bytes = open(filepath).read() | |
end2end = True | |
analysis = analyse(file_bytes, True) | |
# print(analysis) | |
basepath = os.path.splitext(filepath)[0] | |
if not end2end: | |
with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f: | |
json.dump(analysis, f, indent = 4) | |
else: | |
with open(os.path.splitext(filepath)[0] + '.e2e-analysis.json', 'w') as f: | |
json.dump(analysis, f, indent = 4) | |
with open(os.path.splitext(filepath)[0] + '.e2e-analysis.md', 'w') as f: | |
f.write(analysis[1]['response']) | |