Sakshi
arch lob agnostic
bef8e94
raw
history blame
9.56 kB
"""
Run analysis
@author : Sakshi Tantak
"""
# Imports
from time import time
from datetime import datetime
from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT
from policy_analyser.ocr import PyMuPDF4LLMOCR
from policy_analyser.extraction import extract
from policy_analyser.rules import prepare_payload, rules
from policy_analyser.llm import call_openai
# OCR = AzureLayoutOCR()
OCR = PyMuPDF4LLMOCR()
def analyse(file_bytes, end2end = False):
print('OCR Started ...')
ocr_start = time()
if isinstance(file_bytes, str):
text = file_bytes
elif isinstance(file_bytes, (bytearray, bytes)):
text, _ = OCR(file_bytes)
ocr_end = time()
print(f'OCR done [{ocr_end - ocr_start}]')
if len(text) > 0:
if not end2end:
print('Extraction Started ...')
ext_start = time()
raw_response, entities = extract(text)
ext_end = time()
print(f'Extraction done [{ext_end - ext_start}]')
if len(entities) > 0:
print('Preparing payload for analysis ...')
payload = prepare_payload(entities)
print('Payload prepared for analysis')
print('Analysing ...')
analysis_start = time()
analysis = rules(payload)
analysis_end = time()
print(f'Analysed [{analysis_end - analysis_start}]')
print('Summarising ...')
summary = {}
summary_start = time()
for verdict in ['Good', 'Average', 'Bad']:
descriptions = '\n'.join([factor['reason'] for factor in analysis if factor['verdict'] == verdict])
if len(descriptions) > 0:
prompt = f"""Given the following analysis on the {verdict} factors of a customer's policy that they have bought, generate a crisp and catchy summary of the factors for a customer. Try to make it factor-wise with bullet points
NOTE : THE POLICY WAS NOT SOLD BY US
analysis : {descriptions}
summary : """
response = call_openai(prompt)
print(response)
else:
response = ''
summary[verdict] = response
summary_end = time()
# print(f'Summarised [{summary_end - summary_start}]')
# factors_str = ''
# for verdict in ['Good', 'Average', 'Bad']:
# factors_str += verdict + ' Factors:'
# factors_str += '\n' + '\n'.join([f"{factor['factor']}: {factor['reason']}" for factor in analysis if factor['verdict'] == verdict])
# print('Suggesting ...')
# suggestion_start = time()
# suggestion = call_openai(f"""Given the following main factors and their values of a customer's health insurance policy, use these factors to compare with given Acko's health policy and suggest to the customer how the Average and Bad factors maybe covered better by Acko's policy.
# Format response in less than 50 words and make it factor-wise. Try to format in points. Include emojis to make it catchy.
# Customer Poliocy Factors:
# {factors_str}
# Acko Policy : {ACKO_POLICY}
# Customer Suggestion : """)
# suggestion_end = time()
# print(f'Suggested [{suggestion_end - suggestion_start}]')
response = [
{
'stage' : 'OCR',
'response' : text,
'time' : ocr_end - ocr_start
},
{
'stage' : 'EXTRACTION',
'response' : {
'raw' : raw_response,
'processed' : entities
},
'time' : ext_end - ext_start
},
{
'stage' : 'POST_PROCESS',
'response' : payload,
'time' : 0
},
{
'stage' : 'ANALYSE',
'response' : analysis,
'time' : analysis_end - analysis_start
},
{
'stage' : 'ANALYSIS_SUMMARY',
'response' : summary,
'time' : summary_end - summary_start
},
# {
# 'stage' : 'SUGGEST',
# 'response' : suggestion,
# 'time' : suggestion_end - suggestion_start
# }
]
return response
response = [
{
'stage' : 'OCR',
'response' : text,
'time' : 0
},
{
'stage' : 'EXTRACTION',
'response' : {
'raw' : '',
'processed' : []
},
'time' : 0
},
{
'stage' : 'POST_PROCESS',
'response' : {},
'time' : 0
},
{
'stage' : 'ANALYSE',
'response' : [],
'time' : 0
},
{
'stage' : 'ANALYSIS_SUMMARY',
'response' : {'Good' : '', 'Average' : '', 'Bad' : ''},
'time' : 0
},
# {
# 'stage' : 'SUGGEST',
# 'response' : '',
# 'time' : 0
# }
]
return response
else:
response = [
{
'stage' : 'OCR',
'response' : text,
'time' : ocr_end - ocr_start
}
]
try:
print('Analysing ...')
analysis_start = time()
raw_response = call_openai(ANALYSIS_PROMPT + 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates")
analysis_end = time()
print('Analysis : ', raw_response)
print(f'Analysed [{analysis_end - analysis_start}]')
if raw_response is not None:
response.append(
{
'stage' : 'ANALYSE',
'response' : raw_response,
'time' : analysis_end - analysis_start
}
)
print('Suggesting our policy ...')
suggestion_start = time()
suggestion = call_openai(SUGGESTION_PROMPT + "\nCustomer Policy Analysis : " + raw_response + "\nAcko's Policy : " + ACKO_POLICY)
suggestion_end = time()
print(f'Suggested [{suggestion_end - suggestion_start}]')
if suggestion is not None:
response.append({
'stage' : 'SUGGEST',
'response' : suggestion,
'time' : suggestion_end - suggestion_start
}
)
return response
except Exception as e:
print(e)
response.extend(
[
{
'stage' : 'ANALYSE',
'response' : '',
'time' : 0
},
{
'stage' : 'SUGGEST',
'response' : '',
'time' : 0
}
]
)
return response
if __name__ == '__main__':
import os
import json
import sys
from tqdm import tqdm
filepaths = sys.argv[1:]
for filepath in tqdm(filepaths):
# if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
# continue
if '.analysis' in filepath or '.e2e-analysis' in filepath:
continue
print(filepath)
if filepath.endswith('.pdf'):
file_bytes = open(filepath, 'rb').read()
elif filepath.endswith(('.txt', '.md')):
file_bytes = open(filepath).read()
end2end = True
analysis = analyse(file_bytes, True)
# print(analysis)
basepath = os.path.splitext(filepath)[0]
if not end2end:
with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f:
json.dump(analysis, f, indent = 4)
else:
with open(os.path.splitext(filepath)[0] + '.e2e-analysis.json', 'w') as f:
json.dump(analysis, f, indent = 4)
with open(os.path.splitext(filepath)[0] + '.e2e-analysis.md', 'w') as f:
f.write(analysis[1]['response'])