""" Run analysis @author : Sakshi Tantak """ # Imports from time import time from datetime import datetime from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT from policy_analyser.ocr import PyMuPDF4LLMOCR from policy_analyser.extraction import extract from policy_analyser.rules import prepare_payload, rules from policy_analyser.llm import call_openai # OCR = AzureLayoutOCR() OCR = PyMuPDF4LLMOCR() def analyse(file_bytes, end2end = False): print('OCR Started ...') ocr_start = time() if isinstance(file_bytes, str): text = file_bytes elif isinstance(file_bytes, (bytearray, bytes)): text, _ = OCR(file_bytes) ocr_end = time() print(f'OCR done [{ocr_end - ocr_start}]') if len(text) > 0: if not end2end: print('Extraction Started ...') ext_start = time() raw_response, entities = extract(text) ext_end = time() print(f'Extraction done [{ext_end - ext_start}]') if len(entities) > 0: print('Preparing payload for analysis ...') payload = prepare_payload(entities) print('Payload prepared for analysis') print('Analysing ...') analysis_start = time() analysis = rules(payload) analysis_end = time() print(f'Analysed [{analysis_end - analysis_start}]') print('Summarising ...') summary = {} summary_start = time() for verdict in ['Good', 'Average', 'Bad']: descriptions = '\n'.join([factor['reason'] for factor in analysis if factor['verdict'] == verdict]) if len(descriptions) > 0: prompt = f"""Given the following analysis on the {verdict} factors of a customer's policy that they have bought, generate a crisp and catchy summary of the factors for a customer. Try to make it factor-wise with bullet points NOTE : THE POLICY WAS NOT SOLD BY US analysis : {descriptions} summary : """ response = call_openai(prompt) print(response) else: response = '' summary[verdict] = response summary_end = time() # print(f'Summarised [{summary_end - summary_start}]') # factors_str = '' # for verdict in ['Good', 'Average', 'Bad']: # factors_str += verdict + ' Factors:' # factors_str += '\n' + '\n'.join([f"{factor['factor']}: {factor['reason']}" for factor in analysis if factor['verdict'] == verdict]) # print('Suggesting ...') # suggestion_start = time() # suggestion = call_openai(f"""Given the following main factors and their values of a customer's health insurance policy, use these factors to compare with given Acko's health policy and suggest to the customer how the Average and Bad factors maybe covered better by Acko's policy. # Format response in less than 50 words and make it factor-wise. Try to format in points. Include emojis to make it catchy. # Customer Poliocy Factors: # {factors_str} # Acko Policy : {ACKO_POLICY} # Customer Suggestion : """) # suggestion_end = time() # print(f'Suggested [{suggestion_end - suggestion_start}]') response = [ { 'stage' : 'OCR', 'response' : text, 'time' : ocr_end - ocr_start }, { 'stage' : 'EXTRACTION', 'response' : { 'raw' : raw_response, 'processed' : entities }, 'time' : ext_end - ext_start }, { 'stage' : 'POST_PROCESS', 'response' : payload, 'time' : 0 }, { 'stage' : 'ANALYSE', 'response' : analysis, 'time' : analysis_end - analysis_start }, { 'stage' : 'ANALYSIS_SUMMARY', 'response' : summary, 'time' : summary_end - summary_start }, # { # 'stage' : 'SUGGEST', # 'response' : suggestion, # 'time' : suggestion_end - suggestion_start # } ] return response response = [ { 'stage' : 'OCR', 'response' : text, 'time' : 0 }, { 'stage' : 'EXTRACTION', 'response' : { 'raw' : '', 'processed' : [] }, 'time' : 0 }, { 'stage' : 'POST_PROCESS', 'response' : {}, 'time' : 0 }, { 'stage' : 'ANALYSE', 'response' : [], 'time' : 0 }, { 'stage' : 'ANALYSIS_SUMMARY', 'response' : {'Good' : '', 'Average' : '', 'Bad' : ''}, 'time' : 0 }, # { # 'stage' : 'SUGGEST', # 'response' : '', # 'time' : 0 # } ] return response else: response = [ { 'stage' : 'OCR', 'response' : text, 'time' : ocr_end - ocr_start } ] try: print('Analysing ...') analysis_start = time() raw_response = call_openai(ANALYSIS_PROMPT + 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates") analysis_end = time() print('Analysis : ', raw_response) print(f'Analysed [{analysis_end - analysis_start}]') if raw_response is not None: response.append( { 'stage' : 'ANALYSE', 'response' : raw_response, 'time' : analysis_end - analysis_start } ) print('Suggesting our policy ...') suggestion_start = time() suggestion = call_openai(SUGGESTION_PROMPT + "\nCustomer Policy Analysis : " + raw_response + "\nAcko's Policy : " + ACKO_POLICY) suggestion_end = time() print(f'Suggested [{suggestion_end - suggestion_start}]') if suggestion is not None: response.append({ 'stage' : 'SUGGEST', 'response' : suggestion, 'time' : suggestion_end - suggestion_start } ) return response except Exception as e: print(e) response.extend( [ { 'stage' : 'ANALYSE', 'response' : '', 'time' : 0 }, { 'stage' : 'SUGGEST', 'response' : '', 'time' : 0 } ] ) return response if __name__ == '__main__': import os import json import sys from tqdm import tqdm filepaths = sys.argv[1:] for filepath in tqdm(filepaths): # if os.path.isfile(filepath.replace('.pdf', '.analysis.json')): # continue if '.analysis' in filepath or '.e2e-analysis' in filepath: continue print(filepath) if filepath.endswith('.pdf'): file_bytes = open(filepath, 'rb').read() elif filepath.endswith(('.txt', '.md')): file_bytes = open(filepath).read() end2end = True analysis = analyse(file_bytes, True) # print(analysis) basepath = os.path.splitext(filepath)[0] if not end2end: with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f: json.dump(analysis, f, indent = 4) else: with open(os.path.splitext(filepath)[0] + '.e2e-analysis.json', 'w') as f: json.dump(analysis, f, indent = 4) with open(os.path.splitext(filepath)[0] + '.e2e-analysis.md', 'w') as f: f.write(analysis[1]['response'])