""" Run analysis @author : Sakshi Tantak """ # Imports import os from time import time from datetime import datetime from policy_analyser import PROMPTS_DIR, DATA_DIR from policy_analyser.ocr import PyMuPDF4LLMOCR, AzureDocumentIntelligenceOCR from policy_analyser.llm import call_openai from policy_analyser.utils import markdown_table_to_json class LOB: def __init__(self, ocr_engine = 'open-source/pymupdf4llm'): if ocr_engine == 'open-source/pymupdf4llm': self.engine = PyMuPDF4LLMOCR() elif ocr_engine == 'azure/layout': self.engine = AzureDocumentIntelligenceOCR() self.file_type = 'pdf' with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f: self.analysis_prompt = f.read() def __call__(self, file_bytes): response = [ { 'stage' : 'OCR', 'response' : '', 'time' : 0 }, { 'stage' : 'ANALYSE', 'response' : '', 'time' : 0 }, { 'stage' : 'SUGGEST', 'response' : '', 'time' : 0 } ] try: print('OCR Started ...') ocr_start = time() if isinstance(file_bytes, str): text = file_bytes elif isinstance(file_bytes, (bytearray, bytes)): text, _ = self.engine(file_bytes) ocr_end = time() print(f'OCR done [{ocr_end - ocr_start}]') if len(text) > 0: response[0].update({'response' : text, 'time' : ocr_end - ocr_start}) try: print('Analysing ...') analysis_start = time() raw_response = self._analyse(text = text) analysis_end = time() print('Analysis : ', raw_response) print(f'Analysed [{analysis_end - analysis_start}]') if raw_response is not None and len(raw_response) > 0: response[1].update({'response' : raw_response, 'time' : analysis_end - analysis_start}) try: print('Suggesting our policy ...') suggestion_start = time() suggestion = self._suggest(analysis = raw_response) suggestion_end = time() print(f'Suggested [{suggestion_end - suggestion_start}]') if suggestion is not None and len(suggestion) > 0: response[2].update({'response' : suggestion, 'time' : suggestion_end - suggestion_start}) except Exception as sugg_e: print(f'Exception while suggesting : {sugg_e}') except Exception as analysis_e: print(f'Exception while analysing : {analysis_e}') except Exception as ocr_e: print(f'Exception while OCR : {ocr_e}') return response def _analyse(self, **kwargs): raise NotImplemented def _suggest(self, **kwargs): raise NotImplemented class Health(LOB): def __init__(self, ocr_engine = 'open-source/pymupdf4llm'): super().__init__(ocr_engine) with open(os.path.join(PROMPTS_DIR, 'health', 'analysis_output_format.txt'), 'r') as f: self.analysis_output_format = f.read() with open(os.path.join(PROMPTS_DIR, 'health', 'rules.txt'), 'r') as f: self.rules = f.read() with open(os.path.join(PROMPTS_DIR, 'health', 'suggest.txt'), 'r') as f: self.suggest_prompt = f.read() with open(os.path.join(DATA_DIR, 'health_policy.md'), 'r') as f: self.acko_policy = f.read() with open(os.path.join(DATA_DIR, 'health_super_topup.md'), 'r') as f: self.acko_super_topup = f.read() def _analyse(self, **kwargs): text = kwargs.get('text') if len(text) > 0: prompt = self.analysis_prompt.replace('{{lob}}', 'health').replace('{{rules}}', self.rules).replace('{{output_format}}', self.analysis_output_format) prompt += 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates" response = call_openai(prompt) if len(response) > 0: return response return '' def _suggest(self, **kwargs): analysis = kwargs.get('analysis') if len(analysis) > 0: bad_factors = markdown_table_to_json(analysis.split(f'')[-1].split(f'')[0].replace('## Bad Factors', '')) bad_factor_names = [factor['Factor'] for factor in bad_factors] avg_factors = markdown_table_to_json(analysis.split(f'')[-1].split(f'')[0].replace('## Average Factors', '')) avg_factor_names = [factor['Factor'] for factor in bad_factors] if len(bad_factors) > 3: prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' print('selected policy') elif len(avg_factors) > 3: prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' print('selected policy') elif len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]): prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup print('selected super topup') elif len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]): prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup print('selected super topup') else: prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' # if len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]) \ # or len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]): # prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_super_topup # else: # prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top up Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy' response = call_openai(prompt) if len(response) > 0: return response return '' def __call__(self, file_bytes): return super().__call__(file_bytes) if __name__ == '__main__': import os import json import sys from tqdm import tqdm filepaths = sys.argv[1:] health = Health() for filepath in tqdm(filepaths): # if os.path.isfile(filepath.replace('.pdf', '.analysis.json')): # continue # if '.analysis' in filepath or '.e2e-analysis' in filepath: # continue print(filepath) if filepath.endswith('.pdf'): file_bytes = open(filepath, 'rb').read() elif filepath.endswith(('.txt', '.md')): file_bytes = open(filepath).read() end2end = True analysis = health(file_bytes) # print(analysis) basepath = os.path.splitext(filepath)[0] # if not end2end: # with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f: # json.dump(analysis, f, indent = 4) # else: # with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.json', 'w') as f: # json.dump(analysis, f, indent = 4) # with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.md', 'w') as f: # f.write(analysis[1]['response'])