Spaces:
Sleeping
Sleeping
File size: 8,753 Bytes
0106d5f bef8e94 0106d5f bef8e94 d960853 0106d5f d67de0b 0106d5f bef8e94 d960853 bef8e94 0106d5f bef8e94 0106d5f bef8e94 0106d5f bef8e94 d67de0b bef8e94 d67de0b bef8e94 0106d5f bef8e94 0106d5f bef8e94 d67de0b 5a53af9 7bfb2b2 5a53af9 7bfb2b2 5a53af9 7bfb2b2 5a53af9 7bfb2b2 555dd1d 5a53af9 bef8e94 0106d5f bef8e94 0106d5f bef8e94 0106d5f d67de0b 0106d5f bef8e94 0106d5f d67de0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
"""
Run analysis
@author : Sakshi Tantak
"""
# Imports
import os
from time import time
from datetime import datetime
from policy_analyser import PROMPTS_DIR, DATA_DIR
from policy_analyser.ocr import PyMuPDF4LLMOCR, AzureDocumentIntelligenceOCR
from policy_analyser.llm import call_openai
from policy_analyser.utils import markdown_table_to_json
class LOB:
def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
if ocr_engine == 'open-source/pymupdf4llm':
self.engine = PyMuPDF4LLMOCR()
elif ocr_engine == 'azure/layout':
self.engine = AzureDocumentIntelligenceOCR()
self.file_type = 'pdf'
with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
self.analysis_prompt = f.read()
def __call__(self, file_bytes):
response = [
{
'stage' : 'OCR',
'response' : '',
'time' : 0
},
{
'stage' : 'ANALYSE',
'response' : '',
'time' : 0
},
{
'stage' : 'SUGGEST',
'response' : '',
'time' : 0
}
]
try:
print('OCR Started ...')
ocr_start = time()
if isinstance(file_bytes, str):
text = file_bytes
elif isinstance(file_bytes, (bytearray, bytes)):
text, _ = self.engine(file_bytes)
ocr_end = time()
print(f'OCR done [{ocr_end - ocr_start}]')
if len(text) > 0:
response[0].update({'response' : text, 'time' : ocr_end - ocr_start})
try:
print('Analysing ...')
analysis_start = time()
raw_response = self._analyse(text = text)
analysis_end = time()
print('Analysis : ', raw_response)
print(f'Analysed [{analysis_end - analysis_start}]')
if raw_response is not None and len(raw_response) > 0:
response[1].update({'response' : raw_response, 'time' : analysis_end - analysis_start})
try:
print('Suggesting our policy ...')
suggestion_start = time()
suggestion = self._suggest(analysis = raw_response)
suggestion_end = time()
print(f'Suggested [{suggestion_end - suggestion_start}]')
if suggestion is not None and len(suggestion) > 0:
response[2].update({'response' : suggestion, 'time' : suggestion_end - suggestion_start})
except Exception as sugg_e:
print(f'Exception while suggesting : {sugg_e}')
except Exception as analysis_e:
print(f'Exception while analysing : {analysis_e}')
except Exception as ocr_e:
print(f'Exception while OCR : {ocr_e}')
return response
def _analyse(self, **kwargs):
raise NotImplemented
def _suggest(self, **kwargs):
raise NotImplemented
class Health(LOB):
def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
super().__init__(ocr_engine)
with open(os.path.join(PROMPTS_DIR, 'health', 'analysis_output_format.txt'), 'r') as f:
self.analysis_output_format = f.read()
with open(os.path.join(PROMPTS_DIR, 'health', 'rules.txt'), 'r') as f:
self.rules = f.read()
with open(os.path.join(PROMPTS_DIR, 'health', 'suggest.txt'), 'r') as f:
self.suggest_prompt = f.read()
with open(os.path.join(DATA_DIR, 'health_policy.md'), 'r') as f:
self.acko_policy = f.read()
with open(os.path.join(DATA_DIR, 'health_super_topup.md'), 'r') as f:
self.acko_super_topup = f.read()
def _analyse(self, **kwargs):
text = kwargs.get('text')
if len(text) > 0:
prompt = self.analysis_prompt.replace('{{lob}}', 'health').replace('{{rules}}', self.rules).replace('{{output_format}}', self.analysis_output_format)
prompt += 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates"
response = call_openai(prompt)
if len(response) > 0:
return response
return ''
def _suggest(self, **kwargs):
analysis = kwargs.get('analysis')
if len(analysis) > 0:
bad_factors = markdown_table_to_json(analysis.split(f'<BAD>')[-1].split(f'</BAD>')[0].replace('## Bad Factors', ''))
bad_factor_names = [factor['Factor'] for factor in bad_factors]
avg_factors = markdown_table_to_json(analysis.split(f'<AVERAGE>')[-1].split(f'</AVERAGE>')[0].replace('## Average Factors', ''))
avg_factor_names = [factor['Factor'] for factor in bad_factors]
if len(bad_factors) > 3:
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
print('selected policy')
elif len(avg_factors) > 3:
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
print('selected policy')
elif len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]):
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup
print('selected super topup')
elif len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]):
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup
print('selected super topup')
else:
prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
# if len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]) \
# or len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]):
# prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_super_topup
# else:
# prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top up Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
response = call_openai(prompt)
if len(response) > 0:
return response
return ''
def __call__(self, file_bytes):
return super().__call__(file_bytes)
if __name__ == '__main__':
import os
import json
import sys
from tqdm import tqdm
filepaths = sys.argv[1:]
health = Health()
for filepath in tqdm(filepaths):
# if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
# continue
# if '.analysis' in filepath or '.e2e-analysis' in filepath:
# continue
print(filepath)
if filepath.endswith('.pdf'):
file_bytes = open(filepath, 'rb').read()
elif filepath.endswith(('.txt', '.md')):
file_bytes = open(filepath).read()
end2end = True
analysis = health(file_bytes)
# print(analysis)
basepath = os.path.splitext(filepath)[0]
# if not end2end:
# with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f:
# json.dump(analysis, f, indent = 4)
# else:
# with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.json', 'w') as f:
# json.dump(analysis, f, indent = 4)
# with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.md', 'w') as f:
# f.write(analysis[1]['response'])
|