Spaces:

pikaduck
/

policy-analyser

Sleeping

policy-analyser / policy_analyser /analyse.py

Sakshi

default prompt for suggest added

555dd1d 4 months ago

8.75 kB

	"""
	Run analysis
	@author : Sakshi Tantak
	"""

	# Imports
	import os
	from time import time
	from datetime import datetime

	from policy_analyser import PROMPTS_DIR, DATA_DIR
	from policy_analyser.ocr import PyMuPDF4LLMOCR, AzureDocumentIntelligenceOCR
	from policy_analyser.llm import call_openai
	from policy_analyser.utils import markdown_table_to_json

	class LOB:
	def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
	if ocr_engine == 'open-source/pymupdf4llm':
	self.engine = PyMuPDF4LLMOCR()
	elif ocr_engine == 'azure/layout':
	self.engine = AzureDocumentIntelligenceOCR()
	self.file_type = 'pdf'
	with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
	self.analysis_prompt = f.read()

	def __call__(self, file_bytes):
	response = [
	{
	'stage' : 'OCR',
	'response' : '',
	'time' : 0
	},
	{
	'stage' : 'ANALYSE',
	'response' : '',
	'time' : 0
	},
	{
	'stage' : 'SUGGEST',
	'response' : '',
	'time' : 0
	}
	]
	try:
	print('OCR Started ...')
	ocr_start = time()
	if isinstance(file_bytes, str):
	text = file_bytes
	elif isinstance(file_bytes, (bytearray, bytes)):
	text, _ = self.engine(file_bytes)
	ocr_end = time()
	print(f'OCR done [{ocr_end - ocr_start}]')

	if len(text) > 0:
	response[0].update({'response' : text, 'time' : ocr_end - ocr_start})
	try:
	print('Analysing ...')
	analysis_start = time()
	raw_response = self._analyse(text = text)
	analysis_end = time()
	print('Analysis : ', raw_response)
	print(f'Analysed [{analysis_end - analysis_start}]')
	if raw_response is not None and len(raw_response) > 0:
	response[1].update({'response' : raw_response, 'time' : analysis_end - analysis_start})
	try:
	print('Suggesting our policy ...')
	suggestion_start = time()
	suggestion = self._suggest(analysis = raw_response)
	suggestion_end = time()
	print(f'Suggested [{suggestion_end - suggestion_start}]')
	if suggestion is not None and len(suggestion) > 0:
	response[2].update({'response' : suggestion, 'time' : suggestion_end - suggestion_start})
	except Exception as sugg_e:
	print(f'Exception while suggesting : {sugg_e}')
	except Exception as analysis_e:
	print(f'Exception while analysing : {analysis_e}')
	except Exception as ocr_e:
	print(f'Exception while OCR : {ocr_e}')
	return response

	def _analyse(self, **kwargs):
	raise NotImplemented

	def _suggest(self, **kwargs):
	raise NotImplemented

	class Health(LOB):
	def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
	super().__init__(ocr_engine)
	with open(os.path.join(PROMPTS_DIR, 'health', 'analysis_output_format.txt'), 'r') as f:
	self.analysis_output_format = f.read()
	with open(os.path.join(PROMPTS_DIR, 'health', 'rules.txt'), 'r') as f:
	self.rules = f.read()
	with open(os.path.join(PROMPTS_DIR, 'health', 'suggest.txt'), 'r') as f:
	self.suggest_prompt = f.read()
	with open(os.path.join(DATA_DIR, 'health_policy.md'), 'r') as f:
	self.acko_policy = f.read()
	with open(os.path.join(DATA_DIR, 'health_super_topup.md'), 'r') as f:
	self.acko_super_topup = f.read()

	def _analyse(self, **kwargs):
	text = kwargs.get('text')
	if len(text) > 0:
	prompt = self.analysis_prompt.replace('{{lob}}', 'health').replace('{{rules}}', self.rules).replace('{{output_format}}', self.analysis_output_format)
	prompt += 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates"
	response = call_openai(prompt)
	if len(response) > 0:
	return response
	return ''

	def _suggest(self, **kwargs):
	analysis = kwargs.get('analysis')
	if len(analysis) > 0:
	bad_factors = markdown_table_to_json(analysis.split(f'<BAD>')[-1].split(f'</BAD>')[0].replace('## Bad Factors', ''))
	bad_factor_names = [factor['Factor'] for factor in bad_factors]
	avg_factors = markdown_table_to_json(analysis.split(f'<AVERAGE>')[-1].split(f'</AVERAGE>')[0].replace('## Average Factors', ''))
	avg_factor_names = [factor['Factor'] for factor in bad_factors]
	if len(bad_factors) > 3:
	prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
	print('selected policy')
	elif len(avg_factors) > 3:
	prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
	print('selected policy')
	elif len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]):
	prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup
	print('selected super topup')
	elif len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]):
	prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup
	print('selected super topup')
	else:
	prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
	# if len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]) \
	# or len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]):
	# prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_super_topup
	# else:
	# prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top up Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
	response = call_openai(prompt)
	if len(response) > 0:
	return response
	return ''

	def __call__(self, file_bytes):
	return super().__call__(file_bytes)

	if __name__ == '__main__':
	import os
	import json
	import sys
	from tqdm import tqdm
	filepaths = sys.argv[1:]
	health = Health()

	for filepath in tqdm(filepaths):
	# if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
	# continue
	# if '.analysis' in filepath or '.e2e-analysis' in filepath:
	# continue
	print(filepath)
	if filepath.endswith('.pdf'):
	file_bytes = open(filepath, 'rb').read()
	elif filepath.endswith(('.txt', '.md')):
	file_bytes = open(filepath).read()
	end2end = True
	analysis = health(file_bytes)
	# print(analysis)
	basepath = os.path.splitext(filepath)[0]
	# if not end2end:
	# with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f:
	# json.dump(analysis, f, indent = 4)
	# else:
	# with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.json', 'w') as f:
	# json.dump(analysis, f, indent = 4)
	# with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.md', 'w') as f:
	# f.write(analysis[1]['response'])