Spaces:

pikaduck
/

policy-analyser

Sleeping

File size: 8,753 Bytes

0106d5f
 
 
 
 
 
bef8e94
0106d5f
 
 
bef8e94
d960853
0106d5f
d67de0b
0106d5f
bef8e94
 
 
 
d960853
 
bef8e94
 
 
0106d5f
bef8e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0106d5f
bef8e94
 
0106d5f
bef8e94
 
 
 
 
 
 
 
 
d67de0b
bef8e94
d67de0b
 
bef8e94
 
 
 
 
 
 
 
0106d5f
bef8e94
0106d5f
bef8e94
 
 
d67de0b
 
 
 
5a53af9
7bfb2b2
 
5a53af9
7bfb2b2
 
5a53af9
7bfb2b2
 
5a53af9
7bfb2b2
 
555dd1d
 
5a53af9
 
 
 
 
bef8e94
 
 
 
0106d5f
bef8e94
 
0106d5f
 
 
 
 
 
 
bef8e94
0106d5f
 
 
 
d67de0b
 
0106d5f
 
 
 
 
 
bef8e94
0106d5f
 
d67de0b

"""
    Run analysis
    @author : Sakshi Tantak
"""

# Imports
import os
from time import time
from datetime import datetime

from policy_analyser import PROMPTS_DIR, DATA_DIR
from policy_analyser.ocr import PyMuPDF4LLMOCR, AzureDocumentIntelligenceOCR
from policy_analyser.llm import call_openai
from policy_analyser.utils import markdown_table_to_json

class LOB:
    def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
        if ocr_engine == 'open-source/pymupdf4llm':
            self.engine = PyMuPDF4LLMOCR()
        elif ocr_engine == 'azure/layout':
            self.engine = AzureDocumentIntelligenceOCR()
        self.file_type = 'pdf'
        with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
            self.analysis_prompt = f.read()

    def __call__(self, file_bytes):
        response = [
            {
                'stage' : 'OCR',
                'response' : '',
                'time' : 0
            },
            {
                'stage' : 'ANALYSE',
                'response' : '',
                'time' : 0
            },
            {
                'stage' : 'SUGGEST',
                'response' : '',
                'time' : 0
            }
        ]
        try:
            print('OCR Started ...')
            ocr_start = time()
            if isinstance(file_bytes, str):
                text = file_bytes
            elif isinstance(file_bytes, (bytearray, bytes)):
                text, _ = self.engine(file_bytes)
            ocr_end = time()
            print(f'OCR done [{ocr_end - ocr_start}]')

            if len(text) > 0:
                response[0].update({'response' : text, 'time' : ocr_end - ocr_start})
                try:
                    print('Analysing ...')
                    analysis_start = time()
                    raw_response = self._analyse(text = text)
                    analysis_end = time()
                    print('Analysis : ', raw_response)
                    print(f'Analysed [{analysis_end - analysis_start}]')
                    if raw_response is not None and len(raw_response) > 0:
                        response[1].update({'response' : raw_response, 'time' : analysis_end - analysis_start})
                        try:
                            print('Suggesting our policy ...')
                            suggestion_start = time()
                            suggestion = self._suggest(analysis = raw_response)
                            suggestion_end = time()
                            print(f'Suggested [{suggestion_end - suggestion_start}]')
                            if suggestion is not None and len(suggestion) > 0:
                                response[2].update({'response' : suggestion, 'time' : suggestion_end - suggestion_start})
                        except Exception as sugg_e:
                            print(f'Exception while suggesting : {sugg_e}')
                except Exception as analysis_e:
                    print(f'Exception while analysing : {analysis_e}')
        except Exception as ocr_e:
            print(f'Exception while OCR : {ocr_e}')
        return response

    def _analyse(self, **kwargs):
        raise NotImplemented

    def _suggest(self, **kwargs):
        raise NotImplemented

class Health(LOB):
    def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
        super().__init__(ocr_engine)
        with open(os.path.join(PROMPTS_DIR, 'health', 'analysis_output_format.txt'), 'r') as f:
            self.analysis_output_format = f.read()
        with open(os.path.join(PROMPTS_DIR, 'health', 'rules.txt'), 'r') as f:
            self.rules = f.read()
        with open(os.path.join(PROMPTS_DIR, 'health', 'suggest.txt'), 'r') as f:
            self.suggest_prompt = f.read()
        with open(os.path.join(DATA_DIR, 'health_policy.md'), 'r') as f:
            self.acko_policy = f.read()
        with open(os.path.join(DATA_DIR, 'health_super_topup.md'), 'r') as f:
            self.acko_super_topup = f.read()

    def _analyse(self, **kwargs):
        text = kwargs.get('text')
        if len(text) > 0:
            prompt = self.analysis_prompt.replace('{{lob}}', 'health').replace('{{rules}}', self.rules).replace('{{output_format}}', self.analysis_output_format)
            prompt += 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates"
            response = call_openai(prompt)
            if len(response) > 0:
                return response
        return ''

    def _suggest(self, **kwargs):
        analysis = kwargs.get('analysis')
        if len(analysis) > 0:
            bad_factors = markdown_table_to_json(analysis.split(f'<BAD>')[-1].split(f'</BAD>')[0].replace('## Bad Factors', ''))
            bad_factor_names = [factor['Factor'] for factor in bad_factors]
            avg_factors = markdown_table_to_json(analysis.split(f'<AVERAGE>')[-1].split(f'</AVERAGE>')[0].replace('## Average Factors', ''))
            avg_factor_names = [factor['Factor'] for factor in bad_factors]
            if len(bad_factors) > 3:
                prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
                print('selected policy')
            elif len(avg_factors) > 3:
                prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
                print('selected policy')
            elif len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]):
                prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup
                print('selected super topup')
            elif len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]):
                prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top-up Policy : " + self.acko_super_topup
                print('selected super topup')
            else:
                prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
            # if len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]) \
            #     or len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]):
            #     prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_super_topup
            # else:
            #     prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top up Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
            response = call_openai(prompt)
            if len(response) > 0:
                return response
        return ''

    def __call__(self, file_bytes):
        return super().__call__(file_bytes)

if __name__ == '__main__':
    import os
    import json
    import sys
    from tqdm import tqdm
    filepaths = sys.argv[1:]
    health = Health()

    for filepath in tqdm(filepaths):
        # if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
            # continue
        # if '.analysis' in filepath or '.e2e-analysis' in filepath:
        #     continue
        print(filepath)
        if filepath.endswith('.pdf'):
            file_bytes = open(filepath, 'rb').read()
        elif filepath.endswith(('.txt', '.md')):
            file_bytes = open(filepath).read()
        end2end = True
        analysis = health(file_bytes)
        # print(analysis)
        basepath = os.path.splitext(filepath)[0]
        # if not end2end:
        #     with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f:
        #         json.dump(analysis, f, indent = 4)
        # else:
        #     with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.json', 'w') as f:
        #         json.dump(analysis, f, indent = 4)
        #     with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.md', 'w') as f:
        #         f.write(analysis[1]['response'])