Spaces:

pikaduck
/

policy-analyser

Running

File size: 7,042 Bytes

0106d5f
 
 
 
 
 
bef8e94
0106d5f
 
 
bef8e94
a327219
0106d5f
d67de0b
0106d5f
bef8e94
 
 
 
 
 
 
0106d5f
bef8e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0106d5f
bef8e94
 
0106d5f
bef8e94
 
 
 
 
 
 
 
 
d67de0b
bef8e94
d67de0b
 
bef8e94
 
 
 
 
 
 
 
0106d5f
bef8e94
0106d5f
bef8e94
 
 
d67de0b
 
 
 
 
 
 
 
 
bef8e94
 
 
 
0106d5f
bef8e94
 
0106d5f
 
 
 
 
 
 
bef8e94
0106d5f
 
 
 
d67de0b
 
0106d5f
 
 
 
 
 
bef8e94
0106d5f
 
d67de0b

"""
    Run analysis
    @author : Sakshi Tantak
"""

# Imports
import os
from time import time
from datetime import datetime

from policy_analyser import PROMPTS_DIR, DATA_DIR
from policy_analyser.ocr import PyMuPDF4LLMOCR
from policy_analyser.llm import call_openai
from policy_analyser.utils import markdown_table_to_json

class LOB:
    def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
        if ocr_engine == 'open-source/pymupdf4llm':
            self.engine = PyMuPDF4LLMOCR()
        self.file_type = 'pdf'
        with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
            self.analysis_prompt = f.read()

    def __call__(self, file_bytes):
        response = [
            {
                'stage' : 'OCR',
                'response' : '',
                'time' : 0
            },
            {
                'stage' : 'ANALYSE',
                'response' : '',
                'time' : 0
            },
            {
                'stage' : 'SUGGEST',
                'response' : '',
                'time' : 0
            }
        ]
        try:
            print('OCR Started ...')
            ocr_start = time()
            if isinstance(file_bytes, str):
                text = file_bytes
            elif isinstance(file_bytes, (bytearray, bytes)):
                text, _ = self.engine(file_bytes)
            ocr_end = time()
            print(f'OCR done [{ocr_end - ocr_start}]')

            if len(text) > 0:
                response[0].update({'response' : text, 'time' : ocr_end - ocr_start})
                try:
                    print('Analysing ...')
                    analysis_start = time()
                    raw_response = self._analyse(text = text)
                    analysis_end = time()
                    print('Analysis : ', raw_response)
                    print(f'Analysed [{analysis_end - analysis_start}]')
                    if raw_response is not None and len(raw_response) > 0:
                        response[1].update({'response' : raw_response, 'time' : analysis_end - analysis_start})
                        try:
                            print('Suggesting our policy ...')
                            suggestion_start = time()
                            suggestion = self._suggest(analysis = raw_response)
                            suggestion_end = time()
                            print(f'Suggested [{suggestion_end - suggestion_start}]')
                            if suggestion is not None and len(suggestion) > 0:
                                response[2].update({'response' : suggestion, 'time' : suggestion_end - suggestion_start})
                        except Exception as sugg_e:
                            print(f'Exception while suggesting : {sugg_e}')
                except Exception as analysis_e:
                    print(f'Exception while analysing : {analysis_e}')
        except Exception as ocr_e:
            print(f'Exception while OCR : {ocr_e}')
        return response

    def _analyse(self, **kwargs):
        raise NotImplemented

    def _suggest(self, **kwargs):
        raise NotImplemented

class Health(LOB):
    def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
        super().__init__(ocr_engine)
        with open(os.path.join(PROMPTS_DIR, 'health', 'analysis_output_format.txt'), 'r') as f:
            self.analysis_output_format = f.read()
        with open(os.path.join(PROMPTS_DIR, 'health', 'rules.txt'), 'r') as f:
            self.rules = f.read()
        with open(os.path.join(PROMPTS_DIR, 'health', 'suggest.txt'), 'r') as f:
            self.suggest_prompt = f.read()
        with open(os.path.join(DATA_DIR, 'health_policy.md'), 'r') as f:
            self.acko_policy = f.read()
        with open(os.path.join(DATA_DIR, 'health_super_topup.md'), 'r') as f:
            self.acko_super_topup = f.read()

    def _analyse(self, **kwargs):
        text = kwargs.get('text')
        if len(text) > 0:
            prompt = self.analysis_prompt.replace('{{lob}}', 'health').replace('{{rules}}', self.rules).replace('{{output_format}}', self.analysis_output_format)
            prompt += 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates"
            response = call_openai(prompt)
            if len(response) > 0:
                return response
        return ''

    def _suggest(self, **kwargs):
        analysis = kwargs.get('analysis')
        if len(analysis) > 0:
            bad_factors = markdown_table_to_json(analysis.split(f'<BAD>')[-1].split(f'</BAD>')[0].replace('## Bad Factors', ''))
            bad_factor_names = [factor['Factor'] for factor in bad_factors]
            avg_factors = markdown_table_to_json(analysis.split(f'<AVERAGE>')[-1].split(f'</AVERAGE>')[0].replace('## Average Factors', ''))
            avg_factor_names = [factor['Factor'] for factor in bad_factors]
            if len(bad_factors) < 3 and any(['sum insured' in factor.lower() for factor in bad_factor_names]) \
                or len(avg_factors) < 3 and any(['sum insured' in factor.lower() for factor in avg_factor_names]):
                prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_super_topup
            else:
                prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Super Top up Policy : " + self.acko_policy + '\nNote : Super Top up is an additional policy the customer can buy to enhance the benefits along with their current policy'
            response = call_openai(prompt)
            if len(response) > 0:
                return response
        return ''

    def __call__(self, file_bytes):
        return super().__call__(file_bytes)

if __name__ == '__main__':
    import os
    import json
    import sys
    from tqdm import tqdm
    filepaths = sys.argv[1:]
    health = Health()

    for filepath in tqdm(filepaths):
        # if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
            # continue
        # if '.analysis' in filepath or '.e2e-analysis' in filepath:
        #     continue
        print(filepath)
        if filepath.endswith('.pdf'):
            file_bytes = open(filepath, 'rb').read()
        elif filepath.endswith(('.txt', '.md')):
            file_bytes = open(filepath).read()
        end2end = True
        analysis = health(file_bytes)
        # print(analysis)
        basepath = os.path.splitext(filepath)[0]
        # if not end2end:
        #     with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f:
        #         json.dump(analysis, f, indent = 4)
        # else:
        #     with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.json', 'w') as f:
        #         json.dump(analysis, f, indent = 4)
        #     with open(os.path.splitext(filepath)[0] + '.o1-mini.e2e-analysis.md', 'w') as f:
        #         f.write(analysis[1]['response'])