Sakshi commited on
Commit
bef8e94
·
1 Parent(s): 7d4da57

arch lob agnostic

Browse files
app.py CHANGED
@@ -1,15 +1,55 @@
1
  import os
2
  import re
 
3
 
4
  import streamlit as st
 
5
 
6
  from utils import validate_pdf
7
  from styles import apply_custom_styles
8
- from policy_analyser.analyse import analyse
9
 
10
  if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
11
  os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def main():
14
  # Apply custom styles
15
  apply_custom_styles()
@@ -27,6 +67,11 @@ def main():
27
  st.markdown('<div class="upload-container">', unsafe_allow_html=True)
28
  uploaded_files = st.file_uploader("Choose policy PDF files", type="pdf", accept_multiple_files=True)
29
  print(uploaded_files)
 
 
 
 
 
30
  st.markdown('</div>', unsafe_allow_html=True)
31
 
32
  if uploaded_files and st.button('Analyse'):
@@ -51,7 +96,7 @@ def main():
51
  with st.spinner(f"Analyzing {uploaded_file.name}..."):
52
  try:
53
  # Make API call
54
- response = analyse(pdf_bytes, True)
55
  analysis = next(
56
  (item for item in response if item.get("stage") == "ANALYSE"), None
57
  )['response']
@@ -63,7 +108,7 @@ def main():
63
  # Store results
64
  all_analyses.append({
65
  'name': uploaded_file.name,
66
- 'analysis' : re.sub(r'\<\/?(GOOD|AVERAGE|BAD|FINAL_VERDICT)\>', '', analysis),
67
  'suggestion' : suggestion
68
  })
69
 
@@ -75,43 +120,45 @@ def main():
75
  for idx, analysis in enumerate(all_analyses):
76
  with st.expander(f"### Policy {idx + 1}: {analysis['name']}"):
77
  with st.container():
78
- st.markdown(analysis['analysis'])
79
  with st.container():
80
  st.markdown('# Why Acko? 🚀')
81
  st.markdown(analysis['suggestion'])
 
82
 
83
  # Detailed Comparison Tab
84
  with tab2:
85
- if len(all_analyses) > 1:
86
- # Create comparison matrix
87
- factors_to_compare = set()
88
- for analysis in all_analyses:
89
- factors_to_compare.update(
90
- [f.split(':')[0] for f in analysis['good_factors'] +
91
- analysis['average_factors'] + analysis['bad_factors']]
92
- )
93
-
94
- # Create comparison table
95
- st.markdown("### Policy Comparison Matrix")
96
-
97
- comparison_data = []
98
- for factor in sorted(factors_to_compare):
99
- row = {'Factor': factor}
100
- for idx, analysis in enumerate(all_analyses):
101
- policy_name = f"Policy {idx + 1}"
102
- verdict = 'Not Found'
103
- for category in ['good_factors', 'average_factors', 'bad_factors']:
104
- for item in analysis[category]:
105
- if item.split(':')[0] == factor:
106
- verdict = category.split('_')[0].title()
107
- break
108
- row[policy_name] = verdict
109
- comparison_data.append(row)
110
-
111
- # Display comparison table
112
- st.table(comparison_data)
113
- else:
114
- st.info("Upload multiple policies to see comparison")
 
115
 
116
  # Footer
117
  st.markdown("""
 
1
  import os
2
  import re
3
+ import json
4
 
5
  import streamlit as st
6
+ from streamviz import gauge
7
 
8
  from utils import validate_pdf
9
  from styles import apply_custom_styles
10
+ from policy_analyser.analyse import Health
11
 
12
  if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
13
  os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
14
 
15
+ if 'health_analyser' not in st.session_state:
16
+ st.session_state.health_analyser = Health()
17
+
18
+ def markdown_table_to_json(markdown):
19
+ lines = markdown.strip().split("\n")
20
+
21
+ # Extract headers
22
+ headers = [h.strip() for h in lines[0].split("|") if h.strip()]
23
+
24
+ # Extract rows
25
+ rows = []
26
+ for line in lines[2:]: # Skip header and separator line
27
+ values = [v.strip() for v in line.split("|") if v.strip()]
28
+ row_dict = dict(zip(headers, values))
29
+ rows.append(row_dict)
30
+
31
+ return rows
32
+
33
+ def visualise_pie_chart(analysis):
34
+ verdicts = {}
35
+ score = 0
36
+ total = 0
37
+ for verdict in ['GOOD', 'AVERAGE', 'BAD']:
38
+ table = analysis.split(f'<{verdict}>')[-1].split(f'</{verdict}>')[0]
39
+ table = markdown_table_to_json(table)
40
+ if len(table) > 0:
41
+ verdicts[verdict] = table
42
+ if verdict == 'GOOD':
43
+ score += 5 * len(table)
44
+ if verdict == 'AVERAGE':
45
+ score += 3 * len(table)
46
+ elif verdict == 'BAD':
47
+ score += len(table)
48
+ total += 5 * len(table)
49
+ gauge(gVal = total, gTitle = '', gMode = 'gauge+number',
50
+ grLow = total // 3,
51
+ grMid = 2 * (total // 3))
52
+
53
  def main():
54
  # Apply custom styles
55
  apply_custom_styles()
 
67
  st.markdown('<div class="upload-container">', unsafe_allow_html=True)
68
  uploaded_files = st.file_uploader("Choose policy PDF files", type="pdf", accept_multiple_files=True)
69
  print(uploaded_files)
70
+ lob = st.selectbox(
71
+ 'Type of insurance',
72
+ options = ['Health', 'Life', 'Auto'],
73
+ index = 0
74
+ )
75
  st.markdown('</div>', unsafe_allow_html=True)
76
 
77
  if uploaded_files and st.button('Analyse'):
 
96
  with st.spinner(f"Analyzing {uploaded_file.name}..."):
97
  try:
98
  # Make API call
99
+ response = st.session_state.health_analyser(pdf_bytes)
100
  analysis = next(
101
  (item for item in response if item.get("stage") == "ANALYSE"), None
102
  )['response']
 
108
  # Store results
109
  all_analyses.append({
110
  'name': uploaded_file.name,
111
+ 'analysis' : analysis,
112
  'suggestion' : suggestion
113
  })
114
 
 
120
  for idx, analysis in enumerate(all_analyses):
121
  with st.expander(f"### Policy {idx + 1}: {analysis['name']}"):
122
  with st.container():
123
+ st.markdown(re.sub(r'\<\/?(GOOD|AVERAGE|BAD|FINAL_VERDICT)\>', '', analysis['analysis']))
124
  with st.container():
125
  st.markdown('# Why Acko? 🚀')
126
  st.markdown(analysis['suggestion'])
127
+ # visualise_pie_chart(analysis['analysis'])
128
 
129
  # Detailed Comparison Tab
130
  with tab2:
131
+ st.warning('Coming Soon')
132
+ # if len(all_analyses) > 1:
133
+ # # Create comparison matrix
134
+ # factors_to_compare = set()
135
+ # for analysis in all_analyses:
136
+ # factors_to_compare.update(
137
+ # [f.split(':')[0] for f in analysis['good_factors'] +
138
+ # analysis['average_factors'] + analysis['bad_factors']]
139
+ # )
140
+
141
+ # # Create comparison table
142
+ # st.markdown("### Policy Comparison Matrix")
143
+
144
+ # comparison_data = []
145
+ # for factor in sorted(factors_to_compare):
146
+ # row = {'Factor': factor}
147
+ # for idx, analysis in enumerate(all_analyses):
148
+ # policy_name = f"Policy {idx + 1}"
149
+ # verdict = 'Not Found'
150
+ # for category in ['good_factors', 'average_factors', 'bad_factors']:
151
+ # for item in analysis[category]:
152
+ # if item.split(':')[0] == factor:
153
+ # verdict = category.split('_')[0].title()
154
+ # break
155
+ # row[policy_name] = verdict
156
+ # comparison_data.append(row)
157
+
158
+ # # Display comparison table
159
+ # st.table(comparison_data)
160
+ # else:
161
+ # st.info("Upload multiple policies to see comparison")
162
 
163
  # Footer
164
  st.markdown("""
policy_analyser/__init__.py CHANGED
@@ -32,12 +32,9 @@ GPT_KEY = os.environ.get('GPT_KEY', '')
32
  GPT_VERSION = '2024-12-01-preview'
33
  GPT_API_BASE = 'https://ai-ackods910341544474.openai.azure.com/'
34
 
35
- EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
36
- entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
37
- for entity in entities:
38
- del entity['entityId']
39
- entities_str = '\n---\n'.join(['\n'.join([f'{k} : {v}' for k, v in entity.items()]) for entity in entities])
40
- EXTRACTION_PROMPT += entities_str
41
- ANALYSIS_PROMPT = open(os.path.join(PROMPTS_DIR, 'analysis.txt')).read().strip()
42
- SUGGESTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'suggest.txt')).read().strip()
43
- ACKO_POLICY = open(os.path.join(DATA_DIR, 'Policy_Wordings_Acko_Personal_Health_Policy_Applicable_for_the_policies_sold_post_1_10_2024_64ea02eb51_ab3c8eefa2.md')).read()
 
32
  GPT_VERSION = '2024-12-01-preview'
33
  GPT_API_BASE = 'https://ai-ackods910341544474.openai.azure.com/'
34
 
35
+ # EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
36
+ # entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
37
+ # for entity in entities:
38
+ # del entity['entityId']
39
+ # entities_str = '\n---\n'.join(['\n'.join([f'{k} : {v}' for k, v in entity.items()]) for entity in entities])
40
+ # EXTRACTION_PROMPT += entities_str
 
 
 
policy_analyser/analyse.py CHANGED
@@ -4,203 +4,116 @@
4
  """
5
 
6
  # Imports
 
7
  from time import time
8
  from datetime import datetime
9
 
10
- from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT
11
  from policy_analyser.ocr import PyMuPDF4LLMOCR
12
- from policy_analyser.extraction import extract
13
- from policy_analyser.rules import prepare_payload, rules
14
  from policy_analyser.llm import call_openai
15
 
16
- # OCR = AzureLayoutOCR()
17
- OCR = PyMuPDF4LLMOCR()
 
 
 
 
 
18
 
19
- def analyse(file_bytes, end2end = False):
20
- print('OCR Started ...')
21
- ocr_start = time()
22
- if isinstance(file_bytes, str):
23
- text = file_bytes
24
- elif isinstance(file_bytes, (bytearray, bytes)):
25
- text, _ = OCR(file_bytes)
26
- ocr_end = time()
27
- print(f'OCR done [{ocr_end - ocr_start}]')
28
- if len(text) > 0:
29
- if not end2end:
30
- print('Extraction Started ...')
31
- ext_start = time()
32
- raw_response, entities = extract(text)
33
- ext_end = time()
34
- print(f'Extraction done [{ext_end - ext_start}]')
35
- if len(entities) > 0:
36
- print('Preparing payload for analysis ...')
37
- payload = prepare_payload(entities)
38
- print('Payload prepared for analysis')
39
- print('Analysing ...')
40
- analysis_start = time()
41
- analysis = rules(payload)
42
- analysis_end = time()
43
- print(f'Analysed [{analysis_end - analysis_start}]')
44
- print('Summarising ...')
45
- summary = {}
46
- summary_start = time()
47
- for verdict in ['Good', 'Average', 'Bad']:
48
- descriptions = '\n'.join([factor['reason'] for factor in analysis if factor['verdict'] == verdict])
49
- if len(descriptions) > 0:
50
- prompt = f"""Given the following analysis on the {verdict} factors of a customer's policy that they have bought, generate a crisp and catchy summary of the factors for a customer. Try to make it factor-wise with bullet points
51
- NOTE : THE POLICY WAS NOT SOLD BY US
52
- analysis : {descriptions}
53
- summary : """
54
- response = call_openai(prompt)
55
- print(response)
56
- else:
57
- response = ''
58
- summary[verdict] = response
59
- summary_end = time()
60
- # print(f'Summarised [{summary_end - summary_start}]')
61
- # factors_str = ''
62
- # for verdict in ['Good', 'Average', 'Bad']:
63
- # factors_str += verdict + ' Factors:'
64
- # factors_str += '\n' + '\n'.join([f"{factor['factor']}: {factor['reason']}" for factor in analysis if factor['verdict'] == verdict])
65
- # print('Suggesting ...')
66
- # suggestion_start = time()
67
- # suggestion = call_openai(f"""Given the following main factors and their values of a customer's health insurance policy, use these factors to compare with given Acko's health policy and suggest to the customer how the Average and Bad factors maybe covered better by Acko's policy.
68
- # Format response in less than 50 words and make it factor-wise. Try to format in points. Include emojis to make it catchy.
69
- # Customer Poliocy Factors:
70
- # {factors_str}
 
 
 
 
 
71
 
72
- # Acko Policy : {ACKO_POLICY}
 
73
 
74
- # Customer Suggestion : """)
75
- # suggestion_end = time()
76
- # print(f'Suggested [{suggestion_end - suggestion_start}]')
77
- response = [
78
- {
79
- 'stage' : 'OCR',
80
- 'response' : text,
81
- 'time' : ocr_end - ocr_start
82
- },
83
- {
84
- 'stage' : 'EXTRACTION',
85
- 'response' : {
86
- 'raw' : raw_response,
87
- 'processed' : entities
88
- },
89
- 'time' : ext_end - ext_start
90
- },
91
- {
92
- 'stage' : 'POST_PROCESS',
93
- 'response' : payload,
94
- 'time' : 0
95
- },
96
- {
97
- 'stage' : 'ANALYSE',
98
- 'response' : analysis,
99
- 'time' : analysis_end - analysis_start
100
- },
101
- {
102
- 'stage' : 'ANALYSIS_SUMMARY',
103
- 'response' : summary,
104
- 'time' : summary_end - summary_start
105
- },
106
- # {
107
- # 'stage' : 'SUGGEST',
108
- # 'response' : suggestion,
109
- # 'time' : suggestion_end - suggestion_start
110
- # }
111
- ]
112
  return response
 
113
 
114
- response = [
115
- {
116
- 'stage' : 'OCR',
117
- 'response' : text,
118
- 'time' : 0
119
- },
120
- {
121
- 'stage' : 'EXTRACTION',
122
- 'response' : {
123
- 'raw' : '',
124
- 'processed' : []
125
- },
126
- 'time' : 0
127
- },
128
- {
129
- 'stage' : 'POST_PROCESS',
130
- 'response' : {},
131
- 'time' : 0
132
- },
133
- {
134
- 'stage' : 'ANALYSE',
135
- 'response' : [],
136
- 'time' : 0
137
- },
138
- {
139
- 'stage' : 'ANALYSIS_SUMMARY',
140
- 'response' : {'Good' : '', 'Average' : '', 'Bad' : ''},
141
- 'time' : 0
142
- },
143
- # {
144
- # 'stage' : 'SUGGEST',
145
- # 'response' : '',
146
- # 'time' : 0
147
- # }
148
- ]
149
- return response
150
 
151
- else:
152
- response = [
153
- {
154
- 'stage' : 'OCR',
155
- 'response' : text,
156
- 'time' : ocr_end - ocr_start
157
- }
158
- ]
159
- try:
160
- print('Analysing ...')
161
- analysis_start = time()
162
- raw_response = call_openai(ANALYSIS_PROMPT + 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates")
163
- analysis_end = time()
164
- print('Analysis : ', raw_response)
165
- print(f'Analysed [{analysis_end - analysis_start}]')
166
- if raw_response is not None:
167
- response.append(
168
- {
169
- 'stage' : 'ANALYSE',
170
- 'response' : raw_response,
171
- 'time' : analysis_end - analysis_start
172
- }
173
- )
174
- print('Suggesting our policy ...')
175
- suggestion_start = time()
176
- suggestion = call_openai(SUGGESTION_PROMPT + "\nCustomer Policy Analysis : " + raw_response + "\nAcko's Policy : " + ACKO_POLICY)
177
- suggestion_end = time()
178
- print(f'Suggested [{suggestion_end - suggestion_start}]')
179
- if suggestion is not None:
180
- response.append({
181
- 'stage' : 'SUGGEST',
182
- 'response' : suggestion,
183
- 'time' : suggestion_end - suggestion_start
184
- }
185
- )
186
- return response
187
- except Exception as e:
188
- print(e)
189
- response.extend(
190
- [
191
- {
192
- 'stage' : 'ANALYSE',
193
- 'response' : '',
194
- 'time' : 0
195
- },
196
- {
197
- 'stage' : 'SUGGEST',
198
- 'response' : '',
199
- 'time' : 0
200
- }
201
- ]
202
- )
203
- return response
204
 
205
  if __name__ == '__main__':
206
  import os
@@ -208,6 +121,7 @@ if __name__ == '__main__':
208
  import sys
209
  from tqdm import tqdm
210
  filepaths = sys.argv[1:]
 
211
 
212
  for filepath in tqdm(filepaths):
213
  # if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
@@ -220,7 +134,7 @@ if __name__ == '__main__':
220
  elif filepath.endswith(('.txt', '.md')):
221
  file_bytes = open(filepath).read()
222
  end2end = True
223
- analysis = analyse(file_bytes, True)
224
  # print(analysis)
225
  basepath = os.path.splitext(filepath)[0]
226
  if not end2end:
 
4
  """
5
 
6
  # Imports
7
+ import os
8
  from time import time
9
  from datetime import datetime
10
 
11
+ from policy_analyser import PROMPTS_DIR, DATA_DIR
12
  from policy_analyser.ocr import PyMuPDF4LLMOCR
 
 
13
  from policy_analyser.llm import call_openai
14
 
15
+ class LOB:
16
+ def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
17
+ if ocr_engine == 'open-source/pymupdf4llm':
18
+ self.engine = PyMuPDF4LLMOCR()
19
+ self.file_type = 'pdf'
20
+ with open(os.path.join(PROMPTS_DIR, 'analysis.txt'), 'r') as f:
21
+ self.analysis_prompt = f.read()
22
 
23
+ def __call__(self, file_bytes):
24
+ response = [
25
+ {
26
+ 'stage' : 'OCR',
27
+ 'response' : '',
28
+ 'time' : 0
29
+ },
30
+ {
31
+ 'stage' : 'ANALYSE',
32
+ 'response' : '',
33
+ 'time' : 0
34
+ },
35
+ {
36
+ 'stage' : 'SUGGEST',
37
+ 'response' : '',
38
+ 'time' : 0
39
+ }
40
+ ]
41
+ try:
42
+ print('OCR Started ...')
43
+ ocr_start = time()
44
+ if isinstance(file_bytes, str):
45
+ text = file_bytes
46
+ elif isinstance(file_bytes, (bytearray, bytes)):
47
+ text, _ = self.engine(file_bytes)
48
+ ocr_end = time()
49
+ print(f'OCR done [{ocr_end - ocr_start}]')
50
+
51
+ if len(text) > 0:
52
+ response[0].update({'response' : text, 'time' : ocr_end - ocr_start})
53
+ try:
54
+ print('Analysing ...')
55
+ analysis_start = time()
56
+ raw_response = self._analyse(text = text)
57
+ analysis_end = time()
58
+ print('Analysis : ', raw_response)
59
+ print(f'Analysed [{analysis_end - analysis_start}]')
60
+ if raw_response is not None and len(raw_response) > 0:
61
+ response[1].update({'response' : raw_response, 'time' : analysis_end - analysis_start})
62
+ try:
63
+ print('Suggesting our policy ...')
64
+ suggestion_start = time()
65
+ suggestion = self._suggest(analysis = raw_response)
66
+ suggestion_end = time()
67
+ print(f'Suggested [{suggestion_end - suggestion_start}]')
68
+ if suggestion is not None and len(suggestion) > 0:
69
+ response[2].update({'response' : suggestion, 'time' : suggestion_end - suggestion_start})
70
+ except Exception as sugg_e:
71
+ print(f'Exception while suggesting : {sugg_e}')
72
+ except Exception as analysis_e:
73
+ print(f'Exception while analysing : {analysis_e}')
74
+ except Exception as ocr_e:
75
+ print(f'Exception while OCR : {ocr_e}')
76
+ return response
77
+
78
+ def _analyse(self, **kwargs):
79
+ raise NotImplemented
80
 
81
+ def _suggest(self, **kwargs):
82
+ raise NotImplemented
83
 
84
+ class Health(LOB):
85
+ def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
86
+ super().__init__(ocr_engine)
87
+ with open(os.path.join(PROMPTS_DIR, 'health', 'analysis_output_format.txt'), 'r') as f:
88
+ self.analysis_output_format = f.read()
89
+ with open(os.path.join(PROMPTS_DIR, 'health', 'rules.txt'), 'r') as f:
90
+ self.rules = f.read()
91
+ with open(os.path.join(PROMPTS_DIR, 'health', 'suggest.txt'), 'r') as f:
92
+ self.suggest_prompt = f.read()
93
+ with open(os.path.join(DATA_DIR, 'health.md'), 'r') as f:
94
+ self.acko_policy = f.read()
95
+
96
+ def _analyse(self, **kwargs):
97
+ text = kwargs.get('text')
98
+ if len(text) > 0:
99
+ prompt = self.analysis_prompt.replace('{{lob}}', 'health').replace('{{rules}}', self.rules).replace('{{output_format}}', self.analysis_output_format)
100
+ prompt += 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates"
101
+ response = call_openai(prompt)
102
+ if len(response) > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  return response
104
+ return ''
105
 
106
+ def _suggest(self, **kwargs):
107
+ analysis = kwargs.get('analysis')
108
+ if len(analysis) > 0:
109
+ prompt = self.suggest_prompt + "\nCustomer Policy Analysis : " + analysis + "\nAcko's Policy : " + self.acko_policy
110
+ response = call_openai(prompt)
111
+ if len(response) > 0:
112
+ return response
113
+ return ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ def __call__(self, file_bytes):
116
+ return super().__call__(file_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  if __name__ == '__main__':
119
  import os
 
121
  import sys
122
  from tqdm import tqdm
123
  filepaths = sys.argv[1:]
124
+ health = Health()
125
 
126
  for filepath in tqdm(filepaths):
127
  # if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
 
134
  elif filepath.endswith(('.txt', '.md')):
135
  file_bytes = open(filepath).read()
136
  end2end = True
137
+ analysis = health(file_bytes)
138
  # print(analysis)
139
  basepath = os.path.splitext(filepath)[0]
140
  if not end2end:
policy_analyser/analyse_.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Run analysis
3
+ @author : Sakshi Tantak
4
+ """
5
+
6
+ # Imports
7
+ from time import time
8
+ from datetime import datetime
9
+
10
+ from policy_analyser import ACKO_POLICY, ANALYSIS_PROMPT, SUGGESTION_PROMPT
11
+ from policy_analyser.ocr import PyMuPDF4LLMOCR
12
+ from policy_analyser.extraction import extract
13
+ from policy_analyser.rules import prepare_payload, rules
14
+ from policy_analyser.llm import call_openai
15
+
16
+ # OCR = AzureLayoutOCR()
17
+ OCR = PyMuPDF4LLMOCR()
18
+
19
+ def analyse(file_bytes, end2end = False):
20
+ print('OCR Started ...')
21
+ ocr_start = time()
22
+ if isinstance(file_bytes, str):
23
+ text = file_bytes
24
+ elif isinstance(file_bytes, (bytearray, bytes)):
25
+ text, _ = OCR(file_bytes)
26
+ ocr_end = time()
27
+ print(f'OCR done [{ocr_end - ocr_start}]')
28
+ if len(text) > 0:
29
+ if not end2end:
30
+ print('Extraction Started ...')
31
+ ext_start = time()
32
+ raw_response, entities = extract(text)
33
+ ext_end = time()
34
+ print(f'Extraction done [{ext_end - ext_start}]')
35
+ if len(entities) > 0:
36
+ print('Preparing payload for analysis ...')
37
+ payload = prepare_payload(entities)
38
+ print('Payload prepared for analysis')
39
+ print('Analysing ...')
40
+ analysis_start = time()
41
+ analysis = rules(payload)
42
+ analysis_end = time()
43
+ print(f'Analysed [{analysis_end - analysis_start}]')
44
+ print('Summarising ...')
45
+ summary = {}
46
+ summary_start = time()
47
+ for verdict in ['Good', 'Average', 'Bad']:
48
+ descriptions = '\n'.join([factor['reason'] for factor in analysis if factor['verdict'] == verdict])
49
+ if len(descriptions) > 0:
50
+ prompt = f"""Given the following analysis on the {verdict} factors of a customer's policy that they have bought, generate a crisp and catchy summary of the factors for a customer. Try to make it factor-wise with bullet points
51
+ NOTE : THE POLICY WAS NOT SOLD BY US
52
+ analysis : {descriptions}
53
+ summary : """
54
+ response = call_openai(prompt)
55
+ print(response)
56
+ else:
57
+ response = ''
58
+ summary[verdict] = response
59
+ summary_end = time()
60
+ # print(f'Summarised [{summary_end - summary_start}]')
61
+ # factors_str = ''
62
+ # for verdict in ['Good', 'Average', 'Bad']:
63
+ # factors_str += verdict + ' Factors:'
64
+ # factors_str += '\n' + '\n'.join([f"{factor['factor']}: {factor['reason']}" for factor in analysis if factor['verdict'] == verdict])
65
+ # print('Suggesting ...')
66
+ # suggestion_start = time()
67
+ # suggestion = call_openai(f"""Given the following main factors and their values of a customer's health insurance policy, use these factors to compare with given Acko's health policy and suggest to the customer how the Average and Bad factors maybe covered better by Acko's policy.
68
+ # Format response in less than 50 words and make it factor-wise. Try to format in points. Include emojis to make it catchy.
69
+ # Customer Poliocy Factors:
70
+ # {factors_str}
71
+
72
+ # Acko Policy : {ACKO_POLICY}
73
+
74
+ # Customer Suggestion : """)
75
+ # suggestion_end = time()
76
+ # print(f'Suggested [{suggestion_end - suggestion_start}]')
77
+ response = [
78
+ {
79
+ 'stage' : 'OCR',
80
+ 'response' : text,
81
+ 'time' : ocr_end - ocr_start
82
+ },
83
+ {
84
+ 'stage' : 'EXTRACTION',
85
+ 'response' : {
86
+ 'raw' : raw_response,
87
+ 'processed' : entities
88
+ },
89
+ 'time' : ext_end - ext_start
90
+ },
91
+ {
92
+ 'stage' : 'POST_PROCESS',
93
+ 'response' : payload,
94
+ 'time' : 0
95
+ },
96
+ {
97
+ 'stage' : 'ANALYSE',
98
+ 'response' : analysis,
99
+ 'time' : analysis_end - analysis_start
100
+ },
101
+ {
102
+ 'stage' : 'ANALYSIS_SUMMARY',
103
+ 'response' : summary,
104
+ 'time' : summary_end - summary_start
105
+ },
106
+ # {
107
+ # 'stage' : 'SUGGEST',
108
+ # 'response' : suggestion,
109
+ # 'time' : suggestion_end - suggestion_start
110
+ # }
111
+ ]
112
+ return response
113
+
114
+ response = [
115
+ {
116
+ 'stage' : 'OCR',
117
+ 'response' : text,
118
+ 'time' : 0
119
+ },
120
+ {
121
+ 'stage' : 'EXTRACTION',
122
+ 'response' : {
123
+ 'raw' : '',
124
+ 'processed' : []
125
+ },
126
+ 'time' : 0
127
+ },
128
+ {
129
+ 'stage' : 'POST_PROCESS',
130
+ 'response' : {},
131
+ 'time' : 0
132
+ },
133
+ {
134
+ 'stage' : 'ANALYSE',
135
+ 'response' : [],
136
+ 'time' : 0
137
+ },
138
+ {
139
+ 'stage' : 'ANALYSIS_SUMMARY',
140
+ 'response' : {'Good' : '', 'Average' : '', 'Bad' : ''},
141
+ 'time' : 0
142
+ },
143
+ # {
144
+ # 'stage' : 'SUGGEST',
145
+ # 'response' : '',
146
+ # 'time' : 0
147
+ # }
148
+ ]
149
+ return response
150
+
151
+ else:
152
+ response = [
153
+ {
154
+ 'stage' : 'OCR',
155
+ 'response' : text,
156
+ 'time' : ocr_end - ocr_start
157
+ }
158
+ ]
159
+ try:
160
+ print('Analysing ...')
161
+ analysis_start = time()
162
+ raw_response = call_openai(ANALYSIS_PROMPT + 'Policy : ' + text + f"\n\nConsider today's date as {datetime.today().day}/{datetime.today().month}/{datetime.today().year} for your analysis on waiting periods and dates")
163
+ analysis_end = time()
164
+ print('Analysis : ', raw_response)
165
+ print(f'Analysed [{analysis_end - analysis_start}]')
166
+ if raw_response is not None:
167
+ response.append(
168
+ {
169
+ 'stage' : 'ANALYSE',
170
+ 'response' : raw_response,
171
+ 'time' : analysis_end - analysis_start
172
+ }
173
+ )
174
+ print('Suggesting our policy ...')
175
+ suggestion_start = time()
176
+ suggestion = call_openai(SUGGESTION_PROMPT + "\nCustomer Policy Analysis : " + raw_response + "\nAcko's Policy : " + ACKO_POLICY)
177
+ suggestion_end = time()
178
+ print(f'Suggested [{suggestion_end - suggestion_start}]')
179
+ if suggestion is not None:
180
+ response.append({
181
+ 'stage' : 'SUGGEST',
182
+ 'response' : suggestion,
183
+ 'time' : suggestion_end - suggestion_start
184
+ }
185
+ )
186
+ return response
187
+ except Exception as e:
188
+ print(e)
189
+ response.extend(
190
+ [
191
+ {
192
+ 'stage' : 'ANALYSE',
193
+ 'response' : '',
194
+ 'time' : 0
195
+ },
196
+ {
197
+ 'stage' : 'SUGGEST',
198
+ 'response' : '',
199
+ 'time' : 0
200
+ }
201
+ ]
202
+ )
203
+ return response
204
+
205
+ if __name__ == '__main__':
206
+ import os
207
+ import json
208
+ import sys
209
+ from tqdm import tqdm
210
+ filepaths = sys.argv[1:]
211
+
212
+ for filepath in tqdm(filepaths):
213
+ # if os.path.isfile(filepath.replace('.pdf', '.analysis.json')):
214
+ # continue
215
+ if '.analysis' in filepath or '.e2e-analysis' in filepath:
216
+ continue
217
+ print(filepath)
218
+ if filepath.endswith('.pdf'):
219
+ file_bytes = open(filepath, 'rb').read()
220
+ elif filepath.endswith(('.txt', '.md')):
221
+ file_bytes = open(filepath).read()
222
+ end2end = True
223
+ analysis = analyse(file_bytes, True)
224
+ # print(analysis)
225
+ basepath = os.path.splitext(filepath)[0]
226
+ if not end2end:
227
+ with open(os.path.splitext(filepath)[0] + '.analysis.json', 'w') as f:
228
+ json.dump(analysis, f, indent = 4)
229
+ else:
230
+ with open(os.path.splitext(filepath)[0] + '.e2e-analysis.json', 'w') as f:
231
+ json.dump(analysis, f, indent = 4)
232
+ with open(os.path.splitext(filepath)[0] + '.e2e-analysis.md', 'w') as f:
233
+ f.write(analysis[1]['response'])
policy_analyser/data/{Policy_Wordings_Acko_Personal_Health_Policy_Applicable_for_the_policies_sold_post_1_10_2024_64ea02eb51_ab3c8eefa2.md → health.md} RENAMED
File without changes
policy_analyser/llm.py CHANGED
@@ -25,5 +25,5 @@ def call_openai(system_prompt, seed = 42):
25
  # response_format = response_format,
26
  reasoning_effort = 'low'
27
  )
28
-
29
  return response.choices[0].message.content
 
25
  # response_format = response_format,
26
  reasoning_effort = 'low'
27
  )
28
+ print('LLM response : ', response)
29
  return response.choices[0].message.content
policy_analyser/prompts/analysis.txt CHANGED
@@ -1,154 +1,14 @@
1
- Given the markdown content of a customer's health insurance policy, analyse the insurance policy for the customer by applying given rules for specific factors of the policy.
2
 
3
  Apply the following rules enclosed in triple backticks on the policy to analyse it.
4
  Make sure you are consider values for analysis factors on basis of customer's selected insurance plan when multiple plans are described in the policy terms.
5
  Make sure all factors appear in one of Good, Average or Bad only. No factor should be repeated in more than 1 verdict table.
6
  Note : Top cities = [Mumbai, Delhi, Bangalore, Chennai, Hyderabad, Gurgaon, Pune]
7
  ```
8
- IF Adults == 1:
9
- IF Is_Top_City:
10
- IF Sum_Insured >= 2500000:
11
- Verdict = "Good"
12
- ELSE IF Sum_Insured >= 1000000 AND Sum_Insured < 2500000:
13
- Verdict = "Average"
14
- ELSE:
15
- Verdict = "Bad"
16
- ELSE:
17
- IF Sum_Insured >= 1000000:
18
- Verdict = "Good"
19
- ELSE IF Sum_Insured >= 500000 AND Sum_Insured < 1000000:
20
- Verdict = "Average"
21
- ELSE:
22
- Verdict = "Bad"
23
-
24
- IF Adults >= 2:
25
- IF Children == 0:
26
- IF Is_Top_City:
27
- IF Sum_Insured >= 5000000:
28
- Verdict = "Good"
29
- ELSE IF Sum_Insured >= 2500000 AND Sum_Insured < 5000000:
30
- Verdict = "Average"
31
- ELSE:
32
- Verdict = "Bad"
33
- ELSE:
34
- IF Sum_Insured >= 2500000:
35
- Verdict = "Good"
36
- ELSE IF Sum_Insured >= 1000000 AND Sum_Insured < 2500000:
37
- Verdict = "Average"
38
- ELSE:
39
- Verdict = "Bad"
40
-
41
- IF Children >= 1:
42
- IF Children > 1 OR Is_Top_City:
43
- IF Sum_Insured >= 10000000:
44
- Verdict = "Good"
45
- ELSE IF Sum_Insured >= 5000000 AND Sum_Insured < 10000000:
46
- Verdict = "Average"
47
- ELSE:
48
- Verdict = "Bad"
49
- ELSE:
50
- IF Sum_Insured >= 5000000:
51
- Verdict = "Good"
52
- ELSE IF Sum_Insured >= 2500000 AND Sum_Insured < 5000000:
53
- Verdict = "Average"
54
- ELSE:
55
- Verdict = "Bad"
56
-
57
- # Room Rent Limit
58
- IF Room_Rent_Limit > 0:
59
- Verdict = "Bad"
60
- ELSE:
61
- Verdict = "Good"
62
-
63
- # Deductibles
64
- IF Deductible > 0:
65
- Verdict = "Bad"
66
- ELSE:
67
- Verdict = "Good"
68
-
69
- # Sublimits
70
- IF Sublimits == EMPTY:
71
- Verdict = "Good"
72
- ELSE:
73
- Verdict = "Bad"
74
-
75
- # Copayment
76
- IF Copay <= 5:
77
- Verdict = "Good"
78
- ELSE IF Copay > 5 AND Copay <= 10:
79
- Verdict = "Average"
80
- ELSE:
81
- Verdict = "Bad"
82
-
83
- # Pre-existing Diseases (PED) Waiting Period
84
- IF PED_Waiting_Period > 0:
85
- IF Policy_Age > PED_Waiting_Period:
86
- Verdict = "Good"
87
- ELSE:
88
- Verdict = "Bad"
89
- ELSE:
90
- Verdict = "Good"
91
-
92
- # 30-Day Waiting Period
93
- IF Thirty_Day_Waiting_Period:
94
- IF Policy_Age > 1:
95
- Verdict = "Good"
96
- ELSE:
97
- Verdict = "Bad"
98
- ELSE:
99
- Verdict = "Good"
100
-
101
- # Specific Illness Waiting Period
102
- IF Specific_Illness_Waiting_Period > 0:
103
- IF Policy_Age > Specific_Illness_Waiting_Period:
104
- Verdict = "Good"
105
- ELSE:
106
- Verdict = "Bad"
107
- ELSE:
108
- Verdict = "Good"
109
-
110
- # Maternity Benefits
111
- IF Maternity_Benefits:
112
- Verdict = "Good"
113
- IF Maternity_Waiting_Period > 0:
114
- IF Policy_Age > Maternity_Waiting_Period:
115
- Verdict = "Good"
116
- ELSE:
117
- Verdict = "Bad"
118
- ELSE:
119
- Verdict = "Good"
120
- ELSE:
121
- Verdict = "Bad"
122
  ```
123
 
124
  Format your response in the following way, to present analysis to customer. Don't keep a table if there are no factors in it.
125
  Use appropriate language and emojis to portray analysis and verdicts to the customer. Generate short and crisp verdicts and analysis. Be discrete about rules, do not expose rules to customer but use them to explain reasoning and analysis:
126
 
127
- <CUSTOMER_RESPONSE>
128
- # Our Analysis of your policy [Name of policy] by [Name of insurance company]
129
-
130
- <GOOD>
131
- ## Good Factors
132
- | Factor | Your policy | Our Analysis |
133
- | --- | --- | --- |
134
- | Sum Insured | Value of sum insured in the policy | Analysis of why sum insured is good based on the given rules |
135
- </GOOD>
136
-
137
- <AVERAGE>
138
- ## Average Factors
139
- | Factor | Your policy | Our Analysis |
140
- | --- | --- | --- |
141
- | Copay | Value of copayment in the policy | Analysis of why copay is average based on the given rules |
142
- </AVERAGE>
143
-
144
- <BAD>
145
- ## Bad Factors
146
- | Factor | Your policy | Our Analysis |
147
- | --- | --- | --- |
148
- | Deductible | Value of deductible in the policy | Analysis of why deductible is bad based on the given rules |
149
- </BAD>
150
-
151
- <FINAL_VERDICT>
152
- Final and short point-wise verdict on the analysis
153
- </FINAL_VERDICT>
154
- </CUSTOMER_RESPONSE>
 
1
+ Given the markdown content of a customer's {{lob}} insurance policy, analyse the insurance policy for the customer by applying given rules for specific factors of the policy.
2
 
3
  Apply the following rules enclosed in triple backticks on the policy to analyse it.
4
  Make sure you are consider values for analysis factors on basis of customer's selected insurance plan when multiple plans are described in the policy terms.
5
  Make sure all factors appear in one of Good, Average or Bad only. No factor should be repeated in more than 1 verdict table.
6
  Note : Top cities = [Mumbai, Delhi, Bangalore, Chennai, Hyderabad, Gurgaon, Pune]
7
  ```
8
+ {{rules}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ```
10
 
11
  Format your response in the following way, to present analysis to customer. Don't keep a table if there are no factors in it.
12
  Use appropriate language and emojis to portray analysis and verdicts to the customer. Generate short and crisp verdicts and analysis. Be discrete about rules, do not expose rules to customer but use them to explain reasoning and analysis:
13
 
14
+ {{output_format}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
policy_analyser/prompts/auto/__init__.py ADDED
File without changes
policy_analyser/prompts/health/__init__.py ADDED
File without changes
policy_analyser/prompts/health/analysis_output_format.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <CUSTOMER_RESPONSE>
2
+ # Our Analysis of your policy [Name of policy] by [Name of insurance company]
3
+
4
+ <GOOD>
5
+ ## Good Factors
6
+ | Factor | Your policy | Our Analysis |
7
+ | --- | --- | --- |
8
+ | Sum Insured | Value of sum insured in the policy | Analysis of why sum insured is good based on the given rules |
9
+ </GOOD>
10
+
11
+ <AVERAGE>
12
+ ## Average Factors
13
+ | Factor | Your policy | Our Analysis |
14
+ | --- | --- | --- |
15
+ | Copay | Value of copayment in the policy | Analysis of why copay is average based on the given rules |
16
+ </AVERAGE>
17
+
18
+ <BAD>
19
+ ## Bad Factors
20
+ | Factor | Your policy | Our Analysis |
21
+ | --- | --- | --- |
22
+ | Deductible | Value of deductible in the policy | Analysis of why deductible is bad based on the given rules |
23
+ </BAD>
24
+ </CUSTOMER_RESPONSE>
policy_analyser/prompts/{extraction.txt → health/extraction.txt} RENAMED
File without changes
policy_analyser/prompts/health/rules.txt ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IF Adults == 1:
2
+ IF Is_Top_City:
3
+ IF Sum_Insured >= 2500000:
4
+ Verdict = "Good"
5
+ ELSE IF Sum_Insured >= 1000000 AND Sum_Insured < 2500000:
6
+ Verdict = "Average"
7
+ ELSE:
8
+ Verdict = "Bad"
9
+ ELSE:
10
+ IF Sum_Insured >= 1000000:
11
+ Verdict = "Good"
12
+ ELSE IF Sum_Insured >= 500000 AND Sum_Insured < 1000000:
13
+ Verdict = "Average"
14
+ ELSE:
15
+ Verdict = "Bad"
16
+
17
+ IF Adults >= 2:
18
+ IF Children == 0:
19
+ IF Is_Top_City:
20
+ IF Sum_Insured >= 5000000:
21
+ Verdict = "Good"
22
+ ELSE IF Sum_Insured >= 2500000 AND Sum_Insured < 5000000:
23
+ Verdict = "Average"
24
+ ELSE:
25
+ Verdict = "Bad"
26
+ ELSE:
27
+ IF Sum_Insured >= 2500000:
28
+ Verdict = "Good"
29
+ ELSE IF Sum_Insured >= 1000000 AND Sum_Insured < 2500000:
30
+ Verdict = "Average"
31
+ ELSE:
32
+ Verdict = "Bad"
33
+
34
+ IF Children >= 1:
35
+ IF Children > 1 OR Is_Top_City:
36
+ IF Sum_Insured >= 10000000:
37
+ Verdict = "Good"
38
+ ELSE IF Sum_Insured >= 5000000 AND Sum_Insured < 10000000:
39
+ Verdict = "Average"
40
+ ELSE:
41
+ Verdict = "Bad"
42
+ ELSE:
43
+ IF Sum_Insured >= 5000000:
44
+ Verdict = "Good"
45
+ ELSE IF Sum_Insured >= 2500000 AND Sum_Insured < 5000000:
46
+ Verdict = "Average"
47
+ ELSE:
48
+ Verdict = "Bad"
49
+
50
+ # Room Rent Limit
51
+ IF Room_Rent_Limit > 0:
52
+ Verdict = "Bad"
53
+ ELSE:
54
+ Verdict = "Good"
55
+
56
+ # Deductibles
57
+ IF Deductible > 0:
58
+ Verdict = "Bad"
59
+ ELSE:
60
+ Verdict = "Good"
61
+
62
+ # Sublimits
63
+ IF Sublimits == EMPTY:
64
+ Verdict = "Good"
65
+ ELSE:
66
+ Verdict = "Bad"
67
+
68
+ # Copayment
69
+ IF Copay <= 5:
70
+ Verdict = "Good"
71
+ ELSE IF Copay > 5 AND Copay <= 10:
72
+ Verdict = "Average"
73
+ ELSE:
74
+ Verdict = "Bad"
75
+
76
+ # Pre-existing Diseases (PED) Waiting Period
77
+ IF PED_Waiting_Period > 0:
78
+ IF Policy_Age > PED_Waiting_Period:
79
+ Verdict = "Good"
80
+ ELSE:
81
+ Verdict = "Bad"
82
+ ELSE:
83
+ Verdict = "Good"
84
+
85
+ # 30-Day Waiting Period
86
+ IF Thirty_Day_Waiting_Period:
87
+ IF Policy_Age > 1:
88
+ Verdict = "Good"
89
+ ELSE:
90
+ Verdict = "Bad"
91
+ ELSE:
92
+ Verdict = "Good"
93
+
94
+ # Specific Illness Waiting Period
95
+ IF Specific_Illness_Waiting_Period > 0:
96
+ IF Policy_Age > Specific_Illness_Waiting_Period:
97
+ Verdict = "Good"
98
+ ELSE:
99
+ Verdict = "Bad"
100
+ ELSE:
101
+ Verdict = "Good"
102
+
103
+ # Maternity Benefits
104
+ IF Maternity_Benefits:
105
+ Verdict = "Good"
106
+ IF Maternity_Waiting_Period > 0:
107
+ IF Policy_Age > Maternity_Waiting_Period:
108
+ Verdict = "Good"
109
+ ELSE:
110
+ Verdict = "Bad"
111
+ ELSE:
112
+ Verdict = "Good"
113
+ ELSE:
114
+ Verdict = "Bad"
policy_analyser/prompts/{suggest.txt → health/suggest.txt} RENAMED
File without changes
policy_analyser/prompts/life/__init__.py ADDED
File without changes