Dhruv-Ty commited on
Commit
90d2de3
·
verified ·
1 Parent(s): 4214066

Create utils.py

Browse files
Files changed (1) hide show
  1. src/utils.py +217 -0
src/utils.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains utility functions for text processing and other helper functions.
3
+ """
4
+
5
+ import re
6
+ import os
7
+ import base64
8
+
9
+
10
+ def has_meaningful_content(text):
11
+ """
12
+ Check if explanation has meaningful content.
13
+
14
+ Args:
15
+ text (str): The text to check
16
+
17
+ Returns:
18
+ bool: True if the text has meaningful content, False otherwise
19
+ """
20
+ if not text:
21
+ return False
22
+
23
+ # Check if the text is just equal signs or other separators
24
+ stripped_text = text.strip()
25
+ if re.match(r'^[=\-_*]+$', stripped_text.replace('\n', '')):
26
+ return False
27
+
28
+ # Check if the text only contains "## REASONING" with no actual content
29
+ if "## REASONING" in stripped_text and len(stripped_text) < 20:
30
+ return False
31
+
32
+ return True
33
+
34
+
35
+ def remove_reasoning_and_sources(text):
36
+ """
37
+ Remove reasoning and sources sections from the main response text.
38
+
39
+ Args:
40
+ text (str): The text to clean
41
+
42
+ Returns:
43
+ str: Text without reasoning and sources sections
44
+ """
45
+ # First, remove any reasoning sections
46
+ pattern_reasoning = r'(?i)(\n+\s*reasoning:|\n+\s*\*{0,2}reasoning\*{0,2}:?|\n+\s*#{1,3}\s*reasoning).*?(?=\n+\s*(?:#{1,3}|sources:|references:|\Z))'
47
+ cleaned_text = re.sub(pattern_reasoning, '', text, flags=re.DOTALL)
48
+
49
+ # Then, remove any sources/references sections
50
+ pattern_sources = r'(?i)(\n+\s*sources:|\n+\s*references:|\n+\s*\*{0,2}sources\*{0,2}:?|\n+\s*\*{0,2}references\*{0,2}:?|\n+\s*#{1,3}\s*sources|\n+\s*#{1,3}\s*references).*?(?=\n+\s*(?:#{1,3}|\Z))'
51
+ cleaned_text = re.sub(pattern_sources, '', cleaned_text, flags=re.DOTALL)
52
+
53
+ # Also remove any source citations in the text (e.g., [1], [source_id])
54
+ cleaned_text = re.sub(r'\[([\w\d:_\-\.+]+)\]', '', cleaned_text)
55
+
56
+ # Process line by line to handle sections more comprehensively
57
+ lines = cleaned_text.split('\n')
58
+ filtered_lines = []
59
+ skip_section = False
60
+
61
+ for line in lines:
62
+ # Check if we should skip this line (part of reasoning or sources)
63
+ if re.search(r'(?i)^(\s*reasoning:|\s*sources:|\s*references:|\s*\*{0,2}reasoning\*{0,2}:?|\s*\*{0,2}sources\*{0,2}:?|\s*\*{0,2}references\*{0,2}:?|\s*#{1,3}\s*reasoning|\s*#{1,3}\s*sources|\s*#{1,3}\s*references)', line):
64
+ skip_section = True
65
+ continue
66
+ # Check if we're entering a new section
67
+ elif skip_section and re.search(r'(?i)^(\s*#{1,3}|\s*[a-zA-Z]+:)', line):
68
+ skip_section = False
69
+
70
+ # Only keep lines that aren't in sections we want to skip
71
+ if not skip_section:
72
+ filtered_lines.append(line)
73
+
74
+ # Remove any trailing URL citations that might be left
75
+ result = '\n'.join(filtered_lines).strip()
76
+ result = re.sub(r'\[([^\]]+)\]\(https?://[^)]+\)', r'\1', result)
77
+
78
+ return result
79
+
80
+
81
+ def clean_explanation(text):
82
+ """
83
+ Remove duplicate sources sections and data availability notes from explanation.
84
+
85
+ Args:
86
+ text (str): The explanation text to clean
87
+
88
+ Returns:
89
+ str: Cleaned explanation text
90
+ """
91
+ if not text:
92
+ return text
93
+
94
+ # Remove DATA AVAILABILITY NOTE section
95
+ pattern_data_note = r'\n+\s*#{1,3}\s*DATA AVAILABILITY NOTE.*?(?=\n+\s*#{1,3}|\Z)'
96
+ cleaned_text = re.sub(pattern_data_note, '', text, flags=re.DOTALL)
97
+
98
+ # Fix formatting issues with reasoning points - ensure consistent formatting
99
+ pattern_reasoning_headers = r'(#{1,3}\s*REASONING[^#]*?)#{1,3}\s*(\d+\.\s+)'
100
+ cleaned_text = re.sub(pattern_reasoning_headers, r'\1\2', cleaned_text, flags=re.DOTALL)
101
+
102
+ # Remove any "REASONING1." pattern which creates the heading effect
103
+ cleaned_text = re.sub(r'(#{1,3}\s*REASONING)(\d+\.)', r'\1', cleaned_text)
104
+
105
+ # Normalize all reasoning points to use the same format
106
+ cleaned_text = re.sub(r'(\n+)(\d+\.)', r'\1 \2', cleaned_text)
107
+
108
+ # SIMPLER APPROACH: Remove all sources sections except the last one
109
+ # First, split the text by source section headers
110
+ pattern_sources = r'(\n+\s*#{1,3}\s+(?:SOURCES|Sources)(?:\s+USED)?[^\n]*)'
111
+ sections = re.split(pattern_sources, cleaned_text)
112
+
113
+ # Find all source sections
114
+ source_sections = []
115
+ current_section = ""
116
+ in_source = False
117
+ source_content = ""
118
+
119
+ for i, section in enumerate(sections):
120
+ # If this is a source section header
121
+ if re.match(r'\s*#{1,3}\s+(?:SOURCES|Sources)(?:\s+USED)?', section.strip()):
122
+ in_source = True
123
+ current_section = section
124
+ # If this is content after a source header
125
+ elif in_source and i > 0:
126
+ source_content = section
127
+ current_section += section
128
+ source_sections.append(current_section)
129
+ in_source = False
130
+ current_section = ""
131
+
132
+ # Remove all sources sections from the text
133
+ for section in source_sections:
134
+ cleaned_text = cleaned_text.replace(section, '')
135
+
136
+ # Clean up any double newlines
137
+ cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
138
+
139
+ # Add the sources section back with a consistent heading
140
+ if source_content.strip():
141
+ # Extract just the content without the header
142
+ source_content = source_content.strip()
143
+
144
+ # If the source content starts with bullet points, make sure they're properly formatted
145
+ source_content = re.sub(r'^(\s*)(\d+\.)', r'\1•', source_content, flags=re.MULTILINE)
146
+
147
+ # Add a clean, consistent "Sources" heading
148
+ cleaned_text = cleaned_text.strip()
149
+ if cleaned_text:
150
+ cleaned_text += "\n\n"
151
+ cleaned_text += "## Sources\n" + source_content
152
+
153
+ return cleaned_text.strip()
154
+
155
+
156
+ def get_image_base64(image_path):
157
+ """
158
+ Encode image to base64.
159
+
160
+ Args:
161
+ image_path (str): Path to the image file
162
+
163
+ Returns:
164
+ str: Base64 encoded image or None if error
165
+ """
166
+ try:
167
+ if os.path.exists(image_path):
168
+ with open(image_path, "rb") as img_file:
169
+ return base64.b64encode(img_file.read()).decode()
170
+ else:
171
+ print(f"Image not found: {image_path}")
172
+ return None
173
+ except Exception as e:
174
+ print(f"Error loading image: {e}")
175
+ return None
176
+
177
+
178
+ def format_conversation_history(history, patient_info=None):
179
+ """
180
+ Format the conversation history into a string suitable for LLM processing.
181
+
182
+ Args:
183
+ history (list): List of message dictionaries
184
+ patient_info (dict, optional): Dictionary with patient information
185
+
186
+ Returns:
187
+ str: Formatted conversation text for report generation
188
+ """
189
+ formatted_text = "# Medical Consultation\n\n"
190
+
191
+ # Add patient info if provided
192
+ if patient_info:
193
+ formatted_text += "## Patient Information\n"
194
+ formatted_text += f"* Name: {patient_info.get('name', '')}\n"
195
+ formatted_text += f"* Age: {patient_info.get('age', '')}\n"
196
+ formatted_text += f"* Gender: {patient_info.get('gender', '')}\n\n"
197
+
198
+ formatted_text += "## Conversation Transcript\n\n"
199
+
200
+ for message in history:
201
+ role = message.get("role", "").strip()
202
+ content = message.get("content", "").strip()
203
+
204
+ if not content:
205
+ continue # Skip empty messages
206
+
207
+ if role.lower() == "user":
208
+ formatted_text += f"PATIENT: {content}\n\n"
209
+ elif role.lower() == "assistant":
210
+ formatted_text += f"ASSISTANT: {content}\n\n"
211
+ # Include explanations which often contain diagnostic reasoning
212
+ if "explanation" in message and message["explanation"]:
213
+ explanation = message.get("explanation", "").strip()
214
+ if explanation:
215
+ formatted_text += f"REASONING: {explanation}\n\n"
216
+
217
+ return formatted_text