PenguinMan commited on
Commit
a4c7002
·
verified ·
1 Parent(s): cf6fecb

Upload api.py

Browse files
Files changed (1) hide show
  1. api.py +321 -0
api.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ import sqlite3
5
+ import os
6
+ import pytesseract
7
+ from PIL import Image
8
+ from pdf2image import convert_from_path
9
+ from groq import Groq
10
+ import json
11
+ import logging
12
+
13
+ # Configure logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # --- Configuration ---
18
+ DATABASE = "medidoc.db"
19
+ UPLOAD_FOLDER = "uploads"
20
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
21
+
22
+ # --- Groq Client Initialization ---
23
+ # Use environment variable for API key
24
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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")
25
+ client = Groq(api_key=GROQ_API_KEY)
26
+
27
+ # --- Database Setup ---
28
+ def init_db():
29
+ try:
30
+ conn = sqlite3.connect(DATABASE)
31
+ cursor = conn.cursor()
32
+ cursor.execute("""
33
+ CREATE TABLE IF NOT EXISTS documents (
34
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
35
+ filename TEXT NOT NULL,
36
+ category TEXT,
37
+ document_date TEXT,
38
+ doctor_name TEXT,
39
+ hospital_name TEXT,
40
+ summary TEXT,
41
+ content TEXT
42
+ )
43
+ """)
44
+ conn.commit()
45
+ conn.close()
46
+ logger.info("Database initialized successfully")
47
+ except Exception as e:
48
+ logger.error(f"Database initialization failed: {e}")
49
+
50
+ init_db()
51
+
52
+ # --- FastAPI App ---
53
+ app = FastAPI(title="MediDoc API", version="1.0.0")
54
+
55
+ # Add CORS middleware
56
+ app.add_middleware(
57
+ CORSMiddleware,
58
+ allow_origins=["*"], # In production, specify exact origins
59
+ allow_credentials=True,
60
+ allow_methods=["*"],
61
+ allow_headers=["*"],
62
+ )
63
+
64
+ # --- Helper Functions ---
65
+ def extract_text_from_file(filepath: str) -> str:
66
+ """Extract text from PDF or image files"""
67
+ try:
68
+ if not os.path.exists(filepath):
69
+ logger.error(f"File not found: {filepath}")
70
+ return ""
71
+
72
+ if filepath.lower().endswith(".pdf"):
73
+ pages = convert_from_path(filepath)
74
+ text = ""
75
+ for page in pages:
76
+ text += pytesseract.image_to_string(page) + "\n"
77
+ return text.strip()
78
+ else:
79
+ # Handle image files
80
+ with Image.open(filepath) as img:
81
+ text = pytesseract.image_to_string(img)
82
+ return text.strip()
83
+
84
+ except Exception as e:
85
+ logger.error(f"Error extracting text from {filepath}: {e}")
86
+ return ""
87
+
88
+ def process_with_llm(text: str) -> dict:
89
+ """Analyze medical text using Groq's Llama model"""
90
+ if not text.strip():
91
+ return {
92
+ "category": "Empty Document",
93
+ "document_date": "N/A",
94
+ "doctor_name": "N/A",
95
+ "hospital_name": "N/A",
96
+ "summary": "Document appears to be empty or text could not be extracted.",
97
+ }
98
+
99
+ system_prompt = """
100
+ You are an expert medical data extraction assistant. Analyze the provided text from a medical document and extract key information.
101
+ Respond ONLY with a valid JSON object containing exactly these keys:
102
+ - "category": Choose from "Prescription", "Lab Report", "Medical Bill", "Pharmacy Bill", "Discharge Summary", "Consultation Notes", "Other"
103
+ - "document_date": Date in YYYY-MM-DD format. If not found, use "N/A"
104
+ - "doctor_name": Full name of the doctor. If not found, use "N/A"
105
+ - "hospital_name": Name of hospital/clinic. If not found, use "N/A"
106
+ - "summary": A brief, clear summary in 1-2 sentences describing what this document is about
107
+
108
+ Return only the JSON object, no other text.
109
+ """
110
+
111
+ fallback_response = {
112
+ "category": "Other",
113
+ "document_date": "N/A",
114
+ "doctor_name": "N/A",
115
+ "hospital_name": "N/A",
116
+ "summary": "Medical document processed but specific information could not be extracted.",
117
+ }
118
+
119
+ try:
120
+ completion = client.chat.completions.create(
121
+ model="llama-3.1-8b-instant",
122
+ messages=[
123
+ {"role": "system", "content": system_prompt},
124
+ {"role": "user", "content": f"Medical document text:\n\n{text[:2000]}"} # Limit text length
125
+ ],
126
+ temperature=0.1,
127
+ max_tokens=300,
128
+ top_p=1,
129
+ stream=False,
130
+ )
131
+
132
+ response_content = completion.choices[0].message.content.strip()
133
+
134
+ # Clean up the response
135
+ if response_content.startswith("```json"):
136
+ response_content = response_content[7:]
137
+ if response_content.endswith("```"):
138
+ response_content = response_content[:-3]
139
+ response_content = response_content.strip()
140
+
141
+ parsed_response = json.loads(response_content)
142
+
143
+ # Validate required keys
144
+ required_keys = ["category", "document_date", "doctor_name", "hospital_name", "summary"]
145
+ for key in required_keys:
146
+ if key not in parsed_response:
147
+ parsed_response[key] = "N/A"
148
+
149
+ return parsed_response
150
+
151
+ except json.JSONDecodeError as e:
152
+ logger.error(f"JSON Parsing Error: {e}\nRaw Response: {response_content}")
153
+ return fallback_response
154
+ except Exception as e:
155
+ logger.error(f"Error with Groq API: {e}")
156
+ return fallback_response
157
+
158
+ # --- API Endpoints ---
159
+ @app.get("/")
160
+ async def root():
161
+ return {"message": "MediDoc API is running"}
162
+
163
+ @app.post("/upload/")
164
+ async def upload_document(file: UploadFile = File(...)):
165
+ """Upload and process a medical document"""
166
+ try:
167
+ # Validate file type
168
+ allowed_types = ['application/pdf', 'image/jpeg', 'image/jpg', 'image/png']
169
+ if file.content_type not in allowed_types:
170
+ raise HTTPException(status_code=400, detail="Only PDF and image files are allowed")
171
+
172
+ # Save uploaded file
173
+ filepath = os.path.join(UPLOAD_FOLDER, file.filename)
174
+ with open(filepath, "wb") as buffer:
175
+ content = await file.read()
176
+ if not content:
177
+ raise HTTPException(status_code=400, detail="Uploaded file is empty")
178
+ buffer.write(content)
179
+
180
+ logger.info(f"File saved: {filepath}")
181
+
182
+ # Extract text
183
+ text = extract_text_from_file(filepath)
184
+ if not text.strip():
185
+ # Clean up the file
186
+ os.remove(filepath)
187
+ raise HTTPException(status_code=400, detail="Could not extract text from the uploaded file")
188
+
189
+ # Process with LLM
190
+ processed_data = process_with_llm(text)
191
+
192
+ # Save to database
193
+ conn = sqlite3.connect(DATABASE)
194
+ cursor = conn.cursor()
195
+ cursor.execute(
196
+ """INSERT INTO documents
197
+ (filename, category, document_date, doctor_name, hospital_name, summary, content)
198
+ VALUES (?, ?, ?, ?, ?, ?, ?)""",
199
+ (
200
+ file.filename,
201
+ processed_data.get("category", "N/A"),
202
+ processed_data.get("document_date", "N/A"),
203
+ processed_data.get("doctor_name", "N/A"),
204
+ processed_data.get("hospital_name", "N/A"),
205
+ processed_data.get("summary", "N/A"),
206
+ text
207
+ ),
208
+ )
209
+ conn.commit()
210
+ conn.close()
211
+
212
+ logger.info(f"Document processed successfully: {file.filename}")
213
+ return {"filename": file.filename, "info": processed_data, "status": "success"}
214
+
215
+ except HTTPException:
216
+ raise
217
+ except Exception as e:
218
+ logger.error(f"Unexpected error processing file: {e}")
219
+ raise HTTPException(status_code=500, detail="Internal server error occurred while processing the file")
220
+
221
+ @app.get("/documents/")
222
+ def get_documents():
223
+ """Retrieve all processed documents"""
224
+ try:
225
+ conn = sqlite3.connect(DATABASE)
226
+ conn.row_factory = sqlite3.Row
227
+ cursor = conn.cursor()
228
+ cursor.execute("""
229
+ SELECT id, filename, category, document_date, doctor_name, hospital_name, summary
230
+ FROM documents
231
+ ORDER BY
232
+ CASE WHEN document_date = 'N/A' THEN 1 ELSE 0 END,
233
+ document_date DESC
234
+ """)
235
+ documents = [dict(row) for row in cursor.fetchall()]
236
+ conn.close()
237
+ return {"documents": documents, "count": len(documents)}
238
+ except Exception as e:
239
+ logger.error(f"Error retrieving documents: {e}")
240
+ raise HTTPException(status_code=500, detail="Could not retrieve documents")
241
+
242
+ class SearchResult(BaseModel):
243
+ answer: str
244
+ sources: list
245
+
246
+ @app.get("/search/", response_model=SearchResult)
247
+ def search_medical_history(query: str):
248
+ """Search through medical documents using natural language"""
249
+ if not query.strip():
250
+ raise HTTPException(status_code=400, detail="Search query cannot be empty")
251
+
252
+ try:
253
+ conn = sqlite3.connect(DATABASE)
254
+ cursor = conn.cursor()
255
+ cursor.execute("SELECT filename, content, summary, category FROM documents")
256
+ all_docs = cursor.fetchall()
257
+ conn.close()
258
+
259
+ if not all_docs:
260
+ return {"answer": "No documents have been uploaded yet. Please upload some medical documents first.", "sources": []}
261
+
262
+ # Prepare context for the AI
263
+ context_parts = []
264
+ for i, doc in enumerate(all_docs):
265
+ filename, content, summary, category = doc
266
+ context_parts.append(f"Document {i+1}: {filename}\nCategory: {category}\nSummary: {summary}\nContent: {content[:1500]}")
267
+
268
+ context = "\n\n---\n\n".join(context_parts)
269
+
270
+ system_prompt = f"""
271
+ You are a medical assistant helping a patient understand their medical history.
272
+ Answer the user's question based ONLY on the provided medical documents.
273
+
274
+ Guidelines:
275
+ - Provide a clear, helpful answer
276
+ - Mention specific document names when referencing information
277
+ - If information is not available in the documents, say so clearly
278
+ - Be concise but informative
279
+ - Use medical terminology appropriately but explain complex terms
280
+
281
+ Available Documents:
282
+ {context}
283
+ """
284
+
285
+ completion = client.chat.completions.create(
286
+ model="llama-3.1-8b-instant",
287
+ messages=[
288
+ {"role": "system", "content": system_prompt},
289
+ {"role": "user", "content": query}
290
+ ],
291
+ temperature=0.2,
292
+ max_tokens=800,
293
+ )
294
+
295
+ answer = completion.choices[0].message.content
296
+
297
+ # Find relevant sources mentioned in the answer
298
+ sources = []
299
+ for doc in all_docs:
300
+ filename = doc[0]
301
+ if filename.lower() in answer.lower():
302
+ sources.append({
303
+ "filename": filename,
304
+ "summary": doc[2],
305
+ "category": doc[3]
306
+ })
307
+
308
+ return {"answer": answer, "sources": sources}
309
+
310
+ except Exception as e:
311
+ logger.error(f"Error during search: {e}")
312
+ raise HTTPException(status_code=500, detail="Search service is currently unavailable")
313
+
314
+ @app.get("/health")
315
+ def health_check():
316
+ """Health check endpoint"""
317
+ return {"status": "healthy", "database": "connected"}
318
+
319
+ if __name__ == "__main__":
320
+ import uvicorn
321
+ uvicorn.run(app, host="0.0.0.0", port=8000)