Raykarr commited on
Commit
3870ea1
Β·
verified Β·
1 Parent(s): 79ddee4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +906 -882
app.py CHANGED
@@ -1,883 +1,907 @@
1
- # This file is a copy of backend-vercel/app.py
2
- # It's placed here so Vercel can serve both frontend and backend from the same repo
3
-
4
- import asyncio
5
- import hashlib
6
- import os
7
- import json
8
- from typing import List, Dict, Any, Optional
9
- from datetime import datetime
10
- from pathlib import Path
11
-
12
- import fitz
13
- from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
14
- from fastapi.responses import JSONResponse, FileResponse
15
- from fastapi.middleware.cors import CORSMiddleware
16
- from fastapi.staticfiles import StaticFiles
17
- from loguru import logger
18
- from pydantic import BaseModel
19
- from tiktoken import get_encoding
20
-
21
- # API-based services
22
- import requests
23
- from pinecone import Pinecone
24
- from supabase import create_client, Client
25
- from groq import Groq
26
-
27
- # Configure logger for production
28
- logger.remove()
29
- logger.add(lambda msg: print(msg, end=""), colorize=True,
30
- format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | {message}",
31
- level="INFO")
32
-
33
- # Load environment variables
34
- try:
35
- from dotenv import load_dotenv
36
- from pathlib import Path
37
-
38
- # This ensures the .env file is loaded from the `backend` directory
39
- # regardless of where the script is run from.
40
- env_path = Path(__file__).parent / '.env'
41
- if env_path.is_file():
42
- load_dotenv(dotenv_path=env_path)
43
- logger.info(f"βœ… Loaded environment variables from: {env_path}")
44
- else:
45
- logger.warning(f"⚠️ .env file not found at {env_path}. Relying on system environment variables.")
46
-
47
- except ImportError:
48
- logger.info("dotenv not installed, skipping .env file load.")
49
-
50
- # --- API Keys & Client Initialization ---
51
-
52
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
53
- PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
54
- SUPABASE_URL = os.getenv("SUPABASE_URL")
55
- SUPABASE_KEY = os.getenv("SUPABASE_KEY")
56
- HF_API_KEY = os.getenv("HF_API_KEY")
57
-
58
- # Pinecone
59
- pc: Optional[Pinecone] = None
60
- if PINECONE_API_KEY:
61
- try:
62
- pc = Pinecone(api_key=PINECONE_API_KEY)
63
- logger.info("βœ… Pinecone client initialized.")
64
- except Exception as e:
65
- logger.error(f"❌ Failed to initialize Pinecone: {e}")
66
- else:
67
- logger.warning("⚠️ PINECONE_API_KEY not set. Vector search will be disabled.")
68
-
69
- # Supabase
70
- supabase_client: Optional[Client] = None
71
- if SUPABASE_URL and SUPABASE_KEY:
72
- try:
73
- supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)
74
- logger.info("βœ… Supabase client initialized.")
75
- except Exception as e:
76
- logger.error(f"❌ Failed to initialize Supabase: {e}")
77
- else:
78
- logger.warning("⚠️ Supabase credentials not set. Database operations will be disabled.")
79
-
80
- # Local file storage for PDFs
81
- UPLOADS_DIR = Path(__file__).parent / "uploads"
82
- UPLOADS_DIR.mkdir(exist_ok=True)
83
-
84
-
85
- # --- Production-Ready Core Functions ---
86
-
87
- def get_llm_client() -> Optional[Groq]:
88
- """Initializes and returns a Groq client if the API key is available."""
89
- if not GROQ_API_KEY:
90
- logger.error("❌ GROQ_API_KEY not set. LLM analysis is disabled.")
91
- return None
92
- try:
93
- return Groq(api_key=GROQ_API_KEY)
94
- except Exception as e:
95
- logger.error(f"❌ Failed to create Groq client: {e}")
96
- return None
97
-
98
- async def get_embeddings_huggingface(texts: List[str]) -> List[List[float]]:
99
- """Get embeddings using Hugging Face Inference API with requests."""
100
- if not HF_API_KEY:
101
- logger.error("❌ HF_API_KEY not set. Cannot generate embeddings.")
102
- raise HTTPException(status_code=500, detail="Embedding service is not configured.")
103
-
104
- try:
105
- import requests
106
-
107
- headers = {
108
- "Authorization": f"Bearer {HF_API_KEY}",
109
- "Content-Type": "application/json"
110
- }
111
- model = "sentence-transformers/all-mpnet-base-v2"
112
-
113
- embeddings = []
114
- for text in texts:
115
- response = requests.post(
116
- f"https://api-inference.huggingface.co/models/{model}",
117
- headers=headers,
118
- json={"inputs": [text]},
119
- timeout=30
120
- )
121
- if response.status_code == 200:
122
- data = response.json()
123
- # Preferred response format: {"embedding": [...] }
124
- if isinstance(data, dict) and "embedding" in data:
125
- embeddings.append(data["embedding"])
126
- continue
127
- # Fallback: some models return list directly
128
- if isinstance(data, list):
129
- embeddings.append(data[0] if isinstance(data[0], list) else data)
130
- continue
131
- logger.warning(f"⚠️ Unexpected HF response format: {type(data)}")
132
- else:
133
- logger.debug(f"⚠️ HF API HTTP {response.status_code}: {response.text[:120]}")
134
- # Fallback embedding when HF call fails
135
- embeddings.append(_get_fallback_embedding(text))
136
-
137
- logger.info(f"βœ… Generated {len(embeddings)} embeddings using HF API")
138
- return embeddings
139
-
140
- except Exception as e:
141
- logger.error(f"❌ Hugging Face API error during embedding generation: {e}")
142
- # Return fallback embeddings instead of raising exception
143
- return [_get_fallback_embedding(text) for text in texts]
144
-
145
- def _get_fallback_embedding(text: str) -> List[float]:
146
- """Generate fallback embedding using hash for 768 dimensions."""
147
- import hashlib
148
- hash_obj = hashlib.md5(text.encode())
149
- # all-mpnet-base-v2 has 768 dimensions
150
- return [float(x) / 255.0 for x in hash_obj.digest()] * 48 # 768 dimensions
151
-
152
- # --- PDF Processing and Chunking ---
153
-
154
- def _sync_extract_with_coordinates(pdf_bytes: bytes) -> List[Dict[str, Any]]:
155
- """Synchronous core logic for text and coordinate extraction."""
156
- text_blocks = []
157
- with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
158
- for page_num, page in enumerate(doc, 1):
159
- blocks = page.get_text("dict").get("blocks", [])
160
- for block in blocks:
161
- if "lines" in block:
162
- for line in block["lines"]:
163
- for span in line["spans"]:
164
- if span["text"].strip():
165
- text_blocks.append({
166
- "text": span["text"].strip(),
167
- "page_num": page_num,
168
- "coordinates": list(span["bbox"]),
169
- "block_id": f"p{page_num}b{len(text_blocks)}"
170
- })
171
- return text_blocks
172
-
173
- async def extract_text_with_coordinates(pdf_bytes: bytes) -> List[Dict[str, Any]]:
174
- """Extracts text blocks with page numbers and coordinates from a PDF."""
175
- loop = asyncio.get_event_loop()
176
- return await loop.run_in_executor(None, _sync_extract_with_coordinates, pdf_bytes)
177
-
178
- async def chunk_text_with_coordinates(text_blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
179
- """Creates semantic chunks from text blocks while preserving location info."""
180
- chunks = []
181
- current_chunk_text = ""
182
- current_chunk_blocks = []
183
-
184
- enc = get_encoding("cl100k_base")
185
- CHUNK_SIZE_TOKENS = 250
186
- MIN_CHUNK_SIZE_CHARS = 50
187
-
188
- for block in text_blocks:
189
- block_text = block["text"]
190
-
191
- if (enc.encode(current_chunk_text + " " + block_text)) and (len(enc.encode(current_chunk_text + " " + block_text)) > CHUNK_SIZE_TOKENS):
192
- if len(current_chunk_text) >= MIN_CHUNK_SIZE_CHARS:
193
- first_block = current_chunk_blocks[0]
194
- chunks.append({
195
- "id": f"chunk_{len(chunks)}",
196
- "text": current_chunk_text.strip(),
197
- "page_num": first_block["page_num"],
198
- "coordinates": [b["coordinates"] for b in current_chunk_blocks],
199
- "token_count": len(enc.encode(current_chunk_text))
200
- })
201
- current_chunk_text = ""
202
- current_chunk_blocks = []
203
-
204
- current_chunk_text += " " + block_text
205
- current_chunk_blocks.append(block)
206
-
207
- if current_chunk_text and len(current_chunk_text) >= MIN_CHUNK_SIZE_CHARS:
208
- first_block = current_chunk_blocks[0]
209
- chunks.append({
210
- "id": f"chunk_{len(chunks)}",
211
- "text": current_chunk_text.strip(),
212
- "page_num": first_block["page_num"],
213
- "coordinates": [b["coordinates"] for b in current_chunk_blocks],
214
- "token_count": len(enc.encode(current_chunk_text))
215
- })
216
-
217
- logger.info(f"βœ… Created {len(chunks)} chunks.")
218
- return chunks
219
-
220
-
221
- # --- Background Analysis Engine ---
222
-
223
- ANALYST_PROMPT = """
224
- You are an expert insurance policy analyst. Analyze the following text for potential policyholder concerns like exclusions, limitations, high costs, or complex duties.
225
-
226
- IMPORTANT: You must respond with ONLY a valid JSON object. Do not include any other text, explanations, or formatting. The JSON must have these exact fields:
227
-
228
- {
229
- "is_concern": true/false, // Must be a boolean
230
- "category": "EXCLUSION" | "LIMITATION" | "WAITING_PERIOD" | "DEDUCTIBLE" | "COPAYMENT" | "COINSURANCE" | "POLICYHOLDER_DUTY" | "RENEWAL_RESTRICTION" | "CLAIM_PROCESS" | "NETWORK_RESTRICTION",
231
- "severity": "HIGH" | "MEDIUM" | "LOW",
232
- "summary": "A one-sentence, easy-to-understand summary of the concern.",
233
- "recommendation": "A concise, actionable recommendation for the policyholder."
234
- }
235
-
236
- TEXT TO ANALYZE:
237
- {text_content}
238
- """
239
-
240
- async def analyze_chunk_for_concerns(llm: Groq, chunk: Dict[str, Any]) -> Optional[Dict[str, Any]]:
241
- """Analyzes a single text chunk for insurance concerns using the LLM."""
242
- if not llm: return None
243
-
244
- cache_key = f"analysis:{hashlib.sha1(chunk['text'].encode()).hexdigest()}"
245
- if supabase_client:
246
- try:
247
- response = supabase_client.table('cache').select('value').eq('key', cache_key).execute()
248
- if response.data:
249
- return json.loads(response.data[0]['value'])
250
- except Exception as e:
251
- logger.warning(f"⚠️ Cache lookup failed: {e}")
252
-
253
- try:
254
- # Provide a structured format for the model to follow
255
- prompt = f"""
256
- You are an expert insurance policy analyst. Analyze the following text for potential policyholder concerns.
257
- Please provide your analysis in the following format:
258
-
259
- Is Concern: [true/false]
260
- Category: [category]
261
- Severity: [severity]
262
- Summary: [one-sentence summary]
263
- Recommendation: [actionable recommendation]
264
-
265
- TEXT TO ANALYZE:
266
- {chunk['text']}
267
- """
268
-
269
- response = await asyncio.to_thread(
270
- llm.chat.completions.create,
271
- messages=[{"role": "user", "content": prompt}],
272
- model="llama-3.1-8b-instant",
273
- temperature=0.0,
274
- max_tokens=350,
275
- )
276
-
277
- result_text = response.choices[0].message.content
278
-
279
- # Parse the natural language response
280
- analysis_result = parse_llm_response(result_text)
281
-
282
- if analysis_result and analysis_result.get("is_concern"):
283
- if supabase_client:
284
- try:
285
- supabase_client.table('cache').upsert({
286
- 'key': cache_key,
287
- 'value': json.dumps(analysis_result)
288
- }).execute()
289
- except Exception as e:
290
- logger.warning(f"⚠️ Cache save failed: {e}")
291
- return analysis_result
292
-
293
- except Exception as e:
294
- logger.error(f"❌ LLM analysis error for chunk {chunk.get('id', '')}: {e}")
295
-
296
- return None
297
-
298
- def clean_llm_response(response: str) -> str:
299
- """More aggressively clean LLM response artifacts."""
300
- import re
301
-
302
- # Remove XML-style thinking tags and their entire content
303
- response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL | re.IGNORECASE)
304
-
305
- # Remove any other XML-like tags
306
- response = re.sub(r'<[^>]+>', '', response)
307
-
308
- # Remove lines that are just conversational filler or metadata
309
- lines = response.split('\n')
310
- cleaned_lines = []
311
- for line in lines:
312
- line_lower = line.strip().lower()
313
- if not any(phrase in line_lower for phrase in [
314
- "okay, so i need to analyze", "sure, i can help", "here is the analysis", "i have analyzed the text"
315
- ]):
316
- cleaned_lines.append(line)
317
-
318
- response = '\n'.join(cleaned_lines)
319
-
320
- # Standardize whitespace
321
- response = re.sub(r'\n\s*\n+', '\n', response.strip())
322
-
323
- return response
324
-
325
- def clean_chat_response(response: str) -> str:
326
- """Clean chat responses to remove reasoning and improve formatting."""
327
- import re
328
-
329
- # Remove thinking/reasoning sections
330
- response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL | re.IGNORECASE)
331
- response = re.sub(r'<reasoning>.*?</reasoning>', '', response, flags=re.DOTALL | re.IGNORECASE)
332
-
333
- # Remove lines that start with thinking indicators
334
- lines = response.split('\n')
335
- cleaned_lines = []
336
- for line in lines:
337
- line_lower = line.strip().lower()
338
- # Skip lines that are clearly reasoning/thinking
339
- if any(phrase in line_lower for phrase in [
340
- "let me think", "i need to", "first,", "next,", "i should", "i will",
341
- "okay,", "so,", "well,", "hmm,", "let me", "i'll", "i'm going to"
342
- ]):
343
- continue
344
- # Skip empty lines
345
- if not line.strip():
346
- continue
347
- cleaned_lines.append(line)
348
-
349
- # Join lines and clean up formatting
350
- response = '\n'.join(cleaned_lines)
351
-
352
- # Remove excessive whitespace
353
- response = re.sub(r'\n\s*\n+', '\n\n', response.strip())
354
-
355
- # If response is too short, return a simple message
356
- if len(response.strip()) < 10:
357
- return "I don't have enough information to answer that question based on the current finding."
358
-
359
- return response
360
-
361
- def parse_llm_response(response: str) -> Optional[Dict[str, Any]]:
362
- """Parse structured LLM response into a dictionary."""
363
- try:
364
- response = clean_llm_response(response)
365
-
366
- result = {
367
- "is_concern": False,
368
- "category": "UNCATEGORIZED",
369
- "severity": "UNKNOWN",
370
- "summary": "No concerns found",
371
- "recommendation": ""
372
- }
373
-
374
- # Regex to find key-value pairs, ignoring case and whitespace
375
- def get_value(key: str) -> Optional[str]:
376
- import re
377
- match = re.search(f"^{key}\\s*:\\s*(.*)", response, re.IGNORECASE | re.MULTILINE)
378
- if match:
379
- return match.group(1).strip().replace("[", "").replace("]", "")
380
- return None
381
-
382
- is_concern_str = get_value("Is Concern")
383
- if is_concern_str:
384
- result["is_concern"] = "true" in is_concern_str.lower()
385
-
386
- # If the model says it's not a concern, we can stop here.
387
- if not result["is_concern"]:
388
- return result
389
-
390
- category_str = get_value("Category")
391
- if category_str:
392
- categories = [
393
- "EXCLUSION", "LIMITATION", "WAITING_PERIOD", "DEDUCTIBLE",
394
- "COPAYMENT", "COINSURANCE", "POLICYHOLDER_DUTY",
395
- "RENEWAL_RESTRICTION", "CLAIM_PROCESS", "NETWORK_RESTRICTION"
396
- ]
397
- for cat in categories:
398
- if cat.replace("_", " ").lower() in category_str.lower():
399
- result["category"] = cat
400
- break
401
-
402
- severity_str = get_value("Severity")
403
- if severity_str:
404
- severity_lower = severity_str.lower()
405
- if "high" in severity_lower: result["severity"] = "HIGH"
406
- elif "medium" in severity_lower: result["severity"] = "MEDIUM"
407
- elif "low" in severity_lower: result["severity"] = "LOW"
408
-
409
- summary_str = get_value("Summary")
410
- if summary_str:
411
- result["summary"] = summary_str
412
-
413
- recommendation_str = get_value("Recommendation")
414
- if recommendation_str:
415
- result["recommendation"] = recommendation_str
416
-
417
- # A final check to ensure we have a meaningful summary if a concern was flagged.
418
- if result["is_concern"] and (not result["summary"] or result["summary"] == "No concerns found"):
419
- # Fallback to grabbing the first meaningful line of text that is not a key-value pair.
420
- lines = [line.strip() for line in response.split('\n') if line.strip() and ":" not in line]
421
- if lines:
422
- result["summary"] = lines[0]
423
-
424
- return result
425
-
426
- except Exception as e:
427
- logger.error(f"❌ Failed to parse LLM response: {e}")
428
- return None
429
-
430
- # --- Database Operations ---
431
- # REMINDER: Ensure your Supabase schema matches. The 'documents' table needs:
432
- # - id TEXT PRIMARY KEY
433
- # - filename TEXT
434
- # - total_pages INTEGER
435
- # - analysis_status TEXT
436
- # - analysis_completed_at TIMESTAMP WITH TIME ZONE
437
- # - upload_date TIMESTAMP WITH TIME ZONE DEFAULT NOW()
438
-
439
- async def save_document_metadata(doc_id: str, filename: str, page_count: int):
440
- if not supabase_client: return
441
- try:
442
- supabase_client.table('documents').insert({
443
- 'id': doc_id,
444
- 'filename': filename,
445
- 'total_pages': page_count,
446
- 'analysis_status': 'pending',
447
- }).execute()
448
- except Exception as e:
449
- logger.error(f"❌ DB Error saving document metadata for {doc_id}: {e}")
450
-
451
- async def save_finding(document_id: str, finding: Dict[str, Any], chunk: Dict[str, Any]):
452
- if not supabase_client: return
453
- try:
454
- # Calculate confidence score based on finding quality
455
- confidence_score = calculate_confidence_score(finding)
456
-
457
- supabase_client.table('findings').insert({
458
- 'document_id': document_id,
459
- 'page_num': chunk.get('page_num', 0),
460
- 'coordinates': json.dumps(chunk.get('coordinates', [])),
461
- 'text_content': chunk.get('text', ''),
462
- 'category': finding.get('category', 'UNCATEGORIZED'),
463
- 'severity': finding.get('severity', 'UNKNOWN'),
464
- 'summary': finding.get('summary', 'No summary provided.'),
465
- 'recommendation': finding.get('recommendation', ''),
466
- 'confidence_score': confidence_score,
467
- }).execute()
468
- except Exception as e:
469
- logger.error(f"❌ DB Error saving finding for doc {document_id}: {e}")
470
-
471
- def calculate_confidence_score(finding: Dict[str, Any]) -> float:
472
- """Calculate confidence score based on finding quality."""
473
- score = 0.5 # Base score
474
-
475
- # Adjust based on category
476
- if finding.get('category') != 'UNCATEGORIZED':
477
- score += 0.2
478
-
479
- # Adjust based on severity
480
- if finding.get('severity') in ['HIGH', 'MEDIUM', 'LOW']:
481
- score += 0.1
482
-
483
- # Adjust based on summary quality
484
- summary = finding.get('summary', '')
485
- if len(summary) > 20 and summary != 'No summary provided.':
486
- score += 0.1
487
-
488
- # Adjust based on recommendation quality
489
- recommendation = finding.get('recommendation', '')
490
- if len(recommendation) > 10:
491
- score += 0.1
492
-
493
- return min(1.0, max(0.0, score)) # Clamp between 0 and 1
494
-
495
- async def update_analysis_status(document_id: str, status: str):
496
- if not supabase_client: return
497
- try:
498
- update_data = {'analysis_status': status}
499
- if status == 'completed':
500
- update_data['analysis_completed_at'] = datetime.now().isoformat()
501
-
502
- supabase_client.table('documents').update(update_data).eq('id', document_id).execute()
503
- logger.info(f"βœ… Analysis status for {document_id} updated to '{status}'.")
504
- except Exception as e:
505
- logger.error(f"❌ DB Error updating status for doc {document_id}: {e}")
506
-
507
- async def add_to_vectorstore(namespace: str, chunks: List[Dict[str, Any]]):
508
- if not pc: return
509
- try:
510
- texts = [chunk['text'] for chunk in chunks]
511
- embeddings = await get_embeddings_huggingface(texts)
512
-
513
- index = pc.Index("insurance-doc")
514
- # Ensure embedding dimension matches index (512)
515
- vectors = []
516
- for chunk, emb in zip(chunks, embeddings):
517
- if len(emb) != 512:
518
- emb = emb[:512] if len(emb) > 512 else (emb + [0.0]*(512-len(emb)))
519
- vectors.append({
520
- 'id': f"{namespace}_{chunk['id']}",
521
- 'values': emb,
522
- 'metadata': {'text': chunk['text'], 'namespace': namespace}
523
- })
524
-
525
- index.upsert(vectors=vectors)
526
- logger.info(f"βœ… Added {len(vectors)} vectors to Pinecone.")
527
- except Exception as e:
528
- logger.error(f"❌ Failed to add to vector store: {e}")
529
-
530
- # --- Main Background Task ---
531
-
532
- async def analyze_document_background(document_id: str):
533
- """The main background task to process and analyze a document."""
534
- logger.info(f"πŸ”„ Starting full analysis for document: {document_id}")
535
- await update_analysis_status(document_id, 'analyzing')
536
-
537
- if not supabase_client:
538
- await update_analysis_status(document_id, 'failed')
539
- return
540
-
541
- try:
542
- # Get cached data
543
- blocks_response = supabase_client.table('cache').select('value').eq('key', f"blocks:{document_id}").execute()
544
- if not blocks_response.data:
545
- logger.error(f"❌ Text blocks not found in cache for {document_id}.")
546
- await update_analysis_status(document_id, 'failed')
547
- return
548
-
549
- text_blocks = json.loads(blocks_response.data[0]['value'])
550
- chunks = await chunk_text_with_coordinates(text_blocks)
551
-
552
- # Add to vector store in parallel
553
- asyncio.create_task(add_to_vectorstore(document_id, chunks))
554
-
555
- llm = get_llm_client()
556
- if not llm:
557
- await update_analysis_status(document_id, 'failed')
558
- return
559
-
560
- # Analyze chunks
561
- analysis_tasks = [analyze_chunk_for_concerns(llm, chunk) for chunk in chunks]
562
- results = await asyncio.gather(*analysis_tasks)
563
-
564
- # Save valid findings
565
- findings_count = 0
566
- for i, finding in enumerate(results):
567
- if finding and finding.get('is_concern'):
568
- await save_finding(document_id, finding, chunks[i])
569
- findings_count += 1
570
-
571
- logger.info(f"βœ… Analysis complete for {document_id}. Found {findings_count} concerns.")
572
- await update_analysis_status(document_id, 'completed')
573
-
574
- except Exception as e:
575
- logger.error(f"❌ Unhandled error in background analysis for {document_id}: {e}")
576
- await update_analysis_status(document_id, 'failed')
577
-
578
- # --- FastAPI App Setup ---
579
-
580
- app = FastAPI(title="Insurance Document Analysis API", version="3.4.0")
581
- app.add_middleware(
582
- CORSMiddleware,
583
- allow_origins=["*"], # Best to restrict in production
584
- allow_credentials=True,
585
- allow_methods=["*"],
586
- allow_headers=["*"],
587
- )
588
- # Static files mounting disabled for Vercel deployment
589
- # app.mount("/uploads", StaticFiles(directory="uploads"), name="uploads")
590
-
591
- # --- Pydantic Models ---
592
-
593
- class IngestResponse(BaseModel):
594
- document_id: str
595
- filename: str
596
- total_pages: int
597
- analysis_status: str
598
-
599
- class AnalysisStatus(BaseModel):
600
- document_id: str
601
- status: str
602
- findings_count: int
603
-
604
- class Finding(BaseModel):
605
- id: int
606
- category: str
607
- severity: str
608
- summary: str
609
- recommendation: Optional[str]
610
- page_num: int
611
- confidence_score: float
612
-
613
- # --- API Endpoints ---
614
-
615
- @app.get("/")
616
- async def root():
617
- return {"message": "Insurance Document Analysis API is running."}
618
-
619
- @app.post("/ingest", response_model=IngestResponse)
620
- async def ingest(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
621
- logger.info(f"πŸ“€ Ingest request received for file: {file.filename} ({file.size} bytes)")
622
- try:
623
- # Vercel serverless functions have 4.5MB request body limit
624
- MAX_FILE_SIZE = 4.4 * 1024 * 1024 # 4.4MB to be safe
625
-
626
- pdf_bytes = await file.read()
627
- if not pdf_bytes:
628
- raise HTTPException(400, "Empty file received.")
629
-
630
- # Check file size before processing
631
- if len(pdf_bytes) > MAX_FILE_SIZE:
632
- raise HTTPException(
633
- status_code=413,
634
- detail=f"File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB. Your file is {len(pdf_bytes) // (1024*1024)}MB."
635
- )
636
-
637
- doc_id = hashlib.sha256(pdf_bytes).hexdigest()
638
-
639
- # CORRECTED: Allow re-analysis by deleting old data first.
640
- if supabase_client:
641
- existing = supabase_client.table('documents').select('id').eq('id', doc_id).execute()
642
- if existing.data:
643
- logger.warning(f"⚠️ Document {doc_id} already exists. Deleting old data to re-analyze.")
644
- # Delete old findings before starting new analysis
645
- supabase_client.table('findings').delete().eq('document_id', doc_id).execute()
646
- # We can keep the document entry and just update it
647
- supabase_client.table('documents').update({'analysis_status': 'pending'}).eq('id', doc_id).execute()
648
- else:
649
- # If it doesn't exist, save new metadata
650
- text_blocks_temp = await extract_text_with_coordinates(pdf_bytes)
651
- page_count_temp = max(b['page_num'] for b in text_blocks_temp) if text_blocks_temp else 0
652
- await save_document_metadata(doc_id, file.filename, page_count_temp)
653
-
654
-
655
- # Save PDF to local storage for serving
656
- pdf_path = UPLOADS_DIR / f"{doc_id}.pdf"
657
- with open(pdf_path, "wb") as f:
658
- f.write(pdf_bytes)
659
- logger.info(f"βœ… PDF saved to: {pdf_path}")
660
-
661
- text_blocks = await extract_text_with_coordinates(pdf_bytes)
662
- page_count = max(b['page_num'] for b in text_blocks) if text_blocks else 0
663
-
664
- # Cache text blocks for the background worker
665
- if supabase_client:
666
- try:
667
- supabase_client.table('cache').upsert({
668
- 'key': f"blocks:{doc_id}",
669
- 'value': json.dumps(text_blocks)
670
- }).execute()
671
- except Exception as e:
672
- logger.warning(f"⚠️ Failed to cache text blocks for {doc_id}: {e}")
673
-
674
- background_tasks.add_task(analyze_document_background, doc_id)
675
-
676
- return IngestResponse(
677
- document_id=doc_id,
678
- filename=file.filename,
679
- total_pages=page_count,
680
- analysis_status="pending"
681
- )
682
- except Exception as e:
683
- logger.error(f"❌ Ingestion error: {e}")
684
- raise HTTPException(500, "An unexpected error occurred during file ingestion.")
685
-
686
- @app.get("/analysis/{document_id}", response_model=AnalysisStatus)
687
- async def get_analysis_status(document_id: str):
688
- if not supabase_client:
689
- raise HTTPException(503, "Database service is not available.")
690
- try:
691
- doc_response = supabase_client.table('documents').select('analysis_status').eq('id', document_id).execute()
692
- if not doc_response.data:
693
- raise HTTPException(404, "Document not found.")
694
-
695
- status = doc_response.data[0]['analysis_status']
696
-
697
- count_response = supabase_client.table('findings').select('id', count='exact').eq('document_id', document_id).execute()
698
- findings_count = count_response.count or 0
699
-
700
- return AnalysisStatus(
701
- document_id=document_id,
702
- status=status,
703
- findings_count=findings_count
704
- )
705
- except Exception as e:
706
- logger.error(f"❌ Failed to get analysis status for {document_id}: {e}")
707
- raise HTTPException(500, "Database error.")
708
-
709
- @app.get("/findings/{document_id}", response_model=List[Finding])
710
- async def get_findings(document_id: str):
711
- if not supabase_client:
712
- raise HTTPException(503, "Database service is not available.")
713
- try:
714
- response = supabase_client.table('findings').select('*').eq('document_id', document_id).order('severity').order('page_num').execute()
715
-
716
- # Deduplicate findings based on summary
717
- unique_findings = {}
718
- for row in response.data:
719
- summary = row['summary']
720
- if summary not in unique_findings:
721
- unique_findings[summary] = Finding(**row)
722
-
723
- return list(unique_findings.values())
724
- except Exception as e:
725
- logger.error(f"❌ Failed to get findings for {document_id}: {e}")
726
- return []
727
-
728
- @app.get("/documents/{document_id}/pdf")
729
- async def get_pdf(document_id: str):
730
- """Serve PDF file for document viewer."""
731
- logger.info(f"πŸ“„ PDF request for document: {document_id}")
732
-
733
- try:
734
- # Check if PDF file exists locally
735
- pdf_path = UPLOADS_DIR / f"{document_id}.pdf"
736
- if not pdf_path.exists():
737
- raise HTTPException(404, "PDF file not found.")
738
-
739
- # Get document metadata for filename
740
- filename = document_id
741
- if supabase_client:
742
- try:
743
- doc_response = supabase_client.table('documents').select('filename').eq('id', document_id).execute()
744
- if doc_response.data:
745
- filename = doc_response.data[0]['filename']
746
- except Exception as e:
747
- logger.warning(f"⚠️ Could not get filename from database: {e}")
748
-
749
- # Serve the PDF file for inline viewing
750
- return FileResponse(
751
- path=pdf_path,
752
- filename=filename,
753
- media_type="application/pdf",
754
- headers={"Content-Disposition": "inline"}
755
- )
756
-
757
- except HTTPException:
758
- raise
759
- except Exception as e:
760
- logger.error(f"❌ PDF serving error for {document_id}: {e}")
761
- raise HTTPException(500, "Failed to serve PDF.")
762
-
763
- @app.get("/progress/{document_id}")
764
- async def get_processing_progress(document_id: str):
765
- """Return simple progress information for the frontend polling UI."""
766
- if not supabase_client:
767
- return {"status": "error", "progress": 0, "message": "Database not configured"}
768
-
769
- try:
770
- resp = supabase_client.table('documents').select('analysis_status').eq('id', document_id).execute()
771
- if not resp.data:
772
- return {"status": "not_found", "progress": 0, "message": "Document not found"}
773
-
774
- status = resp.data[0]['analysis_status']
775
- percent = {
776
- 'pending': 10,
777
- 'analyzing': 60,
778
- 'completed': 100,
779
- 'failed': 0
780
- }.get(status, 0)
781
-
782
- message = {
783
- 'pending': 'Waiting for analysis to start',
784
- 'analyzing': 'AI is analyzing the document',
785
- 'completed': 'Analysis completed',
786
- 'failed': 'Analysis failed'
787
- }.get(status, 'Unknown status')
788
-
789
- return {
790
- 'status': status,
791
- 'progress': percent,
792
- 'message': message,
793
- 'timestamp': datetime.now().isoformat()
794
- }
795
- except Exception as e:
796
- logger.error(f"❌ Progress endpoint error: {e}")
797
- return {"status": "error", "progress": 0, "message": "Internal server error"}
798
-
799
- @app.get("/health")
800
- async def health_check():
801
- logger.info("πŸ” Health check requested")
802
- return {
803
- "status": "healthy",
804
- "timestamp": datetime.now().isoformat(),
805
- "services": {
806
- "groq": GROQ_API_KEY is not None,
807
- "pinecone": pc is not None,
808
- "supabase": supabase_client is not None,
809
- "huggingface": HF_API_KEY is not None
810
- }
811
- }
812
-
813
- # --- Chat Endpoint ---
814
-
815
- @app.post("/findings/{finding_id}/chat")
816
- async def contextual_chat(finding_id: int, request: Dict[str, str]):
817
- """Contextual chat about specific finding"""
818
- llm = get_llm_client()
819
- if not llm:
820
- raise HTTPException(500, "Chat service not available")
821
-
822
- try:
823
- # Get finding details from database
824
- if not supabase_client:
825
- raise HTTPException(500, "Database not configured")
826
-
827
- resp = supabase_client.table('findings').select('*').eq('id', finding_id).execute()
828
- if not resp.data:
829
- raise HTTPException(404, "Finding not found")
830
-
831
- finding = resp.data[0]
832
-
833
- prompt = f"""
834
- You are an expert insurance policy analyst. Answer the user's question about this specific finding.
835
-
836
- IMPORTANT: Provide ONLY a direct, helpful answer.
837
- Do NOT include any reasoning, thinking process, or meta-commentary.
838
- Give a clear, concise response that directly addresses the user's question.
839
-
840
- Context:
841
- - Text Content: {finding['text_content']}
842
- - Finding: {finding['summary']}
843
- - Category: {finding['category']}
844
- - Severity: {finding['severity']}
845
- - Recommendation: {finding['recommendation']}
846
-
847
- Question: {request.get('q', '')}
848
-
849
- Answer the question directly and helpfully, using the context provided.
850
- """
851
-
852
- response = await asyncio.to_thread(
853
- llm.chat.completions.create,
854
- messages=[{"role": "user", "content": prompt}],
855
- model="llama-3.1-8b-instant",
856
- temperature=0.1,
857
- max_tokens=500,
858
- )
859
-
860
- # Clean the response to remove reasoning and improve formatting
861
- answer = response.choices[0].message.content
862
- answer = clean_chat_response(answer)
863
-
864
- return {
865
- "answer": answer,
866
- "finding_id": finding_id,
867
- "context": {
868
- "category": finding['category'],
869
- "summary": finding['summary'],
870
- "text_content": finding['text_content']
871
- }
872
- }
873
-
874
- except HTTPException:
875
- raise
876
- except Exception as e:
877
- logger.error(f"❌ Chat error for finding {finding_id}: {e}")
878
- raise HTTPException(500, f"Chat failed: {str(e)}")
879
-
880
- # --- Hugging Face Spaces Entry Point ---
881
- if __name__ == "__main__":
882
- import uvicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ # This file is a copy of backend-vercel/app.py
2
+ # It's placed here so Vercel can serve both frontend and backend from the same repo
3
+
4
+ import asyncio
5
+ import hashlib
6
+ import os
7
+ import json
8
+ from typing import List, Dict, Any, Optional
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+
12
+ import fitz
13
+ from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
14
+ from fastapi.responses import JSONResponse, FileResponse
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from fastapi.staticfiles import StaticFiles
17
+ from loguru import logger
18
+ from pydantic import BaseModel
19
+ from tiktoken import get_encoding
20
+
21
+ # API-based services
22
+ import requests
23
+ from pinecone import Pinecone
24
+ from supabase import create_client, Client
25
+ from groq import Groq
26
+
27
+ # Configure logger for production
28
+ logger.remove()
29
+ logger.add(lambda msg: print(msg, end=""), colorize=True,
30
+ format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | {message}",
31
+ level="INFO")
32
+
33
+ # Load environment variables
34
+ try:
35
+ from dotenv import load_dotenv
36
+ from pathlib import Path
37
+
38
+ # This ensures the .env file is loaded from the `backend` directory
39
+ # regardless of where the script is run from.
40
+ env_path = Path(__file__).parent / '.env'
41
+ if env_path.is_file():
42
+ load_dotenv(dotenv_path=env_path)
43
+ logger.info(f"βœ… Loaded environment variables from: {env_path}")
44
+ else:
45
+ logger.warning(f"⚠️ .env file not found at {env_path}. Relying on system environment variables.")
46
+
47
+ except ImportError:
48
+ logger.info("dotenv not installed, skipping .env file load.")
49
+
50
+ # --- API Keys & Client Initialization ---
51
+
52
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
53
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
54
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
55
+ SUPABASE_KEY = os.getenv("SUPABASE_KEY")
56
+ HF_API_KEY = os.getenv("HF_API_KEY")
57
+
58
+ # Pinecone
59
+ pc: Optional[Pinecone] = None
60
+ if PINECONE_API_KEY:
61
+ try:
62
+ pc = Pinecone(api_key=PINECONE_API_KEY)
63
+ logger.info("βœ… Pinecone client initialized.")
64
+ except Exception as e:
65
+ logger.error(f"❌ Failed to initialize Pinecone: {e}")
66
+ else:
67
+ logger.warning("⚠️ PINECONE_API_KEY not set. Vector search will be disabled.")
68
+
69
+ # Supabase
70
+ supabase_client: Optional[Client] = None
71
+ if SUPABASE_URL and SUPABASE_KEY:
72
+ try:
73
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)
74
+ logger.info("βœ… Supabase client initialized.")
75
+ except Exception as e:
76
+ logger.error(f"❌ Failed to initialize Supabase: {e}")
77
+ else:
78
+ logger.warning("⚠️ Supabase credentials not set. Database operations will be disabled.")
79
+
80
+ # Local file storage for PDFs (robust for restricted environments like HF Spaces)
81
+ # Prefer env var if provided; else try local folder; fall back to /tmp/uploads when not writeable
82
+ def _resolve_uploads_dir() -> Path:
83
+ candidate = os.getenv("UPLOADS_DIR")
84
+ if candidate:
85
+ path = Path(candidate)
86
+ try:
87
+ path.mkdir(parents=True, exist_ok=True)
88
+ return path
89
+ except Exception as e:
90
+ logger.warning(f"⚠️ Could not create UPLOADS_DIR at {path}: {e}. Falling back to defaults.")
91
+
92
+ # Try relative to app directory
93
+ try:
94
+ local_path = Path(__file__).parent / "uploads"
95
+ local_path.mkdir(parents=True, exist_ok=True)
96
+ return local_path
97
+ except Exception as e:
98
+ logger.warning(f"⚠️ Cannot create local uploads dir at {local_path}: {e}. Using /tmp/uploads.")
99
+
100
+ # Final fallback: /tmp (always writeable in most PaaS)
101
+ tmp_path = Path("/tmp/uploads")
102
+ tmp_path.mkdir(parents=True, exist_ok=True)
103
+ return tmp_path
104
+
105
+ UPLOADS_DIR = _resolve_uploads_dir()
106
+ logger.info(f"πŸ“ Using uploads directory: {UPLOADS_DIR}")
107
+
108
+
109
+ # --- Production-Ready Core Functions ---
110
+
111
+ def get_llm_client() -> Optional[Groq]:
112
+ """Initializes and returns a Groq client if the API key is available."""
113
+ if not GROQ_API_KEY:
114
+ logger.error("❌ GROQ_API_KEY not set. LLM analysis is disabled.")
115
+ return None
116
+ try:
117
+ return Groq(api_key=GROQ_API_KEY)
118
+ except Exception as e:
119
+ logger.error(f"❌ Failed to create Groq client: {e}")
120
+ return None
121
+
122
+ async def get_embeddings_huggingface(texts: List[str]) -> List[List[float]]:
123
+ """Get embeddings using Hugging Face Inference API with requests."""
124
+ if not HF_API_KEY:
125
+ logger.error("❌ HF_API_KEY not set. Cannot generate embeddings.")
126
+ raise HTTPException(status_code=500, detail="Embedding service is not configured.")
127
+
128
+ try:
129
+ import requests
130
+
131
+ headers = {
132
+ "Authorization": f"Bearer {HF_API_KEY}",
133
+ "Content-Type": "application/json"
134
+ }
135
+ model = "sentence-transformers/all-mpnet-base-v2"
136
+
137
+ embeddings = []
138
+ for text in texts:
139
+ response = requests.post(
140
+ f"https://api-inference.huggingface.co/models/{model}",
141
+ headers=headers,
142
+ json={"inputs": [text]},
143
+ timeout=30
144
+ )
145
+ if response.status_code == 200:
146
+ data = response.json()
147
+ # Preferred response format: {"embedding": [...] }
148
+ if isinstance(data, dict) and "embedding" in data:
149
+ embeddings.append(data["embedding"])
150
+ continue
151
+ # Fallback: some models return list directly
152
+ if isinstance(data, list):
153
+ embeddings.append(data[0] if isinstance(data[0], list) else data)
154
+ continue
155
+ logger.warning(f"⚠️ Unexpected HF response format: {type(data)}")
156
+ else:
157
+ logger.debug(f"⚠️ HF API HTTP {response.status_code}: {response.text[:120]}")
158
+ # Fallback embedding when HF call fails
159
+ embeddings.append(_get_fallback_embedding(text))
160
+
161
+ logger.info(f"βœ… Generated {len(embeddings)} embeddings using HF API")
162
+ return embeddings
163
+
164
+ except Exception as e:
165
+ logger.error(f"❌ Hugging Face API error during embedding generation: {e}")
166
+ # Return fallback embeddings instead of raising exception
167
+ return [_get_fallback_embedding(text) for text in texts]
168
+
169
+ def _get_fallback_embedding(text: str) -> List[float]:
170
+ """Generate fallback embedding using hash for 768 dimensions."""
171
+ import hashlib
172
+ hash_obj = hashlib.md5(text.encode())
173
+ # all-mpnet-base-v2 has 768 dimensions
174
+ return [float(x) / 255.0 for x in hash_obj.digest()] * 48 # 768 dimensions
175
+
176
+ # --- PDF Processing and Chunking ---
177
+
178
+ def _sync_extract_with_coordinates(pdf_bytes: bytes) -> List[Dict[str, Any]]:
179
+ """Synchronous core logic for text and coordinate extraction."""
180
+ text_blocks = []
181
+ with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
182
+ for page_num, page in enumerate(doc, 1):
183
+ blocks = page.get_text("dict").get("blocks", [])
184
+ for block in blocks:
185
+ if "lines" in block:
186
+ for line in block["lines"]:
187
+ for span in line["spans"]:
188
+ if span["text"].strip():
189
+ text_blocks.append({
190
+ "text": span["text"].strip(),
191
+ "page_num": page_num,
192
+ "coordinates": list(span["bbox"]),
193
+ "block_id": f"p{page_num}b{len(text_blocks)}"
194
+ })
195
+ return text_blocks
196
+
197
+ async def extract_text_with_coordinates(pdf_bytes: bytes) -> List[Dict[str, Any]]:
198
+ """Extracts text blocks with page numbers and coordinates from a PDF."""
199
+ loop = asyncio.get_event_loop()
200
+ return await loop.run_in_executor(None, _sync_extract_with_coordinates, pdf_bytes)
201
+
202
+ async def chunk_text_with_coordinates(text_blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
203
+ """Creates semantic chunks from text blocks while preserving location info."""
204
+ chunks = []
205
+ current_chunk_text = ""
206
+ current_chunk_blocks = []
207
+
208
+ enc = get_encoding("cl100k_base")
209
+ CHUNK_SIZE_TOKENS = 250
210
+ MIN_CHUNK_SIZE_CHARS = 50
211
+
212
+ for block in text_blocks:
213
+ block_text = block["text"]
214
+
215
+ if (enc.encode(current_chunk_text + " " + block_text)) and (len(enc.encode(current_chunk_text + " " + block_text)) > CHUNK_SIZE_TOKENS):
216
+ if len(current_chunk_text) >= MIN_CHUNK_SIZE_CHARS:
217
+ first_block = current_chunk_blocks[0]
218
+ chunks.append({
219
+ "id": f"chunk_{len(chunks)}",
220
+ "text": current_chunk_text.strip(),
221
+ "page_num": first_block["page_num"],
222
+ "coordinates": [b["coordinates"] for b in current_chunk_blocks],
223
+ "token_count": len(enc.encode(current_chunk_text))
224
+ })
225
+ current_chunk_text = ""
226
+ current_chunk_blocks = []
227
+
228
+ current_chunk_text += " " + block_text
229
+ current_chunk_blocks.append(block)
230
+
231
+ if current_chunk_text and len(current_chunk_text) >= MIN_CHUNK_SIZE_CHARS:
232
+ first_block = current_chunk_blocks[0]
233
+ chunks.append({
234
+ "id": f"chunk_{len(chunks)}",
235
+ "text": current_chunk_text.strip(),
236
+ "page_num": first_block["page_num"],
237
+ "coordinates": [b["coordinates"] for b in current_chunk_blocks],
238
+ "token_count": len(enc.encode(current_chunk_text))
239
+ })
240
+
241
+ logger.info(f"βœ… Created {len(chunks)} chunks.")
242
+ return chunks
243
+
244
+
245
+ # --- Background Analysis Engine ---
246
+
247
+ ANALYST_PROMPT = """
248
+ You are an expert insurance policy analyst. Analyze the following text for potential policyholder concerns like exclusions, limitations, high costs, or complex duties.
249
+
250
+ IMPORTANT: You must respond with ONLY a valid JSON object. Do not include any other text, explanations, or formatting. The JSON must have these exact fields:
251
+
252
+ {
253
+ "is_concern": true/false, // Must be a boolean
254
+ "category": "EXCLUSION" | "LIMITATION" | "WAITING_PERIOD" | "DEDUCTIBLE" | "COPAYMENT" | "COINSURANCE" | "POLICYHOLDER_DUTY" | "RENEWAL_RESTRICTION" | "CLAIM_PROCESS" | "NETWORK_RESTRICTION",
255
+ "severity": "HIGH" | "MEDIUM" | "LOW",
256
+ "summary": "A one-sentence, easy-to-understand summary of the concern.",
257
+ "recommendation": "A concise, actionable recommendation for the policyholder."
258
+ }
259
+
260
+ TEXT TO ANALYZE:
261
+ {text_content}
262
+ """
263
+
264
+ async def analyze_chunk_for_concerns(llm: Groq, chunk: Dict[str, Any]) -> Optional[Dict[str, Any]]:
265
+ """Analyzes a single text chunk for insurance concerns using the LLM."""
266
+ if not llm: return None
267
+
268
+ cache_key = f"analysis:{hashlib.sha1(chunk['text'].encode()).hexdigest()}"
269
+ if supabase_client:
270
+ try:
271
+ response = supabase_client.table('cache').select('value').eq('key', cache_key).execute()
272
+ if response.data:
273
+ return json.loads(response.data[0]['value'])
274
+ except Exception as e:
275
+ logger.warning(f"⚠️ Cache lookup failed: {e}")
276
+
277
+ try:
278
+ # Provide a structured format for the model to follow
279
+ prompt = f"""
280
+ You are an expert insurance policy analyst. Analyze the following text for potential policyholder concerns.
281
+ Please provide your analysis in the following format:
282
+
283
+ Is Concern: [true/false]
284
+ Category: [category]
285
+ Severity: [severity]
286
+ Summary: [one-sentence summary]
287
+ Recommendation: [actionable recommendation]
288
+
289
+ TEXT TO ANALYZE:
290
+ {chunk['text']}
291
+ """
292
+
293
+ response = await asyncio.to_thread(
294
+ llm.chat.completions.create,
295
+ messages=[{"role": "user", "content": prompt}],
296
+ model="llama-3.1-8b-instant",
297
+ temperature=0.0,
298
+ max_tokens=350,
299
+ )
300
+
301
+ result_text = response.choices[0].message.content
302
+
303
+ # Parse the natural language response
304
+ analysis_result = parse_llm_response(result_text)
305
+
306
+ if analysis_result and analysis_result.get("is_concern"):
307
+ if supabase_client:
308
+ try:
309
+ supabase_client.table('cache').upsert({
310
+ 'key': cache_key,
311
+ 'value': json.dumps(analysis_result)
312
+ }).execute()
313
+ except Exception as e:
314
+ logger.warning(f"⚠️ Cache save failed: {e}")
315
+ return analysis_result
316
+
317
+ except Exception as e:
318
+ logger.error(f"❌ LLM analysis error for chunk {chunk.get('id', '')}: {e}")
319
+
320
+ return None
321
+
322
+ def clean_llm_response(response: str) -> str:
323
+ """More aggressively clean LLM response artifacts."""
324
+ import re
325
+
326
+ # Remove XML-style thinking tags and their entire content
327
+ response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL | re.IGNORECASE)
328
+
329
+ # Remove any other XML-like tags
330
+ response = re.sub(r'<[^>]+>', '', response)
331
+
332
+ # Remove lines that are just conversational filler or metadata
333
+ lines = response.split('\n')
334
+ cleaned_lines = []
335
+ for line in lines:
336
+ line_lower = line.strip().lower()
337
+ if not any(phrase in line_lower for phrase in [
338
+ "okay, so i need to analyze", "sure, i can help", "here is the analysis", "i have analyzed the text"
339
+ ]):
340
+ cleaned_lines.append(line)
341
+
342
+ response = '\n'.join(cleaned_lines)
343
+
344
+ # Standardize whitespace
345
+ response = re.sub(r'\n\s*\n+', '\n', response.strip())
346
+
347
+ return response
348
+
349
+ def clean_chat_response(response: str) -> str:
350
+ """Clean chat responses to remove reasoning and improve formatting."""
351
+ import re
352
+
353
+ # Remove thinking/reasoning sections
354
+ response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL | re.IGNORECASE)
355
+ response = re.sub(r'<reasoning>.*?</reasoning>', '', response, flags=re.DOTALL | re.IGNORECASE)
356
+
357
+ # Remove lines that start with thinking indicators
358
+ lines = response.split('\n')
359
+ cleaned_lines = []
360
+ for line in lines:
361
+ line_lower = line.strip().lower()
362
+ # Skip lines that are clearly reasoning/thinking
363
+ if any(phrase in line_lower for phrase in [
364
+ "let me think", "i need to", "first,", "next,", "i should", "i will",
365
+ "okay,", "so,", "well,", "hmm,", "let me", "i'll", "i'm going to"
366
+ ]):
367
+ continue
368
+ # Skip empty lines
369
+ if not line.strip():
370
+ continue
371
+ cleaned_lines.append(line)
372
+
373
+ # Join lines and clean up formatting
374
+ response = '\n'.join(cleaned_lines)
375
+
376
+ # Remove excessive whitespace
377
+ response = re.sub(r'\n\s*\n+', '\n\n', response.strip())
378
+
379
+ # If response is too short, return a simple message
380
+ if len(response.strip()) < 10:
381
+ return "I don't have enough information to answer that question based on the current finding."
382
+
383
+ return response
384
+
385
+ def parse_llm_response(response: str) -> Optional[Dict[str, Any]]:
386
+ """Parse structured LLM response into a dictionary."""
387
+ try:
388
+ response = clean_llm_response(response)
389
+
390
+ result = {
391
+ "is_concern": False,
392
+ "category": "UNCATEGORIZED",
393
+ "severity": "UNKNOWN",
394
+ "summary": "No concerns found",
395
+ "recommendation": ""
396
+ }
397
+
398
+ # Regex to find key-value pairs, ignoring case and whitespace
399
+ def get_value(key: str) -> Optional[str]:
400
+ import re
401
+ match = re.search(f"^{key}\\s*:\\s*(.*)", response, re.IGNORECASE | re.MULTILINE)
402
+ if match:
403
+ return match.group(1).strip().replace("[", "").replace("]", "")
404
+ return None
405
+
406
+ is_concern_str = get_value("Is Concern")
407
+ if is_concern_str:
408
+ result["is_concern"] = "true" in is_concern_str.lower()
409
+
410
+ # If the model says it's not a concern, we can stop here.
411
+ if not result["is_concern"]:
412
+ return result
413
+
414
+ category_str = get_value("Category")
415
+ if category_str:
416
+ categories = [
417
+ "EXCLUSION", "LIMITATION", "WAITING_PERIOD", "DEDUCTIBLE",
418
+ "COPAYMENT", "COINSURANCE", "POLICYHOLDER_DUTY",
419
+ "RENEWAL_RESTRICTION", "CLAIM_PROCESS", "NETWORK_RESTRICTION"
420
+ ]
421
+ for cat in categories:
422
+ if cat.replace("_", " ").lower() in category_str.lower():
423
+ result["category"] = cat
424
+ break
425
+
426
+ severity_str = get_value("Severity")
427
+ if severity_str:
428
+ severity_lower = severity_str.lower()
429
+ if "high" in severity_lower: result["severity"] = "HIGH"
430
+ elif "medium" in severity_lower: result["severity"] = "MEDIUM"
431
+ elif "low" in severity_lower: result["severity"] = "LOW"
432
+
433
+ summary_str = get_value("Summary")
434
+ if summary_str:
435
+ result["summary"] = summary_str
436
+
437
+ recommendation_str = get_value("Recommendation")
438
+ if recommendation_str:
439
+ result["recommendation"] = recommendation_str
440
+
441
+ # A final check to ensure we have a meaningful summary if a concern was flagged.
442
+ if result["is_concern"] and (not result["summary"] or result["summary"] == "No concerns found"):
443
+ # Fallback to grabbing the first meaningful line of text that is not a key-value pair.
444
+ lines = [line.strip() for line in response.split('\n') if line.strip() and ":" not in line]
445
+ if lines:
446
+ result["summary"] = lines[0]
447
+
448
+ return result
449
+
450
+ except Exception as e:
451
+ logger.error(f"❌ Failed to parse LLM response: {e}")
452
+ return None
453
+
454
+ # --- Database Operations ---
455
+ # REMINDER: Ensure your Supabase schema matches. The 'documents' table needs:
456
+ # - id TEXT PRIMARY KEY
457
+ # - filename TEXT
458
+ # - total_pages INTEGER
459
+ # - analysis_status TEXT
460
+ # - analysis_completed_at TIMESTAMP WITH TIME ZONE
461
+ # - upload_date TIMESTAMP WITH TIME ZONE DEFAULT NOW()
462
+
463
+ async def save_document_metadata(doc_id: str, filename: str, page_count: int):
464
+ if not supabase_client: return
465
+ try:
466
+ supabase_client.table('documents').insert({
467
+ 'id': doc_id,
468
+ 'filename': filename,
469
+ 'total_pages': page_count,
470
+ 'analysis_status': 'pending',
471
+ }).execute()
472
+ except Exception as e:
473
+ logger.error(f"❌ DB Error saving document metadata for {doc_id}: {e}")
474
+
475
+ async def save_finding(document_id: str, finding: Dict[str, Any], chunk: Dict[str, Any]):
476
+ if not supabase_client: return
477
+ try:
478
+ # Calculate confidence score based on finding quality
479
+ confidence_score = calculate_confidence_score(finding)
480
+
481
+ supabase_client.table('findings').insert({
482
+ 'document_id': document_id,
483
+ 'page_num': chunk.get('page_num', 0),
484
+ 'coordinates': json.dumps(chunk.get('coordinates', [])),
485
+ 'text_content': chunk.get('text', ''),
486
+ 'category': finding.get('category', 'UNCATEGORIZED'),
487
+ 'severity': finding.get('severity', 'UNKNOWN'),
488
+ 'summary': finding.get('summary', 'No summary provided.'),
489
+ 'recommendation': finding.get('recommendation', ''),
490
+ 'confidence_score': confidence_score,
491
+ }).execute()
492
+ except Exception as e:
493
+ logger.error(f"❌ DB Error saving finding for doc {document_id}: {e}")
494
+
495
+ def calculate_confidence_score(finding: Dict[str, Any]) -> float:
496
+ """Calculate confidence score based on finding quality."""
497
+ score = 0.5 # Base score
498
+
499
+ # Adjust based on category
500
+ if finding.get('category') != 'UNCATEGORIZED':
501
+ score += 0.2
502
+
503
+ # Adjust based on severity
504
+ if finding.get('severity') in ['HIGH', 'MEDIUM', 'LOW']:
505
+ score += 0.1
506
+
507
+ # Adjust based on summary quality
508
+ summary = finding.get('summary', '')
509
+ if len(summary) > 20 and summary != 'No summary provided.':
510
+ score += 0.1
511
+
512
+ # Adjust based on recommendation quality
513
+ recommendation = finding.get('recommendation', '')
514
+ if len(recommendation) > 10:
515
+ score += 0.1
516
+
517
+ return min(1.0, max(0.0, score)) # Clamp between 0 and 1
518
+
519
+ async def update_analysis_status(document_id: str, status: str):
520
+ if not supabase_client: return
521
+ try:
522
+ update_data = {'analysis_status': status}
523
+ if status == 'completed':
524
+ update_data['analysis_completed_at'] = datetime.now().isoformat()
525
+
526
+ supabase_client.table('documents').update(update_data).eq('id', document_id).execute()
527
+ logger.info(f"βœ… Analysis status for {document_id} updated to '{status}'.")
528
+ except Exception as e:
529
+ logger.error(f"❌ DB Error updating status for doc {document_id}: {e}")
530
+
531
+ async def add_to_vectorstore(namespace: str, chunks: List[Dict[str, Any]]):
532
+ if not pc: return
533
+ try:
534
+ texts = [chunk['text'] for chunk in chunks]
535
+ embeddings = await get_embeddings_huggingface(texts)
536
+
537
+ index = pc.Index("insurance-doc")
538
+ # Ensure embedding dimension matches index (512)
539
+ vectors = []
540
+ for chunk, emb in zip(chunks, embeddings):
541
+ if len(emb) != 512:
542
+ emb = emb[:512] if len(emb) > 512 else (emb + [0.0]*(512-len(emb)))
543
+ vectors.append({
544
+ 'id': f"{namespace}_{chunk['id']}",
545
+ 'values': emb,
546
+ 'metadata': {'text': chunk['text'], 'namespace': namespace}
547
+ })
548
+
549
+ index.upsert(vectors=vectors)
550
+ logger.info(f"βœ… Added {len(vectors)} vectors to Pinecone.")
551
+ except Exception as e:
552
+ logger.error(f"❌ Failed to add to vector store: {e}")
553
+
554
+ # --- Main Background Task ---
555
+
556
+ async def analyze_document_background(document_id: str):
557
+ """The main background task to process and analyze a document."""
558
+ logger.info(f"πŸ”„ Starting full analysis for document: {document_id}")
559
+ await update_analysis_status(document_id, 'analyzing')
560
+
561
+ if not supabase_client:
562
+ await update_analysis_status(document_id, 'failed')
563
+ return
564
+
565
+ try:
566
+ # Get cached data
567
+ blocks_response = supabase_client.table('cache').select('value').eq('key', f"blocks:{document_id}").execute()
568
+ if not blocks_response.data:
569
+ logger.error(f"❌ Text blocks not found in cache for {document_id}.")
570
+ await update_analysis_status(document_id, 'failed')
571
+ return
572
+
573
+ text_blocks = json.loads(blocks_response.data[0]['value'])
574
+ chunks = await chunk_text_with_coordinates(text_blocks)
575
+
576
+ # Add to vector store in parallel
577
+ asyncio.create_task(add_to_vectorstore(document_id, chunks))
578
+
579
+ llm = get_llm_client()
580
+ if not llm:
581
+ await update_analysis_status(document_id, 'failed')
582
+ return
583
+
584
+ # Analyze chunks
585
+ analysis_tasks = [analyze_chunk_for_concerns(llm, chunk) for chunk in chunks]
586
+ results = await asyncio.gather(*analysis_tasks)
587
+
588
+ # Save valid findings
589
+ findings_count = 0
590
+ for i, finding in enumerate(results):
591
+ if finding and finding.get('is_concern'):
592
+ await save_finding(document_id, finding, chunks[i])
593
+ findings_count += 1
594
+
595
+ logger.info(f"βœ… Analysis complete for {document_id}. Found {findings_count} concerns.")
596
+ await update_analysis_status(document_id, 'completed')
597
+
598
+ except Exception as e:
599
+ logger.error(f"❌ Unhandled error in background analysis for {document_id}: {e}")
600
+ await update_analysis_status(document_id, 'failed')
601
+
602
+ # --- FastAPI App Setup ---
603
+
604
+ app = FastAPI(title="Insurance Document Analysis API", version="3.4.0")
605
+ app.add_middleware(
606
+ CORSMiddleware,
607
+ allow_origins=["*"], # Best to restrict in production
608
+ allow_credentials=True,
609
+ allow_methods=["*"],
610
+ allow_headers=["*"],
611
+ )
612
+ # Static files mounting disabled for Vercel deployment
613
+ # app.mount("/uploads", StaticFiles(directory="uploads"), name="uploads")
614
+
615
+ # --- Pydantic Models ---
616
+
617
+ class IngestResponse(BaseModel):
618
+ document_id: str
619
+ filename: str
620
+ total_pages: int
621
+ analysis_status: str
622
+
623
+ class AnalysisStatus(BaseModel):
624
+ document_id: str
625
+ status: str
626
+ findings_count: int
627
+
628
+ class Finding(BaseModel):
629
+ id: int
630
+ category: str
631
+ severity: str
632
+ summary: str
633
+ recommendation: Optional[str]
634
+ page_num: int
635
+ confidence_score: float
636
+
637
+ # --- API Endpoints ---
638
+
639
+ @app.get("/")
640
+ async def root():
641
+ return {"message": "Insurance Document Analysis API is running."}
642
+
643
+ @app.post("/ingest", response_model=IngestResponse)
644
+ async def ingest(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
645
+ logger.info(f"πŸ“€ Ingest request received for file: {file.filename} ({file.size} bytes)")
646
+ try:
647
+ # Vercel serverless functions have 4.5MB request body limit
648
+ MAX_FILE_SIZE = 4.4 * 1024 * 1024 # 4.4MB to be safe
649
+
650
+ pdf_bytes = await file.read()
651
+ if not pdf_bytes:
652
+ raise HTTPException(400, "Empty file received.")
653
+
654
+ # Check file size before processing
655
+ if len(pdf_bytes) > MAX_FILE_SIZE:
656
+ raise HTTPException(
657
+ status_code=413,
658
+ detail=f"File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB. Your file is {len(pdf_bytes) // (1024*1024)}MB."
659
+ )
660
+
661
+ doc_id = hashlib.sha256(pdf_bytes).hexdigest()
662
+
663
+ # CORRECTED: Allow re-analysis by deleting old data first.
664
+ if supabase_client:
665
+ existing = supabase_client.table('documents').select('id').eq('id', doc_id).execute()
666
+ if existing.data:
667
+ logger.warning(f"⚠️ Document {doc_id} already exists. Deleting old data to re-analyze.")
668
+ # Delete old findings before starting new analysis
669
+ supabase_client.table('findings').delete().eq('document_id', doc_id).execute()
670
+ # We can keep the document entry and just update it
671
+ supabase_client.table('documents').update({'analysis_status': 'pending'}).eq('id', doc_id).execute()
672
+ else:
673
+ # If it doesn't exist, save new metadata
674
+ text_blocks_temp = await extract_text_with_coordinates(pdf_bytes)
675
+ page_count_temp = max(b['page_num'] for b in text_blocks_temp) if text_blocks_temp else 0
676
+ await save_document_metadata(doc_id, file.filename, page_count_temp)
677
+
678
+
679
+ # Save PDF to local storage for serving
680
+ pdf_path = UPLOADS_DIR / f"{doc_id}.pdf"
681
+ with open(pdf_path, "wb") as f:
682
+ f.write(pdf_bytes)
683
+ logger.info(f"βœ… PDF saved to: {pdf_path}")
684
+
685
+ text_blocks = await extract_text_with_coordinates(pdf_bytes)
686
+ page_count = max(b['page_num'] for b in text_blocks) if text_blocks else 0
687
+
688
+ # Cache text blocks for the background worker
689
+ if supabase_client:
690
+ try:
691
+ supabase_client.table('cache').upsert({
692
+ 'key': f"blocks:{doc_id}",
693
+ 'value': json.dumps(text_blocks)
694
+ }).execute()
695
+ except Exception as e:
696
+ logger.warning(f"⚠️ Failed to cache text blocks for {doc_id}: {e}")
697
+
698
+ background_tasks.add_task(analyze_document_background, doc_id)
699
+
700
+ return IngestResponse(
701
+ document_id=doc_id,
702
+ filename=file.filename,
703
+ total_pages=page_count,
704
+ analysis_status="pending"
705
+ )
706
+ except Exception as e:
707
+ logger.error(f"❌ Ingestion error: {e}")
708
+ raise HTTPException(500, "An unexpected error occurred during file ingestion.")
709
+
710
+ @app.get("/analysis/{document_id}", response_model=AnalysisStatus)
711
+ async def get_analysis_status(document_id: str):
712
+ if not supabase_client:
713
+ raise HTTPException(503, "Database service is not available.")
714
+ try:
715
+ doc_response = supabase_client.table('documents').select('analysis_status').eq('id', document_id).execute()
716
+ if not doc_response.data:
717
+ raise HTTPException(404, "Document not found.")
718
+
719
+ status = doc_response.data[0]['analysis_status']
720
+
721
+ count_response = supabase_client.table('findings').select('id', count='exact').eq('document_id', document_id).execute()
722
+ findings_count = count_response.count or 0
723
+
724
+ return AnalysisStatus(
725
+ document_id=document_id,
726
+ status=status,
727
+ findings_count=findings_count
728
+ )
729
+ except Exception as e:
730
+ logger.error(f"❌ Failed to get analysis status for {document_id}: {e}")
731
+ raise HTTPException(500, "Database error.")
732
+
733
+ @app.get("/findings/{document_id}", response_model=List[Finding])
734
+ async def get_findings(document_id: str):
735
+ if not supabase_client:
736
+ raise HTTPException(503, "Database service is not available.")
737
+ try:
738
+ response = supabase_client.table('findings').select('*').eq('document_id', document_id).order('severity').order('page_num').execute()
739
+
740
+ # Deduplicate findings based on summary
741
+ unique_findings = {}
742
+ for row in response.data:
743
+ summary = row['summary']
744
+ if summary not in unique_findings:
745
+ unique_findings[summary] = Finding(**row)
746
+
747
+ return list(unique_findings.values())
748
+ except Exception as e:
749
+ logger.error(f"❌ Failed to get findings for {document_id}: {e}")
750
+ return []
751
+
752
+ @app.get("/documents/{document_id}/pdf")
753
+ async def get_pdf(document_id: str):
754
+ """Serve PDF file for document viewer."""
755
+ logger.info(f"πŸ“„ PDF request for document: {document_id}")
756
+
757
+ try:
758
+ # Check if PDF file exists locally
759
+ pdf_path = UPLOADS_DIR / f"{document_id}.pdf"
760
+ if not pdf_path.exists():
761
+ raise HTTPException(404, "PDF file not found.")
762
+
763
+ # Get document metadata for filename
764
+ filename = document_id
765
+ if supabase_client:
766
+ try:
767
+ doc_response = supabase_client.table('documents').select('filename').eq('id', document_id).execute()
768
+ if doc_response.data:
769
+ filename = doc_response.data[0]['filename']
770
+ except Exception as e:
771
+ logger.warning(f"⚠️ Could not get filename from database: {e}")
772
+
773
+ # Serve the PDF file for inline viewing
774
+ return FileResponse(
775
+ path=pdf_path,
776
+ filename=filename,
777
+ media_type="application/pdf",
778
+ headers={"Content-Disposition": "inline"}
779
+ )
780
+
781
+ except HTTPException:
782
+ raise
783
+ except Exception as e:
784
+ logger.error(f"❌ PDF serving error for {document_id}: {e}")
785
+ raise HTTPException(500, "Failed to serve PDF.")
786
+
787
+ @app.get("/progress/{document_id}")
788
+ async def get_processing_progress(document_id: str):
789
+ """Return simple progress information for the frontend polling UI."""
790
+ if not supabase_client:
791
+ return {"status": "error", "progress": 0, "message": "Database not configured"}
792
+
793
+ try:
794
+ resp = supabase_client.table('documents').select('analysis_status').eq('id', document_id).execute()
795
+ if not resp.data:
796
+ return {"status": "not_found", "progress": 0, "message": "Document not found"}
797
+
798
+ status = resp.data[0]['analysis_status']
799
+ percent = {
800
+ 'pending': 10,
801
+ 'analyzing': 60,
802
+ 'completed': 100,
803
+ 'failed': 0
804
+ }.get(status, 0)
805
+
806
+ message = {
807
+ 'pending': 'Waiting for analysis to start',
808
+ 'analyzing': 'AI is analyzing the document',
809
+ 'completed': 'Analysis completed',
810
+ 'failed': 'Analysis failed'
811
+ }.get(status, 'Unknown status')
812
+
813
+ return {
814
+ 'status': status,
815
+ 'progress': percent,
816
+ 'message': message,
817
+ 'timestamp': datetime.now().isoformat()
818
+ }
819
+ except Exception as e:
820
+ logger.error(f"❌ Progress endpoint error: {e}")
821
+ return {"status": "error", "progress": 0, "message": "Internal server error"}
822
+
823
+ @app.get("/health")
824
+ async def health_check():
825
+ logger.info("πŸ” Health check requested")
826
+ return {
827
+ "status": "healthy",
828
+ "timestamp": datetime.now().isoformat(),
829
+ "services": {
830
+ "groq": GROQ_API_KEY is not None,
831
+ "pinecone": pc is not None,
832
+ "supabase": supabase_client is not None,
833
+ "huggingface": HF_API_KEY is not None
834
+ }
835
+ }
836
+
837
+ # --- Chat Endpoint ---
838
+
839
+ @app.post("/findings/{finding_id}/chat")
840
+ async def contextual_chat(finding_id: int, request: Dict[str, str]):
841
+ """Contextual chat about specific finding"""
842
+ llm = get_llm_client()
843
+ if not llm:
844
+ raise HTTPException(500, "Chat service not available")
845
+
846
+ try:
847
+ # Get finding details from database
848
+ if not supabase_client:
849
+ raise HTTPException(500, "Database not configured")
850
+
851
+ resp = supabase_client.table('findings').select('*').eq('id', finding_id).execute()
852
+ if not resp.data:
853
+ raise HTTPException(404, "Finding not found")
854
+
855
+ finding = resp.data[0]
856
+
857
+ prompt = f"""
858
+ You are an expert insurance policy analyst. Answer the user's question about this specific finding.
859
+
860
+ IMPORTANT: Provide ONLY a direct, helpful answer.
861
+ Do NOT include any reasoning, thinking process, or meta-commentary.
862
+ Give a clear, concise response that directly addresses the user's question.
863
+
864
+ Context:
865
+ - Text Content: {finding['text_content']}
866
+ - Finding: {finding['summary']}
867
+ - Category: {finding['category']}
868
+ - Severity: {finding['severity']}
869
+ - Recommendation: {finding['recommendation']}
870
+
871
+ Question: {request.get('q', '')}
872
+
873
+ Answer the question directly and helpfully, using the context provided.
874
+ """
875
+
876
+ response = await asyncio.to_thread(
877
+ llm.chat.completions.create,
878
+ messages=[{"role": "user", "content": prompt}],
879
+ model="llama-3.1-8b-instant",
880
+ temperature=0.1,
881
+ max_tokens=500,
882
+ )
883
+
884
+ # Clean the response to remove reasoning and improve formatting
885
+ answer = response.choices[0].message.content
886
+ answer = clean_chat_response(answer)
887
+
888
+ return {
889
+ "answer": answer,
890
+ "finding_id": finding_id,
891
+ "context": {
892
+ "category": finding['category'],
893
+ "summary": finding['summary'],
894
+ "text_content": finding['text_content']
895
+ }
896
+ }
897
+
898
+ except HTTPException:
899
+ raise
900
+ except Exception as e:
901
+ logger.error(f"❌ Chat error for finding {finding_id}: {e}")
902
+ raise HTTPException(500, f"Chat failed: {str(e)}")
903
+
904
+ # --- Hugging Face Spaces Entry Point ---
905
+ if __name__ == "__main__":
906
+ import uvicorn
907
  uvicorn.run(app, host="0.0.0.0", port=7860)