broadfield-dev commited on
Commit
c6639ba
·
verified ·
1 Parent(s): 2ae0bde

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -576
app.py CHANGED
@@ -1,590 +1,97 @@
 
 
1
  import os
2
- import io
3
- import requests
4
- import logging
5
- import re
6
  import json
7
- import base64
8
- from flask import Flask, request, render_template, jsonify, send_file, Response
9
- from PyPDF2 import PdfReader, PdfWriter
10
- import pytesseract
11
- from pdf2image import convert_from_bytes
12
- from PIL import Image
13
- from datasets import Dataset, load_dataset
14
- from sentence_transformers import SentenceTransformer
15
- from datetime import datetime
16
- from numpy import dot
17
- from numpy.linalg import norm
18
- from huggingface_hub import HfApi, hf_hub_download
19
- from transformers import TrOCRProcessor, VisionEncoderDecoderModel
20
- import torch
21
- import chromadb
22
- from chromadb.utils import embedding_functions
23
- import shutil
24
-
25
- # Set up logging
26
- logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
27
- logger = logging.getLogger(__name__)
28
-
29
- # Set cache, uploads, and output directories
30
- os.environ["HF_HOME"] = "/app/cache"
31
- os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
32
- os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/cache"
33
- os.environ["XDG_CACHE_HOME"] = "/app/cache"
34
- UPLOADS_DIR = "/app/uploads"
35
- PAGES_DIR = os.path.join(UPLOADS_DIR, "pages")
36
- OUTPUT_DIR = "/app/output"
37
- COMBINED_PDF_PATH = os.path.join(OUTPUT_DIR, "combined_output.pdf")
38
- PROGRESS_JSON_PATH = os.path.join(OUTPUT_DIR, "progress_log.json")
39
- CHROMA_DB_PATH = os.path.join(OUTPUT_DIR, "chromadb")
40
- os.makedirs(PAGES_DIR, exist_ok=True)
41
- os.makedirs(OUTPUT_DIR, exist_ok=True)
42
 
43
  app = Flask(__name__)
44
 
45
- # Hugging Face Hub configuration
46
- HF_TOKEN = os.getenv("HF_TOKEN")
47
- HF_DATASET_REPO = "broadfield-dev/pdf-ocr-dataset"
48
- HF_API = HfApi()
49
-
50
- # Tracking file for resuming
51
- TRACKING_FILE = "/app/cache/processing_state.json"
52
-
53
- # Load sentence transformer
54
- try:
55
- embedder = SentenceTransformer('all-MiniLM-L6-v2', cache_folder="/app/cache")
56
- logger.info("SentenceTransformer loaded successfully")
57
- except Exception as e:
58
- logger.error(f"Failed to load SentenceTransformer: {e}")
59
 
60
- # Initialize TrOCR (CPU-only)
61
  try:
62
- trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
63
- trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
64
- trocr_model.to("cpu").eval()
65
- logger.info("TrOCR initialized successfully on CPU")
66
- except Exception as e:
67
- logger.error(f"Failed to initialize TrOCR: {e}")
68
- trocr_model = None
69
- trocr_processor = None
70
-
71
- # Initialize ChromaDB
72
- chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
73
- sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
74
- chroma_collection = chroma_client.get_or_create_collection(name="pdf_pages", embedding_function=sentence_transformer_ef)
75
-
76
- # Load or initialize progress log
77
- def load_progress_log(storage_mode):
78
- if storage_mode == "hf":
79
- try:
80
- progress_file = hf_hub_download(repo_id=HF_DATASET_REPO, filename="progress_log.json", repo_type="dataset", token=HF_TOKEN)
81
- with open(progress_file, "r") as f:
82
- return json.load(f)
83
- except Exception as e:
84
- logger.info(f"No HF progress log found or error loading: {e}, initializing new log")
85
- return {"urls": {}}
86
- else: # local
87
- if os.path.exists(PROGRESS_JSON_PATH):
88
- with open(PROGRESS_JSON_PATH, "r") as f:
89
- return json.load(f)
90
- return {"urls": {}}
91
-
92
- def save_progress_log(progress_log, storage_mode):
93
- if storage_mode == "hf":
94
- with open("/app/cache/progress_log.json", "w") as f:
95
- json.dump(progress_log, f)
96
- HF_API.upload_file(
97
- path_or_fileobj="/app/cache/progress_log.json",
98
- path_in_repo="progress_log.json",
99
- repo_id=HF_DATASET_REPO,
100
- repo_type="dataset",
101
- token=HF_TOKEN
102
- )
103
- logger.info("Progress log updated in Hugging Face dataset")
104
- else: # local
105
- with open(PROGRESS_JSON_PATH, "w") as f:
106
- json.dump(progress_log, f)
107
- logger.info("Progress log updated locally")
108
-
109
- # Tesseract OCR with bounding boxes
110
- def ocr_with_tesseract(pdf_bytes, page_num):
111
- try:
112
- images = convert_from_bytes(pdf_bytes, first_page=page_num+1, last_page=page_num+1)
113
- if not images:
114
- logger.info(f"Page {page_num + 1} is blank")
115
- return {"page_num": page_num + 1, "text": "Blank page", "words": []}
116
- image = images[0]
117
- data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
118
- text = pytesseract.image_to_string(image)
119
- words = [
120
- {"text": data["text"][i], "left": data["left"][i], "top": data["top"][i], "width": data["width"][i], "height": data["height"][i]}
121
- for i in range(len(data["text"])) if data["text"][i].strip()
122
- ]
123
- logger.info(f"Tesseract processed page {page_num + 1} with {len(words)} words")
124
- return {"page_num": page_num + 1, "text": text, "words": words}
125
- except Exception as e:
126
- logger.error(f"Tesseract error on page {page_num + 1}: {e}")
127
- return {"page_num": page_num + 1, "text": f"Tesseract Error: {str(e)}", "words": []}
128
-
129
- # TrOCR OCR
130
- def ocr_with_trocr(pdf_bytes, page_num):
131
- if not trocr_model or not trocr_processor:
132
- logger.warning(f"TrOCR not available for page {page_num + 1}")
133
- return {"page_num": page_num + 1, "text": "TrOCR not initialized", "words": []}
134
- try:
135
- images = convert_from_bytes(pdf_bytes, first_page=page_num+1, last_page=page_num+1)
136
- if not images:
137
- logger.info(f"Page {page_num + 1} is blank")
138
- return {"page_num": page_num + 1, "text": "Blank page", "words": []}
139
- image = images[0].convert("RGB")
140
- pixel_values = trocr_processor(image, return_tensors="pt").pixel_values.to("cpu")
141
- with torch.no_grad():
142
- generated_ids = trocr_model.generate(pixel_values, max_length=50)
143
- text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
144
- words = [{"text": word, "left": 0, "top": 0, "width": 0, "height": 0} for word in text.split()]
145
- logger.info(f"TrOCR processed page {page_num + 1} with text: {text}")
146
- return {"page_num": page_num + 1, "text": text, "words": words}
147
- except Exception as e:
148
- logger.error(f"TrOCR error on page {page_num + 1}: {e}")
149
- return {"page_num": page_num + 1, "text": f"TrOCR Error: {str(e)}", "words": []}
150
-
151
- # Map Tesseract bounding boxes to OCR text
152
- def map_tesseract_to_ocr(tesseract_result, ocr_result):
153
- if not tesseract_result["words"] or "Error" in ocr_result["text"]:
154
- logger.info(f"Mapping skipped for page {tesseract_result['page_num']}: No Tesseract words or OCR error")
155
- return {**ocr_result, "words": tesseract_result["words"]}
156
-
157
- ocr_text = ocr_result["text"]
158
- tesseract_words = tesseract_result["words"]
159
-
160
- sentences = re.split(r'(?<=[.!?])\s+', ocr_text.strip())
161
- sentence_embeddings = embedder.encode(sentences)
162
-
163
- mapped_words = []
164
- for word in tesseract_words:
165
- word_embedding = embedder.encode(word["text"])
166
- similarities = [
167
- dot(word_embedding, sent_emb) / (norm(word_embedding) * norm(sent_emb)) if norm(sent_emb) != 0 else 0
168
- for sent_emb in sentence_embeddings
169
- ]
170
- best_match_idx = similarities.index(max(similarities))
171
- best_sentence = sentences[best_match_idx]
172
- if word["text"].lower() in best_sentence.lower():
173
- mapped_words.append(word)
174
- else:
175
- mapped_words.append(word)
176
- logger.info(f"Mapped {len(mapped_words)} words for page {tesseract_result['page_num']}")
177
- return {**ocr_result, "words": mapped_words}
178
-
179
- # Update combined PDF
180
- def update_combined_pdf(pdf_bytes, page_num):
181
- pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
182
- page = pdf_reader.pages[page_num]
183
- writer = PdfWriter()
184
- if os.path.exists(COMBINED_PDF_PATH):
185
- existing_pdf = PdfReader(COMBINED_PDF_PATH)
186
- for p in existing_pdf.pages:
187
- writer.add_page(p)
188
- writer.add_page(page)
189
- with open(COMBINED_PDF_PATH, "wb") as f:
190
- writer.write(f)
191
- logger.info(f"Updated combined PDF with page {page_num + 1}")
192
-
193
- # Process page
194
- def process_page(pdf_bytes, page_num, ocr_backend, filename, tracking_state, storage_mode):
195
- tesseract_result = ocr_with_tesseract(pdf_bytes, page_num)
196
- ocr_result = ocr_with_trocr(pdf_bytes, page_num) if ocr_backend == "trocr" else ocr_with_tesseract(pdf_bytes, page_num)
197
- combined_result = map_tesseract_to_ocr(tesseract_result, ocr_result)
198
-
199
- local_page_path = os.path.join(PAGES_DIR, f"{filename}_page_{combined_result['page_num']}_{datetime.now().strftime('%Y%m%d%H%M%S')}.pdf")
200
- writer = PdfWriter()
201
- pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
202
- writer.add_page(pdf_reader.pages[page_num])
203
- with open(local_page_path, "wb") as f:
204
- writer.write(f)
205
-
206
- if storage_mode == "hf":
207
- remote_page_path = f"pages/{os.path.basename(local_page_path)}"
208
- HF_API.upload_file(
209
- path_or_fileobj=local_page_path,
210
- path_in_repo=remote_page_path,
211
- repo_id=HF_DATASET_REPO,
212
- repo_type="dataset",
213
- token=HF_TOKEN
214
- )
215
- logger.info(f"Uploaded page to {HF_DATASET_REPO}/{remote_page_path}")
216
- combined_result["page_file"] = remote_page_path
217
- else: # local
218
- update_combined_pdf(pdf_bytes, page_num)
219
- combined_result["page_file"] = local_page_path
220
-
221
- combined_result["pdf_page"] = tracking_state["last_offset"] + page_num
222
-
223
- # Update ChromaDB
224
- chroma_collection.add(
225
- documents=[combined_result["text"]],
226
- metadatas=[{"filename": filename, "page_num": combined_result["page_num"], "page_file": combined_result["page_file"], "words": json.dumps(combined_result["words"])}],
227
- ids=[f"{filename}_page_{combined_result['page_num']}"]
228
- )
229
- logger.info(f"Added page {combined_result['page_num']} to ChromaDB")
230
-
231
- return combined_result
232
-
233
- # Extract PDF URLs from text
234
- def extract_pdf_urls(text):
235
- url_pattern = r'(https?://[^\s]+?\.pdf)'
236
- return re.findall(url_pattern, text)
237
-
238
- # Load or initialize tracking state
239
- def load_tracking_state():
240
- if os.path.exists(TRACKING_FILE):
241
- with open(TRACKING_FILE, "r") as f:
242
- return json.load(f)
243
- return {"processed_urls": {}, "last_offset": 0}
244
-
245
- def save_tracking_state(state):
246
- with open(TRACKING_FILE, "w") as f:
247
- json.dump(state, f)
248
-
249
- # Push to Hugging Face Dataset
250
- def push_to_hf_dataset(new_data):
251
  try:
252
- for item in new_data:
253
- if "url" not in item or not isinstance(item["url"], str):
254
- logger.error(f"Invalid item in new_data: {item}")
255
- raise ValueError(f"Each item must have a valid 'url' key; found {item}")
256
-
257
- try:
258
- dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")
259
- existing_data = dataset["train"].to_dict()
260
- logger.info(f"Loaded existing dataset with keys: {list(existing_data.keys())}")
261
- except Exception as e:
262
- logger.info(f"No existing dataset found or error loading: {e}, initializing new dataset")
263
- existing_data = {"filename": [], "pages": [], "url": [], "embedding": [], "processed_at": [], "pdf_page_offset": []}
264
-
265
- required_keys = ["filename", "pages", "url", "embedding", "processed_at", "pdf_page_offset"]
266
- for key in required_keys:
267
- if key not in existing_data:
268
- existing_data[key] = []
269
- logger.warning(f"Initialized missing key '{key}' in existing_data")
270
-
271
- existing_urls = set(existing_data["url"])
272
- for item in new_data:
273
- logger.debug(f"Processing item: {item}")
274
- if item["url"] not in existing_urls:
275
- for key in required_keys:
276
- existing_data[key].append(item.get(key, None))
277
- existing_urls.add(item["url"])
278
- logger.info(f"Added new URL: {item['url']}")
279
- else:
280
- idx = existing_data["url"].index(item["url"])
281
- existing_data["pages"][idx].extend(item["pages"])
282
- existing_data["embedding"][idx] = item["embedding"]
283
- existing_data["processed_at"][idx] = item["processed_at"]
284
- logger.info(f"Updated existing URL: {item['url']}")
285
-
286
- updated_dataset = Dataset.from_dict(existing_data)
287
- updated_dataset.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
288
- logger.info(f"Successfully appended/updated {len(new_data)} records to {HF_DATASET_REPO}")
289
  except Exception as e:
290
- logger.error(f"Failed to push to HF Dataset: {str(e)}")
291
- raise
292
-
293
- # Check if URL is fully processed
294
- def is_url_fully_processed(url, progress_log, total_pages):
295
- return url in progress_log["urls"] and progress_log["urls"][url]["status"] == "completed" and progress_log["urls"][url]["processed_pages"] >= total_pages
296
 
297
- # Process PDF URL with SSE
298
- def process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
299
- filename = url.split("/")[-1]
 
300
  try:
301
- yield f"data: {json.dumps({'status': 'fetching', 'filename': filename})}\n\n"
302
- logger.info(f"Fetching PDF from {url}")
303
- response = requests.get(url, timeout=10)
304
- response.raise_for_status()
305
- pdf_bytes = response.content
306
- pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
307
- total_pages = len(pdf_reader.pages)
308
-
309
- progress_log["urls"].setdefault(url, {"status": "pending", "processed_pages": 0})
310
- start_page = progress_log["urls"][url]["processed_pages"]
311
-
312
- if is_url_fully_processed(url, progress_log, total_pages):
313
- yield f"data: {json.dumps({'status': 'skipped', 'filename': filename, 'message': 'URL already fully processed'})}\n\n"
314
- return
315
-
316
- pages = []
317
- for page_num in range(start_page, total_pages):
318
- yield f"data: {json.dumps({'status': 'processing', 'filename': filename, 'page_num': page_num + 1, 'total_pages': total_pages})}\n\n"
319
- page = process_page(pdf_bytes, page_num, ocr_backend, filename, tracking_state, storage_mode)
320
- pages.append(page)
321
- yield f"data: {json.dumps({'filename': filename, 'page': page})}\n\n"
322
- progress_log["urls"][url]["processed_pages"] = page_num + 1
323
- save_progress_log(progress_log, storage_mode)
324
-
325
- full_text = "\n\n".join(f"Page {page['page_num']}\n{page['text']}" for page in pages)
326
- embedding = embedder.encode(full_text).tolist() if full_text.strip() else None
327
- result = {
328
- "filename": filename,
329
- "pages": pages,
330
- "url": url,
331
- "embedding": embedding,
332
- "processed_at": datetime.now().isoformat(),
333
- "pdf_page_offset": tracking_state["last_offset"]
334
- }
335
- if storage_mode == "hf":
336
- push_to_hf_dataset([result])
337
- tracking_state["last_offset"] += total_pages - start_page
338
- progress_log["urls"][url]["status"] = "completed"
339
- save_tracking_state(tracking_state)
340
- save_progress_log(progress_log, storage_mode)
341
- yield f"data: {json.dumps({'status': 'completed', 'filename': filename, 'new_offset': tracking_state['last_offset']})}\n\n"
342
- logger.info(f"Completed processing {filename} with new offset {tracking_state['last_offset']}")
343
- except requests.RequestException as e:
344
- logger.error(f"Failed to fetch PDF from {url}: {e}")
345
- yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error fetching PDF: {str(e)}'})}\n\n"
346
- except Exception as e:
347
- logger.error(f"Error processing {url}: {e}")
348
- yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error: {str(e)}'})}\n\n"
349
-
350
- # Process text content with SSE
351
- def process_text_content(text, filename, ocr_backend, tracking_state, progress_log, storage_mode):
352
- try:
353
- pdf_urls = extract_pdf_urls(text)
354
- processed_urls = [url for url in pdf_urls if url in progress_log["urls"] and progress_log["urls"][url]["status"] == "completed"]
355
- new_urls = [url for url in pdf_urls if url not in progress_log["urls"] or progress_log["urls"][url]["status"] != "completed"]
356
-
357
- initial_text = (f"Found {len(pdf_urls)} PDF URLs:\n" +
358
- f"Already processed: {len(processed_urls)}\n" + "\n".join(processed_urls) + "\n" +
359
- f"To process: {len(new_urls)}\n" + "\n".join(new_urls) + "\n\nProcessing...")
360
- yield f"data: {json.dumps({'status': 'info', 'filename': filename, 'message': initial_text})}\n\n"
361
-
362
- for url in new_urls:
363
- logger.info(f"Starting processing of {url} with offset {tracking_state['last_offset']}")
364
- for event in process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
365
- yield event
366
- except Exception as e:
367
- logger.error(f"Error processing text content for {filename}: {e}")
368
- yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error: {str(e)}'})}\n\n"
369
-
370
- # Home route
371
- @app.route("/", methods=["GET"])
372
- def index():
373
- return render_template("index.html")
374
-
375
- # Process URL endpoint with GET
376
- @app.route("/process_url", methods=["GET"])
377
- def process_url():
378
- url = request.args.get("url")
379
- ocr_backend = request.args.get("ocr_backend", "trocr")
380
- storage_mode = request.args.get("storage_mode", "hf")
381
-
382
- if not url:
383
- return jsonify({"error": "No URL provided"}), 400
384
-
385
- tracking_state = load_tracking_state()
386
- progress_log = load_progress_log(storage_mode)
387
-
388
- def generate():
389
- logger.info(f"Processing URL: {url} with ocr_backend={ocr_backend}, storage_mode={storage_mode}, starting offset={tracking_state['last_offset']}")
390
- if url.endswith(".pdf"):
391
- for event in process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
392
- yield event
393
- elif url.endswith(".txt"):
394
- try:
395
- response = requests.get(url, timeout=10)
396
- response.raise_for_status()
397
- text = response.text
398
- filename = url.split("/")[-1]
399
- logger.info(f"Fetched text from {url}")
400
- for event in process_text_content(text, filename, ocr_backend, tracking_state, progress_log, storage_mode):
401
- yield event
402
- except requests.RequestException as e:
403
- logger.error(f"Failed to fetch text from {url}: {e}")
404
- yield f"data: {json.dumps({'status': 'error', 'filename': url, 'message': f'Error fetching URL: {str(e)}'})}\n\n"
405
  else:
406
- yield f"data: {json.dumps({'status': 'error', 'filename': url, 'message': 'Unsupported URL format. Must end in .pdf or .txt'})}\n\n"
407
- logger.info(f"Finished processing URL: {url}")
408
-
409
- return Response(generate(), mimetype="text/event-stream")
410
-
411
- # Search page
412
- @app.route("/search", methods=["GET"])
413
- def search_page():
414
- storage_mode = request.args.get("storage_mode", "hf")
415
- if storage_mode == "hf":
416
- try:
417
- dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")["train"]
418
- files = [{"filename": f, "url": u, "pages": p} for f, u, p in zip(dataset["filename"], dataset["url"], dataset["pages"])]
419
- return render_template("search.html", files=files, storage_mode=storage_mode)
420
- except Exception as e:
421
- logger.error(f"Error loading search page: {e}")
422
- return render_template("search.html", files=[], error=str(e), storage_mode=storage_mode)
423
- else: # local
424
- files = []
425
- results = chroma_collection.get()
426
- for i, metadata in enumerate(results["metadatas"]):
427
- files.append({
428
- "filename": metadata["filename"],
429
- "url": "",
430
- "pages": [{"page_num": metadata["page_num"], "text": results["documents"][i], "page_file": metadata["page_file"], "words": json.loads(metadata["words"])}]
431
- })
432
- return render_template("search.html", files=files, storage_mode=storage_mode)
433
-
434
- # Semantic search route
435
- @app.route("/search_documents", methods=["POST"])
436
- def search_documents():
437
- query = request.form.get("query")
438
- storage_mode = request.form.get("storage_mode", "hf")
439
- if not query:
440
- return jsonify({"error": "No query provided"}), 400
441
-
442
- if storage_mode == "hf":
443
- try:
444
- dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")["train"]
445
- query_embedding = embedder.encode(query).tolist()
446
-
447
- embeddings = [e for e in dataset["embedding"] if e is not None]
448
- documents = dataset["pages"]
449
- filenames = dataset["filename"]
450
- urls = dataset["url"]
451
- processed_ats = dataset["processed_at"]
452
- pdf_page_offsets = dataset["pdf_page_offset"]
453
-
454
- similarities = [
455
- dot(query_embedding, emb) / (norm(query_embedding) * norm(emb)) if norm(emb) != 0 else 0
456
- for emb in embeddings
457
- ]
458
-
459
- sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
460
- results = []
461
-
462
- for idx, i in enumerate(sorted_indices):
463
- pages = documents[i]
464
- highlighted_pages = []
465
- for page in pages:
466
- words = page["words"]
467
- text = page["text"]
468
- pdf_page_num = page["pdf_page"]
469
- page_file = page["page_file"]
470
- page_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{page_file}"
471
- response = requests.get(page_url)
472
- response.raise_for_status()
473
- pdf_bytes = response.content
474
- pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
475
-
476
- sentences = re.split(r'(?<=[.!?])\s+', text)
477
- highlights = []
478
- for sent_idx, sentence in enumerate(sentences):
479
- sent_embedding = embedder.encode(sentence).tolist()
480
- similarity = dot(query_embedding, sent_embedding) / (norm(query_embedding) * norm(sent_embedding)) if norm(sent_embedding) != 0 else 0
481
- if similarity > 0.7:
482
- matching_words = []
483
- sent_words = sentence.split()
484
- word_idx = 0
485
- for word in words:
486
- if word_idx < len(sent_words) and word["text"].lower() in sent_words[word_idx].lower():
487
- matching_words.append(word)
488
- word_idx += 1
489
- highlights.append({"sentence": sentence, "index": sent_idx, "words": matching_words})
490
- highlighted_pages.append({
491
- "page_num": page["page_num"],
492
- "text": text,
493
- "highlights": highlights,
494
- "pdf_page": pdf_page_num,
495
- "pdf_data": pdf_base64,
496
- "page_url": page_url
497
- })
498
- results.append({
499
- "filename": filenames[i],
500
- "pages": highlighted_pages,
501
- "url": urls[i],
502
- "processed_at": processed_ats[i],
503
- "similarity": similarities[i],
504
- "pdf_page_offset": pdf_page_offsets[i]
505
- })
506
- return jsonify({"results": results})
507
- except Exception as e:
508
- logger.error(f"Search error: {e}")
509
- return jsonify({"error": str(e)}), 500
510
- else: # local with ChromaDB
511
- try:
512
- query_results = chroma_collection.query(query_texts=[query], n_results=5)
513
- results = []
514
- for i, doc in enumerate(query_results["documents"][0]):
515
- metadata = query_results["metadatas"][0][i]
516
- words = json.loads(metadata["words"])
517
- text = doc
518
- sentences = re.split(r'(?<=[.!?])\s+', text)
519
- highlights = []
520
- query_embedding = embedder.encode(query).tolist()
521
- for sent_idx, sentence in enumerate(sentences):
522
- sent_embedding = embedder.encode(sentence).tolist()
523
- similarity = dot(query_embedding, sent_embedding) / (norm(query_embedding) * norm(sent_embedding)) if norm(sent_embedding) != 0 else 0
524
- if similarity > 0.7:
525
- matching_words = []
526
- sent_words = sentence.split()
527
- word_idx = 0
528
- for word in words:
529
- if word_idx < len(sent_words) and word["text"].lower() in sent_words[word_idx].lower():
530
- matching_words.append(word)
531
- word_idx += 1
532
- highlights.append({"sentence": sentence, "index": sent_idx, "words": matching_words})
533
- with open(metadata["page_file"], "rb") as f:
534
- pdf_bytes = f.read()
535
- pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
536
- results.append({
537
- "filename": metadata["filename"],
538
- "pages": [{
539
- "page_num": metadata["page_num"],
540
- "text": text,
541
- "highlights": highlights,
542
- "pdf_page": metadata["page_num"],
543
- "pdf_data": pdf_base64,
544
- "page_url": metadata["page_file"]
545
- }],
546
- "url": "",
547
- "processed_at": datetime.now().isoformat(),
548
- "similarity": query_results["distances"][0][i]
549
- })
550
- return jsonify({"results": results})
551
- except Exception as e:
552
- logger.error(f"ChromaDB search error: {e}")
553
- return jsonify({"error": str(e)}), 500
554
-
555
- # Download output folder
556
- @app.route("/download_output", methods=["GET"])
557
- def download_output():
558
- try:
559
- zip_path = "/app/output.zip"
560
- shutil.make_archive("/app/output", "zip", OUTPUT_DIR)
561
- return send_file(zip_path, download_name="output.zip", as_attachment=True, mimetype="application/zip")
562
- except Exception as e:
563
- logger.error(f"Error creating zip: {e}")
564
- return jsonify({"error": str(e)}), 500
565
-
566
- # Preview output contents
567
- @app.route("/preview_output", methods=["GET"])
568
- def preview_output():
569
- try:
570
- combined_pdf_base64 = ""
571
- if os.path.exists(COMBINED_PDF_PATH):
572
- with open(COMBINED_PDF_PATH, "rb") as f:
573
- combined_pdf_base64 = base64.b64encode(f.read()).decode('utf-8')
574
-
575
- progress_json = {}
576
- if os.path.exists(PROGRESS_JSON_PATH):
577
- with open(PROGRESS_JSON_PATH, "r") as f:
578
- progress_json = json.load(f)
579
-
580
- return jsonify({
581
- "combined_pdf": combined_pdf_base64,
582
- "progress_json": progress_json
583
- })
584
  except Exception as e:
585
- logger.error(f"Error previewing output: {e}")
586
- return jsonify({"error": str(e)}), 500
587
 
588
- if __name__ == "__main__":
589
- port = int(os.environ.get("PORT", 7860))
590
- app.run(host="0.0.0.0", port=port, debug=True)
 
1
+ # app.py
2
+ from flask import Flask, send_from_directory, jsonify, request
3
  import os
 
 
 
 
4
  import json
5
+ import errno
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  app = Flask(__name__)
8
 
9
+ # Directory to store settings
10
+ SETTINGS_DIR = "settings"
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # Ensure the settings directory exists and has the correct permissions
13
  try:
14
+ if not os.path.exists(SETTINGS_DIR):
15
+ os.makedirs(SETTINGS_DIR, mode=0o775)
16
+ os.chmod(SETTINGS_DIR, 0o775)
17
+ except OSError as e:
18
+ print(f"Error setting up settings directory: {e}")
19
+
20
+ # Real-world simulation parameters (unscaled)
21
+ simulation_params = {
22
+ "sun": {
23
+ "mass": 1.989e30, # Real mass in kg
24
+ "position": [0, 0, 0],
25
+ "orbital_velocity": 0, # Sun is stationary
26
+ },
27
+ "earth": {
28
+ "mass": 5.972e24, # Real mass in kg
29
+ "position": [149.6e6, 0, 0], # 1 AU in km
30
+ "orbital_velocity": 29.8, # Real orbital velocity in km/s
31
+ },
32
+ "mars": {
33
+ "mass": 6.417e23, # Real mass in kg
34
+ "position": [227.9e6, 0, 0], # 1.52 AU in km
35
+ "orbital_velocity": 24.1, # Real orbital velocity in km/s
36
+ },
37
+ "fluid_speed": 0.1,
38
+ "fluid_friction": 0.9,
39
+ "fluid_deflection": 0.1,
40
+ }
41
+
42
+ # Serve the frontend
43
+ @app.route('/')
44
+ def serve_index():
45
+ return send_from_directory('static', 'index.html')
46
+
47
+ # Serve static files (CSS, JS)
48
+ @app.route('/static/<path:path>')
49
+ def serve_static(path):
50
+ return send_from_directory('static', path)
51
+
52
+ # API to get simulation parameters
53
+ @app.route('/api/params', methods=['GET'])
54
+ def get_params():
55
+ return jsonify(simulation_params)
56
+
57
+ # API to update simulation parameters
58
+ @app.route('/api/params', methods=['POST'])
59
+ def update_params():
60
+ global simulation_params
61
+ data = request.get_json()
62
+ simulation_params.update(data)
63
+ return jsonify({"status": "success", "params": simulation_params})
64
+
65
+ # API to save settings to a JSON file
66
+ @app.route('/api/save', methods=['POST'])
67
+ def save_settings():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  try:
69
+ filename = os.path.join(SETTINGS_DIR, "settings.json")
70
+ with open(filename, 'w') as f:
71
+ json.dump(simulation_params, f, indent=4)
72
+ return jsonify({"status": "success", "message": "Settings saved successfully"})
73
+ except PermissionError as e:
74
+ return jsonify({"status": "error", "message": "Permission denied: Unable to save settings. Please check directory permissions."}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  except Exception as e:
76
+ return jsonify({"status": "error", "message": f"Error saving settings: {str(e)}"}), 500
 
 
 
 
 
77
 
78
+ # API to load settings from a JSON file
79
+ @app.route('/api/load', methods=['GET'])
80
+ def load_settings():
81
+ global simulation_params
82
  try:
83
+ filename = os.path.join(SETTINGS_DIR, "settings.json")
84
+ if os.path.exists(filename):
85
+ with open(filename, 'r') as f:
86
+ simulation_params = json.load(f)
87
+ return jsonify({"status": "success", "params": simulation_params})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  else:
89
+ return jsonify({"status": "error", "message": "No saved settings found"}), 404
90
+ except PermissionError as e:
91
+ return jsonify({"status": "error", "message": "Permission denied: Unable to load settings. Please check directory permissions."}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  except Exception as e:
93
+ return jsonify({"status": "error", "message": f"Error loading settings: {str(e)}"}), 500
 
94
 
95
+ if __name__ == '__main__':
96
+ port = int(os.environ.get('PORT', 7860)) # Default port for Hugging Face Spaces
97
+ app.run(host='0.0.0.0', port=port)