Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,590 +1,97 @@
|
|
|
|
|
|
1 |
import os
|
2 |
-
import io
|
3 |
-
import requests
|
4 |
-
import logging
|
5 |
-
import re
|
6 |
import json
|
7 |
-
import
|
8 |
-
from flask import Flask, request, render_template, jsonify, send_file, Response
|
9 |
-
from PyPDF2 import PdfReader, PdfWriter
|
10 |
-
import pytesseract
|
11 |
-
from pdf2image import convert_from_bytes
|
12 |
-
from PIL import Image
|
13 |
-
from datasets import Dataset, load_dataset
|
14 |
-
from sentence_transformers import SentenceTransformer
|
15 |
-
from datetime import datetime
|
16 |
-
from numpy import dot
|
17 |
-
from numpy.linalg import norm
|
18 |
-
from huggingface_hub import HfApi, hf_hub_download
|
19 |
-
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
20 |
-
import torch
|
21 |
-
import chromadb
|
22 |
-
from chromadb.utils import embedding_functions
|
23 |
-
import shutil
|
24 |
-
|
25 |
-
# Set up logging
|
26 |
-
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
27 |
-
logger = logging.getLogger(__name__)
|
28 |
-
|
29 |
-
# Set cache, uploads, and output directories
|
30 |
-
os.environ["HF_HOME"] = "/app/cache"
|
31 |
-
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
|
32 |
-
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/cache"
|
33 |
-
os.environ["XDG_CACHE_HOME"] = "/app/cache"
|
34 |
-
UPLOADS_DIR = "/app/uploads"
|
35 |
-
PAGES_DIR = os.path.join(UPLOADS_DIR, "pages")
|
36 |
-
OUTPUT_DIR = "/app/output"
|
37 |
-
COMBINED_PDF_PATH = os.path.join(OUTPUT_DIR, "combined_output.pdf")
|
38 |
-
PROGRESS_JSON_PATH = os.path.join(OUTPUT_DIR, "progress_log.json")
|
39 |
-
CHROMA_DB_PATH = os.path.join(OUTPUT_DIR, "chromadb")
|
40 |
-
os.makedirs(PAGES_DIR, exist_ok=True)
|
41 |
-
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
42 |
|
43 |
app = Flask(__name__)
|
44 |
|
45 |
-
#
|
46 |
-
|
47 |
-
HF_DATASET_REPO = "broadfield-dev/pdf-ocr-dataset"
|
48 |
-
HF_API = HfApi()
|
49 |
-
|
50 |
-
# Tracking file for resuming
|
51 |
-
TRACKING_FILE = "/app/cache/processing_state.json"
|
52 |
-
|
53 |
-
# Load sentence transformer
|
54 |
-
try:
|
55 |
-
embedder = SentenceTransformer('all-MiniLM-L6-v2', cache_folder="/app/cache")
|
56 |
-
logger.info("SentenceTransformer loaded successfully")
|
57 |
-
except Exception as e:
|
58 |
-
logger.error(f"Failed to load SentenceTransformer: {e}")
|
59 |
|
60 |
-
#
|
61 |
try:
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
#
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
#
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
def
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
image = images[0]
|
117 |
-
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
118 |
-
text = pytesseract.image_to_string(image)
|
119 |
-
words = [
|
120 |
-
{"text": data["text"][i], "left": data["left"][i], "top": data["top"][i], "width": data["width"][i], "height": data["height"][i]}
|
121 |
-
for i in range(len(data["text"])) if data["text"][i].strip()
|
122 |
-
]
|
123 |
-
logger.info(f"Tesseract processed page {page_num + 1} with {len(words)} words")
|
124 |
-
return {"page_num": page_num + 1, "text": text, "words": words}
|
125 |
-
except Exception as e:
|
126 |
-
logger.error(f"Tesseract error on page {page_num + 1}: {e}")
|
127 |
-
return {"page_num": page_num + 1, "text": f"Tesseract Error: {str(e)}", "words": []}
|
128 |
-
|
129 |
-
# TrOCR OCR
|
130 |
-
def ocr_with_trocr(pdf_bytes, page_num):
|
131 |
-
if not trocr_model or not trocr_processor:
|
132 |
-
logger.warning(f"TrOCR not available for page {page_num + 1}")
|
133 |
-
return {"page_num": page_num + 1, "text": "TrOCR not initialized", "words": []}
|
134 |
-
try:
|
135 |
-
images = convert_from_bytes(pdf_bytes, first_page=page_num+1, last_page=page_num+1)
|
136 |
-
if not images:
|
137 |
-
logger.info(f"Page {page_num + 1} is blank")
|
138 |
-
return {"page_num": page_num + 1, "text": "Blank page", "words": []}
|
139 |
-
image = images[0].convert("RGB")
|
140 |
-
pixel_values = trocr_processor(image, return_tensors="pt").pixel_values.to("cpu")
|
141 |
-
with torch.no_grad():
|
142 |
-
generated_ids = trocr_model.generate(pixel_values, max_length=50)
|
143 |
-
text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
144 |
-
words = [{"text": word, "left": 0, "top": 0, "width": 0, "height": 0} for word in text.split()]
|
145 |
-
logger.info(f"TrOCR processed page {page_num + 1} with text: {text}")
|
146 |
-
return {"page_num": page_num + 1, "text": text, "words": words}
|
147 |
-
except Exception as e:
|
148 |
-
logger.error(f"TrOCR error on page {page_num + 1}: {e}")
|
149 |
-
return {"page_num": page_num + 1, "text": f"TrOCR Error: {str(e)}", "words": []}
|
150 |
-
|
151 |
-
# Map Tesseract bounding boxes to OCR text
|
152 |
-
def map_tesseract_to_ocr(tesseract_result, ocr_result):
|
153 |
-
if not tesseract_result["words"] or "Error" in ocr_result["text"]:
|
154 |
-
logger.info(f"Mapping skipped for page {tesseract_result['page_num']}: No Tesseract words or OCR error")
|
155 |
-
return {**ocr_result, "words": tesseract_result["words"]}
|
156 |
-
|
157 |
-
ocr_text = ocr_result["text"]
|
158 |
-
tesseract_words = tesseract_result["words"]
|
159 |
-
|
160 |
-
sentences = re.split(r'(?<=[.!?])\s+', ocr_text.strip())
|
161 |
-
sentence_embeddings = embedder.encode(sentences)
|
162 |
-
|
163 |
-
mapped_words = []
|
164 |
-
for word in tesseract_words:
|
165 |
-
word_embedding = embedder.encode(word["text"])
|
166 |
-
similarities = [
|
167 |
-
dot(word_embedding, sent_emb) / (norm(word_embedding) * norm(sent_emb)) if norm(sent_emb) != 0 else 0
|
168 |
-
for sent_emb in sentence_embeddings
|
169 |
-
]
|
170 |
-
best_match_idx = similarities.index(max(similarities))
|
171 |
-
best_sentence = sentences[best_match_idx]
|
172 |
-
if word["text"].lower() in best_sentence.lower():
|
173 |
-
mapped_words.append(word)
|
174 |
-
else:
|
175 |
-
mapped_words.append(word)
|
176 |
-
logger.info(f"Mapped {len(mapped_words)} words for page {tesseract_result['page_num']}")
|
177 |
-
return {**ocr_result, "words": mapped_words}
|
178 |
-
|
179 |
-
# Update combined PDF
|
180 |
-
def update_combined_pdf(pdf_bytes, page_num):
|
181 |
-
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
182 |
-
page = pdf_reader.pages[page_num]
|
183 |
-
writer = PdfWriter()
|
184 |
-
if os.path.exists(COMBINED_PDF_PATH):
|
185 |
-
existing_pdf = PdfReader(COMBINED_PDF_PATH)
|
186 |
-
for p in existing_pdf.pages:
|
187 |
-
writer.add_page(p)
|
188 |
-
writer.add_page(page)
|
189 |
-
with open(COMBINED_PDF_PATH, "wb") as f:
|
190 |
-
writer.write(f)
|
191 |
-
logger.info(f"Updated combined PDF with page {page_num + 1}")
|
192 |
-
|
193 |
-
# Process page
|
194 |
-
def process_page(pdf_bytes, page_num, ocr_backend, filename, tracking_state, storage_mode):
|
195 |
-
tesseract_result = ocr_with_tesseract(pdf_bytes, page_num)
|
196 |
-
ocr_result = ocr_with_trocr(pdf_bytes, page_num) if ocr_backend == "trocr" else ocr_with_tesseract(pdf_bytes, page_num)
|
197 |
-
combined_result = map_tesseract_to_ocr(tesseract_result, ocr_result)
|
198 |
-
|
199 |
-
local_page_path = os.path.join(PAGES_DIR, f"{filename}_page_{combined_result['page_num']}_{datetime.now().strftime('%Y%m%d%H%M%S')}.pdf")
|
200 |
-
writer = PdfWriter()
|
201 |
-
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
202 |
-
writer.add_page(pdf_reader.pages[page_num])
|
203 |
-
with open(local_page_path, "wb") as f:
|
204 |
-
writer.write(f)
|
205 |
-
|
206 |
-
if storage_mode == "hf":
|
207 |
-
remote_page_path = f"pages/{os.path.basename(local_page_path)}"
|
208 |
-
HF_API.upload_file(
|
209 |
-
path_or_fileobj=local_page_path,
|
210 |
-
path_in_repo=remote_page_path,
|
211 |
-
repo_id=HF_DATASET_REPO,
|
212 |
-
repo_type="dataset",
|
213 |
-
token=HF_TOKEN
|
214 |
-
)
|
215 |
-
logger.info(f"Uploaded page to {HF_DATASET_REPO}/{remote_page_path}")
|
216 |
-
combined_result["page_file"] = remote_page_path
|
217 |
-
else: # local
|
218 |
-
update_combined_pdf(pdf_bytes, page_num)
|
219 |
-
combined_result["page_file"] = local_page_path
|
220 |
-
|
221 |
-
combined_result["pdf_page"] = tracking_state["last_offset"] + page_num
|
222 |
-
|
223 |
-
# Update ChromaDB
|
224 |
-
chroma_collection.add(
|
225 |
-
documents=[combined_result["text"]],
|
226 |
-
metadatas=[{"filename": filename, "page_num": combined_result["page_num"], "page_file": combined_result["page_file"], "words": json.dumps(combined_result["words"])}],
|
227 |
-
ids=[f"{filename}_page_{combined_result['page_num']}"]
|
228 |
-
)
|
229 |
-
logger.info(f"Added page {combined_result['page_num']} to ChromaDB")
|
230 |
-
|
231 |
-
return combined_result
|
232 |
-
|
233 |
-
# Extract PDF URLs from text
|
234 |
-
def extract_pdf_urls(text):
|
235 |
-
url_pattern = r'(https?://[^\s]+?\.pdf)'
|
236 |
-
return re.findall(url_pattern, text)
|
237 |
-
|
238 |
-
# Load or initialize tracking state
|
239 |
-
def load_tracking_state():
|
240 |
-
if os.path.exists(TRACKING_FILE):
|
241 |
-
with open(TRACKING_FILE, "r") as f:
|
242 |
-
return json.load(f)
|
243 |
-
return {"processed_urls": {}, "last_offset": 0}
|
244 |
-
|
245 |
-
def save_tracking_state(state):
|
246 |
-
with open(TRACKING_FILE, "w") as f:
|
247 |
-
json.dump(state, f)
|
248 |
-
|
249 |
-
# Push to Hugging Face Dataset
|
250 |
-
def push_to_hf_dataset(new_data):
|
251 |
try:
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")
|
259 |
-
existing_data = dataset["train"].to_dict()
|
260 |
-
logger.info(f"Loaded existing dataset with keys: {list(existing_data.keys())}")
|
261 |
-
except Exception as e:
|
262 |
-
logger.info(f"No existing dataset found or error loading: {e}, initializing new dataset")
|
263 |
-
existing_data = {"filename": [], "pages": [], "url": [], "embedding": [], "processed_at": [], "pdf_page_offset": []}
|
264 |
-
|
265 |
-
required_keys = ["filename", "pages", "url", "embedding", "processed_at", "pdf_page_offset"]
|
266 |
-
for key in required_keys:
|
267 |
-
if key not in existing_data:
|
268 |
-
existing_data[key] = []
|
269 |
-
logger.warning(f"Initialized missing key '{key}' in existing_data")
|
270 |
-
|
271 |
-
existing_urls = set(existing_data["url"])
|
272 |
-
for item in new_data:
|
273 |
-
logger.debug(f"Processing item: {item}")
|
274 |
-
if item["url"] not in existing_urls:
|
275 |
-
for key in required_keys:
|
276 |
-
existing_data[key].append(item.get(key, None))
|
277 |
-
existing_urls.add(item["url"])
|
278 |
-
logger.info(f"Added new URL: {item['url']}")
|
279 |
-
else:
|
280 |
-
idx = existing_data["url"].index(item["url"])
|
281 |
-
existing_data["pages"][idx].extend(item["pages"])
|
282 |
-
existing_data["embedding"][idx] = item["embedding"]
|
283 |
-
existing_data["processed_at"][idx] = item["processed_at"]
|
284 |
-
logger.info(f"Updated existing URL: {item['url']}")
|
285 |
-
|
286 |
-
updated_dataset = Dataset.from_dict(existing_data)
|
287 |
-
updated_dataset.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
|
288 |
-
logger.info(f"Successfully appended/updated {len(new_data)} records to {HF_DATASET_REPO}")
|
289 |
except Exception as e:
|
290 |
-
|
291 |
-
raise
|
292 |
-
|
293 |
-
# Check if URL is fully processed
|
294 |
-
def is_url_fully_processed(url, progress_log, total_pages):
|
295 |
-
return url in progress_log["urls"] and progress_log["urls"][url]["status"] == "completed" and progress_log["urls"][url]["processed_pages"] >= total_pages
|
296 |
|
297 |
-
#
|
298 |
-
|
299 |
-
|
|
|
300 |
try:
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
307 |
-
total_pages = len(pdf_reader.pages)
|
308 |
-
|
309 |
-
progress_log["urls"].setdefault(url, {"status": "pending", "processed_pages": 0})
|
310 |
-
start_page = progress_log["urls"][url]["processed_pages"]
|
311 |
-
|
312 |
-
if is_url_fully_processed(url, progress_log, total_pages):
|
313 |
-
yield f"data: {json.dumps({'status': 'skipped', 'filename': filename, 'message': 'URL already fully processed'})}\n\n"
|
314 |
-
return
|
315 |
-
|
316 |
-
pages = []
|
317 |
-
for page_num in range(start_page, total_pages):
|
318 |
-
yield f"data: {json.dumps({'status': 'processing', 'filename': filename, 'page_num': page_num + 1, 'total_pages': total_pages})}\n\n"
|
319 |
-
page = process_page(pdf_bytes, page_num, ocr_backend, filename, tracking_state, storage_mode)
|
320 |
-
pages.append(page)
|
321 |
-
yield f"data: {json.dumps({'filename': filename, 'page': page})}\n\n"
|
322 |
-
progress_log["urls"][url]["processed_pages"] = page_num + 1
|
323 |
-
save_progress_log(progress_log, storage_mode)
|
324 |
-
|
325 |
-
full_text = "\n\n".join(f"Page {page['page_num']}\n{page['text']}" for page in pages)
|
326 |
-
embedding = embedder.encode(full_text).tolist() if full_text.strip() else None
|
327 |
-
result = {
|
328 |
-
"filename": filename,
|
329 |
-
"pages": pages,
|
330 |
-
"url": url,
|
331 |
-
"embedding": embedding,
|
332 |
-
"processed_at": datetime.now().isoformat(),
|
333 |
-
"pdf_page_offset": tracking_state["last_offset"]
|
334 |
-
}
|
335 |
-
if storage_mode == "hf":
|
336 |
-
push_to_hf_dataset([result])
|
337 |
-
tracking_state["last_offset"] += total_pages - start_page
|
338 |
-
progress_log["urls"][url]["status"] = "completed"
|
339 |
-
save_tracking_state(tracking_state)
|
340 |
-
save_progress_log(progress_log, storage_mode)
|
341 |
-
yield f"data: {json.dumps({'status': 'completed', 'filename': filename, 'new_offset': tracking_state['last_offset']})}\n\n"
|
342 |
-
logger.info(f"Completed processing {filename} with new offset {tracking_state['last_offset']}")
|
343 |
-
except requests.RequestException as e:
|
344 |
-
logger.error(f"Failed to fetch PDF from {url}: {e}")
|
345 |
-
yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error fetching PDF: {str(e)}'})}\n\n"
|
346 |
-
except Exception as e:
|
347 |
-
logger.error(f"Error processing {url}: {e}")
|
348 |
-
yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error: {str(e)}'})}\n\n"
|
349 |
-
|
350 |
-
# Process text content with SSE
|
351 |
-
def process_text_content(text, filename, ocr_backend, tracking_state, progress_log, storage_mode):
|
352 |
-
try:
|
353 |
-
pdf_urls = extract_pdf_urls(text)
|
354 |
-
processed_urls = [url for url in pdf_urls if url in progress_log["urls"] and progress_log["urls"][url]["status"] == "completed"]
|
355 |
-
new_urls = [url for url in pdf_urls if url not in progress_log["urls"] or progress_log["urls"][url]["status"] != "completed"]
|
356 |
-
|
357 |
-
initial_text = (f"Found {len(pdf_urls)} PDF URLs:\n" +
|
358 |
-
f"Already processed: {len(processed_urls)}\n" + "\n".join(processed_urls) + "\n" +
|
359 |
-
f"To process: {len(new_urls)}\n" + "\n".join(new_urls) + "\n\nProcessing...")
|
360 |
-
yield f"data: {json.dumps({'status': 'info', 'filename': filename, 'message': initial_text})}\n\n"
|
361 |
-
|
362 |
-
for url in new_urls:
|
363 |
-
logger.info(f"Starting processing of {url} with offset {tracking_state['last_offset']}")
|
364 |
-
for event in process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
|
365 |
-
yield event
|
366 |
-
except Exception as e:
|
367 |
-
logger.error(f"Error processing text content for {filename}: {e}")
|
368 |
-
yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error: {str(e)}'})}\n\n"
|
369 |
-
|
370 |
-
# Home route
|
371 |
-
@app.route("/", methods=["GET"])
|
372 |
-
def index():
|
373 |
-
return render_template("index.html")
|
374 |
-
|
375 |
-
# Process URL endpoint with GET
|
376 |
-
@app.route("/process_url", methods=["GET"])
|
377 |
-
def process_url():
|
378 |
-
url = request.args.get("url")
|
379 |
-
ocr_backend = request.args.get("ocr_backend", "trocr")
|
380 |
-
storage_mode = request.args.get("storage_mode", "hf")
|
381 |
-
|
382 |
-
if not url:
|
383 |
-
return jsonify({"error": "No URL provided"}), 400
|
384 |
-
|
385 |
-
tracking_state = load_tracking_state()
|
386 |
-
progress_log = load_progress_log(storage_mode)
|
387 |
-
|
388 |
-
def generate():
|
389 |
-
logger.info(f"Processing URL: {url} with ocr_backend={ocr_backend}, storage_mode={storage_mode}, starting offset={tracking_state['last_offset']}")
|
390 |
-
if url.endswith(".pdf"):
|
391 |
-
for event in process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
|
392 |
-
yield event
|
393 |
-
elif url.endswith(".txt"):
|
394 |
-
try:
|
395 |
-
response = requests.get(url, timeout=10)
|
396 |
-
response.raise_for_status()
|
397 |
-
text = response.text
|
398 |
-
filename = url.split("/")[-1]
|
399 |
-
logger.info(f"Fetched text from {url}")
|
400 |
-
for event in process_text_content(text, filename, ocr_backend, tracking_state, progress_log, storage_mode):
|
401 |
-
yield event
|
402 |
-
except requests.RequestException as e:
|
403 |
-
logger.error(f"Failed to fetch text from {url}: {e}")
|
404 |
-
yield f"data: {json.dumps({'status': 'error', 'filename': url, 'message': f'Error fetching URL: {str(e)}'})}\n\n"
|
405 |
else:
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
return Response(generate(), mimetype="text/event-stream")
|
410 |
-
|
411 |
-
# Search page
|
412 |
-
@app.route("/search", methods=["GET"])
|
413 |
-
def search_page():
|
414 |
-
storage_mode = request.args.get("storage_mode", "hf")
|
415 |
-
if storage_mode == "hf":
|
416 |
-
try:
|
417 |
-
dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")["train"]
|
418 |
-
files = [{"filename": f, "url": u, "pages": p} for f, u, p in zip(dataset["filename"], dataset["url"], dataset["pages"])]
|
419 |
-
return render_template("search.html", files=files, storage_mode=storage_mode)
|
420 |
-
except Exception as e:
|
421 |
-
logger.error(f"Error loading search page: {e}")
|
422 |
-
return render_template("search.html", files=[], error=str(e), storage_mode=storage_mode)
|
423 |
-
else: # local
|
424 |
-
files = []
|
425 |
-
results = chroma_collection.get()
|
426 |
-
for i, metadata in enumerate(results["metadatas"]):
|
427 |
-
files.append({
|
428 |
-
"filename": metadata["filename"],
|
429 |
-
"url": "",
|
430 |
-
"pages": [{"page_num": metadata["page_num"], "text": results["documents"][i], "page_file": metadata["page_file"], "words": json.loads(metadata["words"])}]
|
431 |
-
})
|
432 |
-
return render_template("search.html", files=files, storage_mode=storage_mode)
|
433 |
-
|
434 |
-
# Semantic search route
|
435 |
-
@app.route("/search_documents", methods=["POST"])
|
436 |
-
def search_documents():
|
437 |
-
query = request.form.get("query")
|
438 |
-
storage_mode = request.form.get("storage_mode", "hf")
|
439 |
-
if not query:
|
440 |
-
return jsonify({"error": "No query provided"}), 400
|
441 |
-
|
442 |
-
if storage_mode == "hf":
|
443 |
-
try:
|
444 |
-
dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")["train"]
|
445 |
-
query_embedding = embedder.encode(query).tolist()
|
446 |
-
|
447 |
-
embeddings = [e for e in dataset["embedding"] if e is not None]
|
448 |
-
documents = dataset["pages"]
|
449 |
-
filenames = dataset["filename"]
|
450 |
-
urls = dataset["url"]
|
451 |
-
processed_ats = dataset["processed_at"]
|
452 |
-
pdf_page_offsets = dataset["pdf_page_offset"]
|
453 |
-
|
454 |
-
similarities = [
|
455 |
-
dot(query_embedding, emb) / (norm(query_embedding) * norm(emb)) if norm(emb) != 0 else 0
|
456 |
-
for emb in embeddings
|
457 |
-
]
|
458 |
-
|
459 |
-
sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
|
460 |
-
results = []
|
461 |
-
|
462 |
-
for idx, i in enumerate(sorted_indices):
|
463 |
-
pages = documents[i]
|
464 |
-
highlighted_pages = []
|
465 |
-
for page in pages:
|
466 |
-
words = page["words"]
|
467 |
-
text = page["text"]
|
468 |
-
pdf_page_num = page["pdf_page"]
|
469 |
-
page_file = page["page_file"]
|
470 |
-
page_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{page_file}"
|
471 |
-
response = requests.get(page_url)
|
472 |
-
response.raise_for_status()
|
473 |
-
pdf_bytes = response.content
|
474 |
-
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
475 |
-
|
476 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
477 |
-
highlights = []
|
478 |
-
for sent_idx, sentence in enumerate(sentences):
|
479 |
-
sent_embedding = embedder.encode(sentence).tolist()
|
480 |
-
similarity = dot(query_embedding, sent_embedding) / (norm(query_embedding) * norm(sent_embedding)) if norm(sent_embedding) != 0 else 0
|
481 |
-
if similarity > 0.7:
|
482 |
-
matching_words = []
|
483 |
-
sent_words = sentence.split()
|
484 |
-
word_idx = 0
|
485 |
-
for word in words:
|
486 |
-
if word_idx < len(sent_words) and word["text"].lower() in sent_words[word_idx].lower():
|
487 |
-
matching_words.append(word)
|
488 |
-
word_idx += 1
|
489 |
-
highlights.append({"sentence": sentence, "index": sent_idx, "words": matching_words})
|
490 |
-
highlighted_pages.append({
|
491 |
-
"page_num": page["page_num"],
|
492 |
-
"text": text,
|
493 |
-
"highlights": highlights,
|
494 |
-
"pdf_page": pdf_page_num,
|
495 |
-
"pdf_data": pdf_base64,
|
496 |
-
"page_url": page_url
|
497 |
-
})
|
498 |
-
results.append({
|
499 |
-
"filename": filenames[i],
|
500 |
-
"pages": highlighted_pages,
|
501 |
-
"url": urls[i],
|
502 |
-
"processed_at": processed_ats[i],
|
503 |
-
"similarity": similarities[i],
|
504 |
-
"pdf_page_offset": pdf_page_offsets[i]
|
505 |
-
})
|
506 |
-
return jsonify({"results": results})
|
507 |
-
except Exception as e:
|
508 |
-
logger.error(f"Search error: {e}")
|
509 |
-
return jsonify({"error": str(e)}), 500
|
510 |
-
else: # local with ChromaDB
|
511 |
-
try:
|
512 |
-
query_results = chroma_collection.query(query_texts=[query], n_results=5)
|
513 |
-
results = []
|
514 |
-
for i, doc in enumerate(query_results["documents"][0]):
|
515 |
-
metadata = query_results["metadatas"][0][i]
|
516 |
-
words = json.loads(metadata["words"])
|
517 |
-
text = doc
|
518 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
519 |
-
highlights = []
|
520 |
-
query_embedding = embedder.encode(query).tolist()
|
521 |
-
for sent_idx, sentence in enumerate(sentences):
|
522 |
-
sent_embedding = embedder.encode(sentence).tolist()
|
523 |
-
similarity = dot(query_embedding, sent_embedding) / (norm(query_embedding) * norm(sent_embedding)) if norm(sent_embedding) != 0 else 0
|
524 |
-
if similarity > 0.7:
|
525 |
-
matching_words = []
|
526 |
-
sent_words = sentence.split()
|
527 |
-
word_idx = 0
|
528 |
-
for word in words:
|
529 |
-
if word_idx < len(sent_words) and word["text"].lower() in sent_words[word_idx].lower():
|
530 |
-
matching_words.append(word)
|
531 |
-
word_idx += 1
|
532 |
-
highlights.append({"sentence": sentence, "index": sent_idx, "words": matching_words})
|
533 |
-
with open(metadata["page_file"], "rb") as f:
|
534 |
-
pdf_bytes = f.read()
|
535 |
-
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
536 |
-
results.append({
|
537 |
-
"filename": metadata["filename"],
|
538 |
-
"pages": [{
|
539 |
-
"page_num": metadata["page_num"],
|
540 |
-
"text": text,
|
541 |
-
"highlights": highlights,
|
542 |
-
"pdf_page": metadata["page_num"],
|
543 |
-
"pdf_data": pdf_base64,
|
544 |
-
"page_url": metadata["page_file"]
|
545 |
-
}],
|
546 |
-
"url": "",
|
547 |
-
"processed_at": datetime.now().isoformat(),
|
548 |
-
"similarity": query_results["distances"][0][i]
|
549 |
-
})
|
550 |
-
return jsonify({"results": results})
|
551 |
-
except Exception as e:
|
552 |
-
logger.error(f"ChromaDB search error: {e}")
|
553 |
-
return jsonify({"error": str(e)}), 500
|
554 |
-
|
555 |
-
# Download output folder
|
556 |
-
@app.route("/download_output", methods=["GET"])
|
557 |
-
def download_output():
|
558 |
-
try:
|
559 |
-
zip_path = "/app/output.zip"
|
560 |
-
shutil.make_archive("/app/output", "zip", OUTPUT_DIR)
|
561 |
-
return send_file(zip_path, download_name="output.zip", as_attachment=True, mimetype="application/zip")
|
562 |
-
except Exception as e:
|
563 |
-
logger.error(f"Error creating zip: {e}")
|
564 |
-
return jsonify({"error": str(e)}), 500
|
565 |
-
|
566 |
-
# Preview output contents
|
567 |
-
@app.route("/preview_output", methods=["GET"])
|
568 |
-
def preview_output():
|
569 |
-
try:
|
570 |
-
combined_pdf_base64 = ""
|
571 |
-
if os.path.exists(COMBINED_PDF_PATH):
|
572 |
-
with open(COMBINED_PDF_PATH, "rb") as f:
|
573 |
-
combined_pdf_base64 = base64.b64encode(f.read()).decode('utf-8')
|
574 |
-
|
575 |
-
progress_json = {}
|
576 |
-
if os.path.exists(PROGRESS_JSON_PATH):
|
577 |
-
with open(PROGRESS_JSON_PATH, "r") as f:
|
578 |
-
progress_json = json.load(f)
|
579 |
-
|
580 |
-
return jsonify({
|
581 |
-
"combined_pdf": combined_pdf_base64,
|
582 |
-
"progress_json": progress_json
|
583 |
-
})
|
584 |
except Exception as e:
|
585 |
-
|
586 |
-
return jsonify({"error": str(e)}), 500
|
587 |
|
588 |
-
if __name__ ==
|
589 |
-
port = int(os.environ.get(
|
590 |
-
app.run(host=
|
|
|
1 |
+
# app.py
|
2 |
+
from flask import Flask, send_from_directory, jsonify, request
|
3 |
import os
|
|
|
|
|
|
|
|
|
4 |
import json
|
5 |
+
import errno
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
app = Flask(__name__)
|
8 |
|
9 |
+
# Directory to store settings
|
10 |
+
SETTINGS_DIR = "settings"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
# Ensure the settings directory exists and has the correct permissions
|
13 |
try:
|
14 |
+
if not os.path.exists(SETTINGS_DIR):
|
15 |
+
os.makedirs(SETTINGS_DIR, mode=0o775)
|
16 |
+
os.chmod(SETTINGS_DIR, 0o775)
|
17 |
+
except OSError as e:
|
18 |
+
print(f"Error setting up settings directory: {e}")
|
19 |
+
|
20 |
+
# Real-world simulation parameters (unscaled)
|
21 |
+
simulation_params = {
|
22 |
+
"sun": {
|
23 |
+
"mass": 1.989e30, # Real mass in kg
|
24 |
+
"position": [0, 0, 0],
|
25 |
+
"orbital_velocity": 0, # Sun is stationary
|
26 |
+
},
|
27 |
+
"earth": {
|
28 |
+
"mass": 5.972e24, # Real mass in kg
|
29 |
+
"position": [149.6e6, 0, 0], # 1 AU in km
|
30 |
+
"orbital_velocity": 29.8, # Real orbital velocity in km/s
|
31 |
+
},
|
32 |
+
"mars": {
|
33 |
+
"mass": 6.417e23, # Real mass in kg
|
34 |
+
"position": [227.9e6, 0, 0], # 1.52 AU in km
|
35 |
+
"orbital_velocity": 24.1, # Real orbital velocity in km/s
|
36 |
+
},
|
37 |
+
"fluid_speed": 0.1,
|
38 |
+
"fluid_friction": 0.9,
|
39 |
+
"fluid_deflection": 0.1,
|
40 |
+
}
|
41 |
+
|
42 |
+
# Serve the frontend
|
43 |
+
@app.route('/')
|
44 |
+
def serve_index():
|
45 |
+
return send_from_directory('static', 'index.html')
|
46 |
+
|
47 |
+
# Serve static files (CSS, JS)
|
48 |
+
@app.route('/static/<path:path>')
|
49 |
+
def serve_static(path):
|
50 |
+
return send_from_directory('static', path)
|
51 |
+
|
52 |
+
# API to get simulation parameters
|
53 |
+
@app.route('/api/params', methods=['GET'])
|
54 |
+
def get_params():
|
55 |
+
return jsonify(simulation_params)
|
56 |
+
|
57 |
+
# API to update simulation parameters
|
58 |
+
@app.route('/api/params', methods=['POST'])
|
59 |
+
def update_params():
|
60 |
+
global simulation_params
|
61 |
+
data = request.get_json()
|
62 |
+
simulation_params.update(data)
|
63 |
+
return jsonify({"status": "success", "params": simulation_params})
|
64 |
+
|
65 |
+
# API to save settings to a JSON file
|
66 |
+
@app.route('/api/save', methods=['POST'])
|
67 |
+
def save_settings():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
try:
|
69 |
+
filename = os.path.join(SETTINGS_DIR, "settings.json")
|
70 |
+
with open(filename, 'w') as f:
|
71 |
+
json.dump(simulation_params, f, indent=4)
|
72 |
+
return jsonify({"status": "success", "message": "Settings saved successfully"})
|
73 |
+
except PermissionError as e:
|
74 |
+
return jsonify({"status": "error", "message": "Permission denied: Unable to save settings. Please check directory permissions."}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
except Exception as e:
|
76 |
+
return jsonify({"status": "error", "message": f"Error saving settings: {str(e)}"}), 500
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
# API to load settings from a JSON file
|
79 |
+
@app.route('/api/load', methods=['GET'])
|
80 |
+
def load_settings():
|
81 |
+
global simulation_params
|
82 |
try:
|
83 |
+
filename = os.path.join(SETTINGS_DIR, "settings.json")
|
84 |
+
if os.path.exists(filename):
|
85 |
+
with open(filename, 'r') as f:
|
86 |
+
simulation_params = json.load(f)
|
87 |
+
return jsonify({"status": "success", "params": simulation_params})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
else:
|
89 |
+
return jsonify({"status": "error", "message": "No saved settings found"}), 404
|
90 |
+
except PermissionError as e:
|
91 |
+
return jsonify({"status": "error", "message": "Permission denied: Unable to load settings. Please check directory permissions."}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
except Exception as e:
|
93 |
+
return jsonify({"status": "error", "message": f"Error loading settings: {str(e)}"}), 500
|
|
|
94 |
|
95 |
+
if __name__ == '__main__':
|
96 |
+
port = int(os.environ.get('PORT', 7860)) # Default port for Hugging Face Spaces
|
97 |
+
app.run(host='0.0.0.0', port=port)
|