Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,97 +1,590 @@
|
|
1 |
-
# app.py
|
2 |
-
from flask import Flask, send_from_directory, jsonify, request
|
3 |
import os
|
|
|
|
|
|
|
|
|
4 |
import json
|
5 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
app = Flask(__name__)
|
8 |
|
9 |
-
#
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
#
|
13 |
try:
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
def
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
try:
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
except Exception as e:
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
#
|
79 |
-
|
80 |
-
|
81 |
-
global simulation_params
|
82 |
try:
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
else:
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
except Exception as e:
|
93 |
-
|
|
|
94 |
|
95 |
-
if __name__ ==
|
96 |
-
port = int(os.environ.get(
|
97 |
-
app.run(host=
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import io
|
3 |
+
import requests
|
4 |
+
import logging
|
5 |
+
import re
|
6 |
import json
|
7 |
+
import base64
|
8 |
+
from flask import Flask, request, render_template, jsonify, send_file, Response
|
9 |
+
from PyPDF2 import PdfReader, PdfWriter
|
10 |
+
import pytesseract
|
11 |
+
from pdf2image import convert_from_bytes
|
12 |
+
from PIL import Image
|
13 |
+
from datasets import Dataset, load_dataset
|
14 |
+
from sentence_transformers import SentenceTransformer
|
15 |
+
from datetime import datetime
|
16 |
+
from numpy import dot
|
17 |
+
from numpy.linalg import norm
|
18 |
+
from huggingface_hub import HfApi, hf_hub_download
|
19 |
+
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
20 |
+
import torch
|
21 |
+
import chromadb
|
22 |
+
from chromadb.utils import embedding_functions
|
23 |
+
import shutil
|
24 |
+
|
25 |
+
# Set up logging
|
26 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
# Set cache, uploads, and output directories
|
30 |
+
os.environ["HF_HOME"] = "/app/cache"
|
31 |
+
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
|
32 |
+
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/cache"
|
33 |
+
os.environ["XDG_CACHE_HOME"] = "/app/cache"
|
34 |
+
UPLOADS_DIR = "/app/uploads"
|
35 |
+
PAGES_DIR = os.path.join(UPLOADS_DIR, "pages")
|
36 |
+
OUTPUT_DIR = "/app/output"
|
37 |
+
COMBINED_PDF_PATH = os.path.join(OUTPUT_DIR, "combined_output.pdf")
|
38 |
+
PROGRESS_JSON_PATH = os.path.join(OUTPUT_DIR, "progress_log.json")
|
39 |
+
CHROMA_DB_PATH = os.path.join(OUTPUT_DIR, "chromadb")
|
40 |
+
os.makedirs(PAGES_DIR, exist_ok=True)
|
41 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
42 |
|
43 |
app = Flask(__name__)
|
44 |
|
45 |
+
# Hugging Face Hub configuration
|
46 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
47 |
+
HF_DATASET_REPO = "broadfield-dev/pdf-ocr-dataset"
|
48 |
+
HF_API = HfApi()
|
49 |
+
|
50 |
+
# Tracking file for resuming
|
51 |
+
TRACKING_FILE = "/app/cache/processing_state.json"
|
52 |
+
|
53 |
+
# Load sentence transformer
|
54 |
+
try:
|
55 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2', cache_folder="/app/cache")
|
56 |
+
logger.info("SentenceTransformer loaded successfully")
|
57 |
+
except Exception as e:
|
58 |
+
logger.error(f"Failed to load SentenceTransformer: {e}")
|
59 |
|
60 |
+
# Initialize TrOCR (CPU-only)
|
61 |
try:
|
62 |
+
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
|
63 |
+
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
|
64 |
+
trocr_model.to("cpu").eval()
|
65 |
+
logger.info("TrOCR initialized successfully on CPU")
|
66 |
+
except Exception as e:
|
67 |
+
logger.error(f"Failed to initialize TrOCR: {e}")
|
68 |
+
trocr_model = None
|
69 |
+
trocr_processor = None
|
70 |
+
|
71 |
+
# Initialize ChromaDB
|
72 |
+
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
|
73 |
+
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
74 |
+
chroma_collection = chroma_client.get_or_create_collection(name="pdf_pages", embedding_function=sentence_transformer_ef)
|
75 |
+
|
76 |
+
# Load or initialize progress log
|
77 |
+
def load_progress_log(storage_mode):
|
78 |
+
if storage_mode == "hf":
|
79 |
+
try:
|
80 |
+
progress_file = hf_hub_download(repo_id=HF_DATASET_REPO, filename="progress_log.json", repo_type="dataset", token=HF_TOKEN)
|
81 |
+
with open(progress_file, "r") as f:
|
82 |
+
return json.load(f)
|
83 |
+
except Exception as e:
|
84 |
+
logger.info(f"No HF progress log found or error loading: {e}, initializing new log")
|
85 |
+
return {"urls": {}}
|
86 |
+
else: # local
|
87 |
+
if os.path.exists(PROGRESS_JSON_PATH):
|
88 |
+
with open(PROGRESS_JSON_PATH, "r") as f:
|
89 |
+
return json.load(f)
|
90 |
+
return {"urls": {}}
|
91 |
+
|
92 |
+
def save_progress_log(progress_log, storage_mode):
|
93 |
+
if storage_mode == "hf":
|
94 |
+
with open("/app/cache/progress_log.json", "w") as f:
|
95 |
+
json.dump(progress_log, f)
|
96 |
+
HF_API.upload_file(
|
97 |
+
path_or_fileobj="/app/cache/progress_log.json",
|
98 |
+
path_in_repo="progress_log.json",
|
99 |
+
repo_id=HF_DATASET_REPO,
|
100 |
+
repo_type="dataset",
|
101 |
+
token=HF_TOKEN
|
102 |
+
)
|
103 |
+
logger.info("Progress log updated in Hugging Face dataset")
|
104 |
+
else: # local
|
105 |
+
with open(PROGRESS_JSON_PATH, "w") as f:
|
106 |
+
json.dump(progress_log, f)
|
107 |
+
logger.info("Progress log updated locally")
|
108 |
+
|
109 |
+
# Tesseract OCR with bounding boxes
|
110 |
+
def ocr_with_tesseract(pdf_bytes, page_num):
|
111 |
+
try:
|
112 |
+
images = convert_from_bytes(pdf_bytes, first_page=page_num+1, last_page=page_num+1)
|
113 |
+
if not images:
|
114 |
+
logger.info(f"Page {page_num + 1} is blank")
|
115 |
+
return {"page_num": page_num + 1, "text": "Blank page", "words": []}
|
116 |
+
image = images[0]
|
117 |
+
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
118 |
+
text = pytesseract.image_to_string(image)
|
119 |
+
words = [
|
120 |
+
{"text": data["text"][i], "left": data["left"][i], "top": data["top"][i], "width": data["width"][i], "height": data["height"][i]}
|
121 |
+
for i in range(len(data["text"])) if data["text"][i].strip()
|
122 |
+
]
|
123 |
+
logger.info(f"Tesseract processed page {page_num + 1} with {len(words)} words")
|
124 |
+
return {"page_num": page_num + 1, "text": text, "words": words}
|
125 |
+
except Exception as e:
|
126 |
+
logger.error(f"Tesseract error on page {page_num + 1}: {e}")
|
127 |
+
return {"page_num": page_num + 1, "text": f"Tesseract Error: {str(e)}", "words": []}
|
128 |
+
|
129 |
+
# TrOCR OCR
|
130 |
+
def ocr_with_trocr(pdf_bytes, page_num):
|
131 |
+
if not trocr_model or not trocr_processor:
|
132 |
+
logger.warning(f"TrOCR not available for page {page_num + 1}")
|
133 |
+
return {"page_num": page_num + 1, "text": "TrOCR not initialized", "words": []}
|
134 |
+
try:
|
135 |
+
images = convert_from_bytes(pdf_bytes, first_page=page_num+1, last_page=page_num+1)
|
136 |
+
if not images:
|
137 |
+
logger.info(f"Page {page_num + 1} is blank")
|
138 |
+
return {"page_num": page_num + 1, "text": "Blank page", "words": []}
|
139 |
+
image = images[0].convert("RGB")
|
140 |
+
pixel_values = trocr_processor(image, return_tensors="pt").pixel_values.to("cpu")
|
141 |
+
with torch.no_grad():
|
142 |
+
generated_ids = trocr_model.generate(pixel_values, max_length=50)
|
143 |
+
text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
144 |
+
words = [{"text": word, "left": 0, "top": 0, "width": 0, "height": 0} for word in text.split()]
|
145 |
+
logger.info(f"TrOCR processed page {page_num + 1} with text: {text}")
|
146 |
+
return {"page_num": page_num + 1, "text": text, "words": words}
|
147 |
+
except Exception as e:
|
148 |
+
logger.error(f"TrOCR error on page {page_num + 1}: {e}")
|
149 |
+
return {"page_num": page_num + 1, "text": f"TrOCR Error: {str(e)}", "words": []}
|
150 |
+
|
151 |
+
# Map Tesseract bounding boxes to OCR text
|
152 |
+
def map_tesseract_to_ocr(tesseract_result, ocr_result):
|
153 |
+
if not tesseract_result["words"] or "Error" in ocr_result["text"]:
|
154 |
+
logger.info(f"Mapping skipped for page {tesseract_result['page_num']}: No Tesseract words or OCR error")
|
155 |
+
return {**ocr_result, "words": tesseract_result["words"]}
|
156 |
+
|
157 |
+
ocr_text = ocr_result["text"]
|
158 |
+
tesseract_words = tesseract_result["words"]
|
159 |
+
|
160 |
+
sentences = re.split(r'(?<=[.!?])\s+', ocr_text.strip())
|
161 |
+
sentence_embeddings = embedder.encode(sentences)
|
162 |
+
|
163 |
+
mapped_words = []
|
164 |
+
for word in tesseract_words:
|
165 |
+
word_embedding = embedder.encode(word["text"])
|
166 |
+
similarities = [
|
167 |
+
dot(word_embedding, sent_emb) / (norm(word_embedding) * norm(sent_emb)) if norm(sent_emb) != 0 else 0
|
168 |
+
for sent_emb in sentence_embeddings
|
169 |
+
]
|
170 |
+
best_match_idx = similarities.index(max(similarities))
|
171 |
+
best_sentence = sentences[best_match_idx]
|
172 |
+
if word["text"].lower() in best_sentence.lower():
|
173 |
+
mapped_words.append(word)
|
174 |
+
else:
|
175 |
+
mapped_words.append(word)
|
176 |
+
logger.info(f"Mapped {len(mapped_words)} words for page {tesseract_result['page_num']}")
|
177 |
+
return {**ocr_result, "words": mapped_words}
|
178 |
+
|
179 |
+
# Update combined PDF
|
180 |
+
def update_combined_pdf(pdf_bytes, page_num):
|
181 |
+
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
182 |
+
page = pdf_reader.pages[page_num]
|
183 |
+
writer = PdfWriter()
|
184 |
+
if os.path.exists(COMBINED_PDF_PATH):
|
185 |
+
existing_pdf = PdfReader(COMBINED_PDF_PATH)
|
186 |
+
for p in existing_pdf.pages:
|
187 |
+
writer.add_page(p)
|
188 |
+
writer.add_page(page)
|
189 |
+
with open(COMBINED_PDF_PATH, "wb") as f:
|
190 |
+
writer.write(f)
|
191 |
+
logger.info(f"Updated combined PDF with page {page_num + 1}")
|
192 |
+
|
193 |
+
# Process page
|
194 |
+
def process_page(pdf_bytes, page_num, ocr_backend, filename, tracking_state, storage_mode):
|
195 |
+
tesseract_result = ocr_with_tesseract(pdf_bytes, page_num)
|
196 |
+
ocr_result = ocr_with_trocr(pdf_bytes, page_num) if ocr_backend == "trocr" else ocr_with_tesseract(pdf_bytes, page_num)
|
197 |
+
combined_result = map_tesseract_to_ocr(tesseract_result, ocr_result)
|
198 |
+
|
199 |
+
local_page_path = os.path.join(PAGES_DIR, f"{filename}_page_{combined_result['page_num']}_{datetime.now().strftime('%Y%m%d%H%M%S')}.pdf")
|
200 |
+
writer = PdfWriter()
|
201 |
+
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
202 |
+
writer.add_page(pdf_reader.pages[page_num])
|
203 |
+
with open(local_page_path, "wb") as f:
|
204 |
+
writer.write(f)
|
205 |
+
|
206 |
+
if storage_mode == "hf":
|
207 |
+
remote_page_path = f"pages/{os.path.basename(local_page_path)}"
|
208 |
+
HF_API.upload_file(
|
209 |
+
path_or_fileobj=local_page_path,
|
210 |
+
path_in_repo=remote_page_path,
|
211 |
+
repo_id=HF_DATASET_REPO,
|
212 |
+
repo_type="dataset",
|
213 |
+
token=HF_TOKEN
|
214 |
+
)
|
215 |
+
logger.info(f"Uploaded page to {HF_DATASET_REPO}/{remote_page_path}")
|
216 |
+
combined_result["page_file"] = remote_page_path
|
217 |
+
else: # local
|
218 |
+
update_combined_pdf(pdf_bytes, page_num)
|
219 |
+
combined_result["page_file"] = local_page_path
|
220 |
+
|
221 |
+
combined_result["pdf_page"] = tracking_state["last_offset"] + page_num
|
222 |
+
|
223 |
+
# Update ChromaDB
|
224 |
+
chroma_collection.add(
|
225 |
+
documents=[combined_result["text"]],
|
226 |
+
metadatas=[{"filename": filename, "page_num": combined_result["page_num"], "page_file": combined_result["page_file"], "words": json.dumps(combined_result["words"])}],
|
227 |
+
ids=[f"{filename}_page_{combined_result['page_num']}"]
|
228 |
+
)
|
229 |
+
logger.info(f"Added page {combined_result['page_num']} to ChromaDB")
|
230 |
+
|
231 |
+
return combined_result
|
232 |
+
|
233 |
+
# Extract PDF URLs from text
|
234 |
+
def extract_pdf_urls(text):
|
235 |
+
url_pattern = r'(https?://[^\s]+?\.pdf)'
|
236 |
+
return re.findall(url_pattern, text)
|
237 |
+
|
238 |
+
# Load or initialize tracking state
|
239 |
+
def load_tracking_state():
|
240 |
+
if os.path.exists(TRACKING_FILE):
|
241 |
+
with open(TRACKING_FILE, "r") as f:
|
242 |
+
return json.load(f)
|
243 |
+
return {"processed_urls": {}, "last_offset": 0}
|
244 |
+
|
245 |
+
def save_tracking_state(state):
|
246 |
+
with open(TRACKING_FILE, "w") as f:
|
247 |
+
json.dump(state, f)
|
248 |
+
|
249 |
+
# Push to Hugging Face Dataset
|
250 |
+
def push_to_hf_dataset(new_data):
|
251 |
try:
|
252 |
+
for item in new_data:
|
253 |
+
if "url" not in item or not isinstance(item["url"], str):
|
254 |
+
logger.error(f"Invalid item in new_data: {item}")
|
255 |
+
raise ValueError(f"Each item must have a valid 'url' key; found {item}")
|
256 |
+
|
257 |
+
try:
|
258 |
+
dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")
|
259 |
+
existing_data = dataset["train"].to_dict()
|
260 |
+
logger.info(f"Loaded existing dataset with keys: {list(existing_data.keys())}")
|
261 |
+
except Exception as e:
|
262 |
+
logger.info(f"No existing dataset found or error loading: {e}, initializing new dataset")
|
263 |
+
existing_data = {"filename": [], "pages": [], "url": [], "embedding": [], "processed_at": [], "pdf_page_offset": []}
|
264 |
+
|
265 |
+
required_keys = ["filename", "pages", "url", "embedding", "processed_at", "pdf_page_offset"]
|
266 |
+
for key in required_keys:
|
267 |
+
if key not in existing_data:
|
268 |
+
existing_data[key] = []
|
269 |
+
logger.warning(f"Initialized missing key '{key}' in existing_data")
|
270 |
+
|
271 |
+
existing_urls = set(existing_data["url"])
|
272 |
+
for item in new_data:
|
273 |
+
logger.debug(f"Processing item: {item}")
|
274 |
+
if item["url"] not in existing_urls:
|
275 |
+
for key in required_keys:
|
276 |
+
existing_data[key].append(item.get(key, None))
|
277 |
+
existing_urls.add(item["url"])
|
278 |
+
logger.info(f"Added new URL: {item['url']}")
|
279 |
+
else:
|
280 |
+
idx = existing_data["url"].index(item["url"])
|
281 |
+
existing_data["pages"][idx].extend(item["pages"])
|
282 |
+
existing_data["embedding"][idx] = item["embedding"]
|
283 |
+
existing_data["processed_at"][idx] = item["processed_at"]
|
284 |
+
logger.info(f"Updated existing URL: {item['url']}")
|
285 |
+
|
286 |
+
updated_dataset = Dataset.from_dict(existing_data)
|
287 |
+
updated_dataset.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
|
288 |
+
logger.info(f"Successfully appended/updated {len(new_data)} records to {HF_DATASET_REPO}")
|
289 |
except Exception as e:
|
290 |
+
logger.error(f"Failed to push to HF Dataset: {str(e)}")
|
291 |
+
raise
|
292 |
+
|
293 |
+
# Check if URL is fully processed
|
294 |
+
def is_url_fully_processed(url, progress_log, total_pages):
|
295 |
+
return url in progress_log["urls"] and progress_log["urls"][url]["status"] == "completed" and progress_log["urls"][url]["processed_pages"] >= total_pages
|
296 |
|
297 |
+
# Process PDF URL with SSE
|
298 |
+
def process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
|
299 |
+
filename = url.split("/")[-1]
|
|
|
300 |
try:
|
301 |
+
yield f"data: {json.dumps({'status': 'fetching', 'filename': filename})}\n\n"
|
302 |
+
logger.info(f"Fetching PDF from {url}")
|
303 |
+
response = requests.get(url, timeout=10)
|
304 |
+
response.raise_for_status()
|
305 |
+
pdf_bytes = response.content
|
306 |
+
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
307 |
+
total_pages = len(pdf_reader.pages)
|
308 |
+
|
309 |
+
progress_log["urls"].setdefault(url, {"status": "pending", "processed_pages": 0})
|
310 |
+
start_page = progress_log["urls"][url]["processed_pages"]
|
311 |
+
|
312 |
+
if is_url_fully_processed(url, progress_log, total_pages):
|
313 |
+
yield f"data: {json.dumps({'status': 'skipped', 'filename': filename, 'message': 'URL already fully processed'})}\n\n"
|
314 |
+
return
|
315 |
+
|
316 |
+
pages = []
|
317 |
+
for page_num in range(start_page, total_pages):
|
318 |
+
yield f"data: {json.dumps({'status': 'processing', 'filename': filename, 'page_num': page_num + 1, 'total_pages': total_pages})}\n\n"
|
319 |
+
page = process_page(pdf_bytes, page_num, ocr_backend, filename, tracking_state, storage_mode)
|
320 |
+
pages.append(page)
|
321 |
+
yield f"data: {json.dumps({'filename': filename, 'page': page})}\n\n"
|
322 |
+
progress_log["urls"][url]["processed_pages"] = page_num + 1
|
323 |
+
save_progress_log(progress_log, storage_mode)
|
324 |
+
|
325 |
+
full_text = "\n\n".join(f"Page {page['page_num']}\n{page['text']}" for page in pages)
|
326 |
+
embedding = embedder.encode(full_text).tolist() if full_text.strip() else None
|
327 |
+
result = {
|
328 |
+
"filename": filename,
|
329 |
+
"pages": pages,
|
330 |
+
"url": url,
|
331 |
+
"embedding": embedding,
|
332 |
+
"processed_at": datetime.now().isoformat(),
|
333 |
+
"pdf_page_offset": tracking_state["last_offset"]
|
334 |
+
}
|
335 |
+
if storage_mode == "hf":
|
336 |
+
push_to_hf_dataset([result])
|
337 |
+
tracking_state["last_offset"] += total_pages - start_page
|
338 |
+
progress_log["urls"][url]["status"] = "completed"
|
339 |
+
save_tracking_state(tracking_state)
|
340 |
+
save_progress_log(progress_log, storage_mode)
|
341 |
+
yield f"data: {json.dumps({'status': 'completed', 'filename': filename, 'new_offset': tracking_state['last_offset']})}\n\n"
|
342 |
+
logger.info(f"Completed processing {filename} with new offset {tracking_state['last_offset']}")
|
343 |
+
except requests.RequestException as e:
|
344 |
+
logger.error(f"Failed to fetch PDF from {url}: {e}")
|
345 |
+
yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error fetching PDF: {str(e)}'})}\n\n"
|
346 |
+
except Exception as e:
|
347 |
+
logger.error(f"Error processing {url}: {e}")
|
348 |
+
yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error: {str(e)}'})}\n\n"
|
349 |
+
|
350 |
+
# Process text content with SSE
|
351 |
+
def process_text_content(text, filename, ocr_backend, tracking_state, progress_log, storage_mode):
|
352 |
+
try:
|
353 |
+
pdf_urls = extract_pdf_urls(text)
|
354 |
+
processed_urls = [url for url in pdf_urls if url in progress_log["urls"] and progress_log["urls"][url]["status"] == "completed"]
|
355 |
+
new_urls = [url for url in pdf_urls if url not in progress_log["urls"] or progress_log["urls"][url]["status"] != "completed"]
|
356 |
+
|
357 |
+
initial_text = (f"Found {len(pdf_urls)} PDF URLs:\n" +
|
358 |
+
f"Already processed: {len(processed_urls)}\n" + "\n".join(processed_urls) + "\n" +
|
359 |
+
f"To process: {len(new_urls)}\n" + "\n".join(new_urls) + "\n\nProcessing...")
|
360 |
+
yield f"data: {json.dumps({'status': 'info', 'filename': filename, 'message': initial_text})}\n\n"
|
361 |
+
|
362 |
+
for url in new_urls:
|
363 |
+
logger.info(f"Starting processing of {url} with offset {tracking_state['last_offset']}")
|
364 |
+
for event in process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
|
365 |
+
yield event
|
366 |
+
except Exception as e:
|
367 |
+
logger.error(f"Error processing text content for {filename}: {e}")
|
368 |
+
yield f"data: {json.dumps({'status': 'error', 'filename': filename, 'message': f'Error: {str(e)}'})}\n\n"
|
369 |
+
|
370 |
+
# Home route
|
371 |
+
@app.route("/", methods=["GET"])
|
372 |
+
def index():
|
373 |
+
return render_template("index.html")
|
374 |
+
|
375 |
+
# Process URL endpoint with GET
|
376 |
+
@app.route("/process_url", methods=["GET"])
|
377 |
+
def process_url():
|
378 |
+
url = request.args.get("url")
|
379 |
+
ocr_backend = request.args.get("ocr_backend", "trocr")
|
380 |
+
storage_mode = request.args.get("storage_mode", "hf")
|
381 |
+
|
382 |
+
if not url:
|
383 |
+
return jsonify({"error": "No URL provided"}), 400
|
384 |
+
|
385 |
+
tracking_state = load_tracking_state()
|
386 |
+
progress_log = load_progress_log(storage_mode)
|
387 |
+
|
388 |
+
def generate():
|
389 |
+
logger.info(f"Processing URL: {url} with ocr_backend={ocr_backend}, storage_mode={storage_mode}, starting offset={tracking_state['last_offset']}")
|
390 |
+
if url.endswith(".pdf"):
|
391 |
+
for event in process_pdf_url(url, ocr_backend, tracking_state, progress_log, storage_mode):
|
392 |
+
yield event
|
393 |
+
elif url.endswith(".txt"):
|
394 |
+
try:
|
395 |
+
response = requests.get(url, timeout=10)
|
396 |
+
response.raise_for_status()
|
397 |
+
text = response.text
|
398 |
+
filename = url.split("/")[-1]
|
399 |
+
logger.info(f"Fetched text from {url}")
|
400 |
+
for event in process_text_content(text, filename, ocr_backend, tracking_state, progress_log, storage_mode):
|
401 |
+
yield event
|
402 |
+
except requests.RequestException as e:
|
403 |
+
logger.error(f"Failed to fetch text from {url}: {e}")
|
404 |
+
yield f"data: {json.dumps({'status': 'error', 'filename': url, 'message': f'Error fetching URL: {str(e)}'})}\n\n"
|
405 |
else:
|
406 |
+
yield f"data: {json.dumps({'status': 'error', 'filename': url, 'message': 'Unsupported URL format. Must end in .pdf or .txt'})}\n\n"
|
407 |
+
logger.info(f"Finished processing URL: {url}")
|
408 |
+
|
409 |
+
return Response(generate(), mimetype="text/event-stream")
|
410 |
+
|
411 |
+
# Search page
|
412 |
+
@app.route("/search", methods=["GET"])
|
413 |
+
def search_page():
|
414 |
+
storage_mode = request.args.get("storage_mode", "hf")
|
415 |
+
if storage_mode == "hf":
|
416 |
+
try:
|
417 |
+
dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")["train"]
|
418 |
+
files = [{"filename": f, "url": u, "pages": p} for f, u, p in zip(dataset["filename"], dataset["url"], dataset["pages"])]
|
419 |
+
return render_template("search.html", files=files, storage_mode=storage_mode)
|
420 |
+
except Exception as e:
|
421 |
+
logger.error(f"Error loading search page: {e}")
|
422 |
+
return render_template("search.html", files=[], error=str(e), storage_mode=storage_mode)
|
423 |
+
else: # local
|
424 |
+
files = []
|
425 |
+
results = chroma_collection.get()
|
426 |
+
for i, metadata in enumerate(results["metadatas"]):
|
427 |
+
files.append({
|
428 |
+
"filename": metadata["filename"],
|
429 |
+
"url": "",
|
430 |
+
"pages": [{"page_num": metadata["page_num"], "text": results["documents"][i], "page_file": metadata["page_file"], "words": json.loads(metadata["words"])}]
|
431 |
+
})
|
432 |
+
return render_template("search.html", files=files, storage_mode=storage_mode)
|
433 |
+
|
434 |
+
# Semantic search route
|
435 |
+
@app.route("/search_documents", methods=["POST"])
|
436 |
+
def search_documents():
|
437 |
+
query = request.form.get("query")
|
438 |
+
storage_mode = request.form.get("storage_mode", "hf")
|
439 |
+
if not query:
|
440 |
+
return jsonify({"error": "No query provided"}), 400
|
441 |
+
|
442 |
+
if storage_mode == "hf":
|
443 |
+
try:
|
444 |
+
dataset = load_dataset(HF_DATASET_REPO, token=HF_TOKEN, cache_dir="/app/cache")["train"]
|
445 |
+
query_embedding = embedder.encode(query).tolist()
|
446 |
+
|
447 |
+
embeddings = [e for e in dataset["embedding"] if e is not None]
|
448 |
+
documents = dataset["pages"]
|
449 |
+
filenames = dataset["filename"]
|
450 |
+
urls = dataset["url"]
|
451 |
+
processed_ats = dataset["processed_at"]
|
452 |
+
pdf_page_offsets = dataset["pdf_page_offset"]
|
453 |
+
|
454 |
+
similarities = [
|
455 |
+
dot(query_embedding, emb) / (norm(query_embedding) * norm(emb)) if norm(emb) != 0 else 0
|
456 |
+
for emb in embeddings
|
457 |
+
]
|
458 |
+
|
459 |
+
sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
|
460 |
+
results = []
|
461 |
+
|
462 |
+
for idx, i in enumerate(sorted_indices):
|
463 |
+
pages = documents[i]
|
464 |
+
highlighted_pages = []
|
465 |
+
for page in pages:
|
466 |
+
words = page["words"]
|
467 |
+
text = page["text"]
|
468 |
+
pdf_page_num = page["pdf_page"]
|
469 |
+
page_file = page["page_file"]
|
470 |
+
page_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{page_file}"
|
471 |
+
response = requests.get(page_url)
|
472 |
+
response.raise_for_status()
|
473 |
+
pdf_bytes = response.content
|
474 |
+
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
475 |
+
|
476 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
477 |
+
highlights = []
|
478 |
+
for sent_idx, sentence in enumerate(sentences):
|
479 |
+
sent_embedding = embedder.encode(sentence).tolist()
|
480 |
+
similarity = dot(query_embedding, sent_embedding) / (norm(query_embedding) * norm(sent_embedding)) if norm(sent_embedding) != 0 else 0
|
481 |
+
if similarity > 0.7:
|
482 |
+
matching_words = []
|
483 |
+
sent_words = sentence.split()
|
484 |
+
word_idx = 0
|
485 |
+
for word in words:
|
486 |
+
if word_idx < len(sent_words) and word["text"].lower() in sent_words[word_idx].lower():
|
487 |
+
matching_words.append(word)
|
488 |
+
word_idx += 1
|
489 |
+
highlights.append({"sentence": sentence, "index": sent_idx, "words": matching_words})
|
490 |
+
highlighted_pages.append({
|
491 |
+
"page_num": page["page_num"],
|
492 |
+
"text": text,
|
493 |
+
"highlights": highlights,
|
494 |
+
"pdf_page": pdf_page_num,
|
495 |
+
"pdf_data": pdf_base64,
|
496 |
+
"page_url": page_url
|
497 |
+
})
|
498 |
+
results.append({
|
499 |
+
"filename": filenames[i],
|
500 |
+
"pages": highlighted_pages,
|
501 |
+
"url": urls[i],
|
502 |
+
"processed_at": processed_ats[i],
|
503 |
+
"similarity": similarities[i],
|
504 |
+
"pdf_page_offset": pdf_page_offsets[i]
|
505 |
+
})
|
506 |
+
return jsonify({"results": results})
|
507 |
+
except Exception as e:
|
508 |
+
logger.error(f"Search error: {e}")
|
509 |
+
return jsonify({"error": str(e)}), 500
|
510 |
+
else: # local with ChromaDB
|
511 |
+
try:
|
512 |
+
query_results = chroma_collection.query(query_texts=[query], n_results=5)
|
513 |
+
results = []
|
514 |
+
for i, doc in enumerate(query_results["documents"][0]):
|
515 |
+
metadata = query_results["metadatas"][0][i]
|
516 |
+
words = json.loads(metadata["words"])
|
517 |
+
text = doc
|
518 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
519 |
+
highlights = []
|
520 |
+
query_embedding = embedder.encode(query).tolist()
|
521 |
+
for sent_idx, sentence in enumerate(sentences):
|
522 |
+
sent_embedding = embedder.encode(sentence).tolist()
|
523 |
+
similarity = dot(query_embedding, sent_embedding) / (norm(query_embedding) * norm(sent_embedding)) if norm(sent_embedding) != 0 else 0
|
524 |
+
if similarity > 0.7:
|
525 |
+
matching_words = []
|
526 |
+
sent_words = sentence.split()
|
527 |
+
word_idx = 0
|
528 |
+
for word in words:
|
529 |
+
if word_idx < len(sent_words) and word["text"].lower() in sent_words[word_idx].lower():
|
530 |
+
matching_words.append(word)
|
531 |
+
word_idx += 1
|
532 |
+
highlights.append({"sentence": sentence, "index": sent_idx, "words": matching_words})
|
533 |
+
with open(metadata["page_file"], "rb") as f:
|
534 |
+
pdf_bytes = f.read()
|
535 |
+
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
536 |
+
results.append({
|
537 |
+
"filename": metadata["filename"],
|
538 |
+
"pages": [{
|
539 |
+
"page_num": metadata["page_num"],
|
540 |
+
"text": text,
|
541 |
+
"highlights": highlights,
|
542 |
+
"pdf_page": metadata["page_num"],
|
543 |
+
"pdf_data": pdf_base64,
|
544 |
+
"page_url": metadata["page_file"]
|
545 |
+
}],
|
546 |
+
"url": "",
|
547 |
+
"processed_at": datetime.now().isoformat(),
|
548 |
+
"similarity": query_results["distances"][0][i]
|
549 |
+
})
|
550 |
+
return jsonify({"results": results})
|
551 |
+
except Exception as e:
|
552 |
+
logger.error(f"ChromaDB search error: {e}")
|
553 |
+
return jsonify({"error": str(e)}), 500
|
554 |
+
|
555 |
+
# Download output folder
|
556 |
+
@app.route("/download_output", methods=["GET"])
|
557 |
+
def download_output():
|
558 |
+
try:
|
559 |
+
zip_path = "/app/output.zip"
|
560 |
+
shutil.make_archive("/app/output", "zip", OUTPUT_DIR)
|
561 |
+
return send_file(zip_path, download_name="output.zip", as_attachment=True, mimetype="application/zip")
|
562 |
+
except Exception as e:
|
563 |
+
logger.error(f"Error creating zip: {e}")
|
564 |
+
return jsonify({"error": str(e)}), 500
|
565 |
+
|
566 |
+
# Preview output contents
|
567 |
+
@app.route("/preview_output", methods=["GET"])
|
568 |
+
def preview_output():
|
569 |
+
try:
|
570 |
+
combined_pdf_base64 = ""
|
571 |
+
if os.path.exists(COMBINED_PDF_PATH):
|
572 |
+
with open(COMBINED_PDF_PATH, "rb") as f:
|
573 |
+
combined_pdf_base64 = base64.b64encode(f.read()).decode('utf-8')
|
574 |
+
|
575 |
+
progress_json = {}
|
576 |
+
if os.path.exists(PROGRESS_JSON_PATH):
|
577 |
+
with open(PROGRESS_JSON_PATH, "r") as f:
|
578 |
+
progress_json = json.load(f)
|
579 |
+
|
580 |
+
return jsonify({
|
581 |
+
"combined_pdf": combined_pdf_base64,
|
582 |
+
"progress_json": progress_json
|
583 |
+
})
|
584 |
except Exception as e:
|
585 |
+
logger.error(f"Error previewing output: {e}")
|
586 |
+
return jsonify({"error": str(e)}), 500
|
587 |
|
588 |
+
if __name__ == "__main__":
|
589 |
+
port = int(os.environ.get("PORT", 7860))
|
590 |
+
app.run(host="0.0.0.0", port=port, debug=True)
|