Update app.py
Browse files
app.py
CHANGED
@@ -30,16 +30,20 @@ def extract_pdf_text_by_page(file) -> List[str]:
|
|
30 |
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
|
|
33 |
try:
|
34 |
# Open the PDF file
|
|
|
35 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
36 |
full_text = ""
|
|
|
37 |
for page_num, page in enumerate(doc, start=1):
|
38 |
text = page.get_text("text")
|
39 |
full_text += text + "\n"
|
40 |
print(f"Extracted text from page {page_num}: {len(text)} characters.")
|
41 |
doc.close()
|
42 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
|
|
43 |
return full_text
|
44 |
except Exception as e:
|
45 |
print(f"Error extracting text from PDF: {e}")
|
@@ -346,12 +350,10 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
|
|
346 |
# Main Analysis Function
|
347 |
# ------------------------------
|
348 |
|
349 |
-
def analyze_pdf(
|
350 |
"""Analyzes the PDF for language issues and returns results and annotated PDF."""
|
351 |
try:
|
352 |
-
|
353 |
-
file.seek(0)
|
354 |
-
full_text = extract_pdf_text(file)
|
355 |
if not full_text:
|
356 |
return {"error": "Failed to extract text from PDF."}, None
|
357 |
|
@@ -360,9 +362,7 @@ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
|
|
360 |
return language_issues, None
|
361 |
|
362 |
issues = language_issues.get("issues", [])
|
363 |
-
|
364 |
-
file.seek(0)
|
365 |
-
annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
|
366 |
return language_issues, annotated_pdf
|
367 |
except Exception as e:
|
368 |
return {"error": str(e)}, None
|
@@ -375,36 +375,45 @@ def process_upload(file):
|
|
375 |
"""
|
376 |
Process the uploaded PDF file and return analysis results and annotated PDF.
|
377 |
"""
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
|
402 |
-
except Exception as e:
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
|
409 |
|
410 |
def create_interface():
|
@@ -416,7 +425,7 @@ def create_interface():
|
|
416 |
file_input = gr.File(
|
417 |
label="Upload PDF",
|
418 |
file_types=[".pdf"],
|
419 |
-
type="binary"
|
420 |
)
|
421 |
|
422 |
with gr.Row():
|
|
|
30 |
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
33 |
+
print("me llamo samyak")
|
34 |
try:
|
35 |
# Open the PDF file
|
36 |
+
print("me llamo samyak")
|
37 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
38 |
full_text = ""
|
39 |
+
print(doc)
|
40 |
for page_num, page in enumerate(doc, start=1):
|
41 |
text = page.get_text("text")
|
42 |
full_text += text + "\n"
|
43 |
print(f"Extracted text from page {page_num}: {len(text)} characters.")
|
44 |
doc.close()
|
45 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
46 |
+
print(full_text)
|
47 |
return full_text
|
48 |
except Exception as e:
|
49 |
print(f"Error extracting text from PDF: {e}")
|
|
|
350 |
# Main Analysis Function
|
351 |
# ------------------------------
|
352 |
|
353 |
+
def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
|
354 |
"""Analyzes the PDF for language issues and returns results and annotated PDF."""
|
355 |
try:
|
356 |
+
full_text = extract_pdf_text(filepath)
|
|
|
|
|
357 |
if not full_text:
|
358 |
return {"error": "Failed to extract text from PDF."}, None
|
359 |
|
|
|
362 |
return language_issues, None
|
363 |
|
364 |
issues = language_issues.get("issues", [])
|
365 |
+
annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
|
|
|
|
|
366 |
return language_issues, annotated_pdf
|
367 |
except Exception as e:
|
368 |
return {"error": str(e)}, None
|
|
|
375 |
"""
|
376 |
Process the uploaded PDF file and return analysis results and annotated PDF.
|
377 |
"""
|
378 |
+
# print(file.name)
|
379 |
+
if file is None:
|
380 |
+
return json.dumps({"error": "No file uploaded"}, indent=2), None
|
381 |
+
|
382 |
+
# # Create a temporary file to work with
|
383 |
+
|
384 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
|
385 |
+
# temp_input.write(file)
|
386 |
+
# temp_input_path = temp_input.name
|
387 |
+
# print(temp_input_path)
|
388 |
+
|
389 |
+
temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
|
390 |
+
temp_input.write(file)
|
391 |
+
temp_input_path = temp_input.name
|
392 |
+
print(temp_input_path)
|
393 |
+
# Analyze -inputthe PDF
|
394 |
+
|
395 |
+
results, annotated_pdf = analyze_pdf(temp_input_path)
|
396 |
+
|
397 |
+
print(results)
|
398 |
+
results_json = json.dumps(results, indent=2)
|
399 |
+
|
400 |
+
# Clean up the temporary input file
|
401 |
+
os.unlink(temp_input_path)
|
402 |
+
|
403 |
+
# If we have an annotated PDF, save it temporarily
|
404 |
+
if annotated_pdf:
|
405 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
406 |
+
tmp_file.write(annotated_pdf)
|
407 |
+
return results_json, tmp_file.name
|
408 |
+
|
409 |
+
return results_json, None
|
410 |
|
411 |
+
# except Exception as e:
|
412 |
+
# error_message = json.dumps({
|
413 |
+
# "error": str(e),
|
414 |
+
# "traceback": traceback.format_exc()
|
415 |
+
# }, indent=2)
|
416 |
+
# return error_message, None
|
417 |
|
418 |
|
419 |
def create_interface():
|
|
|
425 |
file_input = gr.File(
|
426 |
label="Upload PDF",
|
427 |
file_types=[".pdf"],
|
428 |
+
type="binary"
|
429 |
)
|
430 |
|
431 |
with gr.Row():
|