samyak152002 commited on
Commit
4dd18db
·
verified ·
1 Parent(s): e444d56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -37
app.py CHANGED
@@ -30,16 +30,20 @@ def extract_pdf_text_by_page(file) -> List[str]:
30
 
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
 
33
  try:
34
  # Open the PDF file
 
35
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
36
  full_text = ""
 
37
  for page_num, page in enumerate(doc, start=1):
38
  text = page.get_text("text")
39
  full_text += text + "\n"
40
  print(f"Extracted text from page {page_num}: {len(text)} characters.")
41
  doc.close()
42
  print(f"Total extracted text length: {len(full_text)} characters.")
 
43
  return full_text
44
  except Exception as e:
45
  print(f"Error extracting text from PDF: {e}")
@@ -346,12 +350,10 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
346
  # Main Analysis Function
347
  # ------------------------------
348
 
349
- def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
350
  """Analyzes the PDF for language issues and returns results and annotated PDF."""
351
  try:
352
- # Reset file pointer before reading
353
- file.seek(0)
354
- full_text = extract_pdf_text(file)
355
  if not full_text:
356
  return {"error": "Failed to extract text from PDF."}, None
357
 
@@ -360,9 +362,7 @@ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
360
  return language_issues, None
361
 
362
  issues = language_issues.get("issues", [])
363
- # Reset file pointer before highlighting
364
- file.seek(0)
365
- annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
366
  return language_issues, annotated_pdf
367
  except Exception as e:
368
  return {"error": str(e)}, None
@@ -375,36 +375,45 @@ def process_upload(file):
375
  """
376
  Process the uploaded PDF file and return analysis results and annotated PDF.
377
  """
378
- try:
379
- if file is None:
380
- return json.dumps({"error": "No file uploaded"}, indent=2), None
381
-
382
- # Create a temporary file to work with
383
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
384
- temp_input.write(file)
385
- temp_input_path = temp_input.name
386
-
387
- # Analyze the PDF
388
- results, annotated_pdf = analyze_pdf(temp_input_path)
389
- results_json = json.dumps(results, indent=2)
390
-
391
- # Clean up the temporary input file
392
- os.unlink(temp_input_path)
393
-
394
- # If we have an annotated PDF, save it temporarily
395
- if annotated_pdf:
396
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
397
- tmp_file.write(annotated_pdf)
398
- return results_json, tmp_file.name
399
-
400
- return results_json, None
 
 
 
 
 
 
 
 
 
401
 
402
- except Exception as e:
403
- error_message = json.dumps({
404
- "error": str(e),
405
- "traceback": traceback.format_exc()
406
- }, indent=2)
407
- return error_message, None
408
 
409
 
410
  def create_interface():
@@ -416,7 +425,7 @@ def create_interface():
416
  file_input = gr.File(
417
  label="Upload PDF",
418
  file_types=[".pdf"],
419
- type="binary" # Changed from "file" to "binary"
420
  )
421
 
422
  with gr.Row():
 
30
 
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
33
+ print("me llamo samyak")
34
  try:
35
  # Open the PDF file
36
+ print("me llamo samyak")
37
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
38
  full_text = ""
39
+ print(doc)
40
  for page_num, page in enumerate(doc, start=1):
41
  text = page.get_text("text")
42
  full_text += text + "\n"
43
  print(f"Extracted text from page {page_num}: {len(text)} characters.")
44
  doc.close()
45
  print(f"Total extracted text length: {len(full_text)} characters.")
46
+ print(full_text)
47
  return full_text
48
  except Exception as e:
49
  print(f"Error extracting text from PDF: {e}")
 
350
  # Main Analysis Function
351
  # ------------------------------
352
 
353
+ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
354
  """Analyzes the PDF for language issues and returns results and annotated PDF."""
355
  try:
356
+ full_text = extract_pdf_text(filepath)
 
 
357
  if not full_text:
358
  return {"error": "Failed to extract text from PDF."}, None
359
 
 
362
  return language_issues, None
363
 
364
  issues = language_issues.get("issues", [])
365
+ annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
 
 
366
  return language_issues, annotated_pdf
367
  except Exception as e:
368
  return {"error": str(e)}, None
 
375
  """
376
  Process the uploaded PDF file and return analysis results and annotated PDF.
377
  """
378
+ # print(file.name)
379
+ if file is None:
380
+ return json.dumps({"error": "No file uploaded"}, indent=2), None
381
+
382
+ # # Create a temporary file to work with
383
+
384
+ # with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
385
+ # temp_input.write(file)
386
+ # temp_input_path = temp_input.name
387
+ # print(temp_input_path)
388
+
389
+ temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
390
+ temp_input.write(file)
391
+ temp_input_path = temp_input.name
392
+ print(temp_input_path)
393
+ # Analyze -inputthe PDF
394
+
395
+ results, annotated_pdf = analyze_pdf(temp_input_path)
396
+
397
+ print(results)
398
+ results_json = json.dumps(results, indent=2)
399
+
400
+ # Clean up the temporary input file
401
+ os.unlink(temp_input_path)
402
+
403
+ # If we have an annotated PDF, save it temporarily
404
+ if annotated_pdf:
405
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
406
+ tmp_file.write(annotated_pdf)
407
+ return results_json, tmp_file.name
408
+
409
+ return results_json, None
410
 
411
+ # except Exception as e:
412
+ # error_message = json.dumps({
413
+ # "error": str(e),
414
+ # "traceback": traceback.format_exc()
415
+ # }, indent=2)
416
+ # return error_message, None
417
 
418
 
419
  def create_interface():
 
425
  file_input = gr.File(
426
  label="Upload PDF",
427
  file_types=[".pdf"],
428
+ type="binary"
429
  )
430
 
431
  with gr.Row():