samyak152002 commited on
Commit
184c6f9
·
verified ·
1 Parent(s): 2859d26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -12
app.py CHANGED
@@ -29,34 +29,55 @@ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
29
  # return [page.get_text("text") for page in doc]
30
 
31
  def extract_pdf_text(file) -> str:
32
- """Extracts full text from a PDF file using PyMuPDF."""
33
  try:
34
  print(f"Opening PDF file: {file}")
 
 
 
35
  if isinstance(file, str):
36
  print(f"Opening file by path: {file}")
37
- doc = fitz.open(file)
38
  else:
39
  print(f"Opening file from stream")
40
- doc = fitz.open(stream=file.read(), filetype="pdf")
41
-
42
- print(f"PDF opened successfully with {len(doc)} pages")
43
- full_text = ""
 
 
 
44
 
45
- for page_number in range(len(doc)):
46
- page = doc[page_number]
47
- words = page.get_text("markdown") # Change to "text" instead of "word"
48
- full_text += words
49
- print(f"Extracted {len(words)} characters from page {page_number+1}")
50
-
51
  doc.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  print(f"Total extracted text length: {len(full_text)} characters.")
 
53
  return full_text
54
 
55
  except Exception as e:
56
  print(f"Error extracting text from PDF: {str(e)}")
 
57
  print(traceback.format_exc())
58
  return ""
59
 
 
60
  def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
61
  """Checks for the presence of required terms in the text."""
62
  return {term: term.lower() in full_text.lower() for term in search_terms}
 
29
  # return [page.get_text("text") for page in doc]
30
 
31
  def extract_pdf_text(file) -> str:
32
+ """Extracts full text from a PDF file using PyMuPDF4LLM."""
33
  try:
34
  print(f"Opening PDF file: {file}")
35
+
36
+ # Handle file path vs stream
37
+ temp_file_path = None
38
  if isinstance(file, str):
39
  print(f"Opening file by path: {file}")
40
+ file_path = file
41
  else:
42
  print(f"Opening file from stream")
43
+ import tempfile
44
+ import os
45
+ temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
46
+ temp_file_path = temp_file.name
47
+ temp_file.write(file.read())
48
+ temp_file.close()
49
+ file_path = temp_file_path
50
 
51
+ # Get page count with PyMuPDF for logging purposes
52
+ doc = fitz.open(file_path)
53
+ page_count = len(doc)
 
 
 
54
  doc.close()
55
+ print(f"PDF opened successfully with {page_count} pages")
56
+
57
+ # Process with pymupdf4llm
58
+ import pymupdf4llm
59
+ full_text = pymupdf4llm.to_markdown(file_path)
60
+
61
+ # Log extraction info for each page (approximating per-page counts)
62
+ avg_chars_per_page = len(full_text) // page_count if page_count > 0 else 0
63
+ for page_number in range(page_count):
64
+ print(f"Extracted {avg_chars_per_page} characters from page {page_number+1}")
65
+
66
+ # Clean up temporary file if created
67
+ if temp_file_path:
68
+ os.remove(temp_file_path)
69
+
70
  print(f"Total extracted text length: {len(full_text)} characters.")
71
+ print(full_text)
72
  return full_text
73
 
74
  except Exception as e:
75
  print(f"Error extracting text from PDF: {str(e)}")
76
+ import traceback
77
  print(traceback.format_exc())
78
  return ""
79
 
80
+
81
  def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
82
  """Checks for the presence of required terms in the text."""
83
  return {term: term.lower() in full_text.lower() for term in search_terms}