samyak152002 commited on
Commit
d3509b9
·
verified ·
1 Parent(s): 09c8002

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -5
app.py CHANGED
@@ -31,21 +31,30 @@ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
33
  try:
34
- doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
 
 
 
 
 
 
 
 
35
  full_text = ""
36
 
37
  for page_number in range(len(doc)):
38
  page = doc[page_number]
39
- words = page.get_text("word")
40
  full_text += words
 
41
 
42
- # print(full_text)
43
  doc.close()
44
  print(f"Total extracted text length: {len(full_text)} characters.")
45
  return full_text
46
 
47
  except Exception as e:
48
- print(f"Error extracting text from PDF: {e}")
 
49
  return ""
50
 
51
  def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
@@ -472,4 +481,3 @@ if __name__ == "__main__":
472
  # server_name="0.0.0.0",
473
  server_port=None
474
  )
475
-
 
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
33
  try:
34
+ print(f"Opening PDF file: {file}")
35
+ if isinstance(file, str):
36
+ print(f"Opening file by path: {file}")
37
+ doc = fitz.open(file)
38
+ else:
39
+ print(f"Opening file from stream")
40
+ doc = fitz.open(stream=file.read(), filetype="pdf")
41
+
42
+ print(f"PDF opened successfully with {len(doc)} pages")
43
  full_text = ""
44
 
45
  for page_number in range(len(doc)):
46
  page = doc[page_number]
47
+ words = page.get_text("word") # Change to "text" instead of "word"
48
  full_text += words
49
+ print(f"Extracted {len(words)} characters from page {page_number+1}")
50
 
 
51
  doc.close()
52
  print(f"Total extracted text length: {len(full_text)} characters.")
53
  return full_text
54
 
55
  except Exception as e:
56
+ print(f"Error extracting text from PDF: {str(e)}")
57
+ print(traceback.format_exc())
58
  return ""
59
 
60
  def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
 
481
  # server_name="0.0.0.0",
482
  server_port=None
483
  )