Update app.py
Browse files
app.py
CHANGED
@@ -31,21 +31,30 @@ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
33 |
try:
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
full_text = ""
|
36 |
|
37 |
for page_number in range(len(doc)):
|
38 |
page = doc[page_number]
|
39 |
-
words = page.get_text("word")
|
40 |
full_text += words
|
|
|
41 |
|
42 |
-
# print(full_text)
|
43 |
doc.close()
|
44 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
45 |
return full_text
|
46 |
|
47 |
except Exception as e:
|
48 |
-
print(f"Error extracting text from PDF: {e}")
|
|
|
49 |
return ""
|
50 |
|
51 |
def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
|
@@ -472,4 +481,3 @@ if __name__ == "__main__":
|
|
472 |
# server_name="0.0.0.0",
|
473 |
server_port=None
|
474 |
)
|
475 |
-
|
|
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
33 |
try:
|
34 |
+
print(f"Opening PDF file: {file}")
|
35 |
+
if isinstance(file, str):
|
36 |
+
print(f"Opening file by path: {file}")
|
37 |
+
doc = fitz.open(file)
|
38 |
+
else:
|
39 |
+
print(f"Opening file from stream")
|
40 |
+
doc = fitz.open(stream=file.read(), filetype="pdf")
|
41 |
+
|
42 |
+
print(f"PDF opened successfully with {len(doc)} pages")
|
43 |
full_text = ""
|
44 |
|
45 |
for page_number in range(len(doc)):
|
46 |
page = doc[page_number]
|
47 |
+
words = page.get_text("word") # Change to "text" instead of "word"
|
48 |
full_text += words
|
49 |
+
print(f"Extracted {len(words)} characters from page {page_number+1}")
|
50 |
|
|
|
51 |
doc.close()
|
52 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
53 |
return full_text
|
54 |
|
55 |
except Exception as e:
|
56 |
+
print(f"Error extracting text from PDF: {str(e)}")
|
57 |
+
print(traceback.format_exc())
|
58 |
return ""
|
59 |
|
60 |
def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
|
|
|
481 |
# server_name="0.0.0.0",
|
482 |
server_port=None
|
483 |
)
|
|