Update app.py
Browse files
app.py
CHANGED
@@ -30,21 +30,31 @@ def extract_pdf_text_by_page(file) -> List[str]:
|
|
30 |
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
33 |
-
print("me llamo samyak")
|
34 |
try:
|
35 |
-
# Open the PDF file
|
36 |
-
# print("me llamo samyak")
|
37 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
38 |
full_text = ""
|
39 |
-
|
40 |
for page_num, page in enumerate(doc, start=1):
|
41 |
-
text
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
doc.close()
|
45 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
46 |
-
# print(full_text)
|
47 |
return full_text
|
|
|
48 |
except Exception as e:
|
49 |
print(f"Error extracting text from PDF: {e}")
|
50 |
return ""
|
|
|
30 |
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
|
|
33 |
try:
|
|
|
|
|
34 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
35 |
full_text = ""
|
36 |
+
|
37 |
for page_num, page in enumerate(doc, start=1):
|
38 |
+
# Get text blocks with their coordinates
|
39 |
+
blocks = page.get_text("blocks")
|
40 |
+
processed_text = ""
|
41 |
+
|
42 |
+
for block in blocks:
|
43 |
+
text = block[4] # The text content is at index 4
|
44 |
+
|
45 |
+
# Handle line-break hyphens
|
46 |
+
text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
|
47 |
+
|
48 |
+
# Preserve regular hyphens within words (e.g., "state-of-the-art")
|
49 |
+
processed_text += text + "\n"
|
50 |
+
|
51 |
+
full_text += processed_text
|
52 |
+
print(f"Extracted text from page {page_num}: {len(processed_text)} characters.")
|
53 |
+
|
54 |
doc.close()
|
55 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
|
|
56 |
return full_text
|
57 |
+
|
58 |
except Exception as e:
|
59 |
print(f"Error extracting text from PDF: {e}")
|
60 |
return ""
|