samyak152002 commited on
Commit
f652e83
·
verified ·
1 Parent(s): 37896f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -8
app.py CHANGED
@@ -30,21 +30,31 @@ def extract_pdf_text_by_page(file) -> List[str]:
30
 
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
33
- print("me llamo samyak")
34
  try:
35
- # Open the PDF file
36
- # print("me llamo samyak")
37
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
38
  full_text = ""
39
- # print(doc)
40
  for page_num, page in enumerate(doc, start=1):
41
- text = page.get_text("text")
42
- full_text += text + "\n"
43
- print(f"Extracted text from page {page_num}: {len(text)} characters.")
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  doc.close()
45
  print(f"Total extracted text length: {len(full_text)} characters.")
46
- # print(full_text)
47
  return full_text
 
48
  except Exception as e:
49
  print(f"Error extracting text from PDF: {e}")
50
  return ""
 
30
 
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
 
33
  try:
 
 
34
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
35
  full_text = ""
36
+
37
  for page_num, page in enumerate(doc, start=1):
38
+ # Get text blocks with their coordinates
39
+ blocks = page.get_text("blocks")
40
+ processed_text = ""
41
+
42
+ for block in blocks:
43
+ text = block[4] # The text content is at index 4
44
+
45
+ # Handle line-break hyphens
46
+ text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
47
+
48
+ # Preserve regular hyphens within words (e.g., "state-of-the-art")
49
+ processed_text += text + "\n"
50
+
51
+ full_text += processed_text
52
+ print(f"Extracted text from page {page_num}: {len(processed_text)} characters.")
53
+
54
  doc.close()
55
  print(f"Total extracted text length: {len(full_text)} characters.")
 
56
  return full_text
57
+
58
  except Exception as e:
59
  print(f"Error extracting text from PDF: {e}")
60
  return ""