Spaces:
Running
Running
Update lib/read_pdf.py
Browse files- lib/read_pdf.py +48 -0
lib/read_pdf.py
CHANGED
@@ -93,6 +93,54 @@ def extract_and_format_paragraphs(pdf_path):
|
|
93 |
previous_page_text = ""
|
94 |
|
95 |
with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
for page in pdf.pages:
|
97 |
page_text = page.extract_text()
|
98 |
if page_text:
|
|
|
93 |
previous_page_text = ""
|
94 |
|
95 |
with pdfplumber.open(pdf_path) as pdf:
|
96 |
+
if "minutes" in os.path.basename(pdf_path).lower():
|
97 |
+
with pdfplumber.open(pdf_path) as pdf:
|
98 |
+
for page_num, page in enumerate(pdf.pages):
|
99 |
+
# Get the page dimensions
|
100 |
+
width = page.width
|
101 |
+
height = page.height
|
102 |
+
|
103 |
+
# Define bounding boxes for left and right columns
|
104 |
+
left_bbox = (0, 0, width / 2, height) # Left column
|
105 |
+
right_bbox = (width / 2, 0, width, height) # Right column
|
106 |
+
|
107 |
+
# Extract text from the left column
|
108 |
+
left_column_text = page.within_bbox(left_bbox).extract_text() or ""
|
109 |
+
# Clean the left column text
|
110 |
+
cleaned_left_text = clean_text(left_column_text)
|
111 |
+
|
112 |
+
# Extract text from the right column
|
113 |
+
right_column_text = page.within_bbox(right_bbox).extract_text() or ""
|
114 |
+
# Clean the right column text
|
115 |
+
cleaned_right_text = clean_text(right_column_text)
|
116 |
+
|
117 |
+
# Handle text from previous page
|
118 |
+
if previous_page_text:
|
119 |
+
# Check if the previous page text ends with punctuation
|
120 |
+
if not is_end_of_sentence(previous_page_text):
|
121 |
+
# Append the current page's left column text to previous page text
|
122 |
+
previous_page_text += " " + cleaned_left_text
|
123 |
+
else:
|
124 |
+
# Add previous page text to full text
|
125 |
+
full_text += previous_page_text + "\n"
|
126 |
+
# Reset previous page text to current left column text
|
127 |
+
previous_page_text = cleaned_left_text
|
128 |
+
else:
|
129 |
+
previous_page_text = cleaned_left_text
|
130 |
+
|
131 |
+
# Process the right column text
|
132 |
+
if previous_page_text:
|
133 |
+
# Check if the previous page text ends with punctuation
|
134 |
+
if not is_end_of_sentence(previous_page_text):
|
135 |
+
# Append the right column text to previous page text
|
136 |
+
previous_page_text += " " + cleaned_right_text
|
137 |
+
else:
|
138 |
+
# Add previous page text to full text
|
139 |
+
full_text += previous_page_text + "\n"
|
140 |
+
# Reset previous page text to current right column text
|
141 |
+
previous_page_text = cleaned_right_text
|
142 |
+
else:
|
143 |
+
previous_page_text = cleaned_right_text
|
144 |
for page in pdf.pages:
|
145 |
page_text = page.extract_text()
|
146 |
if page_text:
|