Cachoups commited on
Commit
fa09874
·
verified ·
1 Parent(s): 1c10bf3

Update lib/read_pdf.py

Browse files
Files changed (1) hide show
  1. lib/read_pdf.py +48 -0
lib/read_pdf.py CHANGED
@@ -93,6 +93,54 @@ def extract_and_format_paragraphs(pdf_path):
93
  previous_page_text = ""
94
 
95
  with pdfplumber.open(pdf_path) as pdf:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  for page in pdf.pages:
97
  page_text = page.extract_text()
98
  if page_text:
 
93
  previous_page_text = ""
94
 
95
  with pdfplumber.open(pdf_path) as pdf:
96
+ if "minutes" in os.path.basename(pdf_path).lower():
97
+ with pdfplumber.open(pdf_path) as pdf:
98
+ for page_num, page in enumerate(pdf.pages):
99
+ # Get the page dimensions
100
+ width = page.width
101
+ height = page.height
102
+
103
+ # Define bounding boxes for left and right columns
104
+ left_bbox = (0, 0, width / 2, height) # Left column
105
+ right_bbox = (width / 2, 0, width, height) # Right column
106
+
107
+ # Extract text from the left column
108
+ left_column_text = page.within_bbox(left_bbox).extract_text() or ""
109
+ # Clean the left column text
110
+ cleaned_left_text = clean_text(left_column_text)
111
+
112
+ # Extract text from the right column
113
+ right_column_text = page.within_bbox(right_bbox).extract_text() or ""
114
+ # Clean the right column text
115
+ cleaned_right_text = clean_text(right_column_text)
116
+
117
+ # Handle text from previous page
118
+ if previous_page_text:
119
+ # Check if the previous page text ends with punctuation
120
+ if not is_end_of_sentence(previous_page_text):
121
+ # Append the current page's left column text to previous page text
122
+ previous_page_text += " " + cleaned_left_text
123
+ else:
124
+ # Add previous page text to full text
125
+ full_text += previous_page_text + "\n"
126
+ # Reset previous page text to current left column text
127
+ previous_page_text = cleaned_left_text
128
+ else:
129
+ previous_page_text = cleaned_left_text
130
+
131
+ # Process the right column text
132
+ if previous_page_text:
133
+ # Check if the previous page text ends with punctuation
134
+ if not is_end_of_sentence(previous_page_text):
135
+ # Append the right column text to previous page text
136
+ previous_page_text += " " + cleaned_right_text
137
+ else:
138
+ # Add previous page text to full text
139
+ full_text += previous_page_text + "\n"
140
+ # Reset previous page text to current right column text
141
+ previous_page_text = cleaned_right_text
142
+ else:
143
+ previous_page_text = cleaned_right_text
144
  for page in pdf.pages:
145
  page_text = page.extract_text()
146
  if page_text: