Spaces:

Cachoups
/

FinanceReport

Sleeping

App Files Files Community

Cachoups commited on Sep 25, 2024

Commit

fa09874

verified ·

1 Parent(s): 1c10bf3

Update lib/read_pdf.py

Browse files

Files changed (1) hide show

lib/read_pdf.py +48 -0

lib/read_pdf.py CHANGED Viewed

@@ -93,6 +93,54 @@ def extract_and_format_paragraphs(pdf_path):
     previous_page_text = ""
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
             page_text = page.extract_text()
             if page_text:

     previous_page_text = ""
     with pdfplumber.open(pdf_path) as pdf:
+        if "minutes" in os.path.basename(pdf_path).lower():
+            with pdfplumber.open(pdf_path) as pdf:
+                for page_num, page in enumerate(pdf.pages):
+                    # Get the page dimensions
+                    width = page.width
+                    height = page.height
+                    # Define bounding boxes for left and right columns
+                    left_bbox = (0, 0, width / 2, height)  # Left column
+                    right_bbox = (width / 2, 0, width, height)  # Right column
+                    # Extract text from the left column
+                    left_column_text = page.within_bbox(left_bbox).extract_text() or ""
+                    # Clean the left column text
+                    cleaned_left_text = clean_text(left_column_text)
+                    # Extract text from the right column
+                    right_column_text = page.within_bbox(right_bbox).extract_text() or ""
+                    # Clean the right column text
+                    cleaned_right_text = clean_text(right_column_text)
+                    # Handle text from previous page
+                    if previous_page_text:
+                        # Check if the previous page text ends with punctuation
+                        if not is_end_of_sentence(previous_page_text):
+                            # Append the current page's left column text to previous page text
+                            previous_page_text += " " + cleaned_left_text
+                        else:
+                            # Add previous page text to full text
+                            full_text += previous_page_text + "\n"
+                            # Reset previous page text to current left column text
+                            previous_page_text = cleaned_left_text
+                    else:
+                        previous_page_text = cleaned_left_text
+                    # Process the right column text
+                    if previous_page_text:
+                        # Check if the previous page text ends with punctuation
+                        if not is_end_of_sentence(previous_page_text):
+                            # Append the right column text to previous page text
+                            previous_page_text += " " + cleaned_right_text
+                        else:
+                            # Add previous page text to full text
+                            full_text += previous_page_text + "\n"
+                            # Reset previous page text to current right column text
+                            previous_page_text = cleaned_right_text
+                    else:
+                        previous_page_text = cleaned_right_text
         for page in pdf.pages:
             page_text = page.extract_text()
             if page_text: