Spaces:

Cachoups
/

FinanceReport

Sleeping

App Files Files Community

Cachoups commited on Sep 12, 2024

Commit

3e45198

verified ·

1 Parent(s): 16546eb

Upload read_pdf.py

Browse files

Files changed (1) hide show

lib/read_pdf.py +191 -0

lib/read_pdf.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import pdfplumber
+import re
+# Extract text as paragraph delimiter without tables and graphs
+def extract_and_format_paragraphs(pdf_path):
+    """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
+    # Define patterns for headers, footnotes, and specific lines
+    header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
+    footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
+    footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE)  # Footnotes start with a number followed by a space
+    start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
+    end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
+    # Define common abbreviations and patterns that should not be considered as end-of-sentence
+    #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
+    def remove_abbreviation_periods(text):
+        # Define regex patterns for common abbreviations where periods should be ignored
+        abbreviations = [
+            r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
+        ]
+        for abbr in abbreviations:
+            # Remove periods in abbreviations at the end of the text
+            text = re.sub(f'({abbr})\.', r'\1', text)
+        return text
+    def is_end_of_sentence(text):
+        # Strip leading and trailing whitespace
+        text = text.strip()
+        # Remove periods in common abbreviations from the end of the text
+        text = remove_abbreviation_periods(text)
+        # Define regex patterns for sentence-ending punctuation
+        sentence_end_re = re.compile(r'[\.\!\?]\s*$')
+        # Check if the text ends with sentence-ending punctuation
+        return bool(sentence_end_re.search(text))
+    def clean_text(text):
+        """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
+        lines = text.split('\n')
+        filtered_lines = []
+        in_removal_section = False
+        paragraph_lines = []
+        def is_footnote_line(line):
+            """Check if a line matches the footnote pattern."""
+            return footnote_pattern.match(line)
+        def append_line_to_paragraph(line):
+            """Append the line to the paragraph, handling line breaks and footnotes."""
+            if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
+                # This line is a continuation of the previous one
+                paragraph_lines[-1] += ' ' + line.strip()
+            else:
+                # Start a new line in the paragraph
+                paragraph_lines.append(line.strip())
+        skip_line = False
+        for line in lines:
+            # Check for start and end markers
+            if start_marker_pattern.match(line):
+                in_removal_section = True
+            if in_removal_section and end_marker_pattern.match(line):
+                in_removal_section = False
+                continue
+            # Handle footnotes
+            if is_footnote_line(line):
+                skip_line = True
+                continue
+            if skip_line:
+                if is_end_of_sentence(line):
+                    skip_line = False
+                continue
+            # Filter out headers and footers
+            if not header_pattern.match(line) and \
+               not footer_pattern.match(line) and \
+               not in_removal_section:
+                # Remove unnecessary line breaks and append line to paragraph_lines
+                if line.strip():
+                    append_line_to_paragraph(line)
+        # Join all paragraph lines into a single paragraph text, removing unnecessary newlines
+        cleaned_paragraphs = "\n".join(paragraph_lines)
+        return cleaned_paragraphs
+    full_text = ""
+    previous_page_text = ""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                # Clean and format the page text
+                cleaned_text = clean_text(page_text)
+                # Handle text from previous page
+                if previous_page_text:
+                    # Check if the previous page text ends with punctuation
+                    if not is_end_of_sentence(previous_page_text):
+                        # Append the current page text to previous page text
+                        previous_page_text += " " + cleaned_text
+                    else:
+                        # Add previous page text to full text
+                        full_text += previous_page_text + "\n"
+                        # Reset previous page text
+                        previous_page_text = cleaned_text
+                else:
+                    previous_page_text = cleaned_text
+        # Add remaining text from the last page
+        if previous_page_text:
+            full_text += previous_page_text
+    return full_text.strip()
+# Cleaning: cut unecessary information such as annex and intro
+def find_text_range(text, start_keyword, end_keywords):
+    """Find the text range between start and multiple end keywords."""
+    start_index = text.lower().find(start_keyword.lower())
+    if start_index == -1:
+        raise ValueError(f"Start keyword '{start_keyword}' not found in the text.")
+    # Find the earliest occurrence of any end keyword
+    end_index = len(text)  # Default to end of text
+    for end_keyword in end_keywords:
+        keyword_index = text.lower().find(end_keyword.lower())
+        if keyword_index != -1 and keyword_index < end_index:
+            end_index = keyword_index
+    return start_index, end_index
+def extract_relevant_text(text, start_index, end_index):
+    """Extract text from the start index to the end index."""
+    return text[start_index:end_index].strip()
+# Split paragraphs into list of paragraphs
+def split_text_into_paragraphs(extracted_text, min_length):
+    """
+    Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
+    """
+    # Split the text into paragraphs based on newlines
+    paragraphs = re.split(r'\n+', extracted_text.strip())
+    def is_end_of_sentence(text):
+        """Check if the text ends with punctuation indicating the end of a sentence."""
+        return bool(re.search(r'[.!?]$', text.strip()))
+    def count_sentences(text):
+        """Count the number of sentences in a text."""
+        return len(re.split(r'(?<=[.!?])\s+', text.strip()))
+    def merge_single_sentence_paragraphs(paragraphs):
+        """Merge single-sentence paragraphs with the next paragraph if necessary."""
+        merged_paragraphs = []
+        i = 0
+        while i < len(paragraphs):
+            para = paragraphs[i].strip()
+            if not para:
+                i += 1
+                continue
+            if count_sentences(para) == 1 and i + 1 < len(paragraphs):
+                # Check if the next paragraph should be merged with the current one
+                next_para = paragraphs[i + 1].strip()
+                if next_para:
+                    # Merge single-sentence paragraph with the next paragraph
+                    merged_paragraphs.append(para + ' ' + next_para)
+                    i += 2  # Skip the next paragraph since it has been merged
+                else:
+                    # If the next paragraph is empty, just add the current paragraph
+                    merged_paragraphs.append(para)
+                    i += 1
+            else:
+                # Add the current paragraph if it has more than one sentence or is the last one
+                merged_paragraphs.append(para)
+                i += 1
+        return merged_paragraphs
+    # Filter out paragraphs that are too short
+    filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
+    # Merge single-sentence paragraphs
+    final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
+    return final_paragraphs