Spaces:

Cachoups
/

FinanceReport

Sleeping

App Files Files Community

Cachoups commited on Sep 25, 2024

Commit

dbf97ba

verified ·

1 Parent(s): 24621a7

Update lib/read_pdf.py

Browse files

Files changed (1) hide show

lib/read_pdf.py +192 -190

lib/read_pdf.py CHANGED Viewed

@@ -1,191 +1,193 @@
-import pdfplumber
-import re
-# Extract text as paragraph delimiter without tables and graphs
-def extract_and_format_paragraphs(pdf_path):
-    """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
-    # Define patterns for headers, footnotes, and specific lines
-    header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
-    footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
-    footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE)  # Footnotes start with a number followed by a space
-    start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
-    end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
-    # Define common abbreviations and patterns that should not be considered as end-of-sentence
-    #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
-    def remove_abbreviation_periods(text):
-        # Define regex patterns for common abbreviations where periods should be ignored
-        abbreviations = [
-            r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
-        ]
-        for abbr in abbreviations:
-            # Remove periods in abbreviations at the end of the text
-            text = re.sub(f'({abbr})\.', r'\1', text)
-        return text
-    def is_end_of_sentence(text):
-        # Strip leading and trailing whitespace
-        text = text.strip()
-        # Remove periods in common abbreviations from the end of the text
-        text = remove_abbreviation_periods(text)
-        # Define regex patterns for sentence-ending punctuation
-        sentence_end_re = re.compile(r'[\.\!\?]\s*$')
-        # Check if the text ends with sentence-ending punctuation
-        return bool(sentence_end_re.search(text))
-    def clean_text(text):
-        """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
-        lines = text.split('\n')
-        filtered_lines = []
-        in_removal_section = False
-        paragraph_lines = []
-        def is_footnote_line(line):
-            """Check if a line matches the footnote pattern."""
-            return footnote_pattern.match(line)
-        def append_line_to_paragraph(line):
-            """Append the line to the paragraph, handling line breaks and footnotes."""
-            if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
-                # This line is a continuation of the previous one
-                paragraph_lines[-1] += ' ' + line.strip()
-            else:
-                # Start a new line in the paragraph
-                paragraph_lines.append(line.strip())
-        skip_line = False
-        for line in lines:
-            # Check for start and end markers
-            if start_marker_pattern.match(line):
-                in_removal_section = True
-            if in_removal_section and end_marker_pattern.match(line):
-                in_removal_section = False
-                continue
-            # Handle footnotes
-            if is_footnote_line(line):
-                skip_line = True
-                continue
-            if skip_line:
-                if is_end_of_sentence(line):
-                    skip_line = False
-                continue
-            # Filter out headers and footers
-            if not header_pattern.match(line) and \
-               not footer_pattern.match(line) and \
-               not in_removal_section:
-                # Remove unnecessary line breaks and append line to paragraph_lines
-                if line.strip():
-                    append_line_to_paragraph(line)
-        # Join all paragraph lines into a single paragraph text, removing unnecessary newlines
-        cleaned_paragraphs = "\n".join(paragraph_lines)
-        return cleaned_paragraphs
-    full_text = ""
-    previous_page_text = ""
-    with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                # Clean and format the page text
-                cleaned_text = clean_text(page_text)
-                # Handle text from previous page
-                if previous_page_text:
-                    # Check if the previous page text ends with punctuation
-                    if not is_end_of_sentence(previous_page_text):
-                        # Append the current page text to previous page text
-                        previous_page_text += " " + cleaned_text
-                    else:
-                        # Add previous page text to full text
-                        full_text += previous_page_text + "\n"
-                        # Reset previous page text
-                        previous_page_text = cleaned_text
-                else:
-                    previous_page_text = cleaned_text
-        # Add remaining text from the last page
-        if previous_page_text:
-            full_text += previous_page_text
-    return full_text.strip()
-# Cleaning: cut unecessary information such as annex and intro
-def find_text_range(text, start_keyword, end_keywords):
-    """Find the text range between start and multiple end keywords."""
-    start_index = text.lower().find(start_keyword.lower())
-    if start_index == -1:
-        raise ValueError(f"Start keyword '{start_keyword}' not found in the text.")
-    # Find the earliest occurrence of any end keyword
-    end_index = len(text)  # Default to end of text
-    for end_keyword in end_keywords:
-        keyword_index = text.lower().find(end_keyword.lower())
-        if keyword_index != -1 and keyword_index < end_index:
-            end_index = keyword_index
-    return start_index, end_index
-def extract_relevant_text(text, start_index, end_index):
-    """Extract text from the start index to the end index."""
-    return text[start_index:end_index].strip()
-# Split paragraphs into list of paragraphs
-def split_text_into_paragraphs(extracted_text, min_length):
-    """
-    Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
-    """
-    # Split the text into paragraphs based on newlines
-    paragraphs = re.split(r'\n+', extracted_text.strip())
-    def is_end_of_sentence(text):
-        """Check if the text ends with punctuation indicating the end of a sentence."""
-        return bool(re.search(r'[.!?]$', text.strip()))
-    def count_sentences(text):
-        """Count the number of sentences in a text."""
-        return len(re.split(r'(?<=[.!?])\s+', text.strip()))
-    def merge_single_sentence_paragraphs(paragraphs):
-        """Merge single-sentence paragraphs with the next paragraph if necessary."""
-        merged_paragraphs = []
-        i = 0
-        while i < len(paragraphs):
-            para = paragraphs[i].strip()
-            if not para:
-                i += 1
-                continue
-            if count_sentences(para) == 1 and i + 1 < len(paragraphs):
-                # Check if the next paragraph should be merged with the current one
-                next_para = paragraphs[i + 1].strip()
-                if next_para:
-                    # Merge single-sentence paragraph with the next paragraph
-                    merged_paragraphs.append(para + ' ' + next_para)
-                    i += 2  # Skip the next paragraph since it has been merged
-                else:
-                    # If the next paragraph is empty, just add the current paragraph
-                    merged_paragraphs.append(para)
-                    i += 1
-            else:
-                # Add the current paragraph if it has more than one sentence or is the last one
-                merged_paragraphs.append(para)
-                i += 1
-        return merged_paragraphs
-    # Filter out paragraphs that are too short
-    filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
-    # Merge single-sentence paragraphs
-    final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
     return final_paragraphs

+import pdfplumber
+import re
+# Extract text as paragraph delimiter without tables and graphs
+def extract_and_format_paragraphs(pdf_path):
+    """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
+    # Define patterns for headers, footnotes, and specific lines
+    header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
+    footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
+    footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE)  # Footnotes start with a number followed by a space
+    start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
+    end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
+    # Define common abbreviations and patterns that should not be considered as end-of-sentence
+    #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
+    def remove_abbreviation_periods(text):
+        # Define regex patterns for common abbreviations where periods should be ignored
+        abbreviations = [
+            r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
+        ]
+        for abbr in abbreviations:
+            # Remove periods in abbreviations at the end of the text
+            text = re.sub(f'({abbr})\.', r'\1', text)
+        return text
+    def is_end_of_sentence(text):
+        # Strip leading and trailing whitespace
+        text = text.strip()
+        # Remove periods in common abbreviations from the end of the text
+        text = remove_abbreviation_periods(text)
+        # Define regex patterns for sentence-ending punctuation
+        sentence_end_re = re.compile(r'[\.\!\?]\s*$')
+        # Check if the text ends with sentence-ending punctuation
+        return bool(sentence_end_re.search(text))
+    def clean_text(text):
+        """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
+        lines = text.split('\n')
+        filtered_lines = []
+        in_removal_section = False
+        paragraph_lines = []
+        def is_footnote_line(line):
+            """Check if a line matches the footnote pattern."""
+            return footnote_pattern.match(line)
+        def append_line_to_paragraph(line):
+            """Append the line to the paragraph, handling line breaks and footnotes."""
+            if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
+                # This line is a continuation of the previous one
+                paragraph_lines[-1] += ' ' + line.strip()
+            else:
+                # Start a new line in the paragraph
+                paragraph_lines.append(line.strip())
+        skip_line = False
+        for line in lines:
+            # Check for start and end markers
+            if start_marker_pattern.match(line):
+                in_removal_section = True
+            if in_removal_section and end_marker_pattern.match(line):
+                in_removal_section = False
+                continue
+            # Handle footnotes
+            if is_footnote_line(line):
+                skip_line = True
+                continue
+            if skip_line:
+                if is_end_of_sentence(line):
+                    skip_line = False
+                continue
+            # Filter out headers and footers
+            if not header_pattern.match(line) and \
+               not footer_pattern.match(line) and \
+               not in_removal_section:
+                # Remove unnecessary line breaks and append line to paragraph_lines
+                if line.strip():
+                    append_line_to_paragraph(line)
+        # Join all paragraph lines into a single paragraph text, removing unnecessary newlines
+        cleaned_paragraphs = "\n".join(paragraph_lines)
+        return cleaned_paragraphs
+    full_text = ""
+    previous_page_text = ""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                # Clean and format the page text
+                cleaned_text = clean_text(page_text)
+                # Handle text from previous page
+                if previous_page_text:
+                    # Check if the previous page text ends with punctuation
+                    if not is_end_of_sentence(previous_page_text):
+                        # Append the current page text to previous page text
+                        previous_page_text += " " + cleaned_text
+                    else:
+                        # Add previous page text to full text
+                        full_text += previous_page_text + "\n"
+                        # Reset previous page text
+                        previous_page_text = cleaned_text
+                else:
+                    previous_page_text = cleaned_text
+        # Add remaining text from the last page
+        if previous_page_text:
+            full_text += previous_page_text
+    return full_text.strip()
+# Cleaning: cut unecessary information such as annex and intro
+def find_text_range(text, start_keywords, end_keywords):
+    """Find the text range between start and multiple end keywords."""
+    start_index = 0
+    for start_keyword in start_keywords:
+        keyword_index = text.lower().find(start_keyword.lower())
+        if keyword_index != -1 and keyword_index > start_index:
+            start_index = keyword_index
+    #start_index = text.lower().find(start_keyword.lower())
+    # Find the earliest occurrence of any end keyword
+    end_index = len(text)  # Default to end of text
+    for end_keyword in end_keywords:
+        keyword_index = text.lower().find(end_keyword.lower())
+        if keyword_index != -1 and keyword_index < end_index:
+            end_index = keyword_index
+    return start_index, end_index
+def extract_relevant_text(text, start_index, end_index):
+    """Extract text from the start index to the end index."""
+    return text[start_index:end_index].strip()
+# Split paragraphs into list of paragraphs
+def split_text_into_paragraphs(extracted_text, min_length):
+    """
+    Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
+    """
+    # Split the text into paragraphs based on newlines
+    paragraphs = re.split(r'\n+', extracted_text.strip())
+    def is_end_of_sentence(text):
+        """Check if the text ends with punctuation indicating the end of a sentence."""
+        return bool(re.search(r'[.!?]$', text.strip()))
+    def count_sentences(text):
+        """Count the number of sentences in a text."""
+        return len(re.split(r'(?<=[.!?])\s+', text.strip()))
+    def merge_single_sentence_paragraphs(paragraphs):
+        """Merge single-sentence paragraphs with the next paragraph if necessary."""
+        merged_paragraphs = []
+        i = 0
+        while i < len(paragraphs):
+            para = paragraphs[i].strip()
+            if not para:
+                i += 1
+                continue
+            if count_sentences(para) == 1 and i + 1 < len(paragraphs):
+                # Check if the next paragraph should be merged with the current one
+                next_para = paragraphs[i + 1].strip()
+                if next_para:
+                    # Merge single-sentence paragraph with the next paragraph
+                    merged_paragraphs.append(para + ' ' + next_para)
+                    i += 2  # Skip the next paragraph since it has been merged
+                else:
+                    # If the next paragraph is empty, just add the current paragraph
+                    merged_paragraphs.append(para)
+                    i += 1
+            else:
+                # Add the current paragraph if it has more than one sentence or is the last one
+                merged_paragraphs.append(para)
+                i += 1
+        return merged_paragraphs
+    # Filter out paragraphs that are too short
+    filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
+    # Merge single-sentence paragraphs
+    final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
     return final_paragraphs