Spaces:
Running
Running
Upload read_pdf.py
Browse files- lib/read_pdf.py +191 -0
lib/read_pdf.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pdfplumber
|
2 |
+
import re
|
3 |
+
|
4 |
+
# Extract text as paragraph delimiter without tables and graphs
|
5 |
+
def extract_and_format_paragraphs(pdf_path):
|
6 |
+
"""Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
|
7 |
+
|
8 |
+
# Define patterns for headers, footnotes, and specific lines
|
9 |
+
header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
|
10 |
+
footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
|
11 |
+
footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space
|
12 |
+
start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
|
13 |
+
end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
|
14 |
+
|
15 |
+
# Define common abbreviations and patterns that should not be considered as end-of-sentence
|
16 |
+
#exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
|
17 |
+
def remove_abbreviation_periods(text):
|
18 |
+
# Define regex patterns for common abbreviations where periods should be ignored
|
19 |
+
abbreviations = [
|
20 |
+
r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
|
21 |
+
]
|
22 |
+
|
23 |
+
for abbr in abbreviations:
|
24 |
+
# Remove periods in abbreviations at the end of the text
|
25 |
+
text = re.sub(f'({abbr})\.', r'\1', text)
|
26 |
+
|
27 |
+
return text
|
28 |
+
|
29 |
+
def is_end_of_sentence(text):
|
30 |
+
# Strip leading and trailing whitespace
|
31 |
+
text = text.strip()
|
32 |
+
|
33 |
+
# Remove periods in common abbreviations from the end of the text
|
34 |
+
text = remove_abbreviation_periods(text)
|
35 |
+
|
36 |
+
# Define regex patterns for sentence-ending punctuation
|
37 |
+
sentence_end_re = re.compile(r'[\.\!\?]\s*$')
|
38 |
+
|
39 |
+
# Check if the text ends with sentence-ending punctuation
|
40 |
+
return bool(sentence_end_re.search(text))
|
41 |
+
def clean_text(text):
|
42 |
+
"""Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
|
43 |
+
lines = text.split('\n')
|
44 |
+
filtered_lines = []
|
45 |
+
in_removal_section = False
|
46 |
+
paragraph_lines = []
|
47 |
+
|
48 |
+
def is_footnote_line(line):
|
49 |
+
"""Check if a line matches the footnote pattern."""
|
50 |
+
return footnote_pattern.match(line)
|
51 |
+
|
52 |
+
def append_line_to_paragraph(line):
|
53 |
+
"""Append the line to the paragraph, handling line breaks and footnotes."""
|
54 |
+
if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
|
55 |
+
# This line is a continuation of the previous one
|
56 |
+
paragraph_lines[-1] += ' ' + line.strip()
|
57 |
+
else:
|
58 |
+
# Start a new line in the paragraph
|
59 |
+
paragraph_lines.append(line.strip())
|
60 |
+
|
61 |
+
skip_line = False
|
62 |
+
for line in lines:
|
63 |
+
# Check for start and end markers
|
64 |
+
if start_marker_pattern.match(line):
|
65 |
+
in_removal_section = True
|
66 |
+
if in_removal_section and end_marker_pattern.match(line):
|
67 |
+
in_removal_section = False
|
68 |
+
continue
|
69 |
+
|
70 |
+
# Handle footnotes
|
71 |
+
if is_footnote_line(line):
|
72 |
+
skip_line = True
|
73 |
+
continue
|
74 |
+
|
75 |
+
if skip_line:
|
76 |
+
if is_end_of_sentence(line):
|
77 |
+
skip_line = False
|
78 |
+
continue
|
79 |
+
|
80 |
+
# Filter out headers and footers
|
81 |
+
if not header_pattern.match(line) and \
|
82 |
+
not footer_pattern.match(line) and \
|
83 |
+
not in_removal_section:
|
84 |
+
# Remove unnecessary line breaks and append line to paragraph_lines
|
85 |
+
if line.strip():
|
86 |
+
append_line_to_paragraph(line)
|
87 |
+
|
88 |
+
# Join all paragraph lines into a single paragraph text, removing unnecessary newlines
|
89 |
+
cleaned_paragraphs = "\n".join(paragraph_lines)
|
90 |
+
return cleaned_paragraphs
|
91 |
+
|
92 |
+
full_text = ""
|
93 |
+
previous_page_text = ""
|
94 |
+
|
95 |
+
with pdfplumber.open(pdf_path) as pdf:
|
96 |
+
for page in pdf.pages:
|
97 |
+
page_text = page.extract_text()
|
98 |
+
if page_text:
|
99 |
+
# Clean and format the page text
|
100 |
+
cleaned_text = clean_text(page_text)
|
101 |
+
# Handle text from previous page
|
102 |
+
if previous_page_text:
|
103 |
+
# Check if the previous page text ends with punctuation
|
104 |
+
if not is_end_of_sentence(previous_page_text):
|
105 |
+
# Append the current page text to previous page text
|
106 |
+
previous_page_text += " " + cleaned_text
|
107 |
+
else:
|
108 |
+
# Add previous page text to full text
|
109 |
+
full_text += previous_page_text + "\n"
|
110 |
+
# Reset previous page text
|
111 |
+
previous_page_text = cleaned_text
|
112 |
+
else:
|
113 |
+
previous_page_text = cleaned_text
|
114 |
+
|
115 |
+
# Add remaining text from the last page
|
116 |
+
if previous_page_text:
|
117 |
+
full_text += previous_page_text
|
118 |
+
|
119 |
+
return full_text.strip()
|
120 |
+
|
121 |
+
# Cleaning: cut unecessary information such as annex and intro
|
122 |
+
def find_text_range(text, start_keyword, end_keywords):
|
123 |
+
"""Find the text range between start and multiple end keywords."""
|
124 |
+
start_index = text.lower().find(start_keyword.lower())
|
125 |
+
|
126 |
+
if start_index == -1:
|
127 |
+
raise ValueError(f"Start keyword '{start_keyword}' not found in the text.")
|
128 |
+
|
129 |
+
# Find the earliest occurrence of any end keyword
|
130 |
+
end_index = len(text) # Default to end of text
|
131 |
+
for end_keyword in end_keywords:
|
132 |
+
keyword_index = text.lower().find(end_keyword.lower())
|
133 |
+
if keyword_index != -1 and keyword_index < end_index:
|
134 |
+
end_index = keyword_index
|
135 |
+
|
136 |
+
return start_index, end_index
|
137 |
+
|
138 |
+
def extract_relevant_text(text, start_index, end_index):
|
139 |
+
"""Extract text from the start index to the end index."""
|
140 |
+
return text[start_index:end_index].strip()
|
141 |
+
# Split paragraphs into list of paragraphs
|
142 |
+
def split_text_into_paragraphs(extracted_text, min_length):
|
143 |
+
"""
|
144 |
+
Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
|
145 |
+
"""
|
146 |
+
# Split the text into paragraphs based on newlines
|
147 |
+
paragraphs = re.split(r'\n+', extracted_text.strip())
|
148 |
+
|
149 |
+
def is_end_of_sentence(text):
|
150 |
+
"""Check if the text ends with punctuation indicating the end of a sentence."""
|
151 |
+
return bool(re.search(r'[.!?]$', text.strip()))
|
152 |
+
|
153 |
+
def count_sentences(text):
|
154 |
+
"""Count the number of sentences in a text."""
|
155 |
+
return len(re.split(r'(?<=[.!?])\s+', text.strip()))
|
156 |
+
|
157 |
+
def merge_single_sentence_paragraphs(paragraphs):
|
158 |
+
"""Merge single-sentence paragraphs with the next paragraph if necessary."""
|
159 |
+
merged_paragraphs = []
|
160 |
+
i = 0
|
161 |
+
while i < len(paragraphs):
|
162 |
+
para = paragraphs[i].strip()
|
163 |
+
if not para:
|
164 |
+
i += 1
|
165 |
+
continue
|
166 |
+
|
167 |
+
if count_sentences(para) == 1 and i + 1 < len(paragraphs):
|
168 |
+
# Check if the next paragraph should be merged with the current one
|
169 |
+
next_para = paragraphs[i + 1].strip()
|
170 |
+
if next_para:
|
171 |
+
# Merge single-sentence paragraph with the next paragraph
|
172 |
+
merged_paragraphs.append(para + ' ' + next_para)
|
173 |
+
i += 2 # Skip the next paragraph since it has been merged
|
174 |
+
else:
|
175 |
+
# If the next paragraph is empty, just add the current paragraph
|
176 |
+
merged_paragraphs.append(para)
|
177 |
+
i += 1
|
178 |
+
else:
|
179 |
+
# Add the current paragraph if it has more than one sentence or is the last one
|
180 |
+
merged_paragraphs.append(para)
|
181 |
+
i += 1
|
182 |
+
|
183 |
+
return merged_paragraphs
|
184 |
+
|
185 |
+
# Filter out paragraphs that are too short
|
186 |
+
filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
|
187 |
+
|
188 |
+
# Merge single-sentence paragraphs
|
189 |
+
final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
|
190 |
+
|
191 |
+
return final_paragraphs
|