Cachoups commited on
Commit
3e45198
·
verified ·
1 Parent(s): 16546eb

Upload read_pdf.py

Browse files
Files changed (1) hide show
  1. lib/read_pdf.py +191 -0
lib/read_pdf.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import re
3
+
4
+ # Extract text as paragraph delimiter without tables and graphs
5
+ def extract_and_format_paragraphs(pdf_path):
6
+ """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
7
+
8
+ # Define patterns for headers, footnotes, and specific lines
9
+ header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
10
+ footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
11
+ footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space
12
+ start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
13
+ end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
14
+
15
+ # Define common abbreviations and patterns that should not be considered as end-of-sentence
16
+ #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
17
+ def remove_abbreviation_periods(text):
18
+ # Define regex patterns for common abbreviations where periods should be ignored
19
+ abbreviations = [
20
+ r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
21
+ ]
22
+
23
+ for abbr in abbreviations:
24
+ # Remove periods in abbreviations at the end of the text
25
+ text = re.sub(f'({abbr})\.', r'\1', text)
26
+
27
+ return text
28
+
29
+ def is_end_of_sentence(text):
30
+ # Strip leading and trailing whitespace
31
+ text = text.strip()
32
+
33
+ # Remove periods in common abbreviations from the end of the text
34
+ text = remove_abbreviation_periods(text)
35
+
36
+ # Define regex patterns for sentence-ending punctuation
37
+ sentence_end_re = re.compile(r'[\.\!\?]\s*$')
38
+
39
+ # Check if the text ends with sentence-ending punctuation
40
+ return bool(sentence_end_re.search(text))
41
+ def clean_text(text):
42
+ """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
43
+ lines = text.split('\n')
44
+ filtered_lines = []
45
+ in_removal_section = False
46
+ paragraph_lines = []
47
+
48
+ def is_footnote_line(line):
49
+ """Check if a line matches the footnote pattern."""
50
+ return footnote_pattern.match(line)
51
+
52
+ def append_line_to_paragraph(line):
53
+ """Append the line to the paragraph, handling line breaks and footnotes."""
54
+ if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
55
+ # This line is a continuation of the previous one
56
+ paragraph_lines[-1] += ' ' + line.strip()
57
+ else:
58
+ # Start a new line in the paragraph
59
+ paragraph_lines.append(line.strip())
60
+
61
+ skip_line = False
62
+ for line in lines:
63
+ # Check for start and end markers
64
+ if start_marker_pattern.match(line):
65
+ in_removal_section = True
66
+ if in_removal_section and end_marker_pattern.match(line):
67
+ in_removal_section = False
68
+ continue
69
+
70
+ # Handle footnotes
71
+ if is_footnote_line(line):
72
+ skip_line = True
73
+ continue
74
+
75
+ if skip_line:
76
+ if is_end_of_sentence(line):
77
+ skip_line = False
78
+ continue
79
+
80
+ # Filter out headers and footers
81
+ if not header_pattern.match(line) and \
82
+ not footer_pattern.match(line) and \
83
+ not in_removal_section:
84
+ # Remove unnecessary line breaks and append line to paragraph_lines
85
+ if line.strip():
86
+ append_line_to_paragraph(line)
87
+
88
+ # Join all paragraph lines into a single paragraph text, removing unnecessary newlines
89
+ cleaned_paragraphs = "\n".join(paragraph_lines)
90
+ return cleaned_paragraphs
91
+
92
+ full_text = ""
93
+ previous_page_text = ""
94
+
95
+ with pdfplumber.open(pdf_path) as pdf:
96
+ for page in pdf.pages:
97
+ page_text = page.extract_text()
98
+ if page_text:
99
+ # Clean and format the page text
100
+ cleaned_text = clean_text(page_text)
101
+ # Handle text from previous page
102
+ if previous_page_text:
103
+ # Check if the previous page text ends with punctuation
104
+ if not is_end_of_sentence(previous_page_text):
105
+ # Append the current page text to previous page text
106
+ previous_page_text += " " + cleaned_text
107
+ else:
108
+ # Add previous page text to full text
109
+ full_text += previous_page_text + "\n"
110
+ # Reset previous page text
111
+ previous_page_text = cleaned_text
112
+ else:
113
+ previous_page_text = cleaned_text
114
+
115
+ # Add remaining text from the last page
116
+ if previous_page_text:
117
+ full_text += previous_page_text
118
+
119
+ return full_text.strip()
120
+
121
+ # Cleaning: cut unecessary information such as annex and intro
122
+ def find_text_range(text, start_keyword, end_keywords):
123
+ """Find the text range between start and multiple end keywords."""
124
+ start_index = text.lower().find(start_keyword.lower())
125
+
126
+ if start_index == -1:
127
+ raise ValueError(f"Start keyword '{start_keyword}' not found in the text.")
128
+
129
+ # Find the earliest occurrence of any end keyword
130
+ end_index = len(text) # Default to end of text
131
+ for end_keyword in end_keywords:
132
+ keyword_index = text.lower().find(end_keyword.lower())
133
+ if keyword_index != -1 and keyword_index < end_index:
134
+ end_index = keyword_index
135
+
136
+ return start_index, end_index
137
+
138
+ def extract_relevant_text(text, start_index, end_index):
139
+ """Extract text from the start index to the end index."""
140
+ return text[start_index:end_index].strip()
141
+ # Split paragraphs into list of paragraphs
142
+ def split_text_into_paragraphs(extracted_text, min_length):
143
+ """
144
+ Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
145
+ """
146
+ # Split the text into paragraphs based on newlines
147
+ paragraphs = re.split(r'\n+', extracted_text.strip())
148
+
149
+ def is_end_of_sentence(text):
150
+ """Check if the text ends with punctuation indicating the end of a sentence."""
151
+ return bool(re.search(r'[.!?]$', text.strip()))
152
+
153
+ def count_sentences(text):
154
+ """Count the number of sentences in a text."""
155
+ return len(re.split(r'(?<=[.!?])\s+', text.strip()))
156
+
157
+ def merge_single_sentence_paragraphs(paragraphs):
158
+ """Merge single-sentence paragraphs with the next paragraph if necessary."""
159
+ merged_paragraphs = []
160
+ i = 0
161
+ while i < len(paragraphs):
162
+ para = paragraphs[i].strip()
163
+ if not para:
164
+ i += 1
165
+ continue
166
+
167
+ if count_sentences(para) == 1 and i + 1 < len(paragraphs):
168
+ # Check if the next paragraph should be merged with the current one
169
+ next_para = paragraphs[i + 1].strip()
170
+ if next_para:
171
+ # Merge single-sentence paragraph with the next paragraph
172
+ merged_paragraphs.append(para + ' ' + next_para)
173
+ i += 2 # Skip the next paragraph since it has been merged
174
+ else:
175
+ # If the next paragraph is empty, just add the current paragraph
176
+ merged_paragraphs.append(para)
177
+ i += 1
178
+ else:
179
+ # Add the current paragraph if it has more than one sentence or is the last one
180
+ merged_paragraphs.append(para)
181
+ i += 1
182
+
183
+ return merged_paragraphs
184
+
185
+ # Filter out paragraphs that are too short
186
+ filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
187
+
188
+ # Merge single-sentence paragraphs
189
+ final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
190
+
191
+ return final_paragraphs