Cachoups commited on
Commit
dbf97ba
·
verified ·
1 Parent(s): 24621a7

Update lib/read_pdf.py

Browse files
Files changed (1) hide show
  1. lib/read_pdf.py +192 -190
lib/read_pdf.py CHANGED
@@ -1,191 +1,193 @@
1
- import pdfplumber
2
- import re
3
-
4
- # Extract text as paragraph delimiter without tables and graphs
5
- def extract_and_format_paragraphs(pdf_path):
6
- """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
7
-
8
- # Define patterns for headers, footnotes, and specific lines
9
- header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
10
- footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
11
- footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space
12
- start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
13
- end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
14
-
15
- # Define common abbreviations and patterns that should not be considered as end-of-sentence
16
- #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
17
- def remove_abbreviation_periods(text):
18
- # Define regex patterns for common abbreviations where periods should be ignored
19
- abbreviations = [
20
- r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
21
- ]
22
-
23
- for abbr in abbreviations:
24
- # Remove periods in abbreviations at the end of the text
25
- text = re.sub(f'({abbr})\.', r'\1', text)
26
-
27
- return text
28
-
29
- def is_end_of_sentence(text):
30
- # Strip leading and trailing whitespace
31
- text = text.strip()
32
-
33
- # Remove periods in common abbreviations from the end of the text
34
- text = remove_abbreviation_periods(text)
35
-
36
- # Define regex patterns for sentence-ending punctuation
37
- sentence_end_re = re.compile(r'[\.\!\?]\s*$')
38
-
39
- # Check if the text ends with sentence-ending punctuation
40
- return bool(sentence_end_re.search(text))
41
- def clean_text(text):
42
- """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
43
- lines = text.split('\n')
44
- filtered_lines = []
45
- in_removal_section = False
46
- paragraph_lines = []
47
-
48
- def is_footnote_line(line):
49
- """Check if a line matches the footnote pattern."""
50
- return footnote_pattern.match(line)
51
-
52
- def append_line_to_paragraph(line):
53
- """Append the line to the paragraph, handling line breaks and footnotes."""
54
- if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
55
- # This line is a continuation of the previous one
56
- paragraph_lines[-1] += ' ' + line.strip()
57
- else:
58
- # Start a new line in the paragraph
59
- paragraph_lines.append(line.strip())
60
-
61
- skip_line = False
62
- for line in lines:
63
- # Check for start and end markers
64
- if start_marker_pattern.match(line):
65
- in_removal_section = True
66
- if in_removal_section and end_marker_pattern.match(line):
67
- in_removal_section = False
68
- continue
69
-
70
- # Handle footnotes
71
- if is_footnote_line(line):
72
- skip_line = True
73
- continue
74
-
75
- if skip_line:
76
- if is_end_of_sentence(line):
77
- skip_line = False
78
- continue
79
-
80
- # Filter out headers and footers
81
- if not header_pattern.match(line) and \
82
- not footer_pattern.match(line) and \
83
- not in_removal_section:
84
- # Remove unnecessary line breaks and append line to paragraph_lines
85
- if line.strip():
86
- append_line_to_paragraph(line)
87
-
88
- # Join all paragraph lines into a single paragraph text, removing unnecessary newlines
89
- cleaned_paragraphs = "\n".join(paragraph_lines)
90
- return cleaned_paragraphs
91
-
92
- full_text = ""
93
- previous_page_text = ""
94
-
95
- with pdfplumber.open(pdf_path) as pdf:
96
- for page in pdf.pages:
97
- page_text = page.extract_text()
98
- if page_text:
99
- # Clean and format the page text
100
- cleaned_text = clean_text(page_text)
101
- # Handle text from previous page
102
- if previous_page_text:
103
- # Check if the previous page text ends with punctuation
104
- if not is_end_of_sentence(previous_page_text):
105
- # Append the current page text to previous page text
106
- previous_page_text += " " + cleaned_text
107
- else:
108
- # Add previous page text to full text
109
- full_text += previous_page_text + "\n"
110
- # Reset previous page text
111
- previous_page_text = cleaned_text
112
- else:
113
- previous_page_text = cleaned_text
114
-
115
- # Add remaining text from the last page
116
- if previous_page_text:
117
- full_text += previous_page_text
118
-
119
- return full_text.strip()
120
-
121
- # Cleaning: cut unecessary information such as annex and intro
122
- def find_text_range(text, start_keyword, end_keywords):
123
- """Find the text range between start and multiple end keywords."""
124
- start_index = text.lower().find(start_keyword.lower())
125
-
126
- if start_index == -1:
127
- raise ValueError(f"Start keyword '{start_keyword}' not found in the text.")
128
-
129
- # Find the earliest occurrence of any end keyword
130
- end_index = len(text) # Default to end of text
131
- for end_keyword in end_keywords:
132
- keyword_index = text.lower().find(end_keyword.lower())
133
- if keyword_index != -1 and keyword_index < end_index:
134
- end_index = keyword_index
135
-
136
- return start_index, end_index
137
-
138
- def extract_relevant_text(text, start_index, end_index):
139
- """Extract text from the start index to the end index."""
140
- return text[start_index:end_index].strip()
141
- # Split paragraphs into list of paragraphs
142
- def split_text_into_paragraphs(extracted_text, min_length):
143
- """
144
- Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
145
- """
146
- # Split the text into paragraphs based on newlines
147
- paragraphs = re.split(r'\n+', extracted_text.strip())
148
-
149
- def is_end_of_sentence(text):
150
- """Check if the text ends with punctuation indicating the end of a sentence."""
151
- return bool(re.search(r'[.!?]$', text.strip()))
152
-
153
- def count_sentences(text):
154
- """Count the number of sentences in a text."""
155
- return len(re.split(r'(?<=[.!?])\s+', text.strip()))
156
-
157
- def merge_single_sentence_paragraphs(paragraphs):
158
- """Merge single-sentence paragraphs with the next paragraph if necessary."""
159
- merged_paragraphs = []
160
- i = 0
161
- while i < len(paragraphs):
162
- para = paragraphs[i].strip()
163
- if not para:
164
- i += 1
165
- continue
166
-
167
- if count_sentences(para) == 1 and i + 1 < len(paragraphs):
168
- # Check if the next paragraph should be merged with the current one
169
- next_para = paragraphs[i + 1].strip()
170
- if next_para:
171
- # Merge single-sentence paragraph with the next paragraph
172
- merged_paragraphs.append(para + ' ' + next_para)
173
- i += 2 # Skip the next paragraph since it has been merged
174
- else:
175
- # If the next paragraph is empty, just add the current paragraph
176
- merged_paragraphs.append(para)
177
- i += 1
178
- else:
179
- # Add the current paragraph if it has more than one sentence or is the last one
180
- merged_paragraphs.append(para)
181
- i += 1
182
-
183
- return merged_paragraphs
184
-
185
- # Filter out paragraphs that are too short
186
- filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
187
-
188
- # Merge single-sentence paragraphs
189
- final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
190
-
 
 
191
  return final_paragraphs
 
1
+ import pdfplumber
2
+ import re
3
+
4
+ # Extract text as paragraph delimiter without tables and graphs
5
+ def extract_and_format_paragraphs(pdf_path):
6
+ """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
7
+
8
+ # Define patterns for headers, footnotes, and specific lines
9
+ header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
10
+ footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
11
+ footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space
12
+ start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
13
+ end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
14
+
15
+ # Define common abbreviations and patterns that should not be considered as end-of-sentence
16
+ #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
17
+ def remove_abbreviation_periods(text):
18
+ # Define regex patterns for common abbreviations where periods should be ignored
19
+ abbreviations = [
20
+ r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
21
+ ]
22
+
23
+ for abbr in abbreviations:
24
+ # Remove periods in abbreviations at the end of the text
25
+ text = re.sub(f'({abbr})\.', r'\1', text)
26
+
27
+ return text
28
+
29
+ def is_end_of_sentence(text):
30
+ # Strip leading and trailing whitespace
31
+ text = text.strip()
32
+
33
+ # Remove periods in common abbreviations from the end of the text
34
+ text = remove_abbreviation_periods(text)
35
+
36
+ # Define regex patterns for sentence-ending punctuation
37
+ sentence_end_re = re.compile(r'[\.\!\?]\s*$')
38
+
39
+ # Check if the text ends with sentence-ending punctuation
40
+ return bool(sentence_end_re.search(text))
41
+ def clean_text(text):
42
+ """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
43
+ lines = text.split('\n')
44
+ filtered_lines = []
45
+ in_removal_section = False
46
+ paragraph_lines = []
47
+
48
+ def is_footnote_line(line):
49
+ """Check if a line matches the footnote pattern."""
50
+ return footnote_pattern.match(line)
51
+
52
+ def append_line_to_paragraph(line):
53
+ """Append the line to the paragraph, handling line breaks and footnotes."""
54
+ if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
55
+ # This line is a continuation of the previous one
56
+ paragraph_lines[-1] += ' ' + line.strip()
57
+ else:
58
+ # Start a new line in the paragraph
59
+ paragraph_lines.append(line.strip())
60
+
61
+ skip_line = False
62
+ for line in lines:
63
+ # Check for start and end markers
64
+ if start_marker_pattern.match(line):
65
+ in_removal_section = True
66
+ if in_removal_section and end_marker_pattern.match(line):
67
+ in_removal_section = False
68
+ continue
69
+
70
+ # Handle footnotes
71
+ if is_footnote_line(line):
72
+ skip_line = True
73
+ continue
74
+
75
+ if skip_line:
76
+ if is_end_of_sentence(line):
77
+ skip_line = False
78
+ continue
79
+
80
+ # Filter out headers and footers
81
+ if not header_pattern.match(line) and \
82
+ not footer_pattern.match(line) and \
83
+ not in_removal_section:
84
+ # Remove unnecessary line breaks and append line to paragraph_lines
85
+ if line.strip():
86
+ append_line_to_paragraph(line)
87
+
88
+ # Join all paragraph lines into a single paragraph text, removing unnecessary newlines
89
+ cleaned_paragraphs = "\n".join(paragraph_lines)
90
+ return cleaned_paragraphs
91
+
92
+ full_text = ""
93
+ previous_page_text = ""
94
+
95
+ with pdfplumber.open(pdf_path) as pdf:
96
+ for page in pdf.pages:
97
+ page_text = page.extract_text()
98
+ if page_text:
99
+ # Clean and format the page text
100
+ cleaned_text = clean_text(page_text)
101
+ # Handle text from previous page
102
+ if previous_page_text:
103
+ # Check if the previous page text ends with punctuation
104
+ if not is_end_of_sentence(previous_page_text):
105
+ # Append the current page text to previous page text
106
+ previous_page_text += " " + cleaned_text
107
+ else:
108
+ # Add previous page text to full text
109
+ full_text += previous_page_text + "\n"
110
+ # Reset previous page text
111
+ previous_page_text = cleaned_text
112
+ else:
113
+ previous_page_text = cleaned_text
114
+
115
+ # Add remaining text from the last page
116
+ if previous_page_text:
117
+ full_text += previous_page_text
118
+
119
+ return full_text.strip()
120
+
121
+ # Cleaning: cut unecessary information such as annex and intro
122
+ def find_text_range(text, start_keywords, end_keywords):
123
+ """Find the text range between start and multiple end keywords."""
124
+ start_index = 0
125
+ for start_keyword in start_keywords:
126
+ keyword_index = text.lower().find(start_keyword.lower())
127
+ if keyword_index != -1 and keyword_index > start_index:
128
+ start_index = keyword_index
129
+ #start_index = text.lower().find(start_keyword.lower())
130
+
131
+ # Find the earliest occurrence of any end keyword
132
+ end_index = len(text) # Default to end of text
133
+ for end_keyword in end_keywords:
134
+ keyword_index = text.lower().find(end_keyword.lower())
135
+ if keyword_index != -1 and keyword_index < end_index:
136
+ end_index = keyword_index
137
+
138
+ return start_index, end_index
139
+
140
+ def extract_relevant_text(text, start_index, end_index):
141
+ """Extract text from the start index to the end index."""
142
+ return text[start_index:end_index].strip()
143
+ # Split paragraphs into list of paragraphs
144
+ def split_text_into_paragraphs(extracted_text, min_length):
145
+ """
146
+ Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
147
+ """
148
+ # Split the text into paragraphs based on newlines
149
+ paragraphs = re.split(r'\n+', extracted_text.strip())
150
+
151
+ def is_end_of_sentence(text):
152
+ """Check if the text ends with punctuation indicating the end of a sentence."""
153
+ return bool(re.search(r'[.!?]$', text.strip()))
154
+
155
+ def count_sentences(text):
156
+ """Count the number of sentences in a text."""
157
+ return len(re.split(r'(?<=[.!?])\s+', text.strip()))
158
+
159
+ def merge_single_sentence_paragraphs(paragraphs):
160
+ """Merge single-sentence paragraphs with the next paragraph if necessary."""
161
+ merged_paragraphs = []
162
+ i = 0
163
+ while i < len(paragraphs):
164
+ para = paragraphs[i].strip()
165
+ if not para:
166
+ i += 1
167
+ continue
168
+
169
+ if count_sentences(para) == 1 and i + 1 < len(paragraphs):
170
+ # Check if the next paragraph should be merged with the current one
171
+ next_para = paragraphs[i + 1].strip()
172
+ if next_para:
173
+ # Merge single-sentence paragraph with the next paragraph
174
+ merged_paragraphs.append(para + ' ' + next_para)
175
+ i += 2 # Skip the next paragraph since it has been merged
176
+ else:
177
+ # If the next paragraph is empty, just add the current paragraph
178
+ merged_paragraphs.append(para)
179
+ i += 1
180
+ else:
181
+ # Add the current paragraph if it has more than one sentence or is the last one
182
+ merged_paragraphs.append(para)
183
+ i += 1
184
+
185
+ return merged_paragraphs
186
+
187
+ # Filter out paragraphs that are too short
188
+ filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
189
+
190
+ # Merge single-sentence paragraphs
191
+ final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
192
+
193
  return final_paragraphs