Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -41,7 +41,7 @@ def fin_ext_bis(text):
|
|
41 |
results = fin_model_bis(split_in_sentences(text))
|
42 |
return make_spans(text, results)
|
43 |
|
44 |
-
def
|
45 |
if not pdf1 or not pdf2:
|
46 |
return [], []
|
47 |
|
@@ -57,13 +57,12 @@ def extract_and_summarize(pdf1, pdf2):
|
|
57 |
|
58 |
start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
|
59 |
start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
|
60 |
-
|
61 |
paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
|
62 |
paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
return paragraphs_1, paragraphs_2
|
68 |
|
69 |
# Gradio interface setup
|
@@ -152,7 +151,58 @@ def process_and_compare(file1, sheet1, file2, sheet2):
|
|
152 |
|
153 |
return file_path
|
154 |
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
stored_paragraphs_1 = []
|
157 |
stored_paragraphs_2 = []
|
158 |
|
@@ -173,7 +223,7 @@ with gr.Blocks() as demo:
|
|
173 |
|
174 |
def update_paragraphs(pdf1, pdf2):
|
175 |
global stored_paragraphs_1, stored_paragraphs_2
|
176 |
-
stored_paragraphs_1, stored_paragraphs_2 =
|
177 |
updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
|
178 |
updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
|
179 |
return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
|
@@ -236,5 +286,10 @@ with gr.Blocks() as demo:
|
|
236 |
|
237 |
b1 = gr.Button("Compare Data")
|
238 |
b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
demo.launch()
|
|
|
41 |
results = fin_model_bis(split_in_sentences(text))
|
42 |
return make_spans(text, results)
|
43 |
|
44 |
+
def extract_and_paragraph(pdf1, pdf2, paragraph):
|
45 |
if not pdf1 or not pdf2:
|
46 |
return [], []
|
47 |
|
|
|
57 |
|
58 |
start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
|
59 |
start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
|
|
|
60 |
paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
|
61 |
paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
|
62 |
+
if paragraph:
|
63 |
+
paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
|
64 |
+
paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
|
65 |
+
|
66 |
return paragraphs_1, paragraphs_2
|
67 |
|
68 |
# Gradio interface setup
|
|
|
151 |
|
152 |
return file_path
|
153 |
|
154 |
+
def find_sentences_with_keywords(text, keywords):
|
155 |
+
# Split text into sentences using regular expression to match sentence-ending punctuation
|
156 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
157 |
+
|
158 |
+
matched_sentences = []
|
159 |
+
|
160 |
+
# For each keyword, find sentences that contain the keyword as a whole word
|
161 |
+
for keyword in keywords:
|
162 |
+
keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE) # Using word boundaries
|
163 |
+
|
164 |
+
for sentence in sentences:
|
165 |
+
if keyword_pattern.search(sentence):
|
166 |
+
matched_sentences.append(sentence)
|
167 |
+
|
168 |
+
return matched_sentences
|
169 |
+
|
170 |
+
|
171 |
+
# Main function to process both PDFs based on the Excel file names and the sheet name
|
172 |
+
def process_pdfs(file1, file2, sheet):
|
173 |
+
# Derive PDF file names from the Excel file paths
|
174 |
+
pdf_file1 = file1.replace(".xlsx", ".pdf")
|
175 |
+
pdf_file2 = file2.replace(".xlsx", ".pdf")
|
176 |
+
set = {
|
177 |
+
'GDP': ['GDP'],
|
178 |
+
'HICP': ['HICP'],
|
179 |
+
'RRE prices': ['RRE', 'residential'],
|
180 |
+
'Unemployment' : 'Unemployment',
|
181 |
+
'CRE prices': ['CRE', 'commercial']
|
182 |
+
}
|
183 |
+
# Extract text from both PDFs
|
184 |
+
pdf_text1,pdf_text2 = extract_and_paragraph(pdf_file1, pdf_file2, False)
|
185 |
+
|
186 |
+
# Find sentences that match the sheet names (used as keywords)
|
187 |
+
matched_sentences1 = find_sentences_with_keywords(pdf_text1, set[sheet])
|
188 |
+
matched_sentences2 = find_sentences_with_keywords(pdf_text2, set[sheet])
|
189 |
+
|
190 |
+
# Format the results for output
|
191 |
+
result = {
|
192 |
+
"PDF 1": {
|
193 |
+
"File": pdf_file1,
|
194 |
+
"Keyword": set[sheet],
|
195 |
+
"Sentences": matched_sentences1
|
196 |
+
},
|
197 |
+
"PDF 2": {
|
198 |
+
"File": pdf_file2,
|
199 |
+
"Keyword": set[sheet],
|
200 |
+
"Sentences": matched_sentences2
|
201 |
+
}
|
202 |
+
}
|
203 |
+
|
204 |
+
return result
|
205 |
+
|
206 |
stored_paragraphs_1 = []
|
207 |
stored_paragraphs_2 = []
|
208 |
|
|
|
223 |
|
224 |
def update_paragraphs(pdf1, pdf2):
|
225 |
global stored_paragraphs_1, stored_paragraphs_2
|
226 |
+
stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True)
|
227 |
updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
|
228 |
updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
|
229 |
return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
|
|
|
286 |
|
287 |
b1 = gr.Button("Compare Data")
|
288 |
b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
|
289 |
+
with gr.Row():
|
290 |
+
with gr.Column():
|
291 |
+
result = gr.JSON(label="Comparison Result")
|
292 |
+
b2 = gr.Button("Extract text information")
|
293 |
+
b2.click(fn=process_pdfs, inputs=[file1, file2, sheet], outputs=result)
|
294 |
|
295 |
demo.launch()
|