Cachoups commited on
Commit
9c2be66
·
verified ·
1 Parent(s): c5caa40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -8
app.py CHANGED
@@ -41,7 +41,7 @@ def fin_ext_bis(text):
41
  results = fin_model_bis(split_in_sentences(text))
42
  return make_spans(text, results)
43
 
44
- def extract_and_summarize(pdf1, pdf2):
45
  if not pdf1 or not pdf2:
46
  return [], []
47
 
@@ -57,13 +57,12 @@ def extract_and_summarize(pdf1, pdf2):
57
 
58
  start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
59
  start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
60
-
61
  paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
62
  paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
63
-
64
- paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
65
- paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
66
-
67
  return paragraphs_1, paragraphs_2
68
 
69
  # Gradio interface setup
@@ -152,7 +151,58 @@ def process_and_compare(file1, sheet1, file2, sheet2):
152
 
153
  return file_path
154
 
155
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  stored_paragraphs_1 = []
157
  stored_paragraphs_2 = []
158
 
@@ -173,7 +223,7 @@ with gr.Blocks() as demo:
173
 
174
  def update_paragraphs(pdf1, pdf2):
175
  global stored_paragraphs_1, stored_paragraphs_2
176
- stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
177
  updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
178
  updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
179
  return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
@@ -236,5 +286,10 @@ with gr.Blocks() as demo:
236
 
237
  b1 = gr.Button("Compare Data")
238
  b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
 
 
 
 
 
239
 
240
  demo.launch()
 
41
  results = fin_model_bis(split_in_sentences(text))
42
  return make_spans(text, results)
43
 
44
+ def extract_and_paragraph(pdf1, pdf2, paragraph):
45
  if not pdf1 or not pdf2:
46
  return [], []
47
 
 
57
 
58
  start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
59
  start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
 
60
  paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
61
  paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
62
+ if paragraph:
63
+ paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
64
+ paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
65
+
66
  return paragraphs_1, paragraphs_2
67
 
68
  # Gradio interface setup
 
151
 
152
  return file_path
153
 
154
+ def find_sentences_with_keywords(text, keywords):
155
+ # Split text into sentences using regular expression to match sentence-ending punctuation
156
+ sentences = re.split(r'(?<=[.!?])\s+', text)
157
+
158
+ matched_sentences = []
159
+
160
+ # For each keyword, find sentences that contain the keyword as a whole word
161
+ for keyword in keywords:
162
+ keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE) # Using word boundaries
163
+
164
+ for sentence in sentences:
165
+ if keyword_pattern.search(sentence):
166
+ matched_sentences.append(sentence)
167
+
168
+ return matched_sentences
169
+
170
+
171
+ # Main function to process both PDFs based on the Excel file names and the sheet name
172
+ def process_pdfs(file1, file2, sheet):
173
+ # Derive PDF file names from the Excel file paths
174
+ pdf_file1 = file1.replace(".xlsx", ".pdf")
175
+ pdf_file2 = file2.replace(".xlsx", ".pdf")
176
+ set = {
177
+ 'GDP': ['GDP'],
178
+ 'HICP': ['HICP'],
179
+ 'RRE prices': ['RRE', 'residential'],
180
+ 'Unemployment' : 'Unemployment',
181
+ 'CRE prices': ['CRE', 'commercial']
182
+ }
183
+ # Extract text from both PDFs
184
+ pdf_text1,pdf_text2 = extract_and_paragraph(pdf_file1, pdf_file2, False)
185
+
186
+ # Find sentences that match the sheet names (used as keywords)
187
+ matched_sentences1 = find_sentences_with_keywords(pdf_text1, set[sheet])
188
+ matched_sentences2 = find_sentences_with_keywords(pdf_text2, set[sheet])
189
+
190
+ # Format the results for output
191
+ result = {
192
+ "PDF 1": {
193
+ "File": pdf_file1,
194
+ "Keyword": set[sheet],
195
+ "Sentences": matched_sentences1
196
+ },
197
+ "PDF 2": {
198
+ "File": pdf_file2,
199
+ "Keyword": set[sheet],
200
+ "Sentences": matched_sentences2
201
+ }
202
+ }
203
+
204
+ return result
205
+
206
  stored_paragraphs_1 = []
207
  stored_paragraphs_2 = []
208
 
 
223
 
224
  def update_paragraphs(pdf1, pdf2):
225
  global stored_paragraphs_1, stored_paragraphs_2
226
+ stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True)
227
  updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
228
  updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
229
  return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
 
286
 
287
  b1 = gr.Button("Compare Data")
288
  b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
289
+ with gr.Row():
290
+ with gr.Column():
291
+ result = gr.JSON(label="Comparison Result")
292
+ b2 = gr.Button("Extract text information")
293
+ b2.click(fn=process_pdfs, inputs=[file1, file2, sheet], outputs=result)
294
 
295
  demo.launch()