Spaces:

Cachoups
/

FinanceReport

Sleeping

App Files Files Community

Cachoups commited on Sep 13, 2024

Commit

9c2be66

verified ·

1 Parent(s): c5caa40

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -8

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ def fin_ext_bis(text):
     results = fin_model_bis(split_in_sentences(text))
     return make_spans(text, results)
-def extract_and_summarize(pdf1, pdf2):
     if not pdf1 or not pdf2:
         return [], []
@@ -57,13 +57,12 @@ def extract_and_summarize(pdf1, pdf2):
     start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
     start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
     paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
     paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
-    paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
-    paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
     return paragraphs_1, paragraphs_2
 # Gradio interface setup
@@ -152,7 +151,58 @@ def process_and_compare(file1, sheet1, file2, sheet2):
     return file_path
 stored_paragraphs_1 = []
 stored_paragraphs_2 = []
@@ -173,7 +223,7 @@ with gr.Blocks() as demo:
                 def update_paragraphs(pdf1, pdf2):
                     global stored_paragraphs_1, stored_paragraphs_2
-                    stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
                     updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
                     updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
                     return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
@@ -236,5 +286,10 @@ with gr.Blocks() as demo:
         b1 = gr.Button("Compare Data")
         b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
 demo.launch()

     results = fin_model_bis(split_in_sentences(text))
     return make_spans(text, results)
+def extract_and_paragraph(pdf1, pdf2, paragraph):
     if not pdf1 or not pdf2:
         return [], []
     start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
     start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
     paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
     paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
+    if paragraph:
+        paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
+        paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
     return paragraphs_1, paragraphs_2
 # Gradio interface setup
     return file_path
+def find_sentences_with_keywords(text, keywords):
+    # Split text into sentences using regular expression to match sentence-ending punctuation
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    matched_sentences = []
+    # For each keyword, find sentences that contain the keyword as a whole word
+    for keyword in keywords:
+        keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE)  # Using word boundaries
+        for sentence in sentences:
+            if keyword_pattern.search(sentence):
+                matched_sentences.append(sentence)
+    return matched_sentences
+# Main function to process both PDFs based on the Excel file names and the sheet name
+def process_pdfs(file1, file2, sheet):
+    # Derive PDF file names from the Excel file paths
+    pdf_file1 = file1.replace(".xlsx", ".pdf")
+    pdf_file2 = file2.replace(".xlsx", ".pdf")
+    set = {
+    'GDP': ['GDP'],
+    'HICP': ['HICP'],
+    'RRE prices': ['RRE', 'residential'],
+    'Unemployment' : 'Unemployment',
+    'CRE prices': ['CRE', 'commercial']
+    }
+    # Extract text from both PDFs
+    pdf_text1,pdf_text2 = extract_and_paragraph(pdf_file1, pdf_file2, False)
+    # Find sentences that match the sheet names (used as keywords)
+    matched_sentences1 = find_sentences_with_keywords(pdf_text1, set[sheet])
+    matched_sentences2 = find_sentences_with_keywords(pdf_text2, set[sheet])
+    # Format the results for output
+    result = {
+        "PDF 1": {
+            "File": pdf_file1,
+            "Keyword": set[sheet],
+            "Sentences": matched_sentences1
+        },
+        "PDF 2": {
+            "File": pdf_file2,
+            "Keyword": set[sheet],
+            "Sentences": matched_sentences2
+        }
+    }
+    return result
 stored_paragraphs_1 = []
 stored_paragraphs_2 = []
                 def update_paragraphs(pdf1, pdf2):
                     global stored_paragraphs_1, stored_paragraphs_2
+                    stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True)
                     updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
                     updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
                     return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
         b1 = gr.Button("Compare Data")
         b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
+        with gr.Row():
+            with gr.Column():
+                result = gr.JSON(label="Comparison Result")
+        b2 = gr.Button("Extract text information")
+        b2.click(fn=process_pdfs, inputs=[file1, file2, sheet], outputs=result)
 demo.launch()