File size: 6,670 Bytes
08b59ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import gradio as gr
from transformers import pipeline
import spacy
import lib.read_pdf
# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')

def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]

def make_spans(text, results):
    results_list = [res['label'] for res in results]
    facts_spans = list(zip(split_in_sentences(text), results_list))
    return facts_spans

# Initialize pipelines
summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')

def summarize_text(text):
    resp = summarizer(text)
    return resp[0]['summary_text']

def text_to_sentiment(text):
    sentiment = fin_model(text)[0]["label"]
    return sentiment

def fin_ext(text):
    results = fin_model(split_in_sentences(text))
    return make_spans(text, results)

def extract_and_summarize(pdf1, pdf2):
    if not pdf1 or not pdf2:
        return [], []

    pdf1_path = os.path.join(PDF_FOLDER, pdf1)
    pdf2_path = os.path.join(PDF_FOLDER, pdf2)

    # Extract and format paragraphs
    paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
    paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)

    start_keyword = "Main risks to"
    end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES"]

    start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
    start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)

    paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
    paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)

    paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
    paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)

    return paragraphs_1, paragraphs_2

# Gradio interface setup
PDF_FOLDER = "data"

def get_pdf_files(folder):
    return [f for f in os.listdir(folder) if f.endswith('.pdf')]

stored_paragraphs_1 = []
stored_paragraphs_2 = []

with gr.Blocks() as demo:
    gr.Markdown("## Financial Report Paragraph Selection and Analysis")

    with gr.Row():
        # Upload PDFs
        with gr.Column():
            pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
            pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")

        with gr.Column():
            b1 = gr.Button("Extract and Display Paragraphs")
            paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
            paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")

            def update_paragraphs(pdf1, pdf2):
                global stored_paragraphs_1, stored_paragraphs_2
                stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
                updated_dropdown_1 = gr.Dropdown.update(choices=[f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)], label="Select Paragraph from PDF 1")
                updated_dropdown_2 = gr.Dropdown.update(choices=[f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)], label="Select Paragraph from PDF 2")
                return updated_dropdown_1, updated_dropdown_2

            b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])

    with gr.Row():
        # Process the selected paragraph from PDF 1
        with gr.Column():
            selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content")
            summarize_btn1 = gr.Button("Summarize Text from PDF 1")
            sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
            fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")

            def process_paragraph_1(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = stored_paragraphs_1[paragraph_index]
                    summary = summarize_text(selected_paragraph)
                    sentiment = text_to_sentiment(selected_paragraph)
                    fin_spans = fin_ext(selected_paragraph)
                    return selected_paragraph, summary, sentiment, fin_spans
                except (IndexError, ValueError):
                    return "Invalid selection", "Error", "Error", []

            summarize_btn1.click(fn=lambda p: process_paragraph_1(p)[1], inputs=paragraph_1_dropdown, outputs=selected_paragraph_1)
            sentiment_btn1.click(fn=lambda p: process_paragraph_1(p)[2], inputs=paragraph_1_dropdown, outputs=selected_paragraph_1)
            b5 = gr.Button("Analyze Financial Tone and FLS")
            b5.click(fn=lambda p: process_paragraph_1(p)[3], inputs=paragraph_1_dropdown, outputs=fin_spans_1)

    with gr.Row():
        # Process the selected paragraph from PDF 2
        with gr.Column():
            selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content")
            summarize_btn2 = gr.Button("Summarize Text from PDF 2")
            sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
            fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")

            def process_paragraph_2(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = stored_paragraphs_2[paragraph_index]
                    summary = summarize_text(selected_paragraph)
                    sentiment = text_to_sentiment(selected_paragraph)
                    fin_spans = fin_ext(selected_paragraph)
                    return selected_paragraph, summary, sentiment, fin_spans
                except (IndexError, ValueError):
                    return "Invalid selection", "Error", "Error", []

            summarize_btn2.click(fn=lambda p: process_paragraph_2(p)[1], inputs=paragraph_2_dropdown, outputs=selected_paragraph_2)
            sentiment_btn2.click(fn=lambda p: process_paragraph_2(p)[2], inputs=paragraph_2_dropdown, outputs=selected_paragraph_2)
            b6 = gr.Button("Analyze Financial Tone and FLS")
            b6.click(fn=lambda p: process_paragraph_2(p)[3], inputs=paragraph_2_dropdown, outputs=fin_spans_2)

demo.launch()