Spaces:

Cachoups
/

FinanceReport

Sleeping

App Files Files Community

FinanceReport / app.py

Cachoups

Update app.py

c6b10c9 verified 10 months ago

raw

history blame

16.9 kB

	import os
	import gradio as gr
	from transformers import pipeline
	import spacy
	import lib.read_pdf
	import pandas as pd
	import re
	import matplotlib.pyplot as plt
	import matplotlib.patches as patches
	import io
	# Initialize spaCy model
	nlp = spacy.load('en_core_web_sm')
	nlp.add_pipe('sentencizer')

	def split_in_sentences(text):
	doc = nlp(text)
	return [str(sent).strip() for sent in doc.sents]

	def make_spans(text, results):
	results_list = [res['label'] for res in results]
	facts_spans = list(zip(split_in_sentences(text), results_list))
	return facts_spans

	# Initialize pipelines
	summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
	fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')
	fin_model_bis = pipeline("sentiment-analysis", model='ProsusAI/finbert', tokenizer='ProsusAI/finbert')
	table_to_text = pipeline('text2text-generation', model='google/flan-t5-large')

	def summarize_text(text):
	resp = summarizer(text)
	return resp[0]['summary_text']

	def text_to_sentiment(text):
	sentiment = fin_model(text)[0]["label"]
	return sentiment

	def fin_ext(text):
	results = fin_model(split_in_sentences(text))
	return make_spans(text, results)
	def fin_ext_bis(text):
	results = fin_model_bis(split_in_sentences(text))
	return make_spans(text, results)

	def extract_and_paragraph(pdf1, pdf2, paragraph):
	if not pdf1 or not pdf2:
	return [], []

	pdf1_path = os.path.join(PDF_FOLDER, pdf1)
	pdf2_path = os.path.join(PDF_FOLDER, pdf2)

	# Extract and format paragraphs
	paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
	paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)

	start_keyword = "Main risks to"
	end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES"]

	start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
	start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
	paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
	paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
	if paragraph:
	paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
	paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)

	return paragraphs_1, paragraphs_2

	# Gradio interface setup
	PDF_FOLDER = "data"

	def get_pdf_files(folder):
	return [f for f in os.listdir(folder) if f.endswith('.pdf')]

	def show(name):
	return f"{name}"

	def get_excel_files(folder):
	return [f for f in os.listdir(folder) if f.endswith('.xlsx')]

	def get_sheet_names(file):
	xls = pd.ExcelFile(os.path.join(PDF_FOLDER, file))
	return gr.update(choices=xls.sheet_names)


	def process_and_compare(file1, sheet1, file2, sheet2):
	def process_file(file_path, sheet_name):
	# Extract year from file name
	year = int(re.search(r'(\d{4})', file_path).group(1))

	# Load the Excel file
	df = pd.read_excel(os.path.join(PDF_FOLDER, file_path), sheet_name=sheet_name, index_col=0)

	# Define expected columns based on extracted year
	historical_col = f'Historical {year - 1}'
	baseline_cols = [f'Baseline {year}', f'Baseline {year + 1}', f'Baseline {year + 2}']
	adverse_cols = [f'Adverse {year}', f'Adverse {year + 1}', f'Adverse {year + 2}']
	level_deviation_col = f'Level Deviation {year + 2}'

	# Drop rows and reset index
	df = df.iloc[4:].reset_index(drop=True)

	# Define the new column names
	new_columns = ['Country', 'Code', historical_col] + baseline_cols + adverse_cols + ['Adverse Cumulative', 'Adverse Minimum', level_deviation_col]

	# Ensure the number of columns matches
	if len(df.columns) == len(new_columns):
	df.columns = new_columns
	else:
	raise ValueError(f"Expected {len(new_columns)} columns, but found {len(df.columns)} columns in the data.")
	columns = ['Country', f'Adverse {year}', f'Adverse {year+1}', f'Adverse {year+2}', 'Adverse Cumulative']
	return df, df[columns]

	# Process both files
	global stored_df1, stored_df2
	df1, stored_df1 = process_file(file1, sheet1)
	df2, stored_df2 = process_file(file2, sheet2)
	year1 = int(re.search(r'(\d{4})', file1).group(1))
	year2 = int(re.search(r'(\d{4})', file2).group(1))

	# Merge dataframes on 'Country'
	merged_df = pd.merge(df2, df1, on='Country', suffixes=(f'_{year1}', f'_{year2}'))
	merged_df['Difference adverse cumulative growth'] = merged_df[f'Adverse Cumulative_{year2}'] - merged_df[f'Adverse Cumulative_{year1}']
	# Ensure data types are correct
	merged_df['Country'] = merged_df['Country'].astype(str)
	merged_df['Difference adverse cumulative growth'] = pd.to_numeric(merged_df['Difference adverse cumulative growth'], errors='coerce')

	# Create histogram plot with color coding
	fig, ax = plt.subplots(figsize=(12, 8))
	colors = plt.get_cmap('tab20').colors # Use a colormap with multiple colors
	num_countries = len(merged_df['Country'])

	bars = ax.bar(merged_df['Country'], merged_df['Difference adverse cumulative growth'], color=colors[:num_countries])

	# Add a legend
	handles = [patches.Patch(color=color, label=country) for color, country in zip(colors[:num_countries], merged_df['Country'])]
	ax.legend(handles=handles, title='Countries', bbox_to_anchor=(1.05, 1), loc='upper left')

	ax.set_title(f'Histogram of Difference between Adverse cumulative growth of {year2} and {year1} for {sheet1}')
	ax.set_xlabel('Country')
	ax.set_ylabel('Difference')
	plt.xticks(rotation=90)

	# Save plot to a file
	file_path = 'output/plot.png'
	plt.savefig(file_path, format='png', bbox_inches='tight')
	plt.close()

	return file_path, gr.update(choices=stored_df1.Country.values.tolist()), gr.update(choices=stored_df2.Country.values.tolist())

	def find_sentences_with_keywords(text, keywords):
	# Split text into sentences using regular expression to match sentence-ending punctuation
	sentences = re.split(r'(?<=[.!?])\s+', text)

	matched_sentences = set() # Use a set to store unique sentences

	# For each keyword, find sentences that contain the keyword as a whole word
	for keyword in keywords:
	keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE) # Using word boundaries

	for sentence in sentences:
	if keyword_pattern.search(sentence):
	matched_sentences.add(sentence) # Add to set to ensure uniqueness

	return list(matched_sentences) # Convert set back to list for consistent output


	# Main function to process both PDFs based on the Excel file names and the sheet name
	def process_pdfs_and_analyze_sentiment(file1, file2, sheet):
	# Extract text from both PDFs based on the file name
	pdf_file1 = file1.replace(".xlsx", ".pdf")
	pdf_file2 = file2.replace(".xlsx", ".pdf")
	text1, text2 =extract_and_paragraph(pdf_file1, pdf_file2, False)
	# Use sheet name as the keyword to find relevant sentences
	keywords = {
	'GDP': ['GDP'],
	'HICP': ['HICP'],
	'RRE prices': ['RRE', 'residential'],
	'CRE prices': ['CRE', 'commercial'],
	'Unemployment': ['unemployment']
	}
	selected_keywords = keywords.get(sheet, [])

	# Find sentences containing the keywords
	sentences1 = find_sentences_with_keywords(text1, selected_keywords)
	sentences2 = find_sentences_with_keywords(text2, selected_keywords)

	# Concatenate all sentences for each PDF
	text_pdf1 = "\n".join(sentences1)
	text_pdf2 = "\n".join(sentences2)

	# Perform sentiment analysis on the extracted sentences for each PDF
	result_pdf1 = fin_ext_bis(text_pdf1)
	result_pdf2 = fin_ext_bis(text_pdf2)

	return result_pdf1, result_pdf2
	#def change_choices(df):
	# return gr.update(choices=df.Country.values.tolist())

	def generate_text(df, country, theme):
	# Filter the dataframe based on the country
	row = df[df['Country'] == country].iloc[0]

	# Convert the row to a string format for prompt
	row_str = row.to_string(index=True)

	simple_prompt = f"""
	Here is the data for {theme} in {country}:
	{row_str}

	Summarize the adverse growth for {theme} in {country}. Highlight any increase or decrease compared to previous years and include the cumulative result.
	"""
	prompt = f"""
	Here is an example of how to summarize adverse growth data for a given country with GDP as the theme:

	Example for France (GDP):
	Country: France
	Adverse 2020: -0.427975
	Adverse 2021: -1.987167
	Adverse 2022: -1.195906
	Adverse Cumulative: -3.573762

	The theme is GDP.
	Summary:
	In the adverse scenario, the growth for GDP in France decreased by -0.427975% in 2020, worsened further by -1.987167% in 2021, and slightly improved by -1.195906% in 2022. The cumulative adverse growth is -3.573762%.

	Now, summarize the data for {theme} in {country}:
	{row_str}

	Make sure to highlight changes compared to previous years, include the cumulative result if applicable and use 'increase' or 'decrease' to describe changes.
	"""





	# Generate the descriptive text using the model
	result = table_to_text(prompt, max_length=200, temperature=0.7, top_p=0.9)[0]['generated_text']

	return result
	# Global variable
	stored_paragraphs_1 = []
	stored_paragraphs_2 = []

	stored_df1 = []
	stored_df2 = []

	with gr.Blocks() as demo:
	with gr.Tab("Financial Report Text Analysis"):
	gr.Markdown("## Financial Report Paragraph Selection and Analysis on adverse macro-economy scenario")

	with gr.Row():
	# Upload PDFs
	with gr.Column():
	pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
	pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")

	with gr.Column():
	b1 = gr.Button("Extract and Display Paragraphs")
	paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
	paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")

	def update_paragraphs(pdf1, pdf2):
	global stored_paragraphs_1, stored_paragraphs_2
	stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True)
	updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
	updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
	return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)

	b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])

	with gr.Row():
	# Process the selected paragraph from PDF 1
	with gr.Column():
	gr.Markdown("### PDF 1 Analysis")
	selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
	summarize_btn1 = gr.Button("Summarize Text from PDF 1")
	summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
	summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
	sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
	sentiment_textbox_1 = gr.Textbox(label="Classification for PDF 1", lines=1)
	sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
	analyze_btn1 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
	fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")
	analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1)
	analyze_btn1_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
	fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 bis")
	analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_)

	# Process the selected paragraph from PDF 2
	with gr.Column():
	gr.Markdown("### PDF 2 Analysis")
	selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
	selected_paragraph_2.change(show, paragraph_2_dropdown, selected_paragraph_2)
	summarize_btn2 = gr.Button("Summarize Text from PDF 2")
	summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2)
	summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
	sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
	sentiment_textbox_2 = gr.Textbox(label="Classification for PDF 2", lines=1)
	sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
	analyze_btn2 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
	fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")
	analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2)
	analyze_btn2_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
	fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 bis")
	analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_)

	with gr.Tab("Financial Report Table Analysis"):
	# New tab content goes here
	gr.Markdown("## Excel Data Comparison")

	with gr.Row():
	with gr.Column():
	file1 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 1")
	file2 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 2")
	sheet = gr.Dropdown(choices=["GDP", "HICP", "RRE prices", "Unemployment", "CRE prices"], label="Select Sheet for File 1 and 2")

	with gr.Column():
	result = gr.Image(label="Comparison pLot")

	def update_sheets(file):
	return get_sheet_names(file)


	b1 = gr.Button("Compare Data")
	b2 = gr.Button("Extract text information")

	with gr.Row():
	with gr.Column():
	sentiment_results_pdf1 = gr.HighlightedText(label="Sentiment Analysis - PDF 1")
	country_1_dropdown = gr.Dropdown(label="Select Country from Excel File 1")
	summarize_btn1_country = gr.Button("Summary for the selected country")
	text_result_df1 = gr.Textbox(label="Sentence for excel file 1", lines=2)
	summarize_btn1_country.click(fn=lambda country, theme: generate_text(stored_df1, country, theme),
	inputs=[country_1_dropdown, sheet],
	outputs=text_result_df1)
	with gr.Column():
	sentiment_results_pdf2 = gr.HighlightedText(label="Sentiment Analysis - PDF 2")
	country_2_dropdown = gr.Dropdown(label="Select Country from Excel File 2")
	summarize_btn2_country = gr.Button("Summary for the selected country")
	text_result_df2 = gr.Textbox(label="Sentence for excel file 2", lines=2)
	summarize_btn2_country.click(fn=lambda country, theme: generate_text(stored_df2, country, theme),
	inputs=[country_2_dropdown, sheet],
	outputs=text_result_df2)

	# Button to extract text from PDFs and perform sentiment analysis
	b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=[result,country_1_dropdown, country_2_dropdown])
	b2.click(fn=process_pdfs_and_analyze_sentiment, inputs=[file1, file2, sheet], outputs=[sentiment_results_pdf1, sentiment_results_pdf2])


	demo.launch()