Spaces:

norygano
/

causev

Running

App Files Files Community

causev / plot.py

norygano

Presentation

6b22889 5 months ago

raw

history blame

10.4 kB

	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import os
	import umap

	def indicator_chart(chart_type='overall'):
	data_file = os.path.join('data', 'indicator_overview.tsv')
	df = pd.read_csv(data_file, sep='\t')

	if chart_type == 'overall':
	df_filtered = df[df['Indicator'] == 'Total with Indicators'].copy()
	total_sentences_per_subfolder = df.groupby('Subfolder')['Total Sentences'].first().to_dict()
	df_filtered['Total Sentences'] = df_filtered['Subfolder'].map(total_sentences_per_subfolder)
	df_filtered['Indicator_Share'] = df_filtered['Count'] / df_filtered['Total Sentences']
	df_filtered['Indicator_Share_Text'] = (df_filtered['Indicator_Share'] * 100).round(2).astype(str) + '%'

	fig = px.bar(
	df_filtered,
	x='Subfolder',
	y='Indicator_Share',
	labels={'Indicator_Share': 'Share of Sentences with Indicators', 'Subfolder': ''},
	color='Subfolder',
	text='Indicator_Share_Text',
	color_discrete_sequence=px.colors.qualitative.D3,
	custom_data=['Total Sentences', 'Count']
	)

	fig.update_traces(
	hovertemplate=(
	'<b>%{x}</b><br>' +
	'Share with Indicators: %{y:.1%}<br>' +
	'Total Sentences: %{customdata[0]}<br>' +
	'Sentences with Indicators: %{customdata[1]}<extra></extra>'
	),
	textposition='inside',
	texttemplate='%{text}',
	textfont=dict(color='rgb(255, 255, 255)'),
	insidetextanchor='middle',
	)

	elif chart_type == 'individual':
	min_value = 5
	exclude_indicators = ['!besprechen']
	df_filtered = df[~df['Indicator'].isin(['Total with Indicators', 'None'] + exclude_indicators)].copy()
	indicators_meeting_threshold = df_filtered[df_filtered['Count'] >= min_value]['Indicator'].unique()
	df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_meeting_threshold)]
	df_filtered['Indicator'] = df_filtered['Indicator'].str.capitalize()

	fig = px.bar(
	df_filtered,
	x='Subfolder',
	y='Count',
	color='Indicator',
	barmode='group',
	labels={'Count': 'Occurrences', 'Subfolder': '', 'Indicator': ' <b>INDICATOR</b>'},
	color_discrete_sequence=px.colors.qualitative.D3
	)

	fig.update_traces(
	texttemplate='%{y}',
	textposition='inside',
	textfont=dict(color='rgb(255, 255, 255)'),
	insidetextanchor='middle'
	)

	fig.update_layout(
	xaxis=dict(showline=True),
	yaxis=dict(showticklabels=True, title='', tickformat=".0%" if chart_type == 'overall' else None),
	bargap=0.05,
	showlegend=(chart_type == 'individual')
	)

	return fig

	def causes_chart():
	data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
	df = pd.read_csv(data_file, sep='\t')

	# Threshold
	min_value = 30
	df_filtered = df[df['cause'] != 'N/A'].copy()
	causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
	df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
	df_filtered['cause'] = df_filtered['cause'].str.capitalize()

	fig = px.bar(
	df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
	x='subfolder',
	y='Count',
	color='cause',
	barmode='group',
	labels={'Count': 'Occurrences', 'subfolder': '', 'cause': '<b>CAUSE</b>'},
	color_discrete_sequence=px.colors.qualitative.D3,
	)

	fig.update_layout(
	xaxis=dict(showline=True),
	yaxis=dict(showticklabels=True, title=''),

	)

	fig.update_traces(
	texttemplate='%{y}',
	textposition='inside',
	textfont=dict(color='rgb(255, 255, 255)'),
	insidetextanchor='middle',
	)

	return fig

	def scatter(include_modality=False):
	data_file = os.path.join('data', 'feature_matrix.tsv')
	df = pd.read_csv(data_file, sep='\t')

	# Exclude sentences without any indicators, causes, or modalities (if included)
	indicator_columns = [col for col in df.columns if col.startswith('indicator_')]
	cause_columns = [col for col in df.columns if col.startswith('cause_')]
	modality_columns = [col for col in df.columns if col.startswith('modality_')]

	df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) \|
	(df[cause_columns].sum(axis=1) > 0)]

	# Exclude indicator '!besprechen'
	indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col]

	# Limit indicators to those that occur at least 10 times
	indicator_counts = df_filtered[indicator_columns].sum()
	indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()

	# Further filter to exclude entries without any valid indicators
	df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]

	# Exclude non-feature columns for dimensionality reduction
	columns_to_drop = ['subfolder']
	if not include_modality:
	columns_to_drop += modality_columns # Drop modality columns if not included

	features = df_filtered.drop(columns=columns_to_drop)
	features_clean = features.fillna(0)

	# Prepare metadata
	metadata = df_filtered[['subfolder']].copy()
	metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1)
	metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1)

	# UMAP dimensionality reduction
	reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
	reduced_features = reducer.fit_transform(features_clean)
	df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
	df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)

	# Plotting the scatter plot
	hover_data = {'cause': True, 'Component 1': False, 'Component 2': False}
	if include_modality:
	hover_data['Modality'] = True

	custom_labels = {
	'subfolder': 'Effect', # Renaming 'subfolder' to 'Category'
	}

	fig = px.scatter(
	df_reduced,
	x='Component 1',
	y='Component 2',
	color='subfolder', # Only subfolder colors will show in the legend
	symbol='indicator', # Symbols for indicators, without showing in legend
	labels=custom_labels,
	hover_data=hover_data,
	color_discrete_sequence=px.colors.qualitative.D3
	)

	fig.update_layout(
	xaxis=dict(showgrid=True),
	yaxis=dict(showgrid=True),
	showlegend=True, # Show only the subfolder legend
	legend=dict(
	title="Effect, Indicator", # Adjust title to indicate the subfolder legend
	yanchor="top",
	xanchor="left",
	borderwidth=1,
	),
	)

	return fig

	def sankey(cause_threshold=10, indicator_threshold=5):
	# Load the data
	data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
	df = pd.read_csv(data_file, sep='\t')

	# Remove rows with NaN values in 'cause', 'indicator', or 'subfolder' columns
	df = df.dropna(subset=['cause', 'indicator', 'subfolder'])

	# Strip '_nk' from 'subfolder' values
	df['subfolder'] = df['subfolder'].str.replace('_nk', '')

	# Calculate overall counts for each cause and indicator
	cause_counts = df['cause'].value_counts()
	indicator_counts = df['indicator'].value_counts()

	# Filter causes and indicators that meet their respective thresholds
	valid_causes = cause_counts[cause_counts >= cause_threshold].index
	valid_indicators = indicator_counts[indicator_counts >= indicator_threshold].index

	# Filter the DataFrame to include only rows with causes and indicators that meet the thresholds
	df_filtered = df[(df['cause'].isin(valid_causes)) & (df['indicator'].isin(valid_indicators))]

	# Calculate pair counts for cause -> indicator and indicator -> subfolder
	cause_indicator_counts = df_filtered.groupby(['cause', 'indicator']).size().reset_index(name='count')
	indicator_subfolder_counts = df_filtered.groupby(['indicator', 'subfolder']).size().reset_index(name='count')

	# Generate unique labels for Sankey nodes, including all causes, indicators, and subfolders
	causes = df_filtered['cause'].unique()
	indicators = df_filtered['indicator'].unique()
	subfolders = df_filtered['subfolder'].unique()
	all_labels = list(causes) + list(indicators) + list(subfolders)

	# Mapping of each label to an index for Sankey node
	label_to_index = {label: idx for idx, label in enumerate(all_labels)}

	# Define sources, targets, and values for the Sankey diagram
	sources = []
	targets = []
	values = []

	# Add cause -> indicator links
	for _, row in cause_indicator_counts.iterrows():
	if row['cause'] in label_to_index and row['indicator'] in label_to_index:
	sources.append(label_to_index[row['cause']])
	targets.append(label_to_index[row['indicator']])
	values.append(row['count'])

	# Add indicator -> subfolder links
	for _, row in indicator_subfolder_counts.iterrows():
	if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
	sources.append(label_to_index[row['indicator']])
	targets.append(label_to_index[row['subfolder']])
	values.append(row['count'])

	fig = go.Figure(data=[go.Sankey(
	node=dict(
	pad=15,
	thickness=20,
	line=dict(color="black", width=0.5),
	label=all_labels,
	),
	link=dict(
	source=sources,
	target=targets,
	value=values
	)
	)])

	fig.update_layout(
	autosize=False, # Disable automatic resizing
	width=500, # Fixed width
	height=500, # Fixed height
	)

	return fig