Spaces:

norygano
/

causev

Running

App Files Files Community

norygano commited on Oct 27, 2024

Commit

45d0933

1 Parent(s): 60e75a3

Charts

Browse files

Files changed (6) hide show

.gitignore +1 -0
app.py +73 -68
data/feature_matrix.tsv +0 -0
data/indicator_cause_sentence_metadata.tsv +0 -0
data/indicator_overview.tsv +105 -0
plot.py +178 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/plot.cpython-311.pyc

app.py CHANGED Viewed

@@ -2,97 +2,102 @@ import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from annotated_text import annotated_text
 # Load the trained model and tokenizer
 model_directory = "norygano/causalBERT"
 tokenizer = AutoTokenizer.from_pretrained(model_directory, add_prefix_space=True)
 model = AutoModelForTokenClassification.from_pretrained(model_directory)
-# Set model to evaluation mode
 model.eval()
 # Define the label map
 label_map = {0: "O", 1: "B-INDICATOR", 2: "I-INDICATOR", 3: "B-CAUSE", 4: "I-CAUSE", 5: "B-EFFECT", 6: "I-EFFECT"}
-# Streamlit App
-st.markdown(
     """
     <div style="display: flex; align-items: center; justify-content: left; font-size: 60px; font-weight: bold;">
         <span>CAUSEN</span>
         <span style="transform: rotate(270deg); display: inline-block; margin-left: 5px;">V</span>
     </div>
     """,
-    unsafe_allow_html=True
 )
-st.markdown("[Model](https://huggingface.co/norygano/causalBERT)")
-# Add a description with a link to the model
-st.write("Tags indicators and causes of explicit attributions of causality. GER only (atm)")
-# Text input for sentences with italic placeholder text
-sentences_input = st.text_area("*Sentences (one per line)*", "\n".join([
-    "Autos stehen im Verdacht, Waldsterben zu verursachen.",
-    "Fußball führt zu Waldschäden.",
-    "Haustüren tragen zum Betonsterben bei.",
-])
-  , placeholder="Your Sentences here.")
-# Split the input text into individual sentences
-sentences = [sentence.strip() for sentence in sentences_input.splitlines() if sentence.strip()]
-# Button to run the model
-if st.button("Analyze"):
-    for sentence in sentences:
-        # Tokenize the sentence
-        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
-        # Run inference
-        with torch.no_grad():
-            outputs = model(**inputs)
-        # Get the logits and predicted label IDs
-        logits = outputs.logits
-        predicted_label_ids = torch.argmax(logits, dim=2)
-        # Convert token IDs back to tokens
-        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
-        # Map label IDs to human-readable labels
-        predicted_labels = [label_map[label_id.item()] for label_id in predicted_label_ids[0]]
-        # Reconstruct words from subwords and prepare for annotated_text
-        annotations = []
-        current_word = ""
-        current_label = "O"
-        for token, label in zip(tokens, predicted_labels):
-            if token in ['[CLS]', '[SEP]']:  # Exclude special tokens
-                continue
-            if token.startswith("##"):
-                # Append subword without "##" prefix to the current word
-                current_word += token[2:]
-            else:
-                # If we have accumulated a word, add it to annotations with a space
-                if current_word:
-                    if current_label != "O":
-                        annotations.append((current_word, current_label))
-                    else:
-                        annotations.append(current_word)
-                    annotations.append(" ")  # Add a space between words
-                # Start a new word
-                current_word = token
-                current_label = label
-        # Add the last accumulated word
-        if current_word:
-            if current_label != "O":
-                annotations.append((current_word, current_label))
-            else:
-                annotations.append(current_word)
-        # Display annotated text
-        st.write(f"**Sentence:** {sentence}")
-        annotated_text(*annotations)
-        st.write("---")

 import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from annotated_text import annotated_text
+import pandas as pd
+import plotly.express as px
+from plot import indicator_chart, causes_chart, scatter_plot
+import os
 # Load the trained model and tokenizer
 model_directory = "norygano/causalBERT"
 tokenizer = AutoTokenizer.from_pretrained(model_directory, add_prefix_space=True)
 model = AutoModelForTokenClassification.from_pretrained(model_directory)
 model.eval()
 # Define the label map
 label_map = {0: "O", 1: "B-INDICATOR", 2: "I-INDICATOR", 3: "B-CAUSE", 4: "I-CAUSE", 5: "B-EFFECT", 6: "I-EFFECT"}
+# Main application
+st.markdown(
     """
     <div style="display: flex; align-items: center; justify-content: left; font-size: 60px; font-weight: bold;">
         <span>CAUSEN</span>
         <span style="transform: rotate(270deg); display: inline-block; margin-left: 5px;">V</span>
     </div>
     """,
+    unsafe_allow_html=True
 )
+st.markdown("[Model](https://huggingface.co/norygano/causalBERT) | [Data](https://huggingface.co/datasets/norygano/causenv) | [Project](https://www.uni-trier.de/universitaet/fachbereiche-faecher/fachbereich-ii/faecher/germanistik/professurenfachteile/germanistische-linguistik/professoren/prof-dr-martin-wengeler/kontroverse-diskurse/individium-gesellschaft)")
+st.write("Tags indicators and causes in explicit attributions of causality. GER only (atm)")
+# Create tabs
+tab1, tab2, tab3, tab4 = st.tabs(["Prompt", "Indicators", "Causes", "Scatter"])
+# Prompt Tab
+with tab1:
+    sentences_input = st.text_area("*Sentences (one per line)*", "\n".join([
+        "Autos stehen im Verdacht, Waldsterben zu verursachen.",
+        "Fußball führt zu Waldschäden.",
+        "Haustüren tragen zum Betonsterben bei.",
+    ]), placeholder="Your Sentences here.")
+    sentences = [sentence.strip() for sentence in sentences_input.splitlines() if sentence.strip()]
+    if st.button("Analyze"):
+        for sentence in sentences:
+            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
+            with torch.no_grad():
+                outputs = model(**inputs)
+            logits = outputs.logits
+            predicted_label_ids = torch.argmax(logits, dim=2)
+            tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
+            predicted_labels = [label_map[label_id.item()] for label_id in predicted_label_ids[0]]
+            annotations = []
+            current_word = ""
+            current_label = "O"
+            for token, label in zip(tokens, predicted_labels):
+                if token in ['[CLS]', '[SEP]']:  # Exclude special tokens
+                    continue
+                if token.startswith("##"):
+                    current_word += token[2:]
+                else:
+                    if current_word:
+                        if current_label != "O":
+                            annotations.append((current_word, current_label))
+                        else:
+                            annotations.append(current_word)
+                        annotations.append(" ")  # Add a space between words
+                    current_word = token
+                    current_label = label
+            if current_word:
+                if current_label != "O":
+                    annotations.append((current_word, current_label))
+                else:
+                    annotations.append(current_word)
+            st.write(f"**Sentence:** {sentence}")
+            annotated_text(*annotations)
+            st.write("---")
+# Research Insights Tab
+with tab2:
+    st.write("## Indicators")
+    # Overall
+    st.subheader("Overall")
+    fig_overall = indicator_chart(chart_type='overall')
+    st.plotly_chart(fig_overall, use_container_width=True)
+    # Individual Indicators Chart
+    st.subheader("Individual")
+    fig_individual = indicator_chart(chart_type='individual')
+    st.plotly_chart(fig_individual, use_container_width=True)
+with tab3:
+    st.write("## Causes")
+    fig_causes = causes_chart()
+    st.plotly_chart(fig_causes, use_container_width=True)
+with tab4:
+    st.write("## Scatter")
+    fig_scatter = scatter_plot()
+    st.plotly_chart(fig_scatter, use_container_width=True)

data/feature_matrix.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/indicator_cause_sentence_metadata.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/indicator_overview.tsv ADDED Viewed

	@@ -0,0 +1,105 @@

+Subfolder	Total Sentences	Indicator	Count	Share
+Waldsterben_nk	828	Total with Indicators	139	16.79%
+Waldsterben_nk	828	None	131	15.82%
+Waldsterben_nk	828	verantwortung	31	3.74%
+Waldsterben_nk	828	ursache	31	3.74%
+Waldsterben_nk	828	schuld	15	1.81%
+Waldsterben_nk	828	beitragen	15	1.81%
+Waldsterben_nk	828	führen	6	0.72%
+Waldsterben_nk	828	!besprechen	6	0.72%
+Waldsterben_nk	828	wirkung	4	0.48%
+Waldsterben_nk	828	folgen	4	0.48%
+Waldsterben_nk	828	beschleunigen	3	0.36%
+Waldsterben_nk	828	zusammenhang	3	0.36%
+Waldsterben_nk	828	faktor	3	0.36%
+Waldsterben_nk	828	durch	2	0.24%
+Waldsterben_nk	828	grund	2	0.24%
+Waldsterben_nk	828	kosten	2	0.24%
+Waldsterben_nk	828	stecken	1	0.12%
+Waldsterben_nk	828	steuern	1	0.12%
+Waldsterben_nk	828	zuständig	1	0.12%
+Waldsterben_nk	828	auslöser	1	0.12%
+Waldsterben_nk	828	sorgen	1	0.12%
+Waldsterben_nk	828	wenn-dann	1	0.12%
+Waldsterben_nk	828	schaden	1	0.12%
+Waldsterben_nk	828	angesichts	1	0.12%
+Waldsterben_nk	828	einfluss	1	0.12%
+Waldsterben_nk	828	stellung	1	0.12%
+Waldsterben_nk	828	zuTun	1	0.12%
+Waldsterben_nk	828	teil	1	0.12%
+Waldsterben_nk	828	rolle	1	0.12%
+Waldsterben_nk	828	bedeuten	1	0.12%
+Waldsterben_nk	828	sünder	1	0.12%
+Bienensterben_nk	281	Total with Indicators	126	44.84%
+Bienensterben_nk	281	None	123	43.77%
+Bienensterben_nk	281	verantwortung	40	14.23%
+Bienensterben_nk	281	ursache	19	6.76%
+Bienensterben_nk	281	grund	14	4.98%
+Bienensterben_nk	281	beitragen	10	3.56%
+Bienensterben_nk	281	schuld	7	2.49%
+Bienensterben_nk	281	teil	6	2.14%
+Bienensterben_nk	281	führen	5	1.78%
+Bienensterben_nk	281	verbindung	4	1.42%
+Bienensterben_nk	281	kommen	3	1.07%
+Bienensterben_nk	281	faktor	3	1.07%
+Bienensterben_nk	281	zuTun	3	1.07%
+Bienensterben_nk	281	folgen	2	0.71%
+Bienensterben_nk	281	auslöser	2	0.71%
+Bienensterben_nk	281	erklärung	2	0.71%
+Bienensterben_nk	281	wirkung	2	0.71%
+Bienensterben_nk	281	einhergehen	1	0.36%
+Bienensterben_nk	281	wegen	1	0.36%
+Bienensterben_nk	281	durch	1	0.36%
+Bienensterben_nk	281	zusammenhang	1	0.36%
+Bienensterben_nk	281	wenn-dann	1	0.36%
+Bienensterben_nk	281	wundern	1	0.36%
+Bienensterben_nk	281	handeln	1	0.36%
+Bienensterben_nk	281	einfluss	1	0.36%
+Bienensterben_nk	281	!besprechen	1	0.36%
+Artensterben_nk	539	Total with Indicators	141	26.16%
+Artensterben_nk	539	None	141	26.16%
+Artensterben_nk	539	ursache	27	5.01%
+Artensterben_nk	539	beitragen	21	3.90%
+Artensterben_nk	539	verantwortung	19	3.53%
+Artensterben_nk	539	grund	15	2.78%
+Artensterben_nk	539	schuld	11	2.04%
+Artensterben_nk	539	führen	11	2.04%
+Artensterben_nk	539	kommen	4	0.74%
+Artensterben_nk	539	zusammenhang	4	0.74%
+Artensterben_nk	539	einfluss	4	0.74%
+Artensterben_nk	539	teil	4	0.74%
+Artensterben_nk	539	faktor	3	0.56%
+Artensterben_nk	539	wirkung	2	0.37%
+Artensterben_nk	539	erklärung	2	0.37%
+Artensterben_nk	539	folgen	2	0.37%
+Artensterben_nk	539	rolle	2	0.37%
+Artensterben_nk	539	auslöser	1	0.19%
+Artensterben_nk	539	erzeugen	1	0.19%
+Artensterben_nk	539	stecken	1	0.19%
+Artensterben_nk	539	sünder	1	0.19%
+Artensterben_nk	539	durch	1	0.19%
+Artensterben_nk	539	bedingen	1	0.19%
+Artensterben_nk	539	zuTun	1	0.19%
+Artensterben_nk	539	fördern	1	0.19%
+Artensterben_nk	539	treiben	1	0.19%
+Insektensterben_nk	253	Total with Indicators	66	26.09%
+Insektensterben_nk	253	None	60	23.72%
+Insektensterben_nk	253	ursache	12	4.74%
+Insektensterben_nk	253	verantwortung	8	3.16%
+Insektensterben_nk	253	grund	7	2.77%
+Insektensterben_nk	253	beitragen	6	2.37%
+Insektensterben_nk	253	!besprechen	5	1.98%
+Insektensterben_nk	253	rolle	4	1.58%
+Insektensterben_nk	253	schuld	3	1.19%
+Insektensterben_nk	253	faktor	3	1.19%
+Insektensterben_nk	253	zusammenhang	2	0.79%
+Insektensterben_nk	253	zuTun	2	0.79%
+Insektensterben_nk	253	teil	2	0.79%
+Insektensterben_nk	253	folgen	1	0.40%
+Insektensterben_nk	253	kosten	1	0.40%
+Insektensterben_nk	253	durch	1	0.40%
+Insektensterben_nk	253	treiben	1	0.40%
+Insektensterben_nk	253	bedeuten	1	0.40%
+Insektensterben_nk	253	relevant	1	0.40%
+Insektensterben_nk	253	einfluss	1	0.40%
+Insektensterben_nk	253	stecken	1	0.40%

plot.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import pandas as pd
+import plotly.express as px
+import os
+import umap
+from sklearn.preprocessing import StandardScaler
+def indicator_chart(chart_type='overall'):
+    data_file = os.path.join('data', 'indicator_overview.tsv')
+    df = pd.read_csv(data_file, sep='\t')
+    if chart_type == 'overall':
+        df_filtered = df[df['Indicator'] == 'Total with Indicators'].copy()
+        total_sentences_per_subfolder = df.groupby('Subfolder')['Total Sentences'].first().to_dict()
+        df_filtered['Total Sentences'] = df_filtered['Subfolder'].map(total_sentences_per_subfolder)
+        df_filtered['Indicator_Share'] = df_filtered['Count'] / df_filtered['Total Sentences']
+        df_filtered['Indicator_Share_Text'] = (df_filtered['Indicator_Share'] * 100).round(2).astype(str) + '%'
+        fig = px.bar(
+            df_filtered,
+            x='Subfolder',
+            y='Indicator_Share',
+            labels={'Indicator_Share': 'Share of Sentences with Indicators', 'Subfolder': ''},
+            color='Subfolder',
+            text='Indicator_Share_Text',
+            color_discrete_sequence=px.colors.qualitative.D3,
+            custom_data=['Total Sentences', 'Count']
+        )
+        fig.update_traces(
+            hovertemplate=(
+                '<b>%{x}</b><br>' +
+                'Share with Indicators: %{y:.1%}<br>' +
+                'Total Sentences: %{customdata[0]}<br>' +
+                'Sentences with Indicators: %{customdata[1]}<extra></extra>'
+            ),
+            textposition='inside',
+            texttemplate='%{text}',
+            textfont=dict(color='rgb(255, 255, 255)'),
+            insidetextanchor='middle',
+        )
+    elif chart_type == 'individual':
+        min_value = 5
+        exclude_indicators = ['!besprechen']
+        df_filtered = df[~df['Indicator'].isin(['Total with Indicators', 'None'] + exclude_indicators)].copy()
+        indicators_meeting_threshold = df_filtered[df_filtered['Count'] >= min_value]['Indicator'].unique()
+        df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_meeting_threshold)]
+        df_filtered['Indicator'] = df_filtered['Indicator'].str.capitalize()
+        fig = px.bar(
+            df_filtered,
+            x='Subfolder',
+            y='Count',
+            color='Indicator',
+            barmode='group',
+            labels={'Count': 'Occurrences', 'Subfolder': '', 'Indicator': '  <b>INDICATOR</b>'},
+            color_discrete_sequence=px.colors.qualitative.D3
+        )
+        fig.update_traces(
+            texttemplate='%{y}',
+            textposition='inside',
+            textfont=dict(color='rgb(255, 255, 255)'),
+        )
+    fig.update_layout(
+        xaxis=dict(showline=True),
+        yaxis=dict(showticklabels=True, title=''),
+        bargap=0.05,
+        showlegend=(chart_type == 'individual')
+    )
+    return fig
+def causes_chart():
+    data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
+    df = pd.read_csv(data_file, sep='\t')
+    # Threshold
+    min_value = 30
+    df_filtered = df[df['cause'] != 'N/A'].copy()
+    causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
+    df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
+    df_filtered['cause'] = df_filtered['cause'].str.capitalize()
+    fig = px.bar(
+        df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
+        x='subfolder',
+        y='Count',
+        color='cause',
+        barmode='group',
+        labels={'Count': 'Occurrences', 'subfolder': '', 'cause': '<b>CAUSE</b>'},
+        color_discrete_sequence=px.colors.qualitative.G10,
+    )
+    fig.update_layout(
+        xaxis=dict(showline=True),
+        yaxis=dict(showticklabels=True, title=''),
+    )
+    fig.update_traces(
+        texttemplate='%{y}',
+        textposition='inside',
+        textfont=dict(color='rgb(255, 255, 255)'),
+        insidetextanchor='middle',
+    )
+    return fig
+def scatter_plot(include_modality=False):
+    data_file = os.path.join('data', 'feature_matrix.tsv')
+    df = pd.read_csv(data_file, sep='\t')
+    # Exclude sentences without any indicators (all indicator columns are 0), causes, or modalities (if included)
+    indicator_columns = [col for col in df.columns if col.startswith('indicator_')]
+    cause_columns = [col for col in df.columns if col.startswith('cause_')]
+    modality_columns = [col for col in df.columns if col.startswith('modality_')]
+    df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) |
+                        (df[cause_columns].sum(axis=1) > 0)]
+    # Exclude indicator '!besprechen'
+    indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col]
+    # Limit indicators to those that occur at least 10 times
+    indicator_counts = df_filtered[indicator_columns].sum()
+    indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()
+    # Further filter to exclude entries without any valid indicators
+    df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]
+    # Exclude non-feature columns (metadata and sentence text) for dimensionality reduction
+    columns_to_drop = ['subfolder']
+    if not include_modality:
+        columns_to_drop += modality_columns  # Drop modality columns if not included
+    features = df_filtered.drop(columns=columns_to_drop)
+    # Fill NaN values with 0 for the feature matrix
+    features_clean = features.fillna(0)
+    # Store the relevant metadata separately to ensure it is aligned correctly with the dimensionality reduction results
+    metadata = df_filtered[['subfolder']].copy()
+    # Remove the 'indicator_' prefix for indicators and ensure only indicators with at least 10 occurrences are included
+    metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1)
+    # Collect all non-zero causes as a string (multiple causes per sentence)
+    metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1)
+    # Perform UMAP dimensionality reduction
+    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, metric='cosine')
+    reduced_features = reducer.fit_transform(features_clean)
+    df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
+    df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)
+    # Plotting the scatter plot with Plotly Express
+    hover_data = {'cause'}
+    if include_modality:
+        hover_data['Modality'] = True
+    fig = px.scatter(
+        df_reduced,
+        x='Component 1',
+        y='Component 2',
+        color='subfolder',
+        hover_data=hover_data,
+        labels={'Component 1': 'UMAP Dim 1', 'Component 2': 'UMAP Dim 2'},
+        color_discrete_sequence=px.colors.qualitative.Plotly
+    )
+    fig.update_layout(
+        xaxis=dict(showgrid=False),
+        yaxis=dict(showgrid=False),
+        showlegend=True
+    )
+    return fig