Spaces:

vova631
/

emotion-matcher

Sleeping

App Files Files Community

vova631 commited on Jun 30

Commit

69d8d00

verified ·

1 Parent(s): b74fb01

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -184

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
 """emotion-matcher.ipynb
 Automatically generated by Colab.
@@ -22,52 +22,33 @@ splits = {
 df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
 # Preview the first few rows of the dataset
-df.head()
-"""This code loads the training split of the GoEmotions dataset (simplified version) directly from the HuggingFace Hub using the hf:// path.
-We use the pandas library and the read_parquet() method to read the data into a table (DataFrame).
-Then, we display the first few rows using df.head() to make sure the data was loaded correctly.
-This is a perfect starting point for the next step – Exploratory Data Analysis (EDA).
-"""
-#Import necessary libraries
-import pandas as pd
-#View dataset shape
 print("Dataset shape:", df.shape)
-#View basic column information
 print("\nColumn names:", df.columns.tolist())
-#View detailed info
 df.info()
-"""In this step, we check how many rows and columns the dataset has, and examine the names and data types of all columns.
-This gives us an overview of what kind of data we’re dealing with (text, numbers, labels, etc.).
-It helps us understand what preprocessing may be needed next.
-"""
-#Check for missing values
 print("Missing values per column:")
 print(df.isnull().sum())
-#Check for duplicated rows (convert unhashable columns to string)
 print("\nNumber of duplicated rows:")
 print(df.astype(str).duplicated().sum())
-#Check how many unique combinations of emotion labels exist
 print("\nNumber of unique label combinations:")
 print(df["labels"].apply(lambda x: tuple(x)).nunique())
-#Compute text lengths in number of words
 df["text_length"] = df["text"].apply(lambda x: len(x.split()))
-#Plot histogram of text lengths
 import matplotlib.pyplot as plt
 plt.figure(figsize=(10,6))
@@ -78,16 +59,10 @@ plt.ylabel("Number of samples")
 plt.grid(True)
 plt.show()
-"""Most texts in the dataset are short—under 30 words—which helps us choose the proper maximum length for tokenization later.
-"""
-#Count how many emotion labels each text has
 df["num_labels"] = df["labels"].apply(len)
-#Plot distribution
 plt.figure(figsize=(8,5))
 df["num_labels"].value_counts().sort_index().plot(kind="bar")
 plt.xlabel("Number of emotion labels")
@@ -95,8 +70,6 @@ plt.ylabel("Number of samples")
 plt.title("Distribution of Emotion Labels per Sample")
 plt.show()
-"""Most samples are annotated with a single emotion label, and very few have multiple labels. This indicates that the dataset is mostly suitable for single-label classification tasks, although a multi-label approach could still capture additional nuance for rare cases."""
 # Count frequency of each individual emotion label
 from collections import Counter
@@ -105,7 +78,6 @@ all_labels = [label for labels in df["labels"] for label in labels]
 label_counts = Counter(all_labels)
 # Convert to DataFrame for plotting
-import pandas as pd
 emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
 emotion_freq = emotion_freq.sort_values(by='count', ascending=False)
@@ -116,23 +88,12 @@ plt.xlabel("Emotion Label ID")
 plt.ylabel("Number of Occurrences")
 plt.show()
-"""This bar chart illustrates how often each emotion label appears across the dataset.
-We observe a strong imbalance: some labels like 27 (likely “neutral”) dominate with over 14,000 occurrences,
-while others like 16, 21, or 23 are very rare.
-This highlights the need to consider class imbalance when training models.
-"""
-# Import necessary libraries
 import numpy as np
-import matplotlib.pyplot as plt
 import seaborn as sns
-# Create a binary matrix for emotions
-# Get the maximum label ID from all label lists
 num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1
 emotion_matrix = np.zeros((len(df), num_labels), dtype=int)
 for i, labels in enumerate(df["labels"]):
     for label in labels:
         emotion_matrix[i, label] = 1
@@ -145,25 +106,13 @@ plt.figure(figsize=(12, 10))
 sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5)
 plt.title("Emotion Co-occurrence Heatmap")
 plt.xlabel("Emotion Label ID")
-plt.ylabel("Emotion Label ID")
 plt.show()
-"""This heatmap visualizes how frequently pairs of emotion labels co-occur within the same text. Darker shades indicate more frequent co-occurrences, helping identify emotions that often appear together."""
-# View random samples of texts and their corresponding emotion labels
 # Display 5 random rows
 print("Sample text examples with emotion labels:")
 print(df.sample(5)[["text", "labels"]])
-"""This step is meant to get a qualitative sense of the dataset by inspecting real examples. It helps verify whether:
-The texts are understandable and relevant.
-The assigned emotion labels make sense.
-There are any noisy, overly short, or unclear samples.
-"""
 # Define emotion label ID to name mapping manually (based on GoEmotions documentation)
 id2label = [
     'admiration', 'amusement', 'anger', 'annoyance', 'approval',
@@ -174,7 +123,6 @@ id2label = [
     'neutral'
 ]
-# Define a function to convert list of label IDs into label names
 def decode_labels(label_ids):
     return [id2label[i] for i in label_ids]
@@ -182,77 +130,44 @@ def decode_labels(label_ids):
 print("Sample text examples with emotion label names:")
 sample_df = df.sample(5)
 sample_df["label_names"] = sample_df["labels"].apply(decode_labels)
-display(sample_df[["text", "label_names"]])
-"""Sample Texts with Emotion Labels
-The table displays five random text samples from the dataset along with their decoded emotion labels. Most of the examples are labeled as “neutral,” highlighting its dominance in the dataset.
-"""
-# Import library for word cloud
 from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-# Combine all text data into one string
 all_text = " ".join(df["text"])
-# Generate word cloud
 wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
-# Plot word cloud
 plt.figure(figsize=(12, 6))
 plt.imshow(wordcloud, interpolation="bilinear")
 plt.axis("off")
 plt.title("Most Frequent Words in All Text Samples")
 plt.show()
-"""This word cloud displays the most commonly used words across all text samples in the dataset. Larger words appear more frequently, offering insights into prevalent vocabulary and themes used by users expressing various emotions."""
-#  Step: Text Preprocessing - clean the text data
 import re
 import string
-# Define a function to clean each text entry
 def clean_text(text):
-    # Lowercase
     text = text.lower()
-    # Remove [NAME], [URL], and other placeholders
     text = re.sub(r"\[.*?\]", "", text)
-    # Remove punctuation
     text = text.translate(str.maketrans('', '', string.punctuation))
-    # Remove numbers
     text = re.sub(r"\d+", "", text)
-    # Remove extra whitespaces
     text = re.sub(r"\s+", " ", text).strip()
     return text
-# Apply cleaning to the text column
 df["clean_text"] = df["text"].apply(clean_text)
-#  Preview cleaned text
 print("Sample cleaned texts:")
-display(df[["text", "clean_text"]].sample(5))
-"""
-This preprocessing step standardizes text inputs by converting to lowercase, removing brackets like [NAME], punctuation, digits, and extra spaces — which helps downstream models focus on meaningful content.
-"""
 # Plot label distribution
-# Flatten all label lists into a single list
-all_labels = [label for sublist in df["labels"] for label in sublist]
-# Count frequency of each label
-from collections import Counter
-label_counts = Counter(all_labels)
-# Convert to DataFrame for plotting
 label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"])
 label_df.index.name = "label_id"
 label_df = label_df.sort_index()
 label_df["label_name"] = label_df.index.map(lambda i: id2label[i])
-# Plot bar chart
 plt.figure(figsize=(14, 6))
 sns.barplot(x="label_name", y="count", data=label_df)
 plt.xticks(rotation=45, ha="right")
@@ -262,73 +177,27 @@ plt.ylabel("Frequency")
 plt.tight_layout()
 plt.show()
-"""This bar chart shows how often each emotion label appears across all samples. Labels with higher frequency indicate more common emotions in the dataset.
-## 2. Embeddings
-"""
-# Import required libraries
 from sentence_transformers import SentenceTransformer
 import torch
-# Choose a small and fast model for generating sentence embeddings
 model = SentenceTransformer('all-MiniLM-L6-v2')
-# Optional: move model to GPU if available
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 model = model.to(device)
-# Subset the dataset to 2000 samples for efficiency
-sample_df = df.sample(n=2000, random_state=42).reset_index(drop=True)
-# Generate embeddings for the 'clean_text' column
-# This might take 1-2 minutes
-embeddings = model.encode(
-    sample_df['clean_text'].tolist(),
-    convert_to_tensor=True,
-    show_progress_bar=True,
-    device=device
-)
-# Store embeddings as a list inside the dataframe
-sample_df['embedding'] = embeddings.cpu().numpy().tolist()
-# Preview the result
-sample_df[['clean_text', 'embedding']].head()
-"""
-We use the all-MiniLM-L6-v2 model from SentenceTransformers to convert each cleaned text into a dense vector representation, capturing semantic meaning for further clustering and visualization.
-"""
-from tqdm.notebook import tqdm
 sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True)
-embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True)
 sample_df["embedding"] = embeddings.tolist()
-"""This step uses the all-MiniLM-L6-v2 model from the sentence-transformers library to convert each text sample into a dense vector (embedding). To improve efficiency, a random sample of 3,000 examples is selected, encoded in batches, and saved into a new "embedding" column."""
 from sklearn.manifold import TSNE
-import matplotlib.pyplot as plt
-import numpy as np
-# Convert list of embeddings to a NumPy array
 X = np.array(sample_df["embedding"].tolist())
-# Reduce the embedding dimensions to 2D using t-SNE
 tsne = TSNE(n_components=2, random_state=42, perplexity=30)
 X_embedded = tsne.fit_transform(X)
-# Add 2D coordinates to the dataframe
 sample_df["x"] = X_embedded[:, 0]
 sample_df["y"] = X_embedded[:, 1]
-# Visualize the 2D embeddings using a scatter plot
 plt.figure(figsize=(10, 6))
 plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5)
 plt.title("t-SNE Projection of Text Embeddings")
@@ -336,18 +205,13 @@ plt.xlabel("Component 1")
 plt.ylabel("Component 2")
 plt.show()
-"""t-SNE projection of sentence embeddings onto 2D space. Each point represents a high-dimensional text embedding reduced to two components for visualization. This helps reveal potential clustering structures and the distribution of semantic similarities."""
 from sklearn.cluster import KMeans
-# Define the number of clusters (you can try different values like 5, 10, etc.)
 num_clusters = 8
-# Apply K-Means clustering to the embeddings
 kmeans = KMeans(n_clusters=num_clusters, random_state=42)
 sample_df["cluster"] = kmeans.fit_predict(X)
-# Visualize the clusters on the t-SNE projection
 plt.figure(figsize=(10, 6))
 scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6)
 plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection")
@@ -356,46 +220,26 @@ plt.ylabel("Component 2")
 plt.colorbar(scatter, label="Cluster")
 plt.show()
-"""K-Means clustering (k=8) applied to sentence embeddings, visualized using t-SNE. Each color represents a distinct cluster, indicating groups of semantically similar text samples based on their embedding vectors.
-## 3. Inputs & Outputs
-"""
 from sentence_transformers import util
-import torch
-# Ensure sample_df contains the 'embedding' column
 EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device)
-# Define the recommendation function
 def recommend_similar_emotions(user_input):
     if not user_input.strip():
         return "Please enter some text."
-    # Encode the user input into an embedding
     user_embedding = model.encode(user_input, convert_to_tensor=True, device=device)
-    # Compute cosine similarity between user input and all stored embeddings
     similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0]
     top_indices = similarities.argsort(descending=True)[:5]
-    # Format the top 5 most similar results
     results = []
     for idx in top_indices:
         row = sample_df.iloc[idx.item()]
         results.append(f"{row['text']}\nEmotions: {row['labels']}")
     return "\n\n".join(results)
-recommend_similar_emotions("I'm feeling nervous before my exam")
-"""Core recommendation logic for matching user input text to most similar texts in the dataset using sentence embeddings and cosine similarity.
-Returns top 5 results with their associated emotion labels.
-"""
 import gradio as gr
-# Create Gradio interface
 demo = gr.Interface(
     fn=recommend_similar_emotions,
     inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."),
@@ -404,6 +248,4 @@ demo = gr.Interface(
     description="Describe how you feel, and get similar examples with emotion labels."
 )
-demo.launch()
-"""Set up the Gradio web app for entering text and viewing recommendations"""

+# -- coding: utf-8 --
 """emotion-matcher.ipynb
 Automatically generated by Colab.
 df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
 # Preview the first few rows of the dataset
+print(df.head())
+# View dataset shape
 print("Dataset shape:", df.shape)
+# View basic column information
 print("\nColumn names:", df.columns.tolist())
+# View detailed info
 df.info()
+# Check for missing values
 print("Missing values per column:")
 print(df.isnull().sum())
+# Check for duplicated rows (convert unhashable columns to string)
 print("\nNumber of duplicated rows:")
 print(df.astype(str).duplicated().sum())
+# Check how many unique combinations of emotion labels exist
 print("\nNumber of unique label combinations:")
 print(df["labels"].apply(lambda x: tuple(x)).nunique())
+# Compute text lengths in number of words
 df["text_length"] = df["text"].apply(lambda x: len(x.split()))
+# Plot histogram of text lengths
 import matplotlib.pyplot as plt
 plt.figure(figsize=(10,6))
 plt.grid(True)
 plt.show()
+# Count how many emotion labels each text has
 df["num_labels"] = df["labels"].apply(len)
+# Plot distribution
 plt.figure(figsize=(8,5))
 df["num_labels"].value_counts().sort_index().plot(kind="bar")
 plt.xlabel("Number of emotion labels")
 plt.title("Distribution of Emotion Labels per Sample")
 plt.show()
 # Count frequency of each individual emotion label
 from collections import Counter
 label_counts = Counter(all_labels)
 # Convert to DataFrame for plotting
 emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
 emotion_freq = emotion_freq.sort_values(by='count', ascending=False)
 plt.ylabel("Number of Occurrences")
 plt.show()
+# Create a binary matrix for emotions
 import numpy as np
 import seaborn as sns
 num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1
 emotion_matrix = np.zeros((len(df), num_labels), dtype=int)
 for i, labels in enumerate(df["labels"]):
     for label in labels:
         emotion_matrix[i, label] = 1
 sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5)
 plt.title("Emotion Co-occurrence Heatmap")
 plt.xlabel("Emotion Label ID")
+plt.ylabel("Emotion Label ID")
 plt.show()
 # Display 5 random rows
 print("Sample text examples with emotion labels:")
 print(df.sample(5)[["text", "labels"]])
 # Define emotion label ID to name mapping manually (based on GoEmotions documentation)
 id2label = [
     'admiration', 'amusement', 'anger', 'annoyance', 'approval',
     'neutral'
 ]
 def decode_labels(label_ids):
     return [id2label[i] for i in label_ids]
 print("Sample text examples with emotion label names:")
 sample_df = df.sample(5)
 sample_df["label_names"] = sample_df["labels"].apply(decode_labels)
+print(sample_df[["text", "label_names"]])
+# Word cloud
 from wordcloud import WordCloud
 all_text = " ".join(df["text"])
 wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
 plt.figure(figsize=(12, 6))
 plt.imshow(wordcloud, interpolation="bilinear")
 plt.axis("off")
 plt.title("Most Frequent Words in All Text Samples")
 plt.show()
+# Clean the text data
 import re
 import string
 def clean_text(text):
     text = text.lower()
     text = re.sub(r"\[.*?\]", "", text)
     text = text.translate(str.maketrans('', '', string.punctuation))
     text = re.sub(r"\d+", "", text)
     text = re.sub(r"\s+", " ", text).strip()
     return text
 df["clean_text"] = df["text"].apply(clean_text)
 print("Sample cleaned texts:")
+print(df[["text", "clean_text"]].sample(5))
 # Plot label distribution
+label_counts = Counter([label for sublist in df["labels"] for label in sublist])
 label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"])
 label_df.index.name = "label_id"
 label_df = label_df.sort_index()
 label_df["label_name"] = label_df.index.map(lambda i: id2label[i])
 plt.figure(figsize=(14, 6))
 sns.barplot(x="label_name", y="count", data=label_df)
 plt.xticks(rotation=45, ha="right")
 plt.tight_layout()
 plt.show()
+# Embeddings
 from sentence_transformers import SentenceTransformer
 import torch
 model = SentenceTransformer('all-MiniLM-L6-v2')
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 model = model.to(device)
 sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True)
+embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True, device=device)
 sample_df["embedding"] = embeddings.tolist()
+# t-SNE visualization
 from sklearn.manifold import TSNE
 X = np.array(sample_df["embedding"].tolist())
 tsne = TSNE(n_components=2, random_state=42, perplexity=30)
 X_embedded = tsne.fit_transform(X)
 sample_df["x"] = X_embedded[:, 0]
 sample_df["y"] = X_embedded[:, 1]
 plt.figure(figsize=(10, 6))
 plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5)
 plt.title("t-SNE Projection of Text Embeddings")
 plt.ylabel("Component 2")
 plt.show()
+# KMeans Clustering
 from sklearn.cluster import KMeans
 num_clusters = 8
 kmeans = KMeans(n_clusters=num_clusters, random_state=42)
 sample_df["cluster"] = kmeans.fit_predict(X)
 plt.figure(figsize=(10, 6))
 scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6)
 plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection")
 plt.colorbar(scatter, label="Cluster")
 plt.show()
+# Recommendation Function
 from sentence_transformers import util
 EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device)
 def recommend_similar_emotions(user_input):
     if not user_input.strip():
         return "Please enter some text."
     user_embedding = model.encode(user_input, convert_to_tensor=True, device=device)
     similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0]
     top_indices = similarities.argsort(descending=True)[:5]
     results = []
     for idx in top_indices:
         row = sample_df.iloc[idx.item()]
         results.append(f"{row['text']}\nEmotions: {row['labels']}")
     return "\n\n".join(results)
+# Gradio App
 import gradio as gr
 demo = gr.Interface(
     fn=recommend_similar_emotions,
     inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."),
     description="Describe how you feel, and get similar examples with emotion labels."
 )
+demo.launch()