Spaces:

ctizzzy0
/

multi-modal-emotion-ai

Sleeping

App Files Files Community

ctizzzy0 commited on 16 days ago

Commit

13af33f

verified ·

1 Parent(s): 2a0fd08

Update app.py

Browse files

Files changed (1) hide show

app.py +344 -87

app.py CHANGED Viewed

@@ -1,106 +1,363 @@
-import gradio as gr
-from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForCTC
-import torch
-from PIL import Image
-import cv2
 import numpy as np
-import matplotlib.pyplot as plt
 import pandas as pd
 from fpdf import FPDF
-import os
-# ---------------- MODELS ----------------
-TEXT_EMO_MODEL = "j-hartmann/emotion-english-distilroberta-base"
-VOICE_EMO_MODEL = "superb/wav2vec2-base-superb-er"
-FACE_EMO_MODEL = "trpakov/vit-face-expression"  # public model
-# Pipelines
-text_emo = pipeline("text-classification", model=TEXT_EMO_MODEL, top_k=None)
-voice_emo = pipeline("audio-classification", model=VOICE_EMO_MODEL, top_k=None)
-face_emo = pipeline("image-classification", model=FACE_EMO_MODEL, top_k=None)
-# ---------------- HELPERS ----------------
-def detect_face_and_crop(image_path):
-    img = cv2.imread(image_path)
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
-    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
-    if len(faces) > 0:
-        (x, y, w, h) = faces[0]
-        img = img[y:y+h, x:x+w]
-    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-    return img_pil
-def analyze(text, audio_path, image_path):
-    results = {}
-    if text:
-        text_res = text_emo(text)[0]
-        results["Text"] = text_res
-    else:
-        text_res = []
-    if audio_path:
-        voice_res = voice_emo(audio_path)[0]
-        results["Voice"] = voice_res
-    else:
-        voice_res = []
-    if image_path:
-        cropped_face = detect_face_and_crop(image_path)
-        face_res = face_emo(cropped_face)[0]
-        results["Face"] = face_res
-    else:
-        face_res = []
-    # Create plot
-    plt.figure(figsize=(8,5))
-    for modality, data in results.items():
-        labels = [d['label'] for d in data]
-        scores = [d['score']*100 for d in data]
-        plt.bar(labels, scores, alpha=0.6, label=modality)
-    plt.xticks(rotation=45, ha="right")
-    plt.ylabel("Probability (%)")
-    plt.legend()
     plt.tight_layout()
-    plot_path = "emotion_plot.png"
-    plt.savefig(plot_path)
-    plt.close()
-    # Create PDF
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", size=16)
-    pdf.cell(200, 10, "Multi-Modal Emotion Analysis", ln=True, align='C')
-    pdf.image(plot_path, x=10, y=30, w=180)
-    pdf.ln(120)
     pdf.set_font("Arial", size=12)
-    for modality, data in results.items():
-        pdf.cell(200, 10, f"{modality}:", ln=True)
-        for d in data:
-            pdf.cell(200, 8, f"{d['label']}: {d['score']*100:.2f}%", ln=True)
-    pdf_output = "emotion_report.pdf"
-    pdf.output(pdf_output)
-    return plot_path, pdf_output
-# ---------------- UI ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🧠 Multi-Modal Emotion AI (Text + Voice + Face)")
-    gr.Markdown("Analyze emotions from your **words**, **voice**, and **face**, then download a PDF report.")
-    with gr.Row():
-        text_in = gr.Textbox(label="Enter Text", placeholder="Type something meaningful...")
-        audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or record voice (optional)")
-        img_in = gr.Image(type="filepath", label="Upload a face image (optional)")
-    run_btn = gr.Button("Analyze Emotions", variant="primary")
-    with gr.Row():
-        plot_out = gr.Image(label="Emotion Plot")
-        pdf_out = gr.File(label="Download Report")
-    run_btn.click(analyze, inputs=[text_in, audio_in, img_in], outputs=[plot_out, pdf_out])
 app = demo

+# app.py — Multi-Modal Emotion AI (Text + Voice + Face)
+# Features: per-modality analysis, fusion (weighted), safety screen, CBT distortions,
+# PDF report with charts, trends logging, face auto-crop. CPU-friendly for HF Spaces.
+import os, io, json, datetime
+from typing import Dict, List, Optional, Tuple
 import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
+from PIL import Image
+import cv2
+import gradio as gr
 from fpdf import FPDF
+from transformers import pipeline
+# -----------------------------
+# Public, lightweight models
+# -----------------------------
+TEXT_MODEL  = "SamLowe/roberta-base-go_emotions"   # 27 emotions
+VOICE_MODEL = "superb/wav2vec2-base-superb-er"     # speech emotion recognition
+FACE_MODEL  = "trpakov/vit-face-expression"        # facial expression (ViT)
+text_pipe  = pipeline("text-classification",  model=TEXT_MODEL,  top_k=None)
+voice_pipe = pipeline("audio-classification", model=VOICE_MODEL, top_k=None)
+face_pipe  = pipeline("image-classification", model=FACE_MODEL,  top_k=None)
+# -----------------------------
+# Files / persistence
+# -----------------------------
+RUN_LOG = "runs.csv"
+if not os.path.exists(RUN_LOG):
+    pd.DataFrame(columns=["timestamp","text","text_top","voice_top","face_top","fused_top","pos_index"]).to_csv(RUN_LOG, index=False)
+os.makedirs("charts", exist_ok=True)
+# -----------------------------
+# Safety & CBT
+# -----------------------------
+RISK_TERMS = {
+    "self_harm": ["kill myself","end it","suicide","self harm","cutting","overdose"],
+    "violence":  ["hurt them","attack","kill them","shoot","stab","revenge"]
+}
+DISTORTIONS = {
+    "catastrophizing":   ["ruined","disaster","worst ever","nothing will work","everything is over"],
+    "all_or_nothing":    ["always","never","completely","totally","entirely"],
+    "mind_reading":      ["they think","everyone thinks","people will think"],
+    "fortune_telling":   ["will fail","will go wrong","i'm doomed"],
+    "labeling":          ["i'm a failure","i'm useless","i'm stupid"],
+    "should_statements": ["should","must","have to"],
+    "discount_positive": ["doesn't count","just luck","not a big deal"]
+}
+REFRAMES = {
+    "catastrophizing":   "Zoom out: list 3 realistic outcomes besides worst-case.",
+    "all_or_nothing":    "Find the gray: what % went right vs wrong?",
+    "mind_reading":      "Check evidence: what did they actually say/do?",
+    "fortune_telling":   "Run a small test that could disconfirm your prediction.",
+    "labeling":          "Describe the behavior, not your identity.",
+    "should_statements": "Swap ‘should’ → ‘I prefer / I will try’.",
+    "discount_positive": "Write 3 things you handled well and why they matter."
+}
+def safety_screen(text: str) -> Tuple[str, Dict[str, List[str]]]:
+    t = (text or "").lower()
+    hits = {k:[w for w in v if w in t] for k,v in RISK_TERMS.items()}
+    hits = {k:v for k,v in hits.items() if v}
+    return ("high" if hits else "low"), hits
+def detect_distortions(text: str) -> List[str]:
+    t = (text or "").lower()
+    found = []
+    for name, cues in DISTORTIONS.items():
+        if any(cue in t for cue in cues):
+            found.append(name)
+    return sorted(set(found))
+def reframe_tips(names: List[str]) -> List[str]:
+    return [REFRAMES[n] for n in names if n in REFRAMES]
+# -----------------------------
+# Emotion utilities
+# -----------------------------
+POSITIVE = set(["admiration","amusement","approval","gratitude","joy","love","optimism","relief","pride","excitement"])
+NEGATIVE = set(["anger","annoyance","disappointment","disapproval","disgust","embarrassment","fear","grief","nervousness","remorse","sadness"])
+def to_probs(outputs) -> Dict[str,float]:
+    # pipelines return list[list[{"label","score"}]] when top_k=None
+    if isinstance(outputs, list) and outputs and isinstance(outputs[0], list):
+        outputs = outputs[0]
+    d = {o["label"]: float(o["score"]) for o in outputs}
+    s = sum(d.values()) or 1.0
+    return {k: v/s for k,v in d.items()}
+def top_item(prob: Optional[Dict[str,float]]) -> str:
+    if not prob: return ""
+    k = max(prob, key=prob.get)
+    return f"{k} ({prob[k]*100:.1f}%)"
+def positivity_index(prob: Optional[Dict[str,float]]) -> float:
+    if not prob: return 0.5
+    pos = sum(prob.get(k,0.0) for k in POSITIVE)
+    neg = sum(prob.get(k,0.0) for k in NEGATIVE)
+    return round((pos - neg + 1)/2, 4)  # [-1,1] -> [0,1]
+def union_merge(dicts: List[Optional[Dict[str,float]]], weights: List[float]) -> Dict[str,float]:
+    labels = set()
+    for d in dicts:
+        if d: labels |= set(d.keys())
+    merged = {l:0.0 for l in labels}
+    for d, w in zip(dicts, weights):
+        if not d: continue
+        for l in labels:
+            merged[l] += w * d.get(l, 0.0)
+    s = sum(merged.values()) or 1.0
+    return {k:v/s for k,v in merged.items()}
+def bar_fig(prob: Dict[str,float], title: str):
+    labels = list(prob.keys())
+    vals = [prob[k]*100 for k in labels]
+    fig, ax = plt.subplots(figsize=(7.0, 3.6))
+    ax.bar(labels, vals)
+    ax.set_ylim(0, 100)
+    ax.set_ylabel("Probability (%)")
+    ax.set_title(title)
+    for i, v in enumerate(vals):
+        ax.text(i, v + 1, f"{v:.1f}%", ha="center", fontsize=8)
+    plt.xticks(rotation=28, ha="right")
     plt.tight_layout()
+    return fig
+def save_chart(prob: Dict[str,float], title: str, path: str):
+    fig = bar_fig(prob, title)
+    fig.savefig(path, dpi=160, bbox_inches="tight")
+    plt.close(fig)
+# -----------------------------
+# Computer vision: face crop
+# -----------------------------
+HAAR = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
+def crop_face(image_path: str) -> Image.Image:
+    try:
+        img = cv2.imread(image_path)
+        if img is None:  # fallback
+            return Image.open(image_path).convert("RGB")
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        faces = HAAR.detectMultiScale(gray, scaleFactor=1.2, minNeighbors=5, minSize=(80,80))
+        if len(faces) > 0:
+            x,y,w,h = sorted(faces, key=lambda b:b[2]*b[3], reverse=True)[0]
+            img = img[y:y+h, x:x+w]
+        return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    except Exception:
+        return Image.open(image_path).convert("RGB")
+# -----------------------------
+# Per-modality inference
+# -----------------------------
+def analyze_text(text: str):
+    if not text or not text.strip():
+        return gr.Error("Please enter text."), None, None
+    probs = to_probs(text_pipe(text))
+    msg = f"**Top Text Emotion:** {top_item(probs)}  |  **Positivity Index:** {positivity_index(probs):.2f}"
+    fig = bar_fig(probs, "Text Emotions")
+    return msg, fig, json.dumps(probs)
+def analyze_voice(audio_path: Optional[str]):
+    if not audio_path:
+        return "No audio provided.", None, None
+    probs = to_probs(voice_pipe(audio_path))
+    msg = f"**Top Voice Emotion:** {top_item(probs)}"
+    fig = bar_fig(probs, "Voice Emotions")
+    return msg, fig, json.dumps(probs)
+def analyze_face(image_path: Optional[str]):
+    if not image_path:
+        return "No image provided.", None, None
+    face_img = crop_face(image_path)
+    probs = to_probs(face_pipe(face_img))
+    msg = f"**Top Face Emotion:** {top_item(probs)}"
+    fig = bar_fig(probs, "Face Emotions")
+    return msg, fig, json.dumps(probs)
+# -----------------------------
+# PDF Report
+# -----------------------------
+def build_pdf(text_in: str,
+              text_prob: Optional[Dict[str,float]],
+              voice_prob: Optional[Dict[str,float]],
+              face_prob: Optional[Dict[str,float]],
+              fused_prob: Optional[Dict[str,float]],
+              safety_level: str, safety_hits: Dict[str,List[str]],
+              distortions: List[str], tips: List[str]) -> str:
+    # save charts
+    paths = []
+    if text_prob:  save_chart(text_prob,  "Text Emotions",  "charts/text.png");  paths.append("charts/text.png")
+    if voice_prob: save_chart(voice_prob, "Voice Emotions", "charts/voice.png"); paths.append("charts/voice.png")
+    if face_prob:  save_chart(face_prob,  "Face Emotions",  "charts/face.png");  paths.append("charts/face.png")
+    if fused_prob: save_chart(fused_prob, "Fused Profile",  "charts/fused.png"); paths.append("charts/fused.png")
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", size=16)
+    pdf.cell(0, 10, "Multi-Modal Emotion Report", ln=True, align="C")
     pdf.set_font("Arial", size=12)
+    pdf.cell(0, 8, f"Timestamp: {datetime.datetime.now().isoformat(sep=' ', timespec='seconds')}", ln=True)
+    pdf.multi_cell(0, 8, f"Input Text: {text_in or '(none)'}")
+    pdf.ln(2)
+    if safety_level == "high":
+        pdf.set_text_color(220,0,0)
+        pdf.multi_cell(0, 8, "⚠ High-risk language detected. If you’re in immediate danger, contact local emergency services.")
+        pdf.multi_cell(0, 8, "US: 988 (Suicide & Crisis Lifeline)")
+        if safety_hits:
+            pdf.multi_cell(0, 8, f"Matched terms: {json.dumps(safety_hits)}")
+        pdf.set_text_color(0,0,0)
+        pdf.ln(2)
+    if distortions:
+        pdf.cell(0, 8, f"Cognitive distortions: {', '.join(distortions)}", ln=True)
+        if tips:
+            pdf.cell(0, 8, "Reframe suggestions:", ln=True)
+            for t in tips:
+                pdf.multi_cell(0, 7, f" • {t}")
+        pdf.ln(2)
+    for p in paths:
+        if os.path.exists(p):
+            pdf.image(p, w=180)
+            pdf.ln(4)
+    out = "emotion_report.pdf"
+    pdf.output(out)
+    return out
+# -----------------------------
+# Trends
+# -----------------------------
+def log_run(row: dict):
+    df = pd.read_csv(RUN_LOG)
+    df.loc[len(df)] = row
+    df.to_csv(RUN_LOG, index=False)
+def plot_trends():
+    if not os.path.exists(RUN_LOG) or os.path.getsize(RUN_LOG) == 0:
+        return None
+    df = pd.read_csv(RUN_LOG)
+    if df.empty: return None
+    df["date"] = pd.to_datetime(df["timestamp"]).dt.date
+    daily = df.groupby("date")["pos_index"].mean().reset_index()
+    fig, ax = plt.subplots(figsize=(7,3.2))
+    ax.plot(daily["date"], daily["pos_index"], marker="o")
+    ax.set_ylim(0,1)
+    ax.set_ylabel("Positivity Index (0-1)")
+    ax.set_title("Positivity Trend")
+    plt.xticks(rotation=25, ha="right"); plt.tight_layout()
+    return fig
+# -----------------------------
+# Fusion handler
+# -----------------------------
+def fuse_and_report(text_json, voice_json, face_json, text_raw, w_text, w_voice, w_face):
+    te = json.loads(text_json)  if text_json  else None
+    ve = json.loads(voice_json) if voice_json else None
+    fe = json.loads(face_json)  if face_json  else None
+    weights = [w_text, w_voice, w_face]
+    s = sum(weights) or 1.0
+    weights = [w/s for w in weights]
+    fused = union_merge([te, ve, fe], weights) if (te or ve or fe) else None
+    # safety + CBT
+    safety_level, safety_hits = safety_screen(text_raw or "")
+    distos = detect_distortions(text_raw or "")
+    tips = reframe_tips(distos)
+    # pdf
+    pdf_path = build_pdf(text_raw, te, ve, fe, fused, safety_level, safety_hits, distos, tips)
+    # log
+    pi_val = positivity_index(te)
+    log_run({
+        "timestamp": datetime.datetime.now().isoformat(sep=" ", timespec="seconds"),
+        "text": text_raw or "",
+        "text_top": top_item(te),
+        "voice_top": top_item(ve),
+        "face_top": top_item(fe),
+        "fused_top": top_item(fused),
+        "pos_index": pi_val
+    })
+    msg = f"**Fused Top:** {top_item(fused) or '(insufficient inputs)'}  |  Weights → Text:{weights[0]:.2f}, Voice:{weights[1]:.2f}, Face:{weights[2]:.2f}"
+    plot = bar_fig(fused, "Fused Emotional Profile") if fused else None
+    return msg, plot, pdf_path
+# -----------------------------
+# Gradio UI
+# -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧠 Multi-Modal Emotion AI (Text + Voice + Face)")
+    gr.Markdown("Analyze emotions across **text, voice, and face**, detect **safety risks** and **cognitive distortions**, "
+                "tune **fusion weights**, and download a **PDF report**. Audio/image are optional.")
+    # state holders
+    st_text_json  = gr.State()
+    st_voice_json = gr.State()
+    st_face_json  = gr.State()
+    st_text_raw   = gr.State()
+    with gr.Tab("📝 Text"):
+        t_in = gr.Textbox(label="Your text", lines=3, placeholder="How are you feeling today?")
+        t_btn = gr.Button("Analyze Text", variant="primary")
+        t_msg = gr.Markdown()
+        t_plot = gr.Plot()
+        def _t_chain(txt):
+            msg, fig, j = analyze_text(txt)
+            return msg, fig, j, txt
+        t_btn.click(_t_chain, inputs=t_in, outputs=[t_msg, t_plot, st_text_json, st_text_raw])
+    with gr.Tab("🎤 Voice"):
+        a_in = gr.Audio(sources=["microphone","upload"], type="filepath", label="Record or upload audio (optional)")
+        a_btn = gr.Button("Analyze Voice", variant="primary")
+        a_msg = gr.Markdown()
+        a_plot = gr.Plot()
+        a_btn.click(analyze_voice, inputs=a_in, outputs=[a_msg, a_plot, st_voice_json])
+    with gr.Tab("📷 Face"):
+        f_in = gr.Image(type="filepath", label="Upload a face image (optional)")
+        f_btn = gr.Button("Analyze Face", variant="primary")
+        f_msg = gr.Markdown()
+        f_plot = gr.Plot()
+        f_btn.click(analyze_face, inputs=f_in, outputs=[f_msg, f_plot, st_face_json])
+    with gr.Tab("🧩 Fusion + Report"):
+        with gr.Row():
+            w_text  = gr.Slider(0, 1, value=0.5, step=0.05, label="Text weight")
+            w_voice = gr.Slider(0, 1, value=0.3, step=0.05, label="Voice weight")
+            w_face  = gr.Slider(0, 1, value=0.2, step=0.05, label="Face weight")
+        fuse_btn = gr.Button("Fuse & Generate PDF", variant="primary")
+        fuse_msg = gr.Markdown()
+        fuse_plot = gr.Plot()
+        fuse_pdf  = gr.File(label="Download Report")
+        fuse_btn.click(
+            fuse_and_report,
+            inputs=[st_text_json, st_voice_json, st_face_json, st_text_raw, w_text, w_voice, w_face],
+            outputs=[fuse_msg, fuse_plot, fuse_pdf]
+        )
+    with gr.Tab("📈 Trends"):
+        tr_btn = gr.Button("Refresh Positivity Trend")
+        tr_plot = gr.Plot()
+        tr_btn.click(plot_trends, inputs=None, outputs=tr_plot)
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown(
+            "Models: **GoEmotions (text)**, **Wav2Vec2-ER (audio)**, **ViT-Face-Expression (image)**. "
+            "Privacy: inputs are processed in-session; reports are generated client-side on this Space. "
+            "This is an educational demo — not medical advice."
+        )
 app = demo