Spaces:

aeresd
/

test_1

Sleeping

App Files Files Community

aeresd commited on May 20

Commit

e21b576

verified ·

1 Parent(s): 40382a6

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -110

app.py CHANGED Viewed

@@ -5,8 +5,10 @@ from PIL import Image
 import pytesseract
 import pandas as pd
 import plotly.express as px
-# ✅ Step 1: Emoji 翻译模型
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
@@ -16,7 +18,7 @@ emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
-# ✅ Step 2: 冒犯性文本识别模型
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
@@ -26,106 +28,89 @@ model_options = {
 # ✅ 页面配置
 st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
-# ✅ 侧边栏配置
 with st.sidebar:
     st.header("🧠 Configuration")
     selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
     selected_model_id = model_options[selected_model]
-    classifier = pipeline(
-        "text-classification",
-        model=selected_model_id,
-        device=0 if torch.cuda.is_available() else -1,
-        return_all_scores=True
-    )
 # 初始化历史记录
 if "history" not in st.session_state:
     st.session_state.history = []
-# 分类函数（优化版）
-def classify_emoji_text(text: str):
-    prompt = f"输入：{text}\n输出："
-    input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
-    with torch.no_grad():
-        output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
-    decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()
-    # 获取所有分类结果
-    all_results = classifier(translated_text)
-    # 雷达图类别映射规则
-    radar_categories = ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"]
-    radar_scores = {category: 0.0 for category in radar_categories}
-    # 模型特定映射规则
-    model_mappings = {
-        "Toxic-BERT": {
-            "toxic": "Vulgarity",
-            "severe_toxic": "Abuse",
-            "obscene": "Vulgarity",
-            "threat": "Hate Speech",
-            "insult": "Insult",
-            "identity_hate": "Discrimination"
-        },
-        "Roberta Offensive": {
-            "offensive": ["Insult", "Abuse", "Vulgarity"]
-        }
-    }
-    # 动态生成雷达分数
-    for result in all_results:
-        label = result['label']
-        score = result['score']
-        if selected_model == "Toxic-BERT":
-            mapped_category = model_mappings["Toxic-BERT"].get(label)
-            if mapped_category and score > radar_scores[mapped_category]:
-                radar_scores[mapped_category] = score
-        elif selected_model == "Roberta Offensive" and label == "offensive":
-            for category in model_mappings["Roberta Offensive"]["offensive"]:
-                if score > radar_scores[category]:
-                    radar_scores[category] = score
-    # 获取主要分类结果
-    primary_result = max(all_results, key=lambda x: x['score'])
-    label = primary_result["label"]
-    score = primary_result["score"]
-    reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases. Consider replacing emotionally charged, ambiguous, or abusive terms."
-    # 存储到历史记录
-    st.session_state.history.append({
-        "text": text,
-        "translated": translated_text,
-        "label": label,
-        "score": score,
-        "reason": reasoning,
-        "radar_scores": radar_scores
-    })
-    return translated_text, label, score, reasoning, radar_scores
-# 主界面
 st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
-# 文本输入分析
 st.subheader("1. 输入与分类")
-default_text = "你是🐷"
 text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
 if st.button("🚦 Analyze Text"):
     with st.spinner("🔍 Processing..."):
         try:
-            translated, label, score, reason, radar = classify_emoji_text(text)
-            st.markdown("**Translated sentence:**")
-            st.code(translated, language="text")
-            st.markdown(f"**Prediction:** {label}")
-            st.markdown(f"**Confidence Score:** {score:.2%}")
-            st.markdown("**Model Explanation:**")
-            st.info(reason)
         except Exception as e:
             st.error(f"❌ An error occurred:\n{e}")
-# 图片分析
 st.markdown("---")
 st.subheader("2. 图片 OCR & 分类")
 uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"])
@@ -137,13 +122,12 @@ if uploaded_file:
         if ocr_text:
             st.markdown("**Extracted Text:**")
             st.code(ocr_text)
-            translated, label, score, reason, radar = classify_emoji_text(ocr_text)
-            st.markdown("**Translated sentence:**")
-            st.code(translated, language="text")
-            st.markdown(f"**Prediction:** {label}")
-            st.markdown(f"**Confidence Score:** {score:.2%}")
-            st.markdown("**Model Explanation:**")
-            st.info(reason)
         else:
             st.info("⚠️ No text detected in the image.")
@@ -151,30 +135,24 @@ if uploaded_file:
 st.markdown("---")
 st.subheader("3. Violation Analysis Dashboard")
 if st.session_state.history:
-    # 历史记录展示
     df = pd.DataFrame(st.session_state.history)
-    st.markdown("### 🧾 Offensive Terms & Suggestions")
     for item in st.session_state.history:
-        st.markdown(f"- 🔹 **Input:** {item['text']}")
-        st.markdown(f"   - ✨ **Translated:** {item['translated']}")
-        st.markdown(f"   - ❗ **Label:** {item['label']} with **{item['score']:.2%}** confidence")
-        st.markdown(f"   - 🔧 **Suggestion:** {item['reason']}")
-    # 动态生成雷达图
-    latest_radar = st.session_state.history[-1]["radar_scores"]
-    radar_df = pd.DataFrame({
-        "Category": latest_radar.keys(),
-        "Score": latest_radar.values()
-    })
-    radar_fig = px.line_polar(
-        radar_df,
-        r='Score',
-        theta='Category',
-        line_close=True,
-        title="⚠️ Risk Radar by Category",
-        range_r=[0,1]
-    )
-    radar_fig.update_traces(fill='toself', line_color='red')
     st.plotly_chart(radar_fig)
 else:
     st.info("⚠️ No classification data available yet.")

 import pytesseract
 import pandas as pd
 import plotly.express as px
+import re
+from collections import defaultdict
+# ✅ Step 1: Emoji 翻译模型（你自己训练的模型）
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
+# ✅ Step 2: 可选择的冒犯性文本识别模型
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
 # ✅ 页面配置
 st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
+# ✅ 侧边栏：模型选择
 with st.sidebar:
     st.header("🧠 Configuration")
     selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
     selected_model_id = model_options[selected_model]
+    classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)
 # 初始化历史记录
 if "history" not in st.session_state:
     st.session_state.history = []
+# 映射 label 到雷達圖分類（可依模型微調）
+label_to_category = {
+    "toxic": "Abuse",
+    "offensive": "Insult",
+    "insult": "Insult",
+    "threat": "Hate Speech",
+    "obscene": "Vulgarity",
+    "hate": "Hate Speech",
+    "discrimination": "Discrimination"
+}
+# ✅ 冒犯性分析函數（逐元素）
+def classify_text_elements(text: str):
+    elements = re.split(r"[，。,、！!？?\s\n]", text)
+    elements = [e for e in elements if e.strip()]
+    results = []
+    radar_scores = defaultdict(float)
+    for element in elements:
+        prompt = f"输入：{element}\n输出："
+        input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
+        with torch.no_grad():
+            output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
+        decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        translated = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()
+        classification = classifier(translated)[0]
+        label = classification["label"].lower()
+        score = classification["score"]
+        category = label_to_category.get(label, "Others")
+        radar_scores[category] += score
+        reasoning = f"'{element}' was flagged as '{label}' → '{category}' due to potential offensiveness."
+        results.append({
+            "text": element,
+            "translated": translated,
+            "label": label,
+            "category": category,
+            "score": score,
+            "reason": reasoning
+        })
+    st.session_state.history.extend(results)
+    return results, radar_scores
+# 主页面
 st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
+# 文本输入
 st.subheader("1. 输入与分类")
+default_text = "你是🐷，太垃圾了，滚开！"
 text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
 if st.button("🚦 Analyze Text"):
     with st.spinner("🔍 Processing..."):
         try:
+            analysis_results, radar_scores = classify_text_elements(text)
+            st.markdown("### ✨ Element-wise Classification")
+            for item in analysis_results:
+                st.markdown(f"- 🔹 **Input:** {item['text']}")
+                st.markdown(f"   - ✨ **Translated:** {item['translated']}")
+                st.markdown(f"   - ❗ **Label:** {item['label']} → **{item['category']}** ({item['score']:.2%})")
+                st.markdown(f"   - 🔧 **Reasoning:** {item['reason']}")
+            st.success("✅ Analysis complete!")
         except Exception as e:
             st.error(f"❌ An error occurred:\n{e}")
+# 图片上传与 OCR
 st.markdown("---")
 st.subheader("2. 图片 OCR & 分类")
 uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"])
         if ocr_text:
             st.markdown("**Extracted Text:**")
             st.code(ocr_text)
+            analysis_results, radar_scores = classify_text_elements(ocr_text)
+            for item in analysis_results:
+                st.markdown(f"- 🔹 **Input:** {item['text']}")
+                st.markdown(f"   - ✨ **Translated:** {item['translated']}")
+                st.markdown(f"   - ❗ **Label:** {item['label']} → **{item['category']}** ({item['score']:.2%})")
+                st.markdown(f"   - 🔧 **Reasoning:** {item['reason']}")
         else:
             st.info("⚠️ No text detected in the image.")
 st.markdown("---")
 st.subheader("3. Violation Analysis Dashboard")
 if st.session_state.history:
     df = pd.DataFrame(st.session_state.history)
+    st.markdown("### 🧾 Offense History Summary")
     for item in st.session_state.history:
+        st.markdown(f"- **Input:** {item['text']}")
+        st.markdown(f"   - 🔠 Translated: {item['translated']}")
+        st.markdown(f"   - 🏷️ Label: {item['label']} → {item['category']}, Score: {item['score']:.2%}")
+    # 累积雷达分数
+    category_list = ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"]
+    radar_data = {
+        "Category": category_list,
+        "Score": [min(st.session_state.history.count(c)/len(st.session_state.history), 1.0)
+                  if c in radar_scores else 0.0 for c in category_list]
+    }
+    radar_df = pd.DataFrame(radar_data)
+    radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="⚠️ Risk Radar by Category")
+    radar_fig.update_traces(line_color='black')
     st.plotly_chart(radar_fig)
 else:
     st.info("⚠️ No classification data available yet.")