aeresd commited on
Commit
857cce7
·
verified ·
1 Parent(s): e21b576

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -81
app.py CHANGED
@@ -5,8 +5,6 @@ from PIL import Image
5
  import pytesseract
6
  import pandas as pd
7
  import plotly.express as px
8
- import re
9
- from collections import defaultdict
10
 
11
  # ✅ Step 1: Emoji 翻译模型(你自己训练的模型)
12
  emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
@@ -39,74 +37,50 @@ with st.sidebar:
39
  if "history" not in st.session_state:
40
  st.session_state.history = []
41
 
42
- # 映射 label 到雷達圖分類(可依模型微調)
43
- label_to_category = {
44
- "toxic": "Abuse",
45
- "offensive": "Insult",
46
- "insult": "Insult",
47
- "threat": "Hate Speech",
48
- "obscene": "Vulgarity",
49
- "hate": "Hate Speech",
50
- "discrimination": "Discrimination"
51
- }
52
-
53
- # 冒犯性分析函數(逐元素)
54
- def classify_text_elements(text: str):
55
- elements = re.split(r"[,。,、!!??\s\n]", text)
56
- elements = [e for e in elements if e.strip()]
57
-
58
- results = []
59
- radar_scores = defaultdict(float)
60
-
61
- for element in elements:
62
- prompt = f"输入:{element}\n输出:"
63
- input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
64
- with torch.no_grad():
65
- output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
66
- decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
67
- translated = decoded.split("输出:")[-1].strip() if "输出:" in decoded else decoded.strip()
68
-
69
- classification = classifier(translated)[0]
70
- label = classification["label"].lower()
71
- score = classification["score"]
72
-
73
- category = label_to_category.get(label, "Others")
74
- radar_scores[category] += score
75
-
76
- reasoning = f"'{element}' was flagged as '{label}' → '{category}' due to potential offensiveness."
77
- results.append({
78
- "text": element,
79
- "translated": translated,
80
- "label": label,
81
- "category": category,
82
- "score": score,
83
- "reason": reasoning
84
- })
85
-
86
- st.session_state.history.extend(results)
87
- return results, radar_scores
88
-
89
- # 主页面
90
  st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
91
 
92
  # 文本输入
93
  st.subheader("1. 输入与分类")
94
- default_text = "你是🐷,太垃圾了,滚开!"
95
  text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
96
 
97
  if st.button("🚦 Analyze Text"):
98
  with st.spinner("🔍 Processing..."):
99
  try:
100
- analysis_results, radar_scores = classify_text_elements(text)
101
-
102
- st.markdown("### ✨ Element-wise Classification")
103
- for item in analysis_results:
104
- st.markdown(f"- 🔹 **Input:** {item['text']}")
105
- st.markdown(f" - ✨ **Translated:** {item['translated']}")
106
- st.markdown(f" - ❗ **Label:** {item['label']} → **{item['category']}** ({item['score']:.2%})")
107
- st.markdown(f" - 🔧 **Reasoning:** {item['reason']}")
108
-
109
- st.success("✅ Analysis complete!")
110
  except Exception as e:
111
  st.error(f"❌ An error occurred:\n{e}")
112
 
@@ -122,12 +96,13 @@ if uploaded_file:
122
  if ocr_text:
123
  st.markdown("**Extracted Text:**")
124
  st.code(ocr_text)
125
- analysis_results, radar_scores = classify_text_elements(ocr_text)
126
- for item in analysis_results:
127
- st.markdown(f"- 🔹 **Input:** {item['text']}")
128
- st.markdown(f" - ✨ **Translated:** {item['translated']}")
129
- st.markdown(f" - ❗ **Label:** {item['label']} → **{item['category']}** ({item['score']:.2%})")
130
- st.markdown(f" - 🔧 **Reasoning:** {item['reason']}")
 
131
  else:
132
  st.info("⚠️ No text detected in the image.")
133
 
@@ -135,24 +110,59 @@ if uploaded_file:
135
  st.markdown("---")
136
  st.subheader("3. Violation Analysis Dashboard")
137
  if st.session_state.history:
 
138
  df = pd.DataFrame(st.session_state.history)
139
-
140
- st.markdown("### 🧾 Offense History Summary")
141
  for item in st.session_state.history:
142
- st.markdown(f"- **Input:** {item['text']}")
143
- st.markdown(f" - 🔠 Translated: {item['translated']}")
144
- st.markdown(f" - 🏷️ Label: {item['label']} {item['category']}, Score: {item['score']:.2%}")
145
-
146
- # 累积雷达分数
147
- category_list = ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"]
148
- radar_data = {
149
- "Category": category_list,
150
- "Score": [min(st.session_state.history.count(c)/len(st.session_state.history), 1.0)
151
- if c in radar_scores else 0.0 for c in category_list]
152
- }
153
- radar_df = pd.DataFrame(radar_data)
154
  radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="⚠️ Risk Radar by Category")
155
  radar_fig.update_traces(line_color='black')
156
  st.plotly_chart(radar_fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  else:
158
  st.info("⚠️ No classification data available yet.")
 
5
  import pytesseract
6
  import pandas as pd
7
  import plotly.express as px
 
 
8
 
9
  # ✅ Step 1: Emoji 翻译模型(你自己训练的模型)
10
  emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 
37
  if "history" not in st.session_state:
38
  st.session_state.history = []
39
 
40
+ # 分类函数
41
+ def classify_emoji_text(text: str):
42
+ prompt = f"输入:{text}\n输出:"
43
+ input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
44
+ with torch.no_grad():
45
+ output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
46
+ decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
47
+ translated_text = decoded.split("输出:")[-1].strip() if "输出:" in decoded else decoded.strip()
48
+
49
+ result = classifier(translated_text)[0]
50
+ label = result["label"]
51
+ score = result["score"]
52
+ reasoning = (
53
+ f"The sentence was flagged as '{label}' due to potentially offensive phrases. "
54
+ "Consider replacing emotionally charged, ambiguous, or abusive terms."
55
+ )
56
+
57
+ st.session_state.history.append({
58
+ "text": text,
59
+ "translated": translated_text,
60
+ "label": label,
61
+ "score": score,
62
+ "reason": reasoning
63
+ })
64
+ return translated_text, label, score, reasoning
65
+
66
+ # 主页面:输入与分析共存
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
68
 
69
  # 文本输入
70
  st.subheader("1. 输入与分类")
71
+ default_text = "你是🐷"
72
  text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
73
 
74
  if st.button("🚦 Analyze Text"):
75
  with st.spinner("🔍 Processing..."):
76
  try:
77
+ translated, label, score, reason = classify_emoji_text(text)
78
+ st.markdown("**Translated sentence:**")
79
+ st.code(translated, language="text")
80
+ st.markdown(f"**Prediction:** {label}")
81
+ st.markdown(f"**Confidence Score:** {score:.2%}")
82
+ st.markdown("**Model Explanation:**")
83
+ st.info(reason)
 
 
 
84
  except Exception as e:
85
  st.error(f"❌ An error occurred:\n{e}")
86
 
 
96
  if ocr_text:
97
  st.markdown("**Extracted Text:**")
98
  st.code(ocr_text)
99
+ translated, label, score, reason = classify_emoji_text(ocr_text)
100
+ st.markdown("**Translated sentence:**")
101
+ st.code(translated, language="text")
102
+ st.markdown(f"**Prediction:** {label}")
103
+ st.markdown(f"**Confidence Score:** {score:.2%}")
104
+ st.markdown("**Model Explanation:**")
105
+ st.info(reason)
106
  else:
107
  st.info("⚠️ No text detected in the image.")
108
 
 
110
  st.markdown("---")
111
  st.subheader("3. Violation Analysis Dashboard")
112
  if st.session_state.history:
113
+ # 展示历史记录
114
  df = pd.DataFrame(st.session_state.history)
115
+ st.markdown("### 🧾 Offensive Terms & Suggestions")
 
116
  for item in st.session_state.history:
117
+ st.markdown(f"- 🔹 **Input:** {item['text']}")
118
+ st.markdown(f" - **Translated:** {item['translated']}")
119
+ st.markdown(f" - **Label:** {item['label']} with **{item['score']:.2%}** confidence")
120
+ st.markdown(f" - 🔧 **Suggestion:** {item['reason']}")
121
+
122
+ # 雷达图
123
+ radar_df = pd.DataFrame({
124
+ "Category": ["Insult","Abuse","Discrimination","Hate Speech","Vulgarity"],
125
+ "Score": [0.7,0.4,0.3,0.5,0.6]
126
+ })
 
 
127
  radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="⚠️ Risk Radar by Category")
128
  radar_fig.update_traces(line_color='black')
129
  st.plotly_chart(radar_fig)
130
+
131
+ # —— 新增:单词级冒犯性相关性分析 —— #
132
+ st.markdown("### 🧬 Word-level Offensive Correlation")
133
+
134
+ # 取最近一次翻译文本,按空格拆分单词
135
+ last_translated_text = st.session_state.history[-1]["translated"]
136
+ words = last_translated_text.split()
137
+
138
+ # 对每个单词进行分类并收集分数
139
+ word_scores = []
140
+ for word in words:
141
+ try:
142
+ res = classifier(word)[0]
143
+ word_scores.append({
144
+ "Word": word,
145
+ "Label": res["label"],
146
+ "Score": res["score"]
147
+ })
148
+ except Exception:
149
+ continue
150
+
151
+ if word_scores:
152
+ word_df = pd.DataFrame(word_scores)
153
+ word_df = word_df.sort_values(by="Score", ascending=False).reset_index(drop=True)
154
+
155
+ max_display = 5
156
+ # Streamlit 1.22+ 支持 st.toggle,若版本不支持可改用 checkbox
157
+ show_more = st.toggle("Show more words", value=False)
158
+
159
+ display_df = word_df if show_more else word_df.head(max_display)
160
+ # 隐藏边框并渲染 HTML 表格
161
+ st.markdown(
162
+ display_df.to_html(index=False, border=0),
163
+ unsafe_allow_html=True
164
+ )
165
+ else:
166
+ st.info("❕ No word-level analysis available.")
167
  else:
168
  st.info("⚠️ No classification data available yet.")