aaappp7878's picture
Update app.py
2e3fe2e verified
raw
history blame
4.49 kB
# app.py - 文本检测多模型集成系统
import gradio as gr
from transformers import pipeline
import numpy as np
import re
# 加载多个检测模型
models = {
"model1": {
"name": "Xenova/distilbert-base-ai-generated-text-detection",
"detector": None,
"weight": 0.4
},
"model2": {
"name": "Hello-SimpleAI/chatgpt-detector-roberta",
"detector": None,
"weight": 0.3
},
"model3": {
"name": "roberta-base-openai-detector",
"detector": None,
"weight": 0.3
}
}
# 初始化模型
for key in models:
try:
models[key]["detector"] = pipeline("text-classification", model=models[key]["name"])
print(f"成功加载模型: {models[key]['name']}")
except Exception as e:
print(f"加载模型 {models[key]['name']} 失败: {str(e)}")
models[key]["detector"] = None
def analyze_text_features(text):
# 文本特征分析
features = {}
features["length"] = len(text)
words = text.split()
features["word_count"] = len(words)
features["avg_word_length"] = sum(len(word) for word in words) / max(1, len(words))
features["unique_words_ratio"] = len(set(text.lower().split())) / max(1, len(words))
# 句子分析
sentences = re.split(r'[.!?]+', text)
features["sentence_count"] = len(sentences)
features["avg_sentence_length"] = sum(len(s.split()) for s in sentences) / max(1, len(sentences))
# 词汇多样性
if len(words) > 0:
features["lexical_diversity"] = len(set(words)) / len(words)
# 标点符号比例
punctuation_count = sum(1 for char in text if char in ",.!?;:\"'()[]{}")
features["punctuation_ratio"] = punctuation_count / max(1, len(text))
return features
def detect_ai_text(text):
if not text or len(text.strip()) < 50:
return {"error": "文本太短,无法可靠检测"}
results = {}
valid_models = 0
weighted_ai_probability = 0
# 使用每个模型进行预测
for key, model_info in models.items():
if model_info["detector"] is not None:
try:
result = model_info["detector"](text)
# 提取结果
label = result[0]["label"]
score = result[0]["score"]
# 确定AI生成概率
if "ai" in label.lower() or "chatgpt" in label.lower() or "generated" in label.lower():
ai_probability = score
else:
ai_probability = 1 - score
# 添加到结果
results[key] = {
"model_name": model_info["name"],
"ai_probability": ai_probability,
"label": label,
"score": score
}
# 累加加权概率
weighted_ai_probability += ai_probability * model_info["weight"]
valid_models += 1
except Exception as e:
results[key] = {
"model_name": model_info["name"],
"error": str(e)
}
# 计算最终加权概率
final_ai_probability = weighted_ai_probability / max(sum(m["weight"] for k, m in models.items() if m["detector"] is not None), 1)
# 分析文本特征
text_features = analyze_text_features(text)
# 确定置信度级别
if final_ai_probability > 0.7:
confidence_level = "高概率AI生成"
elif final_ai_probability < 0.3:
confidence_level = "高概率人类创作"
else:
confidence_level = "无法确定"
# 构建最终结果
final_result = {
"ai_probability": final_ai_probability,
"confidence_level": confidence_level,
"individual_model_results": results,
"features": text_features
}
return final_result
# 创建Gradio界面
iface = gr.Interface(
fn=detect_ai_text,
inputs=gr.Textbox(lines=10, placeholder="粘贴要检测的文本..."),
outputs=gr.JSON(),
title="增强型AI文本检测API",
description="多模型集成检测文本是否由AI生成",
examples=[
["这是一段示例文本,用于测试AI文本检测功能。请输入至少50个字符的文本以获得准确的检测结果。"]
],
allow_flagging="never"
)
iface.launch()