Spaces:
Running
Running
# app.py - 文本检测多模型集成系统 | |
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
import re | |
# 加载多个检测模型 | |
models = { | |
"model1": { | |
"name": "Xenova/distilbert-base-ai-generated-text-detection", | |
"detector": None, | |
"weight": 0.4 | |
}, | |
"model2": { | |
"name": "Hello-SimpleAI/chatgpt-detector-roberta", | |
"detector": None, | |
"weight": 0.3 | |
}, | |
"model3": { | |
"name": "roberta-base-openai-detector", | |
"detector": None, | |
"weight": 0.3 | |
} | |
} | |
# 初始化模型 | |
for key in models: | |
try: | |
models[key]["detector"] = pipeline("text-classification", model=models[key]["name"]) | |
print(f"成功加载模型: {models[key]['name']}") | |
except Exception as e: | |
print(f"加载模型 {models[key]['name']} 失败: {str(e)}") | |
models[key]["detector"] = None | |
def analyze_text_features(text): | |
# 文本特征分析 | |
features = {} | |
features["length"] = len(text) | |
words = text.split() | |
features["word_count"] = len(words) | |
features["avg_word_length"] = sum(len(word) for word in words) / max(1, len(words)) | |
features["unique_words_ratio"] = len(set(text.lower().split())) / max(1, len(words)) | |
# 句子分析 | |
sentences = re.split(r'[.!?]+', text) | |
features["sentence_count"] = len(sentences) | |
features["avg_sentence_length"] = sum(len(s.split()) for s in sentences) / max(1, len(sentences)) | |
# 词汇多样性 | |
if len(words) > 0: | |
features["lexical_diversity"] = len(set(words)) / len(words) | |
# 标点符号比例 | |
punctuation_count = sum(1 for char in text if char in ",.!?;:\"'()[]{}") | |
features["punctuation_ratio"] = punctuation_count / max(1, len(text)) | |
return features | |
def detect_ai_text(text): | |
if not text or len(text.strip()) < 50: | |
return {"error": "文本太短,无法可靠检测"} | |
results = {} | |
valid_models = 0 | |
weighted_ai_probability = 0 | |
# 使用每个模型进行预测 | |
for key, model_info in models.items(): | |
if model_info["detector"] is not None: | |
try: | |
result = model_info["detector"](text) | |
# 提取结果 | |
label = result[0]["label"] | |
score = result[0]["score"] | |
# 确定AI生成概率 | |
if "ai" in label.lower() or "chatgpt" in label.lower() or "generated" in label.lower(): | |
ai_probability = score | |
else: | |
ai_probability = 1 - score | |
# 添加到结果 | |
results[key] = { | |
"model_name": model_info["name"], | |
"ai_probability": ai_probability, | |
"label": label, | |
"score": score | |
} | |
# 累加加权概率 | |
weighted_ai_probability += ai_probability * model_info["weight"] | |
valid_models += 1 | |
except Exception as e: | |
results[key] = { | |
"model_name": model_info["name"], | |
"error": str(e) | |
} | |
# 计算最终加权概率 | |
final_ai_probability = weighted_ai_probability / max(sum(m["weight"] for k, m in models.items() if m["detector"] is not None), 1) | |
# 分析文本特征 | |
text_features = analyze_text_features(text) | |
# 确定置信度级别 | |
if final_ai_probability > 0.7: | |
confidence_level = "高概率AI生成" | |
elif final_ai_probability < 0.3: | |
confidence_level = "高概率人类创作" | |
else: | |
confidence_level = "无法确定" | |
# 构建最终结果 | |
final_result = { | |
"ai_probability": final_ai_probability, | |
"confidence_level": confidence_level, | |
"individual_model_results": results, | |
"features": text_features | |
} | |
return final_result | |
# 创建Gradio界面 | |
iface = gr.Interface( | |
fn=detect_ai_text, | |
inputs=gr.Textbox(lines=10, placeholder="粘贴要检测的文本..."), | |
outputs=gr.JSON(), | |
title="增强型AI文本检测API", | |
description="多模型集成检测文本是否由AI生成", | |
examples=[ | |
["这是一段示例文本,用于测试AI文本检测功能。请输入至少50个字符的文本以获得准确的检测结果。"] | |
], | |
allow_flagging="never" | |
) | |
iface.launch() | |