Spaces:
Running
Running
File size: 4,487 Bytes
2e3fe2e 5d40c09 7aaa31b 2e3fe2e 5d40c09 2e3fe2e 5d40c09 7aaa31b 5d40c09 2e3fe2e 5d40c09 2e3fe2e 5d40c09 2e3fe2e 7aaa31b 2e3fe2e 5d40c09 2e3fe2e 5d40c09 2e3fe2e 5d40c09 7aaa31b 5d40c09 2e3fe2e 5d40c09 2e3fe2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# app.py - 文本检测多模型集成系统
import gradio as gr
from transformers import pipeline
import numpy as np
import re
# 加载多个检测模型
models = {
"model1": {
"name": "Xenova/distilbert-base-ai-generated-text-detection",
"detector": None,
"weight": 0.4
},
"model2": {
"name": "Hello-SimpleAI/chatgpt-detector-roberta",
"detector": None,
"weight": 0.3
},
"model3": {
"name": "roberta-base-openai-detector",
"detector": None,
"weight": 0.3
}
}
# 初始化模型
for key in models:
try:
models[key]["detector"] = pipeline("text-classification", model=models[key]["name"])
print(f"成功加载模型: {models[key]['name']}")
except Exception as e:
print(f"加载模型 {models[key]['name']} 失败: {str(e)}")
models[key]["detector"] = None
def analyze_text_features(text):
# 文本特征分析
features = {}
features["length"] = len(text)
words = text.split()
features["word_count"] = len(words)
features["avg_word_length"] = sum(len(word) for word in words) / max(1, len(words))
features["unique_words_ratio"] = len(set(text.lower().split())) / max(1, len(words))
# 句子分析
sentences = re.split(r'[.!?]+', text)
features["sentence_count"] = len(sentences)
features["avg_sentence_length"] = sum(len(s.split()) for s in sentences) / max(1, len(sentences))
# 词汇多样性
if len(words) > 0:
features["lexical_diversity"] = len(set(words)) / len(words)
# 标点符号比例
punctuation_count = sum(1 for char in text if char in ",.!?;:\"'()[]{}")
features["punctuation_ratio"] = punctuation_count / max(1, len(text))
return features
def detect_ai_text(text):
if not text or len(text.strip()) < 50:
return {"error": "文本太短,无法可靠检测"}
results = {}
valid_models = 0
weighted_ai_probability = 0
# 使用每个模型进行预测
for key, model_info in models.items():
if model_info["detector"] is not None:
try:
result = model_info["detector"](text)
# 提取结果
label = result[0]["label"]
score = result[0]["score"]
# 确定AI生成概率
if "ai" in label.lower() or "chatgpt" in label.lower() or "generated" in label.lower():
ai_probability = score
else:
ai_probability = 1 - score
# 添加到结果
results[key] = {
"model_name": model_info["name"],
"ai_probability": ai_probability,
"label": label,
"score": score
}
# 累加加权概率
weighted_ai_probability += ai_probability * model_info["weight"]
valid_models += 1
except Exception as e:
results[key] = {
"model_name": model_info["name"],
"error": str(e)
}
# 计算最终加权概率
final_ai_probability = weighted_ai_probability / max(sum(m["weight"] for k, m in models.items() if m["detector"] is not None), 1)
# 分析文本特征
text_features = analyze_text_features(text)
# 确定置信度级别
if final_ai_probability > 0.7:
confidence_level = "高概率AI生成"
elif final_ai_probability < 0.3:
confidence_level = "高概率人类创作"
else:
confidence_level = "无法确定"
# 构建最终结果
final_result = {
"ai_probability": final_ai_probability,
"confidence_level": confidence_level,
"individual_model_results": results,
"features": text_features
}
return final_result
# 创建Gradio界面
iface = gr.Interface(
fn=detect_ai_text,
inputs=gr.Textbox(lines=10, placeholder="粘贴要检测的文本..."),
outputs=gr.JSON(),
title="增强型AI文本检测API",
description="多模型集成检测文本是否由AI生成",
examples=[
["这是一段示例文本,用于测试AI文本检测功能。请输入至少50个字符的文本以获得准确的检测结果。"]
],
allow_flagging="never"
)
iface.launch()
|