# app.py - 文本检测多模型集成系统
import gradio as gr
from transformers import pipeline
import numpy as np
import re

# 加载多个检测模型
models = {
    "model1": {
        "name": "Xenova/distilbert-base-ai-generated-text-detection",
        "detector": None,
        "weight": 0.4
    },
    "model2": {
        "name": "Hello-SimpleAI/chatgpt-detector-roberta",
        "detector": None,
        "weight": 0.3
    },
    "model3": {
        "name": "roberta-base-openai-detector",
        "detector": None,
        "weight": 0.3
    }
}

# 初始化模型
for key in models:
    try:
        models[key]["detector"] = pipeline("text-classification", model=models[key]["name"])
        print(f"成功加载模型: {models[key]['name']}")
    except Exception as e:
        print(f"加载模型 {models[key]['name']} 失败: {str(e)}")
        models[key]["detector"] = None

def analyze_text_features(text):
    # 文本特征分析
    features = {}
    features["length"] = len(text)
    words = text.split()
    features["word_count"] = len(words)
    features["avg_word_length"] = sum(len(word) for word in words) / max(1, len(words))
    features["unique_words_ratio"] = len(set(text.lower().split())) / max(1, len(words))
    
    # 句子分析
    sentences = re.split(r'[.!?]+', text)
    features["sentence_count"] = len(sentences)
    features["avg_sentence_length"] = sum(len(s.split()) for s in sentences) / max(1, len(sentences))
    
    # 词汇多样性
    if len(words) > 0:
        features["lexical_diversity"] = len(set(words)) / len(words)
    
    # 标点符号比例
    punctuation_count = sum(1 for char in text if char in ",.!?;:\"'()[]{}")
    features["punctuation_ratio"] = punctuation_count / max(1, len(text))
    
    return features

def detect_ai_text(text):
    if not text or len(text.strip()) < 50:
        return {"error": "文本太短，无法可靠检测"}
    
    results = {}
    valid_models = 0
    weighted_ai_probability = 0
    
    # 使用每个模型进行预测
    for key, model_info in models.items():
        if model_info["detector"] is not None:
            try:
                result = model_info["detector"](text)
                
                # 提取结果
                label = result[0]["label"]
                score = result[0]["score"]
                
                # 确定AI生成概率
                if "ai" in label.lower() or "chatgpt" in label.lower() or "generated" in label.lower():
                    ai_probability = score
                else:
                    ai_probability = 1 - score
                
                # 添加到结果
                results[key] = {
                    "model_name": model_info["name"],
                    "ai_probability": ai_probability,
                    "label": label,
                    "score": score
                }
                
                # 累加加权概率
                weighted_ai_probability += ai_probability * model_info["weight"]
                valid_models += 1
            
            except Exception as e:
                results[key] = {
                    "model_name": model_info["name"],
                    "error": str(e)
                }
    
    # 计算最终加权概率
    final_ai_probability = weighted_ai_probability / max(sum(m["weight"] for k, m in models.items() if m["detector"] is not None), 1)
    
    # 分析文本特征
    text_features = analyze_text_features(text)
    
    # 确定置信度级别
    if final_ai_probability > 0.7:
        confidence_level = "高概率AI生成"
    elif final_ai_probability < 0.3:
        confidence_level = "高概率人类创作"
    else:
        confidence_level = "无法确定"
    
    # 构建最终结果
    final_result = {
        "ai_probability": final_ai_probability,
        "confidence_level": confidence_level,
        "individual_model_results": results,
        "features": text_features
    }
    
    return final_result

# 创建Gradio界面
iface = gr.Interface(
    fn=detect_ai_text,
    inputs=gr.Textbox(lines=10, placeholder="粘贴要检测的文本..."),
    outputs=gr.JSON(),
    title="增强型AI文本检测API",
    description="多模型集成检测文本是否由AI生成",
    examples=[
        ["这是一段示例文本，用于测试AI文本检测功能。请输入至少50个字符的文本以获得准确的检测结果。"]
    ],
    allow_flagging="never"
)

iface.launch()