Spaces:

AITextDetect
/

MGTbenchmark

Sleeping

File size: 4,251 Bytes

9a80e8e
fcb6ffd
9a80e8e
fcb6ffd
 
 
9a80e8e
fcb6ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
732150f
 
 
 
 
 
 
 
 
fcb6ffd
296f63c
 
 
 
 
 
 
 
 
 
fcb6ffd
 
 
 
732150f
fcb6ffd
 
 
 
 
 
 
 
 
 
 
296f63c
732150f
296f63c
732150f
 
 
296f63c
 
 
 
 
732150f
 
296f63c
 
 
 
 
 
 
 
 
 
 
9a80e8e
fcb6ffd
 
 
 
296f63c

import streamlit as st
import os
import json
import re
import datasets
import tiktoken
import zipfile
from pathlib import Path

# 定义 tiktoken 编码器
encoding = tiktoken.get_encoding("cl100k_base")

# MGTHuman 类
class MGTHuman(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.0.0")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="human", version=VERSION, description="This part of human data"),
        datasets.BuilderConfig(name="Moonshot", version=VERSION, description="Data from the Moonshot model"),
        datasets.BuilderConfig(name="gpt35", version=VERSION, description="Data from the gpt-3.5-turbo model"),
        datasets.BuilderConfig(name="Llama3", version=VERSION, description="Data from the Llama3 model"),
        datasets.BuilderConfig(name="Mixtral", version=VERSION, description="Data from the Mixtral model"),
        datasets.BuilderConfig(name="Qwen", version=VERSION, description="Data from the Qwen model"),
    ]
    DEFAULT_CONFIG_NAME = "human"

    def truncate_text(self, text, max_tokens=2048):
        tokens = encoding.encode(text, allowed_special={'<|endoftext|>'})
        if len(tokens) > max_tokens:
            tokens = tokens[:max_tokens]
            truncated_text = encoding.decode(tokens)
            last_period_idx = truncated_text.rfind('。')
            if last_period_idx == -1:
                last_period_idx = truncated_text.rfind('.')
            if last_period_idx != -1:
                truncated_text = truncated_text[:last_period_idx + 1]
            return truncated_text
        else:
            return text

    def get_text_by_index(self, filepath, index):
        count = 0
        with open(filepath, 'r') as f:
            data = json.load(f)
        for row in data:
            if not row["text"].strip():
                continue
            if count == index:
                text = self.truncate_text(row["text"], max_tokens=2048)
                return text
            count += 1
        return "Index 超出范围，请输入有效的数字。"
    
    def count_entries(self, filepath):
        """返回文件中的总条数，用于动态生成索引范围"""
        count = 0
        with open(filepath, 'r') as f:
            data = json.load(f)
            for row in data:
                if row["text"].strip():
                    count += 1
        return count

# Streamlit UI
st.title("MGTHuman Dataset Viewer")

# 上传包含 JSON 文件的 ZIP 文件
uploaded_folder = st.file_uploader("上传包含 JSON 文件的 ZIP 文件夹", type=["zip"])
if uploaded_folder:
    folder_path = Path("temp")
    folder_path.mkdir(exist_ok=True)
    zip_path = folder_path / uploaded_folder.name
    with open(zip_path, "wb") as f:
        f.write(uploaded_folder.getbuffer())

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(folder_path)

    # 递归获取所有 JSON 文件并分类到不同的 domain
    category = {}
    for json_file in folder_path.rglob("*.json"):  # 使用 rglob 递归查找所有 JSON 文件
        domain = json_file.stem.split('_task3')[0]
        category.setdefault(domain, []).append(str(json_file))

    # 显示可用的 domain 下拉框
    if category:
        selected_domain = st.selectbox("选择数据种类", options=list(category.keys()))
        
        # 确定该 domain 的第一个文件路径并获取条目数量
        file_to_display = category[selected_domain][0]
        mgt_human = MGTHuman(name=selected_domain)
        total_entries = mgt_human.count_entries(file_to_display)
        st.write(f"可用的索引范围: 0 到 {total_entries - 1}")
        
        # 输入序号查看文本
        index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
        
        if st.button("显示文本"):
            text = mgt_human.get_text_by_index(file_to_display, index=index_to_view)
            st.write("对应的文本内容为：", text)
    else:
        st.write("未找到任何 JSON 文件，请检查 ZIP 文件结构。")

# 清理上传文件的临时目录
if st.button("清除文件"):
    import shutil
    shutil.rmtree("temp")
    st.write("临时文件已清除。")