File size: 10,608 Bytes
8bf2130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c573e53
8bf2130
c573e53
8bf2130
 
 
 
 
 
c573e53
 
 
 
8bf2130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c573e53
8bf2130
 
 
 
 
 
98a2138
 
8bf2130
98a2138
8bf2130
 
 
 
 
 
 
 
 
 
 
 
 
 
98a2138
 
 
 
 
 
 
8bf2130
 
 
92acaa5
 
 
8bf2130
 
92acaa5
8bf2130
 
92acaa5
8bf2130
 
 
 
 
 
98a2138
 
 
 
 
 
 
8bf2130
 
 
92acaa5
 
 
8bf2130
 
c573e53
92acaa5
 
c573e53
8bf2130
 
 
 
 
 
 
92acaa5
62916e8
92acaa5
8bf2130
 
92acaa5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# Copyright (c) Hello-SimpleAI Org. 2023.
# Licensed under the Apache License, Version 2.0.

import os
import pickle
import re
from typing import Callable, List, Tuple

import gradio as gr
from nltk.data import load as nltk_load
import numpy as np
from sklearn.linear_model import LogisticRegression
import torch
from transformers.utils import cached_file
from transformers import GPT2LMHeadModel, GPT2Tokenizer


AUTH_TOKEN = os.environ.get("access_token")
DET_LING_ID = 'Hello-SimpleAI/chatgpt-detector-ling'


def download_file(filename):
    return cached_file(DET_LING_ID, filename, use_auth_token=AUTH_TOKEN)


NLTK = nltk_load(download_file('english.pickle'))
sent_cut_en = NLTK.tokenize
LR_GLTR_EN, LR_PPL_EN, LR_GLTR_ZH, LR_PPL_ZH = [
    pickle.load(open(download_file(f'{lang}-gpt2-{name}.pkl'), 'rb'))
    for lang, name in [('en', 'gltr'), ('en', 'ppl'), ('zh', 'gltr'), ('zh', 'ppl')]
]

NAME_EN = 'gpt2'
TOKENIZER_EN = GPT2Tokenizer.from_pretrained(NAME_EN)
MODEL_EN = GPT2LMHeadModel.from_pretrained(NAME_EN)

NAME_ZH = 'IDEA-CCNL/Wenzhong-GPT2-110M'
TOKENIZER_ZH = GPT2Tokenizer.from_pretrained(NAME_ZH)
MODEL_ZH = GPT2LMHeadModel.from_pretrained(NAME_ZH)


# code borrowed from https://github.com/blmoistawinde/HarvestText
def sent_cut_zh(para: str) -> List[str]:
    para = re.sub('([。!?\?!])([^”’)\])】])', r"\1\n\2", para)  # 单字符断句符
    para = re.sub('(\.{3,})([^”’)\])】….])', r"\1\n\2", para)  # 英文省略号
    para = re.sub('(\…+)([^”’)\])】….])', r"\1\n\2", para)  # 中文省略号
    para = re.sub('([。!?\?!]|\.{3,}|\…+)([”’)\])】])([^,。!?\?….])', r'\1\2\n\3', para)
    # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
    para = para.rstrip()  # 段尾如果有多余的\n就去掉它
    # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
    sentences = para.split("\n")
    sentences = [sent.strip() for sent in sentences]
    sentences = [sent for sent in sentences if len(sent.strip()) > 0]
    return sentences


CROSS_ENTROPY = torch.nn.CrossEntropyLoss(reduction='none')


def gpt2_features(
    text: str, tokenizer: GPT2Tokenizer, model: GPT2LMHeadModel, sent_cut: Callable
) -> Tuple[List[int], List[float]]:
    # Tokenize
    input_max_length = tokenizer.model_max_length - 2
    token_ids, offsets = list(), list()
    sentences = sent_cut(text)
    for s in sentences:
        tokens = tokenizer.tokenize(s)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        difference = len(token_ids) + len(ids) - input_max_length
        if difference > 0:
            ids = ids[:-difference]
        offsets.append((len(token_ids), len(token_ids) + len(ids)))  # 左开右闭
        token_ids.extend(ids)
        if difference >= 0:
            break

    input_ids = torch.tensor([tokenizer.bos_token_id] + token_ids)
    logits = model(input_ids).logits
    # Shift so that n-1 predict n
    shift_logits = logits[:-1].contiguous()
    shift_target = input_ids[1:].contiguous()
    loss = CROSS_ENTROPY(shift_logits, shift_target)

    all_probs = torch.softmax(shift_logits, dim=-1)
    sorted_ids = torch.argsort(all_probs, dim=-1, descending=True)  # stable=True
    expanded_tokens = shift_target.unsqueeze(-1).expand_as(sorted_ids)
    indices = torch.where(sorted_ids == expanded_tokens)
    rank = indices[-1]
    counter = [
        rank < 10,
        (rank >= 10) & (rank < 100),
        (rank >= 100) & (rank < 1000),
        rank >= 1000
    ]
    counter = [c.long().sum(-1).item() for c in counter]


    # compute different-level ppl
    text_ppl = loss.mean().exp().item()
    sent_ppl = list()
    for start, end in offsets:
        nll = loss[start: end].sum() / (end - start)
        sent_ppl.append(nll.exp().item())
    max_sent_ppl = max(sent_ppl)
    sent_ppl_avg = sum(sent_ppl) / len(sent_ppl)
    if len(sent_ppl) > 1:
        sent_ppl_std = torch.std(torch.tensor(sent_ppl)).item()
    else:
        sent_ppl_std = 0

    mask = torch.tensor([1] * loss.size(0))
    step_ppl = loss.cumsum(dim=-1).div(mask.cumsum(dim=-1)).exp()
    max_step_ppl = step_ppl.max(dim=-1)[0].item()
    step_ppl_avg = step_ppl.sum(dim=-1).div(loss.size(0)).item()
    if step_ppl.size(0) > 1:
        step_ppl_std = step_ppl.std().item()
    else:
        step_ppl_std = 0
    ppls = [
        text_ppl, max_sent_ppl, sent_ppl_avg, sent_ppl_std,
        max_step_ppl, step_ppl_avg, step_ppl_std
    ]
    return counter, ppls  # type: ignore


def lr_predict(
    f_gltr: List[int], f_ppl: List[float], lr_gltr: LogisticRegression, lr_ppl: LogisticRegression,
    id_to_label: List[str]
) -> List:
    x_gltr = np.asarray([f_gltr])
    gltr_label = lr_gltr.predict(x_gltr)[0]
    gltr_prob = lr_gltr.predict_proba(x_gltr)[0, gltr_label]
    x_ppl = np.asarray([f_ppl])
    ppl_label = lr_ppl.predict(x_ppl)[0]
    ppl_prob = lr_ppl.predict_proba(x_ppl)[0, ppl_label]
    return [id_to_label[gltr_label], gltr_prob, id_to_label[ppl_label], ppl_prob]


def predict_en(text: str) -> List:
    with torch.no_grad():
        feat = gpt2_features(text, TOKENIZER_EN, MODEL_EN, sent_cut_en)
    out = lr_predict(*feat, LR_GLTR_EN, LR_PPL_EN, ['Human', 'ChatGPT'])
    return out


def predict_zh(text: str) -> List:
    with torch.no_grad():
        feat = gpt2_features(text, TOKENIZER_ZH, MODEL_ZH, sent_cut_zh)
    out = lr_predict(*feat, LR_GLTR_ZH, LR_PPL_ZH, ['人类', 'ChatGPT'])
    return out


with gr.Blocks() as demo:
    gr.Markdown(
        """
        ## ChatGPT Detector 🔬 (Linguistic version / 语言学版)

        Visit our project on Github: [chatgpt-comparison-detection project](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
        欢迎在 Github 上关注我们的 [ChatGPT 对比与检测项目](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
        We provide three kinds of detectors, all in Bilingual / 我们提供了三个版本的检测器,且都支持中英文:
        - [QA version / 问答版](https://huggingface.co/spaces/Hello-SimpleAI/chatgpt-detector-qa)<br>
            detect whether an **answer** is generated by ChatGPT for certain **question**, using PLM-based classifiers / 判断某个**问题的回答**是否由ChatGPT生成,使用基于PTM的分类器来开发;
        - [Sinlge-text version / 独立文本版](https://huggingface.co/spaces/Hello-SimpleAI/chatgpt-detector-single)<br>
            detect whether a piece of text is ChatGPT generated, using PLM-based classifiers / 判断**单条文本**是否由ChatGPT生成,使用基于PTM的分类器来开发;
        - [**Linguistic version / 语言学版** (👈 Current / 当前使用)](https://huggingface.co/spaces/Hello-SimpleAI/chatgpt-detector-ling)<br>
            detect whether a piece of text is ChatGPT generated, using linguistic features / 判断**单条文本**是否由ChatGPT生成,使用基于语言学特征的模型来开发;

        """
    )

    with gr.Tab("English"):
        gr.Markdown(
            """
            ## Introduction:
            Two Logistic regression models trained with two kinds of features:
            1. [GLTR](https://aclanthology.org/P19-3019) Test-2, Language model predict token rank top-k buckets, top 10, 10-100, 100-1000, 1000+.
            2. PPL-based, text ppl, sentence ppl, etc.

            English LM is [GPT2-small](https://huggingface.co/gpt2).

            Note: Providing more text to the `Text` box can make the prediction more accurate!
            """
        )
        a1 = gr.Textbox(
            lines=5, label='Text',
            value="There are a few things that can help protect your credit card information from being misused when you give it to a restaurant or any other business:\n\nEncryption: Many businesses use encryption to protect your credit card information when it is being transmitted or stored. This means that the information is transformed into a code that is difficult for anyone to read without the right key."
        )
        button1 = gr.Button("🤖 Predict!")
        gr.Markdown("GLTR")
        label1_gltr = gr.Textbox(lines=1, label='GLTR Predicted Label 🎃')
        score1_gltr = gr.Textbox(lines=1, label='GLTR Probability')
        gr.Markdown("PPL")
        label1_ppl = gr.Textbox(lines=1, label='PPL Predicted Label 🎃')
        score1_ppl = gr.Textbox(lines=1, label='PPL Probability')

    with gr.Tab("中文版"):
        gr.Markdown(
            """
            ## 介绍:
            两个逻辑回归模型, 分别使用以下两种特征:
            1. [GLTR](https://aclanthology.org/P19-3019) Test-2, 每个词的语言模型预测排名分桶, top 10, 10-100, 100-1000, 1000+.
            2. 基于语言模型困惑度 (PPL), 整个文本的PPL、单个句子的PPL等特征.

            中文语言模型使用 闻仲 [Wenzhong-GPT2-110M](https://huggingface.co/IDEA-CCNL/Wenzhong-GPT2-110M).

            注意: 在`文本`栏中输入更多的文本,可以让预测更准确哦!
            """
        )
        a2 = gr.Textbox(
            lines=5, label='文本',
            value="对于OpenAI大力出奇迹的工作,自然每个人都有自己的看点。我自己最欣赏的地方是ChatGPT如何解决 “AI校正(Alignment)“这个问题。这个问题也是我们课题组这两年在探索的学术问题之一。"
        )
        button2 = gr.Button("🤖 预测!")
        gr.Markdown("GLTR (中文测试集准确率 86.39%)")
        label2_gltr = gr.Textbox(lines=1, label='预测结果 🎃')
        score2_gltr = gr.Textbox(lines=1, label='模型概率')
        gr.Markdown("PPL (中文测试集准确率 59.04%, 持续优化中...)")
        label2_ppl = gr.Textbox(lines=1, label='PPL 预测结果 🎃')
        score2_ppl = gr.Textbox(lines=1, label='PPL 模型概率')

    button1.click(predict_en, inputs=[a1], outputs=[label1_gltr, score1_gltr, label1_ppl, score1_ppl])
    button2.click(predict_zh, inputs=[a2], outputs=[label2_gltr, score2_gltr, label2_ppl, score2_ppl])

    # Page Count
    gr.Markdown("""
                <center><a href="https://clustrmaps.com/site/1bsdd" title="Visit tracker"><img src="//clustrmaps.com/map_v2.png?cl=080808&w=a&t=tt&d=NvxUHBTxY0ECXEuebgz8Ym8ynpVtduq59ENXoQpFh74&co=ffffff&ct=808080"/></a></center>
                """)

demo.launch()
# To create a public link, set `share=True` in `launch()`.