Spaces:
Running
on
Zero
Running
on
Zero
DivEye - PR (fixes x3) (#11)
Browse files- contingous? (08abcbbe0bb86f2fb43dca2a0204b700da395312)
- global vars (f776ced53467ad04cffe7180d8c7aae44026c4ce)
- app.py +119 -4
- software.py +0 -125
app.py
CHANGED
@@ -5,11 +5,118 @@ import pandas as pd
|
|
5 |
from software import Software
|
6 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
theme = gr.Theme.from_hub("gstaff/xkcd")
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def detect_ai_text(text):
|
12 |
-
|
|
|
13 |
return "❗ Model not loaded. We require a GPU to run DivEye.", 0.0, pd.DataFrame({
|
14 |
"Source": ["AI Generated", "Human Written"],
|
15 |
"Probability (%)": [0, 0]
|
@@ -27,7 +134,7 @@ def detect_ai_text(text):
|
|
27 |
)
|
28 |
|
29 |
# Call software
|
30 |
-
ai_prob =
|
31 |
human_prob = 1 - ai_prob
|
32 |
|
33 |
if ai_prob > 0.7:
|
@@ -44,15 +151,18 @@ def detect_ai_text(text):
|
|
44 |
|
45 |
return message, round(ai_prob, 3), bar_data
|
46 |
|
|
|
47 |
# Token from environment variable
|
48 |
token = os.getenv("HF_TOKEN")
|
|
|
49 |
|
50 |
if not torch.cuda.is_available():
|
|
|
51 |
print("[DivEye] CUDA not available. Running on CPU.")
|
52 |
-
DESCRIPTION = "This demo requires a GPU to run efficiently. Please use a machine with CUDA support."
|
53 |
|
54 |
# Import necessary models and tokenizers
|
55 |
if torch.cuda.is_available():
|
|
|
56 |
model_name_div = "tiiuae/falcon-7b"
|
57 |
model_name_bi = "google/gemma-1.1-2b-it"
|
58 |
|
@@ -64,8 +174,13 @@ if torch.cuda.is_available():
|
|
64 |
|
65 |
div_model.eval()
|
66 |
bi_model.eval()
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
|
|
|
69 |
|
70 |
# Gradio app setup
|
71 |
with gr.Blocks(title="DivEye") as demo:
|
|
|
5 |
from software import Software
|
6 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
import torch
|
8 |
+
import xgboost as xgb
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import torch
|
12 |
+
import zlib
|
13 |
+
from scipy.stats import skew, kurtosis, entropy
|
14 |
+
from tqdm import tqdm
|
15 |
+
from torch.nn import CrossEntropyLoss
|
16 |
+
from pathlib import Path
|
17 |
+
import spaces
|
18 |
+
import os
|
19 |
|
20 |
theme = gr.Theme.from_hub("gstaff/xkcd")
|
21 |
|
22 |
+
class Diversity:
|
23 |
+
def __init__(self, model, tokenizer, device):
|
24 |
+
self.tokenizer = tokenizer
|
25 |
+
self.model = model
|
26 |
+
self.device = device
|
27 |
+
|
28 |
+
def compute_log_likelihoods(self, text):
|
29 |
+
tokens = self.tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=1024).to(self.device)
|
30 |
+
with torch.no_grad():
|
31 |
+
outputs = self.model(tokens, labels=tokens)
|
32 |
+
logits = outputs.logits
|
33 |
+
shift_logits = logits[:, :-1, :].squeeze(0)
|
34 |
+
shift_labels = tokens[:, 1:].squeeze(0)
|
35 |
+
log_probs = torch.log_softmax(shift_logits.float(), dim=-1)
|
36 |
+
token_log_likelihoods = log_probs[range(shift_labels.shape[0]), shift_labels].cpu().numpy()
|
37 |
+
return token_log_likelihoods
|
38 |
+
|
39 |
+
def compute_surprisal(self, text):
|
40 |
+
log_likelihoods = self.compute_log_likelihoods(text)
|
41 |
+
return -log_likelihoods
|
42 |
+
|
43 |
+
def compute_features(self, text):
|
44 |
+
surprisals = self.compute_surprisal(text)
|
45 |
+
log_likelihoods = self.compute_log_likelihoods(text)
|
46 |
+
if len(surprisals) < 10 or len(log_likelihoods) < 3:
|
47 |
+
return None
|
48 |
+
|
49 |
+
s = np.array(surprisals)
|
50 |
+
mean_s, std_s, var_s, skew_s, kurt_s = np.mean(s), np.std(s), np.var(s), skew(s), kurtosis(s)
|
51 |
+
diff_s = np.diff(s)
|
52 |
+
mean_diff, std_diff = np.mean(diff_s), np.std(diff_s)
|
53 |
+
first_order_diff = np.diff(log_likelihoods)
|
54 |
+
second_order_diff = np.diff(first_order_diff)
|
55 |
+
var_2nd = np.var(second_order_diff)
|
56 |
+
entropy_2nd = entropy(np.histogram(second_order_diff, bins=20, density=True)[0])
|
57 |
+
autocorr_2nd = np.corrcoef(second_order_diff[:-1], second_order_diff[1:])[0, 1] if len(second_order_diff) > 1 else 0
|
58 |
+
comp_ratio = len(zlib.compress(text.encode('utf-8'))) / len(text.encode('utf-8'))
|
59 |
+
|
60 |
+
return [mean_s, std_s, var_s, skew_s, kurt_s, mean_diff, std_diff, var_2nd, entropy_2nd, autocorr_2nd, comp_ratio]
|
61 |
+
|
62 |
+
class BiScope:
|
63 |
+
def __init__(self, model, tokenizer, device):
|
64 |
+
self.COMPLETION_PROMPT_ONLY = "Complete the following text: "
|
65 |
+
self.tokenizer = tokenizer
|
66 |
+
self.model = model
|
67 |
+
self.device = device
|
68 |
+
|
69 |
+
def compute_fce_loss(self, logits, targets, text_slice):
|
70 |
+
return CrossEntropyLoss(reduction='none')(
|
71 |
+
logits[0, text_slice.start-1:text_slice.stop-1, :],
|
72 |
+
targets
|
73 |
+
).detach().cpu().numpy()
|
74 |
+
|
75 |
+
def compute_bce_loss(self, logits, targets, text_slice):
|
76 |
+
return CrossEntropyLoss(reduction='none')(
|
77 |
+
logits[0, text_slice, :],
|
78 |
+
targets
|
79 |
+
).detach().cpu().numpy()
|
80 |
+
|
81 |
+
def detect_single_sample(self, sample):
|
82 |
+
prompt_ids = self.tokenizer(self.COMPLETION_PROMPT_ONLY, return_tensors='pt').input_ids.to(self.device)
|
83 |
+
text_ids = self.tokenizer(sample, return_tensors='pt', max_length=2000, truncation=True).input_ids.to(self.device)
|
84 |
+
combined_ids = torch.cat([prompt_ids, text_ids], dim=1)
|
85 |
+
text_slice = slice(prompt_ids.shape[1], combined_ids.shape[1])
|
86 |
+
|
87 |
+
outputs = self.model(input_ids=combined_ids)
|
88 |
+
logits = outputs.logits
|
89 |
+
targets = combined_ids[0][text_slice]
|
90 |
+
|
91 |
+
fce_loss = self.compute_fce_loss(logits, targets, text_slice)
|
92 |
+
bce_loss = self.compute_bce_loss(logits, targets, text_slice)
|
93 |
+
|
94 |
+
features = []
|
95 |
+
for p in range(1, 10):
|
96 |
+
split = len(fce_loss) * p // 10
|
97 |
+
fce_clipped = np.nan_to_num(np.clip(fce_loss[split:], -1e6, 1e6), nan=0.0, posinf=1e6, neginf=-1e6)
|
98 |
+
bce_clipped = np.nan_to_num(np.clip(bce_loss[split:], -1e6, 1e6), nan=0.0, posinf=1e6, neginf=-1e6)
|
99 |
+
features.extend([
|
100 |
+
np.mean(fce_clipped), np.max(fce_clipped), np.min(fce_clipped), np.std(fce_clipped),
|
101 |
+
np.mean(bce_clipped), np.max(bce_clipped), np.min(bce_clipped), np.std(bce_clipped)
|
102 |
+
])
|
103 |
+
return features
|
104 |
+
|
105 |
+
# ===========================================================
|
106 |
+
@spaces.GPU
|
107 |
+
def evaluate(diveye, biscope, text):
|
108 |
+
global model
|
109 |
+
diveye_features = diveye.compute_features(text)
|
110 |
+
biscope_features = biscope.detect_single_sample(text)
|
111 |
+
|
112 |
+
for f in biscope_features:
|
113 |
+
diveye_features.append(f)
|
114 |
+
|
115 |
+
return model.predict_proba([diveye_features])[:, 1][0].item()
|
116 |
+
|
117 |
def detect_ai_text(text):
|
118 |
+
global loaded, diveye, biscope, model
|
119 |
+
if not loaded:
|
120 |
return "❗ Model not loaded. We require a GPU to run DivEye.", 0.0, pd.DataFrame({
|
121 |
"Source": ["AI Generated", "Human Written"],
|
122 |
"Probability (%)": [0, 0]
|
|
|
134 |
)
|
135 |
|
136 |
# Call software
|
137 |
+
ai_prob = evaluate(diveye, biscope, text)
|
138 |
human_prob = 1 - ai_prob
|
139 |
|
140 |
if ai_prob > 0.7:
|
|
|
151 |
|
152 |
return message, round(ai_prob, 3), bar_data
|
153 |
|
154 |
+
# ==========================================================
|
155 |
# Token from environment variable
|
156 |
token = os.getenv("HF_TOKEN")
|
157 |
+
loaded = False
|
158 |
|
159 |
if not torch.cuda.is_available():
|
160 |
+
loaded = False
|
161 |
print("[DivEye] CUDA not available. Running on CPU.")
|
|
|
162 |
|
163 |
# Import necessary models and tokenizers
|
164 |
if torch.cuda.is_available():
|
165 |
+
loaded = True
|
166 |
model_name_div = "tiiuae/falcon-7b"
|
167 |
model_name_bi = "google/gemma-1.1-2b-it"
|
168 |
|
|
|
174 |
|
175 |
div_model.eval()
|
176 |
bi_model.eval()
|
177 |
+
|
178 |
+
model_path = Path(__file__).parent / "model.json"
|
179 |
+
model = xgb.XGBClassifier()
|
180 |
+
model.load_model(model_path)
|
181 |
|
182 |
+
diveye = Diversity(div_model, div_tokenizer, div_model.device)
|
183 |
+
biscope = BiScope(bi_model, bi_tokenizer, bi_model.device)
|
184 |
|
185 |
# Gradio app setup
|
186 |
with gr.Blocks(title="DivEye") as demo:
|
software.py
DELETED
@@ -1,125 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import xgboost as xgb
|
3 |
-
import pandas as pd
|
4 |
-
import numpy as np
|
5 |
-
import torch
|
6 |
-
import zlib
|
7 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
8 |
-
from scipy.stats import skew, kurtosis, entropy
|
9 |
-
from tqdm import tqdm
|
10 |
-
from torch.nn import CrossEntropyLoss
|
11 |
-
from pathlib import Path
|
12 |
-
import spaces
|
13 |
-
import os
|
14 |
-
|
15 |
-
class Diversity:
|
16 |
-
def __init__(self, model, tokenizer, device):
|
17 |
-
self.tokenizer = tokenizer
|
18 |
-
self.model = model
|
19 |
-
self.device = device
|
20 |
-
|
21 |
-
def compute_log_likelihoods(self, text):
|
22 |
-
tokens = self.tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=1024).to(self.device)
|
23 |
-
with torch.no_grad():
|
24 |
-
outputs = self.model(tokens, labels=tokens)
|
25 |
-
logits = outputs.logits
|
26 |
-
shift_logits = logits[:, :-1, :].squeeze(0)
|
27 |
-
shift_labels = tokens[:, 1:].squeeze(0)
|
28 |
-
log_probs = torch.log_softmax(shift_logits.float(), dim=-1)
|
29 |
-
token_log_likelihoods = log_probs[range(shift_labels.shape[0]), shift_labels].cpu().numpy()
|
30 |
-
return token_log_likelihoods
|
31 |
-
|
32 |
-
def compute_surprisal(self, text):
|
33 |
-
log_likelihoods = self.compute_log_likelihoods(text)
|
34 |
-
return -log_likelihoods
|
35 |
-
|
36 |
-
def compute_features(self, text):
|
37 |
-
surprisals = self.compute_surprisal(text)
|
38 |
-
log_likelihoods = self.compute_log_likelihoods(text)
|
39 |
-
if len(surprisals) < 10 or len(log_likelihoods) < 3:
|
40 |
-
return None
|
41 |
-
|
42 |
-
s = np.array(surprisals)
|
43 |
-
mean_s, std_s, var_s, skew_s, kurt_s = np.mean(s), np.std(s), np.var(s), skew(s), kurtosis(s)
|
44 |
-
diff_s = np.diff(s)
|
45 |
-
mean_diff, std_diff = np.mean(diff_s), np.std(diff_s)
|
46 |
-
first_order_diff = np.diff(log_likelihoods)
|
47 |
-
second_order_diff = np.diff(first_order_diff)
|
48 |
-
var_2nd = np.var(second_order_diff)
|
49 |
-
entropy_2nd = entropy(np.histogram(second_order_diff, bins=20, density=True)[0])
|
50 |
-
autocorr_2nd = np.corrcoef(second_order_diff[:-1], second_order_diff[1:])[0, 1] if len(second_order_diff) > 1 else 0
|
51 |
-
comp_ratio = len(zlib.compress(text.encode('utf-8'))) / len(text.encode('utf-8'))
|
52 |
-
|
53 |
-
return [mean_s, std_s, var_s, skew_s, kurt_s, mean_diff, std_diff, var_2nd, entropy_2nd, autocorr_2nd, comp_ratio]
|
54 |
-
|
55 |
-
class BiScope:
|
56 |
-
def __init__(self, model, tokenizer, device):
|
57 |
-
self.COMPLETION_PROMPT_ONLY = "Complete the following text: "
|
58 |
-
self.tokenizer = tokenizer
|
59 |
-
self.model = model
|
60 |
-
self.device = device
|
61 |
-
|
62 |
-
def compute_fce_loss(self, logits, targets, text_slice):
|
63 |
-
return CrossEntropyLoss(reduction='none')(
|
64 |
-
logits[0, text_slice.start-1:text_slice.stop-1, :],
|
65 |
-
targets
|
66 |
-
).detach().cpu().numpy()
|
67 |
-
|
68 |
-
def compute_bce_loss(self, logits, targets, text_slice):
|
69 |
-
return CrossEntropyLoss(reduction='none')(
|
70 |
-
logits[0, text_slice, :],
|
71 |
-
targets
|
72 |
-
).detach().cpu().numpy()
|
73 |
-
|
74 |
-
def detect_single_sample(self, sample):
|
75 |
-
prompt_ids = self.tokenizer(self.COMPLETION_PROMPT_ONLY, return_tensors='pt').input_ids.to(self.device)
|
76 |
-
text_ids = self.tokenizer(sample, return_tensors='pt', max_length=2000, truncation=True).input_ids.to(self.device)
|
77 |
-
combined_ids = torch.cat([prompt_ids, text_ids], dim=1)
|
78 |
-
text_slice = slice(prompt_ids.shape[1], combined_ids.shape[1])
|
79 |
-
|
80 |
-
outputs = self.model(input_ids=combined_ids)
|
81 |
-
logits = outputs.logits
|
82 |
-
targets = combined_ids[0][text_slice]
|
83 |
-
|
84 |
-
fce_loss = self.compute_fce_loss(logits, targets, text_slice)
|
85 |
-
bce_loss = self.compute_bce_loss(logits, targets, text_slice)
|
86 |
-
|
87 |
-
features = []
|
88 |
-
for p in range(1, 10):
|
89 |
-
split = len(fce_loss) * p // 10
|
90 |
-
fce_clipped = np.nan_to_num(np.clip(fce_loss[split:], -1e6, 1e6), nan=0.0, posinf=1e6, neginf=-1e6)
|
91 |
-
bce_clipped = np.nan_to_num(np.clip(bce_loss[split:], -1e6, 1e6), nan=0.0, posinf=1e6, neginf=-1e6)
|
92 |
-
features.extend([
|
93 |
-
np.mean(fce_clipped), np.max(fce_clipped), np.min(fce_clipped), np.std(fce_clipped),
|
94 |
-
np.mean(bce_clipped), np.max(bce_clipped), np.min(bce_clipped), np.std(bce_clipped)
|
95 |
-
])
|
96 |
-
return features
|
97 |
-
|
98 |
-
|
99 |
-
class Software:
|
100 |
-
def __init__(self, div_model, div_tokenizer, bi_model, bi_tokenizer, device_div="cuda", device_bi="cuda"):
|
101 |
-
self.div_model = div_model
|
102 |
-
self.div_tokenizer = div_tokenizer
|
103 |
-
self.bi_model = bi_model
|
104 |
-
self.bi_tokenizer = bi_tokenizer
|
105 |
-
|
106 |
-
self.device_div = device_div
|
107 |
-
self.device_bi = device_bi
|
108 |
-
|
109 |
-
self.model_path = Path(__file__).parent / "model.json"
|
110 |
-
|
111 |
-
self.model = xgb.XGBClassifier()
|
112 |
-
self.model.load_model(self.model_path)
|
113 |
-
|
114 |
-
@spaces.GPU
|
115 |
-
def evaluate(self, text):
|
116 |
-
diveye = Diversity(self.div_model, self.div_tokenizer, self.device_div)
|
117 |
-
biscope = BiScope(self.bi_model, self.bi_tokenizer, self.device_bi)
|
118 |
-
|
119 |
-
diveye_features = diveye.compute_features(text)
|
120 |
-
biscope_features = biscope.detect_single_sample(text)
|
121 |
-
|
122 |
-
for f in biscope_features:
|
123 |
-
diveye_features.append(f)
|
124 |
-
|
125 |
-
return self.model.predict_proba([diveye_features])[:, 1][0].item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|