Spaces:
Runtime error
Runtime error
File size: 7,716 Bytes
394a4ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import requests
import httpx
import torch
import re
from bs4 import BeautifulSoup
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import asyncio
from evaluate import load
from datetime import date
import nltk
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import plotly.graph_objects as go
import torch.nn.functional as F
import nltk
from unidecode import unidecode
import time
from scipy.special import softmax
import yaml
import os
from utils import *
from dotenv import load_dotenv
with open('config.yaml', 'r') as file:
params = yaml.safe_load(file)
nltk.download('punkt')
nltk.download('stopwords')
load_dotenv()
device = "cuda" if torch.cuda.is_available() else "cpu"
hf_token = os.getenv("HF_TOKEN")
text_bc_model_path = os.getenv("TEXT_BC_MODEL_PATH")
text_mc_model_path = os.getenv("TEXT_MC_MODEL_PATH")
text_quillbot_model_path = os.getenv("TEXT_QUILLBOT_MODEL_PATH")
quillbot_labels = params["QUILLBOT_LABELS"]
mc_label_map = params["MC_OUTPUT_LABELS"]
mc_token_size = int(os.getenv("MC_TOKEN_SIZE"))
bc_token_size = int(os.getenv("BC_TOKEN_SIZE"))
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path, use_auth_token=hf_token)
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path, use_auth_token=hf_token).to(device)
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path, use_auth_token=hf_token)
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path, use_auth_token=hf_token).to(device)
quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path, use_auth_token=hf_token)
quillbot_model = AutoModelForSequenceClassification.from_pretrained(text_quillbot_model_path, use_auth_token=hf_token).to(device)
def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'):
sentences = nltk.sent_tokenize(text)
segments = []
current_segment = []
current_length = 0
if type_det == 'bc':
tokenizer = text_bc_tokenizer
max_length = bc_token_size
elif type_det == 'mc':
tokenizer = text_mc_tokenizer
max_length = mc_token_size
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
sentence_length = len(tokens)
if current_length + sentence_length <= max_length + tolerance - 2:
current_segment.append(sentence)
current_length += sentence_length
else:
if current_segment:
encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
segments.append((current_segment, len(encoded_segment)))
current_segment = [sentence]
current_length = sentence_length
if current_segment:
encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
segments.append((current_segment, len(encoded_segment)))
final_segments = []
for i, (seg, length) in enumerate(segments):
if i == len(segments) - 1:
if length < min_last_segment_length and len(final_segments) > 0:
prev_seg, prev_length = final_segments[-1]
combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
if len(combined_encoded) <= max_length + tolerance:
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
else:
final_segments.append((seg, length))
else:
final_segments.append((seg, length))
else:
final_segments.append((seg, length))
decoded_segments = []
encoded_segments = []
for seg, _ in final_segments:
encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
decoded_segment = tokenizer.decode(encoded_segment)
decoded_segments.append(decoded_segment)
return decoded_segments
def predict_quillbot(text):
with torch.no_grad():
quillbot_model.eval()
tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
output = quillbot_model(**tokenized_text)
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
return q_score
def predict_bc(model, tokenizer, text):
with torch.no_grad():
model.eval()
tokens = text_bc_tokenizer(
text, padding='max_length', truncation=True, max_length=bc_token_size, return_tensors="pt"
).to(device)
output = model(**tokens)
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
return output_norm
def predict_mc(model, tokenizer, text):
with torch.no_grad():
model.eval()
tokens = text_mc_tokenizer(
text, padding='max_length', truncation=True, return_tensors="pt", max_length=mc_token_size
).to(device)
output = model(**tokens)
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
return output_norm
def predict_mc_scores(input):
bc_scores = []
mc_scores = []
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
for i in range(samples_len_bc):
cleaned_text_bc = remove_special_characters(segments_bc[i])
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
bc_scores.append(bc_score)
bc_scores_array = np.array(bc_scores)
average_bc_scores = np.mean(bc_scores_array, axis=0)
bc_score_list = average_bc_scores.tolist()
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'mc')
samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
for i in range(samples_len_mc):
cleaned_text_mc = remove_special_characters(segments_mc[i])
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
mc_scores.append(mc_score)
mc_scores_array = np.array(mc_scores)
average_mc_scores = np.mean(mc_scores_array, axis=0)
mc_score_list = average_mc_scores.tolist()
mc_score = {}
for score, label in zip(mc_score_list, mc_label_map):
mc_score[label.upper()] = score
sum_prob = 1 - bc_score['HUMAN']
for key, value in mc_score.items():
mc_score[key] = value * sum_prob
if sum_prob < 0.01 :
mc_score = {}
mc_score['HUMAN'] = bc_score['HUMAN']
return mc_score
def predict_bc_scores(input):
bc_scores = []
mc_scores = []
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
for i in range(samples_len_bc):
cleaned_text_bc = remove_special_characters(segments_bc[i])
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
bc_scores.append(bc_score)
bc_scores_array = np.array(bc_scores)
average_bc_scores = np.mean(bc_scores_array, axis=0)
bc_score_list = average_bc_scores.tolist()
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
return bc_score |