Spaces:
Runtime error
Runtime error
import requests | |
import httpx | |
import torch | |
import re | |
from bs4 import BeautifulSoup | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import asyncio | |
from evaluate import load | |
from datetime import date | |
import nltk | |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast | |
import plotly.graph_objects as go | |
import torch.nn.functional as F | |
import nltk | |
from unidecode import unidecode | |
import time | |
from scipy.special import softmax | |
import yaml | |
import os | |
from utils import * | |
from dotenv import load_dotenv | |
with open('config.yaml', 'r') as file: | |
params = yaml.safe_load(file) | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
load_dotenv() | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
hf_token = os.getenv("HF_TOKEN") | |
text_bc_model_path = os.getenv("TEXT_BC_MODEL_PATH") | |
text_mc_model_path = os.getenv("TEXT_MC_MODEL_PATH") | |
text_quillbot_model_path = os.getenv("TEXT_QUILLBOT_MODEL_PATH") | |
quillbot_labels = params["QUILLBOT_LABELS"] | |
mc_label_map = params["MC_OUTPUT_LABELS"] | |
mc_token_size = int(os.getenv("MC_TOKEN_SIZE")) | |
bc_token_size = int(os.getenv("BC_TOKEN_SIZE")) | |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path, use_auth_token=hf_token) | |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path, use_auth_token=hf_token).to(device) | |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path, use_auth_token=hf_token) | |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path, use_auth_token=hf_token).to(device) | |
quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path, use_auth_token=hf_token) | |
quillbot_model = AutoModelForSequenceClassification.from_pretrained(text_quillbot_model_path, use_auth_token=hf_token).to(device) | |
def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'): | |
sentences = nltk.sent_tokenize(text) | |
segments = [] | |
current_segment = [] | |
current_length = 0 | |
if type_det == 'bc': | |
tokenizer = text_bc_tokenizer | |
max_length = bc_token_size | |
elif type_det == 'mc': | |
tokenizer = text_mc_tokenizer | |
max_length = mc_token_size | |
for sentence in sentences: | |
tokens = tokenizer.tokenize(sentence) | |
sentence_length = len(tokens) | |
if current_length + sentence_length <= max_length + tolerance - 2: | |
current_segment.append(sentence) | |
current_length += sentence_length | |
else: | |
if current_segment: | |
encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True) | |
segments.append((current_segment, len(encoded_segment))) | |
current_segment = [sentence] | |
current_length = sentence_length | |
if current_segment: | |
encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True) | |
segments.append((current_segment, len(encoded_segment))) | |
final_segments = [] | |
for i, (seg, length) in enumerate(segments): | |
if i == len(segments) - 1: | |
if length < min_last_segment_length and len(final_segments) > 0: | |
prev_seg, prev_length = final_segments[-1] | |
combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True) | |
if len(combined_encoded) <= max_length + tolerance: | |
final_segments[-1] = (prev_seg + seg, len(combined_encoded)) | |
else: | |
final_segments.append((seg, length)) | |
else: | |
final_segments.append((seg, length)) | |
else: | |
final_segments.append((seg, length)) | |
decoded_segments = [] | |
encoded_segments = [] | |
for seg, _ in final_segments: | |
encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True) | |
decoded_segment = tokenizer.decode(encoded_segment) | |
decoded_segments.append(decoded_segment) | |
return decoded_segments | |
def predict_quillbot(text): | |
with torch.no_grad(): | |
quillbot_model.eval() | |
tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device) | |
output = quillbot_model(**tokenized_text) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()} | |
return q_score | |
def predict_bc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_bc_tokenizer( | |
text, padding='max_length', truncation=True, max_length=bc_token_size, return_tensors="pt" | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
return output_norm | |
def predict_mc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_mc_tokenizer( | |
text, padding='max_length', truncation=True, return_tensors="pt", max_length=mc_token_size | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
return output_norm | |
def predict_mc_scores(input): | |
bc_scores = [] | |
mc_scores = [] | |
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc')) | |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc') | |
for i in range(samples_len_bc): | |
cleaned_text_bc = remove_special_characters(segments_bc[i]) | |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc ) | |
bc_scores.append(bc_score) | |
bc_scores_array = np.array(bc_scores) | |
average_bc_scores = np.mean(bc_scores_array, axis=0) | |
bc_score_list = average_bc_scores.tolist() | |
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]} | |
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'mc') | |
samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc')) | |
for i in range(samples_len_mc): | |
cleaned_text_mc = remove_special_characters(segments_mc[i]) | |
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc) | |
mc_scores.append(mc_score) | |
mc_scores_array = np.array(mc_scores) | |
average_mc_scores = np.mean(mc_scores_array, axis=0) | |
mc_score_list = average_mc_scores.tolist() | |
mc_score = {} | |
for score, label in zip(mc_score_list, mc_label_map): | |
mc_score[label.upper()] = score | |
sum_prob = 1 - bc_score['HUMAN'] | |
for key, value in mc_score.items(): | |
mc_score[key] = value * sum_prob | |
if sum_prob < 0.01 : | |
mc_score = {} | |
mc_score['HUMAN'] = bc_score['HUMAN'] | |
return mc_score | |
def predict_bc_scores(input): | |
bc_scores = [] | |
mc_scores = [] | |
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc')) | |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc') | |
for i in range(samples_len_bc): | |
cleaned_text_bc = remove_special_characters(segments_bc[i]) | |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc ) | |
bc_scores.append(bc_score) | |
bc_scores_array = np.array(bc_scores) | |
average_bc_scores = np.mean(bc_scores_array, axis=0) | |
bc_score_list = average_bc_scores.tolist() | |
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]} | |
return bc_score |