Spaces:
Build error
Build error
File size: 7,242 Bytes
29edf23 84669bc 29edf23 ea28e08 23a08cd ea28e08 936bfca 29edf23 ea28e08 936bfca 7c9a059 23a08cd c93f011 29edf23 c93f011 936bfca 29edf23 c93f011 35244e7 ea28e08 23a08cd ea28e08 23a08cd ea28e08 23a08cd ea28e08 23a08cd ea28e08 23a08cd ea28e08 10dc1f6 ea28e08 84669bc 29edf23 c93f011 29edf23 ea28e08 29edf23 ea28e08 29edf23 ea28e08 ada2d1a ea28e08 29edf23 ea28e08 29edf23 ea28e08 99b3c08 84669bc 29edf23 ea28e08 84669bc 776fa07 84669bc 29edf23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# Import dependencies
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch
import nltk
import random
import string
import spacy
import subprocess # Import subprocess for downloading spaCy models
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') # Download WordNet for enhanced synonym lookup
# Download spaCy model if not already installed
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
# Check for GPU and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)
# Load SRDdev Paraphrase model and tokenizer for humanizing text
paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)
# AI detection function using DistilBERT with batch processing
def detect_ai_generated(texts):
inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
with torch.no_grad():
outputs = model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().tolist() # List of AI-generated probabilities
return probabilities
# Synonym replacement using spaCy
def replace_with_synonyms(text, probability=0.3):
doc = nlp(text)
new_text = []
for token in doc:
if random.random() < probability and token.pos_ in ("NOUN", "VERB", "ADJ", "ADV"):
synonyms = [synonym.lemma_ for synonym in token.vocab if synonym.is_lower == token.is_lower]
if synonyms:
new_word = random.choice(synonyms)
new_text.append(new_word)
else:
new_text.append(token.text)
else:
new_text.append(token.text)
return " ".join(new_text)
# Random text transformations to simulate human-like errors
def random_capitalize(word):
if word.isalpha() and random.random() < 0.1:
return word.capitalize()
return word
def random_remove_punctuation(text):
if random.random() < 0.2:
text = list(text)
indices = [i for i, c in enumerate(text) if c in string.punctuation]
if indices:
remove_indices = random.sample(indices, min(3, len(indices)))
for idx in sorted(remove_indices, reverse=True):
text.pop(idx)
return ''.join(text)
return text
def random_double_period(text):
if random.random() < 0.2:
text = text.replace('.', '..', 3)
return text
def random_double_space(text):
if random.random() < 0.2:
words = text.split()
for _ in range(min(3, len(words) - 1)):
idx = random.randint(0, len(words) - 2)
words[idx] += ' '
return ' '.join(words)
return text
def random_replace_comma_space(text, period_replace_percentage=0.33):
comma_occurrences = text.count(", ")
period_occurrences = text.count(". ")
replace_count_comma = max(1, comma_occurrences // 3)
replace_count_period = max(1, period_occurrences // 3)
comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
if text.startswith(", ", idx):
text = text[:idx] + " ," + text[idx + 2:]
if text.startswith(". ", idx):
text = text[:idx] + " ." + text[idx + 2:]
return text
def transform_paragraph(paragraph):
words = paragraph.split()
if len(words) > 12:
words = [random_capitalize(word) for word in words]
transformed_paragraph = ' '.join(words)
transformed_paragraph = random_remove_punctuation(transformed_paragraph)
transformed_paragraph = random_double_period(transformed_paragraph)
transformed_paragraph = random_double_space(transformed_paragraph)
transformed_paragraph = random_replace_comma_space(transformed_paragraph)
transformed_paragraph = replace_with_synonyms(transformed_paragraph) # Use spaCy for synonyms
else:
transformed_paragraph = paragraph
return transformed_paragraph
def transform_text(text):
paragraphs = text.split('\n')
transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
return '\n'.join(transformed_paragraphs)
# Humanize the AI-detected text using the SRDdev Paraphrase model with optimized parameters
def humanize_text(AI_text):
paragraphs = AI_text.split("\n")
paraphrased_paragraphs = []
for paragraph in paragraphs:
if paragraph.strip():
inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
paraphrased_ids = paraphrase_model.generate(
inputs['input_ids'],
max_length=inputs['input_ids'].shape[-1] + 20,
num_beams=2, # Reduced beam size for speed
early_stopping=True,
length_penalty=0.8, # Lower penalty to generate faster
no_repeat_ngram_size=2, # Reduced for performance
do_sample=True, # Enable sampling to add randomness
top_k=50, # Top-k sampling
top_p=0.95, # Top-p (nucleus) sampling
)
paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
paraphrased_paragraphs.append(paraphrased_text)
return "\n\n".join(paraphrased_paragraphs)
# Main function to handle the overall process with batch processing
def main_function(AI_text):
sentences = nltk.sent_tokenize(AI_text)
ai_probabilities = detect_ai_generated(sentences)
ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
# Transform AI text to make it more human-like
humanized_text = humanize_text(AI_text)
humanized_text = transform_text(humanized_text) # Add randomness to simulate human errors
return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"
# Gradio interface definition
interface = gr.Interface(
fn=main_function,
inputs="textbox",
outputs="textbox",
title="AI Text Humanizer",
description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
)
# Launch the Gradio app
interface.launch(debug=True)
|