Spaces:

sashdev
/

humnifierai

Build error

File size: 7,242 Bytes

29edf23
84669bc
29edf23
 
 
ea28e08
 
23a08cd
ea28e08
936bfca
29edf23
 
 
ea28e08
936bfca
7c9a059
 
 
 
 
 
23a08cd
c93f011
 
 
29edf23
 
c93f011
936bfca
29edf23
 
c93f011
35244e7
ea28e08
 
 
 
 
 
 
23a08cd
ea28e08
 
23a08cd
ea28e08
23a08cd
ea28e08
 
 
 
 
 
 
23a08cd
ea28e08
 
23a08cd
ea28e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10dc1f6
ea28e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84669bc
29edf23
 
 
 
c93f011
29edf23
 
ea28e08
 
29edf23
ea28e08
 
 
 
 
29edf23
 
 
 
 
ea28e08
ada2d1a
ea28e08
 
 
29edf23
ea28e08
 
 
29edf23
ea28e08
99b3c08
84669bc
 
29edf23
 
 
ea28e08
 
84669bc
776fa07
84669bc
29edf23

# Import dependencies
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch
import nltk
import random
import string
import spacy
import subprocess  # Import subprocess for downloading spaCy models

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # Download WordNet for enhanced synonym lookup

# Download spaCy model if not already installed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Check for GPU and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)

# Load SRDdev Paraphrase model and tokenizer for humanizing text
paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)

# AI detection function using DistilBERT with batch processing
def detect_ai_generated(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().tolist()  # List of AI-generated probabilities
    return probabilities

# Synonym replacement using spaCy
def replace_with_synonyms(text, probability=0.3):
    doc = nlp(text)
    new_text = []
    for token in doc:
        if random.random() < probability and token.pos_ in ("NOUN", "VERB", "ADJ", "ADV"):
            synonyms = [synonym.lemma_ for synonym in token.vocab if synonym.is_lower == token.is_lower]
            if synonyms:
                new_word = random.choice(synonyms)
                new_text.append(new_word)
            else:
                new_text.append(token.text)
        else:
            new_text.append(token.text)
    return " ".join(new_text)

# Random text transformations to simulate human-like errors
def random_capitalize(word):
    if word.isalpha() and random.random() < 0.1:
        return word.capitalize()
    return word

def random_remove_punctuation(text):
    if random.random() < 0.2:
        text = list(text)
        indices = [i for i, c in enumerate(text) if c in string.punctuation]
        if indices:
            remove_indices = random.sample(indices, min(3, len(indices)))
            for idx in sorted(remove_indices, reverse=True):
                text.pop(idx)
        return ''.join(text)
    return text

def random_double_period(text):
    if random.random() < 0.2:
        text = text.replace('.', '..', 3)
    return text

def random_double_space(text):
    if random.random() < 0.2:
        words = text.split()
        for _ in range(min(3, len(words) - 1)):
            idx = random.randint(0, len(words) - 2)
            words[idx] += '  '
        return ' '.join(words)
    return text

def random_replace_comma_space(text, period_replace_percentage=0.33):
    comma_occurrences = text.count(", ")
    period_occurrences = text.count(". ")
    replace_count_comma = max(1, comma_occurrences // 3)
    replace_count_period = max(1, period_occurrences // 3)
    comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
    period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
    replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
    replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
    for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
        if text.startswith(", ", idx):
            text = text[:idx] + " ," + text[idx + 2:]
        if text.startswith(". ", idx):
            text = text[:idx] + " ." + text[idx + 2:]
    return text

def transform_paragraph(paragraph):
    words = paragraph.split()
    if len(words) > 12:
        words = [random_capitalize(word) for word in words]
        transformed_paragraph = ' '.join(words)
        transformed_paragraph = random_remove_punctuation(transformed_paragraph)
        transformed_paragraph = random_double_period(transformed_paragraph)
        transformed_paragraph = random_double_space(transformed_paragraph)
        transformed_paragraph = random_replace_comma_space(transformed_paragraph)
        transformed_paragraph = replace_with_synonyms(transformed_paragraph)  # Use spaCy for synonyms
    else:
        transformed_paragraph = paragraph
    return transformed_paragraph

def transform_text(text):
    paragraphs = text.split('\n')
    transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
    return '\n'.join(transformed_paragraphs)

# Humanize the AI-detected text using the SRDdev Paraphrase model with optimized parameters
def humanize_text(AI_text):
    paragraphs = AI_text.split("\n")
    paraphrased_paragraphs = []
    for paragraph in paragraphs:
        if paragraph.strip():
            inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
            paraphrased_ids = paraphrase_model.generate(
                inputs['input_ids'],
                max_length=inputs['input_ids'].shape[-1] + 20,
                num_beams=2,  # Reduced beam size for speed
                early_stopping=True,
                length_penalty=0.8,  # Lower penalty to generate faster
                no_repeat_ngram_size=2,  # Reduced for performance
                do_sample=True,  # Enable sampling to add randomness
                top_k=50,  # Top-k sampling
                top_p=0.95,  # Top-p (nucleus) sampling
            )
            paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
            paraphrased_paragraphs.append(paraphrased_text)
    return "\n\n".join(paraphrased_paragraphs)

# Main function to handle the overall process with batch processing
def main_function(AI_text):
    sentences = nltk.sent_tokenize(AI_text)
    ai_probabilities = detect_ai_generated(sentences)
    ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
    
    # Transform AI text to make it more human-like
    humanized_text = humanize_text(AI_text)
    humanized_text = transform_text(humanized_text)  # Add randomness to simulate human errors
    
    return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"

# Gradio interface definition
interface = gr.Interface(
    fn=main_function,
    inputs="textbox",
    outputs="textbox",
    title="AI Text Humanizer",
    description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
)

# Launch the Gradio app
interface.launch(debug=True)