Spaces:

sashdev
/

humnifierai

Build error

App Files Files Community

Shujaat Ali commited on Aug 31, 2024

Commit

ea28e08

verified ·

1 Parent(s): 52b2b32

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -45

app.py CHANGED Viewed

@@ -3,14 +3,15 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
 import torch
 import nltk
 import spacy
-from nltk.corpus import wordnet
-import subprocess
 # Download NLTK data (if not already downloaded)
 nltk.download('punkt')
 nltk.download('stopwords')
-nltk.download('wordnet')  # Download WordNet
 # Download spaCy model if not already installed
 try:
@@ -30,39 +31,97 @@ model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-unca
 paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
 paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)
-# Function to find synonyms using WordNet via NLTK
-def get_synonyms(word):
-    synonyms = set()
-    for syn in wordnet.synsets(word):
-        for lemma in syn.lemmas():
-            synonyms.add(lemma.name())
-    return list(synonyms)
-# Replace words with synonyms using spaCy and WordNet
-def replace_with_synonyms(text):
     doc = nlp(text)
-    processed_text = []
     for token in doc:
-        synonyms = get_synonyms(token.text.lower())
-        if synonyms and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}:  # Only replace certain types of words
-            replacement = synonyms[0]  # Replace with the first synonym
-            if token.is_title:
-                replacement = replacement.capitalize()
-            processed_text.append(replacement)
         else:
-            processed_text.append(token.text)
-    return " ".join(processed_text)
-# AI detection function using DistilBERT
-def detect_ai_generated(text):
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    probabilities = torch.softmax(outputs.logits, dim=1)
-    ai_probability = probabilities[0][1].item()  # Probability of being AI-generated
-    return ai_probability
-# Humanize the AI-detected text using the SRDdev Paraphrase model
 def humanize_text(AI_text):
     paragraphs = AI_text.split("\n")
     paraphrased_paragraphs = []
@@ -71,36 +130,38 @@ def humanize_text(AI_text):
             inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
             paraphrased_ids = paraphrase_model.generate(
                 inputs['input_ids'],
-                max_length=inputs['input_ids'].shape[-1] + 20,  # Slightly more than the original input length
-                num_beams=4,
                 early_stopping=True,
-                length_penalty=1.0,
-                no_repeat_ngram_size=3,
             )
             paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
             paraphrased_paragraphs.append(paraphrased_text)
     return "\n\n".join(paraphrased_paragraphs)
-# Main function to handle the overall process
 def main_function(AI_text):
-    # Replace words with synonyms
-    text_with_synonyms = replace_with_synonyms(AI_text)
-    # Detect AI-generated content
-    ai_probability = detect_ai_generated(text_with_synonyms)
-    # Humanize AI text
-    humanized_text = humanize_text(text_with_synonyms)
-    return f"AI-Generated Content: {ai_probability:.2f}%\n\nHumanized Text:\n{humanized_text}"
 # Gradio interface definition
 interface = gr.Interface(
     fn=main_function,
     inputs="textbox",
     outputs="textbox",
-    title="AI Text Humanizer with Synonym Replacement",
-    description="Enter AI-generated text and get a human-written version, with synonyms replaced for more natural output. This space uses models from Hugging Face directly."
 )
 # Launch the Gradio app

 from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
 import torch
 import nltk
+import random
+import string
 import spacy
+import subprocess  # Import subprocess for downloading spaCy models
 # Download NLTK data (if not already downloaded)
 nltk.download('punkt')
 nltk.download('stopwords')
+nltk.download('wordnet')  # Download WordNet for enhanced synonym lookup
 # Download spaCy model if not already installed
 try:
 paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
 paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)
+# AI detection function using DistilBERT with batch processing
+def detect_ai_generated(texts):
+    inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    probabilities = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().tolist()  # List of AI-generated probabilities
+    return probabilities
+# Synonym replacement using spaCy
+def replace_with_synonyms(text, probability=0.3):
     doc = nlp(text)
+    new_text = []
     for token in doc:
+        if random.random() < probability and token.pos_ in ("NOUN", "VERB", "ADJ", "ADV"):
+            synonyms = [synonym.lemma_ for synonym in token.vocab if synonym.is_lower == token.is_lower]
+            if synonyms:
+                new_word = random.choice(synonyms)
+                new_text.append(new_word)
+            else:
+                new_text.append(token.text)
         else:
+            new_text.append(token.text)
+    return " ".join(new_text)
+# Random text transformations to simulate human-like errors
+def random_capitalize(word):
+    if word.isalpha() and random.random() < 0.1:
+        return word.capitalize()
+    return word
+def random_remove_punctuation(text):
+    if random.random() < 0.2:
+        text = list(text)
+        indices = [i for i, c in enumerate(text) if c in string.punctuation]
+        if indices:
+            remove_indices = random.sample(indices, min(3, len(indices)))
+            for idx in sorted(remove_indices, reverse=True):
+                text.pop(idx)
+        return ''.join(text)
+    return text
+def random_double_period(text):
+    if random.random() < 0.2:
+        text = text.replace('.', '..', 3)
+    return text
+def random_double_space(text):
+    if random.random() < 0.2:
+        words = text.split()
+        for _ in range(min(3, len(words) - 1)):
+            idx = random.randint(0, len(words) - 2)
+            words[idx] += '  '
+        return ' '.join(words)
+    return text
+def random_replace_comma_space(text, period_replace_percentage=0.33):
+    comma_occurrences = text.count(", ")
+    period_occurrences = text.count(". ")
+    replace_count_comma = max(1, comma_occurrences // 3)
+    replace_count_period = max(1, period_occurrences // 3)
+    comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
+    period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
+    replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
+    replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
+    for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
+        if text.startswith(", ", idx):
+            text = text[:idx] + " ," + text[idx + 2:]
+        if text.startswith(". ", idx):
+            text = text[:idx] + " ." + text[idx + 2:]
+    return text
+def transform_paragraph(paragraph):
+    words = paragraph.split()
+    if len(words) > 12:
+        words = [random_capitalize(word) for word in words]
+        transformed_paragraph = ' '.join(words)
+        transformed_paragraph = random_remove_punctuation(transformed_paragraph)
+        transformed_paragraph = random_double_period(transformed_paragraph)
+        transformed_paragraph = random_double_space(transformed_paragraph)
+        transformed_paragraph = random_replace_comma_space(transformed_paragraph)
+        transformed_paragraph = replace_with_synonyms(transformed_paragraph)  # Use spaCy for synonyms
+    else:
+        transformed_paragraph = paragraph
+    return transformed_paragraph
+def transform_text(text):
+    paragraphs = text.split('\n')
+    transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
+    return '\n'.join(transformed_paragraphs)
+# Humanize the AI-detected text using the SRDdev Paraphrase model with optimized parameters
 def humanize_text(AI_text):
     paragraphs = AI_text.split("\n")
     paraphrased_paragraphs = []
             inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
             paraphrased_ids = paraphrase_model.generate(
                 inputs['input_ids'],
+                max_length=inputs['input_ids'].shape[-1] + 20,
+                num_beams=2,  # Reduced beam size for speed
                 early_stopping=True,
+                length_penalty=0.8,  # Lower penalty to generate faster
+                no_repeat_ngram_size=2,  # Reduced for performance
+                do_sample=True,  # Enable sampling to add randomness
+                top_k=50,  # Top-k sampling
+                top_p=0.95,  # Top-p (nucleus) sampling
             )
             paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
             paraphrased_paragraphs.append(paraphrased_text)
     return "\n\n".join(paraphrased_paragraphs)
+# Main function to handle the overall process with batch processing
 def main_function(AI_text):
+    sentences = nltk.sent_tokenize(AI_text)
+    ai_probabilities = detect_ai_generated(sentences)
+    ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
+    # Transform AI text to make it more human-like
+    humanized_text = humanize_text(AI_text)
+    humanized_text = transform_text(humanized_text)  # Add randomness to simulate human errors
+    return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"
 # Gradio interface definition
 interface = gr.Interface(
     fn=main_function,
     inputs="textbox",
     outputs="textbox",
+    title="AI Text Humanizer",
+    description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
 )
 # Launch the Gradio app