Spaces:

ii5
/

Humanizer_transformers

Running

App Files Files Community

ii5 commited on Jun 4

Commit

ea54f0b

verified ·

1 Parent(s): 04bcb58

Upload 4 files

Browse files

Files changed (4) hide show

app.py +147 -0
requirements.txt +0 -0
transformer/__init__.py +18 -0
transformer/app.py +1100 -0

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Simple AI Text Humanizer using Gradio
+A clean text-to-text interface for humanizing AI-generated content.
+"""
+import gradio as gr
+import time
+from typing import Optional
+from transformer.app import AdvancedAcademicTextHumanizer, download_nltk_resources
+# Global humanizer instance
+humanizer_instance = None
+def initialize_humanizer():
+    """Initialize the humanizer model."""
+    global humanizer_instance
+    if humanizer_instance is None:
+        try:
+            print("🔄 Downloading NLTK resources...")
+            # Download NLTK resources
+            download_nltk_resources()
+            print("🔄 Initializing lightweight models...")
+            # Initialize humanizer with lightweight, fast settings
+            humanizer_instance = AdvancedAcademicTextHumanizer(
+                sentence_model="fast",           # Uses all-MiniLM-L6-v2 (lightweight)
+                paraphrase_model="fast",         # Uses t5-small (fast)
+                enable_advanced_models=True,
+                ai_avoidance_mode=True
+            )
+            print("✅ All models loaded successfully and ready!")
+            return "✅ Models loaded successfully"
+        except Exception as e:
+            error_msg = f"❌ Error loading models: {str(e)}"
+            print(error_msg)
+            return error_msg
+    return "✅ Models already loaded"
+def humanize_text(input_text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool) -> str:
+    """Transform AI text to human-like text."""
+    if not input_text.strip():
+        return "Please enter some text to transform."
+    global humanizer_instance
+    if humanizer_instance is None:
+        init_result = initialize_humanizer()
+        if "Error" in init_result:
+            return init_result
+    try:
+        # Transform the text
+        transformed = humanizer_instance.humanize_text(
+            input_text,
+            use_passive=use_passive,
+            use_synonyms=use_synonyms,
+            use_paraphrasing=use_paraphrasing
+        )
+        return transformed
+    except Exception as e:
+        return f"❌ Error during transformation: {str(e)}"
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(title="AI Text Humanizer", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🤖➡️🧔🏻‍♂️ AI Text Humanizer")
+        gr.Markdown("Transform AI-generated text into human-like content using advanced ML models.")
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(
+                    label="Input Text",
+                    placeholder="Paste your AI-generated text here...",
+                    lines=10,
+                    max_lines=20
+                )
+                with gr.Row():
+                    use_passive = gr.Checkbox(
+                        label="Passive Voice Transformation",
+                        value=False,
+                        info="Convert active voice to passive"
+                    )
+                    use_synonyms = gr.Checkbox(
+                        label="Synonym Replacement",
+                        value=True,
+                        info="AI-powered contextual synonyms"
+                    )
+                    use_paraphrasing = gr.Checkbox(
+                        label="Neural Paraphrasing",
+                        value=True,
+                        info="T5 sentence-level rewriting"
+                    )
+                transform_btn = gr.Button("🚀 Transform Text", variant="primary")
+            with gr.Column():
+                output_text = gr.Textbox(
+                    label="Transformed Text",
+                    lines=10,
+                    max_lines=20,
+                    interactive=False
+                )
+        # Initialize models on startup
+        gr.Markdown("### Model Status")
+        status_text = gr.Textbox(
+            label="Initialization Status",
+            value="Click 'Transform Text' to load models...",
+            interactive=False
+        )
+        # Connect the transformation function
+        transform_btn.click(
+            fn=humanize_text,
+            inputs=[input_text, use_passive, use_synonyms, use_paraphrasing],
+            outputs=output_text
+        )
+        # Initialize models when interface loads
+        interface.load(
+            fn=initialize_humanizer,
+            outputs=status_text
+        )
+        gr.Markdown("---")
+        gr.Markdown("**Note:** First-time model loading may take a few moments.")
+    return interface
+def main():
+    """Launch the Gradio interface."""
+    interface = create_interface()
+    # Launch with Mac-optimized settings
+    interface.launch(
+        server_name="127.0.0.1",
+        server_port=7860,
+        share=False,
+        debug=False,
+        show_error=True
+    )
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

Binary file (1.93 kB). View file

transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+AI Text Humanizer Package
+A sophisticated text transformation system that converts AI-generated text
+into more human-like, academic writing while preserving formatting.
+"""
+__version__ = "2.0.0"
+__author__ = "AI Text Humanizer Team"
+__description__ = "Advanced text humanization with markdown preservation"
+from .app import AdvancedAcademicTextHumanizer, NLP_GLOBAL, download_nltk_resources
+__all__ = [
+    "AdvancedAcademicTextHumanizer",
+    "NLP_GLOBAL",
+    "download_nltk_resources"
+]

transformer/app.py ADDED Viewed

	@@ -0,0 +1,1100 @@

+"""
+Advanced Academic Text Humanizer with State-of-the-Art ML Models
+This module provides cutting-edge text transformation capabilities using the latest
+ML models for superior AI text humanization, including T5 paraphrasing, advanced
+sentence transformers, and AI detection avoidance techniques.
+"""
+import ssl
+import random
+import warnings
+import re
+import logging
+import math
+from typing import List, Dict, Tuple, Optional, Union
+from dataclasses import dataclass
+from functools import lru_cache
+import nltk
+import spacy
+import torch
+import numpy as np
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import wordnet, stopwords
+from sentence_transformers import SentenceTransformer, util
+from transformers import (
+    T5ForConditionalGeneration, T5Tokenizer,
+    PegasusForConditionalGeneration, PegasusTokenizer,
+    pipeline, AutoTokenizer, AutoModelForCausalLM
+)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Suppress warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+# Global models
+NLP_GLOBAL = None
+DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
+# Latest state-of-the-art models configuration
+LATEST_MODELS = {
+    'sentence_transformers': {
+        'premium': 'sentence-transformers/all-MiniLM-L12-v2',  # Lighter premium option
+        'balanced': 'sentence-transformers/all-MiniLM-L6-v2',   # Fast and reliable
+        'fast': 'sentence-transformers/all-MiniLM-L6-v2'       # Same as balanced for consistency
+    },
+    'paraphrasing': {
+        'premium': 'google-t5/t5-base',      # Much lighter than UL2
+        'balanced': 'google-t5/t5-small',    # Good balance
+        'fast': 'google-t5/t5-small'        # Fast and efficient
+    },
+    'text_generation': {
+        'premium': 'google-t5/t5-base',      # Much lighter than 70B models
+        'balanced': 'google-t5/t5-small',    # Small and fast
+        'fast': 'google-t5/t5-small'        # Consistent with balanced
+    }
+}
+def initialize_nlp():
+    """Initialize the global NLP model with enhanced capabilities."""
+    global NLP_GLOBAL
+    if NLP_GLOBAL is None:
+        try:
+            NLP_GLOBAL = spacy.load("en_core_web_sm")
+            logger.info("Successfully loaded spaCy model")
+        except Exception as e:
+            logger.error(f"Failed to load spaCy model: {e}")
+            raise
+# Initialize on import
+try:
+    initialize_nlp()
+except Exception as e:
+    logger.warning(f"Could not initialize NLP model: {e}")
+@dataclass
+class TextSegment:
+    """Enhanced text segment with additional metadata."""
+    content: str
+    segment_type: str  # 'text', 'markdown', 'code', 'list', 'header'
+    line_number: int
+    preserve_formatting: bool = False
+    perplexity_score: float = 0.0
+    ai_probability: float = 0.0
+class AdvancedMarkdownPreserver:
+    """Enhanced markdown preservation with better pattern recognition."""
+    def __init__(self):
+        self.patterns = {
+            'code_block': re.compile(r'```[\s\S]*?```', re.MULTILINE),
+            'inline_code': re.compile(r'`[^`]+`'),
+            'header': re.compile(r'^#{1,6}\s+.*$', re.MULTILINE),
+            'list_item': re.compile(r'^\s*[-*+]\s+.*$', re.MULTILINE),
+            'numbered_list': re.compile(r'^\s*\d+\.\s+.*$', re.MULTILINE),
+            'link': re.compile(r'\[([^\]]+)\]\(([^)]+)\)'),
+            'bold': re.compile(r'\*\*([^*]+)\*\*'),
+            'italic': re.compile(r'\*([^*]+)\*'),
+            'blockquote': re.compile(r'^>\s+.*$', re.MULTILINE),
+            'horizontal_rule': re.compile(r'^---+$', re.MULTILINE),
+            'table_row': re.compile(r'^\s*\|.*\|\s*$', re.MULTILINE),
+            'latex_math': re.compile(r'\$\$.*?\$\$|\$.*?\$', re.DOTALL),
+            'footnote': re.compile(r'\[\^[^\]]+\]'),
+        }
+    def segment_text(self, text: str) -> List[TextSegment]:
+        """Segment text with enhanced analysis."""
+        segments = []
+        lines = text.split('\n')
+        for i, line in enumerate(lines):
+            segment_type = self._identify_line_type(line)
+            preserve = segment_type != 'text'
+            # Calculate perplexity and AI probability for text segments
+            perplexity = self._calculate_perplexity(line) if segment_type == 'text' else 0.0
+            ai_prob = self._calculate_ai_probability(line) if segment_type == 'text' else 0.0
+            segments.append(TextSegment(
+                content=line,
+                segment_type=segment_type,
+                line_number=i,
+                preserve_formatting=preserve,
+                perplexity_score=perplexity,
+                ai_probability=ai_prob
+            ))
+        return segments
+    def _identify_line_type(self, line: str) -> str:
+        """Enhanced line type identification."""
+        if not line.strip():
+            return 'empty'
+        for pattern_name, pattern in self.patterns.items():
+            if pattern.match(line):
+                return pattern_name
+        return 'text'
+    def _calculate_perplexity(self, text: str) -> float:
+        """Calculate text perplexity as an AI detection metric."""
+        if not text.strip():
+            return 0.0
+        words = word_tokenize(text.lower())
+        if len(words) < 3:
+            return 0.0
+        # Simple perplexity approximation based on word frequency patterns
+        word_lengths = [len(word) for word in words if word.isalpha()]
+        if not word_lengths:
+            return 0.0
+        avg_length = np.mean(word_lengths)
+        length_variance = np.var(word_lengths)
+        # AI text tends to have more consistent word lengths (lower variance)
+        perplexity = length_variance / (avg_length + 1e-6)
+        return min(perplexity, 10.0)  # Cap at 10
+    def _calculate_ai_probability(self, text: str) -> float:
+        """Calculate probability that text is AI-generated."""
+        if not text.strip():
+            return 0.0
+        # Check for AI-typical patterns
+        ai_indicators = 0
+        total_checks = 6
+        # 1. Consistent sentence structure
+        sentences = sent_tokenize(text)
+        if len(sentences) > 1:
+            lengths = [len(sent.split()) for sent in sentences]
+            if np.std(lengths) < 3:  # Very consistent lengths
+                ai_indicators += 1
+        # 2. Overuse of transitional phrases
+        transitions = ['however', 'moreover', 'furthermore', 'additionally', 'consequently']
+        transition_count = sum(1 for trans in transitions if trans in text.lower())
+        if transition_count > len(sentences) * 0.3:
+            ai_indicators += 1
+        # 3. Lack of contractions
+        contractions = ["n't", "'ll", "'re", "'ve", "'d", "'m"]
+        if not any(cont in text for cont in contractions) and len(text.split()) > 10:
+            ai_indicators += 1
+        # 4. Overly formal language in casual contexts
+        formal_words = ['utilize', 'facilitate', 'demonstrate', 'implement', 'comprehensive']
+        formal_count = sum(1 for word in formal_words if word in text.lower())
+        if formal_count > len(text.split()) * 0.1:
+            ai_indicators += 1
+        # 5. Perfect grammar (rarely natural)
+        if len(text) > 50 and not re.search(r'[.]{2,}|[!]{2,}|[?]{2,}', text):
+            ai_indicators += 1
+        # 6. Repetitive phrasing patterns
+        words = text.lower().split()
+        if len(words) > 10:
+            unique_words = len(set(words))
+            if unique_words / len(words) < 0.6:  # Low lexical diversity
+                ai_indicators += 1
+        return ai_indicators / total_checks
+    def reconstruct_text(self, segments: List[TextSegment]) -> str:
+        """Reconstruct text from processed segments."""
+        return '\n'.join(segment.content for segment in segments)
+def download_nltk_resources():
+    """Download required NLTK resources with comprehensive coverage."""
+    try:
+        _create_unverified_https_context = ssl._create_unverified_context
+    except AttributeError:
+        pass
+    else:
+        ssl._create_default_https_context = _create_unverified_https_context
+    resources = [
+        'punkt', 'averaged_perceptron_tagger', 'punkt_tab',
+        'wordnet', 'averaged_perceptron_tagger_eng', 'stopwords',
+        'vader_lexicon', 'omw-1.4'
+    ]
+    for resource in resources:
+        try:
+            nltk.download(resource, quiet=True)
+            logger.info(f"Successfully downloaded {resource}")
+        except Exception as e:
+            logger.warning(f"Could not download {resource}: {str(e)}")
+class StateOfTheArtHumanizer:
+    """State-of-the-art humanizer with LATEST 2025 models."""
+    def __init__(
+        self,
+        sentence_model: str = 'fast',                         # 🚀 FAST: Uses MiniLM-L6-v2 (fast)
+        paraphrase_model: str = 'fast',                       # 🎯 FAST: T5-Small
+        text_generation_model: str = 'fast',                   # 🔥 FAST: T5-Small
+        device: Optional[str] = None,
+        enable_advanced_models: bool = True,                   # Always enabled for quality
+        model_quality: str = 'fast'                            # 'premium', 'balanced', 'fast'
+    ):
+        """Initialize with latest 2025 state-of-the-art models."""
+        self.device = device or str(DEVICE)
+        self.enable_advanced_models = enable_advanced_models
+        self.model_quality = model_quality
+        # Map model quality to specific models
+        self.sentence_model_name = self._get_model_name('sentence_transformers', sentence_model)
+        self.paraphrase_model_name = self._get_model_name('paraphrasing', paraphrase_model)
+        self.text_gen_model_name = self._get_model_name('text_generation', text_generation_model)
+        # Initialize models
+        self.sentence_model = None
+        self.paraphrase_models = {}
+        self.text_gen_model = None
+        logger.info(f"🚀 Initializing SOTA Humanizer with:")
+        logger.info(f"  📊 Sentence Model: {self.sentence_model_name}")
+        logger.info(f"  🧠 Paraphrase Model: {self.paraphrase_model_name}")
+        logger.info(f"  🔥 Text Gen Model: {self.text_gen_model_name}")
+        self._initialize_models()
+    def _get_model_name(self, category: str, quality: str) -> str:
+        """Get the actual model name from the quality setting."""
+        if quality in LATEST_MODELS[category]:
+            return LATEST_MODELS[category][quality]
+        else:
+            # If specific model name provided, use it directly
+            return quality
+    def _initialize_models(self):
+        """Initialize all models with error handling."""
+        try:
+            # Initialize sentence transformer (BGE-M3 or fallback)
+            logger.info(f"🔄 Loading sentence model: {self.sentence_model_name}")
+            self.sentence_model = SentenceTransformer(self.sentence_model_name, device=self.device)
+            logger.info("✅ Sentence model loaded successfully")
+            # Initialize paraphrasing models
+            self._initialize_paraphrase_models(self.paraphrase_model_name)
+            # Initialize text generation model (if premium)
+            if self.model_quality == 'premium' and self.enable_advanced_models:
+                self._initialize_text_generation_model()
+        except Exception as e:
+            logger.error(f"❌ Model initialization failed: {e}")
+            # Fallback to basic models
+            self._initialize_fallback_models()
+    def _initialize_fallback_models(self):
+        """Initialize fallback models if latest ones fail."""
+        try:
+            logger.info("🔄 Falling back to reliable models...")
+            self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=self.device)
+            self._initialize_paraphrase_models('google-t5/t5-small')
+            logger.info("✅ Fallback models loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Even fallback models failed: {e}")
+    def _initialize_text_generation_model(self):
+        """Initialize latest text generation model (DeepSeek-R1 or Qwen3)."""
+        try:
+            if 'deepseek' in self.text_gen_model_name.lower():
+                logger.info(f"🚀 Loading DeepSeek model: {self.text_gen_model_name}")
+                # For DeepSeek models, use specific configuration
+                self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name)
+                self.text_gen_model = AutoModelForCausalLM.from_pretrained(
+                    self.text_gen_model_name,
+                    torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32,
+                    device_map='auto' if self.device != 'cpu' else None,
+                    trust_remote_code=True
+                )
+                logger.info("✅ DeepSeek model loaded successfully")
+            elif 'qwen' in self.text_gen_model_name.lower():
+                logger.info(f"🔥 Loading Qwen3 model: {self.text_gen_model_name}")
+                # For Qwen models
+                self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name)
+                self.text_gen_model = AutoModelForCausalLM.from_pretrained(
+                    self.text_gen_model_name,
+                    torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32,
+                    device_map='auto' if self.device != 'cpu' else None
+                )
+                logger.info("✅ Qwen3 model loaded successfully")
+            else:
+                # Use pipeline for other models
+                self.text_gen_pipeline = pipeline(
+                    "text2text-generation",
+                    model=self.text_gen_model_name,
+                    device=0 if self.device != 'cpu' else -1,
+                    torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
+                )
+                logger.info("✅ Text generation pipeline loaded successfully")
+        except Exception as e:
+            logger.warning(f"⚠️ Advanced text generation model failed to load: {e}")
+            self.text_gen_model = None
+    def _initialize_paraphrase_models(self, model_name: str):
+        """Initialize paraphrasing models with enhanced capabilities."""
+        try:
+            if 'ul2' in model_name.lower():
+                # Special handling for UL2 model
+                logger.info(f"🏆 Loading UL2 model: {model_name}")
+                self.paraphrase_models['ul2'] = pipeline(
+                    "text2text-generation",
+                    model=model_name,
+                    device=0 if self.device != 'cpu' else -1,
+                    torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
+                )
+                logger.info("✅ UL2 model loaded successfully")
+            elif 'flan-t5' in model_name.lower():
+                # FLAN-T5 models
+                logger.info(f"🎯 Loading FLAN-T5 model: {model_name}")
+                self.paraphrase_models['flan_t5'] = pipeline(
+                    "text2text-generation",
+                    model=model_name,
+                    device=0 if self.device != 'cpu' else -1,
+                    torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
+                )
+                logger.info("✅ FLAN-T5 model loaded successfully")
+            else:
+                # Standard T5 models
+                self.paraphrase_models['t5'] = pipeline(
+                    "text2text-generation",
+                    model=model_name,
+                    device=0 if self.device != 'cpu' else -1,
+                    torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
+                )
+                logger.info("✅ T5 model loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Paraphrase model initialization failed: {e}")
+            raise
+    def paraphrase_sentence(self, sentence: str, model_type: str = 'auto') -> str:
+        """Advanced paraphrasing with latest models."""
+        if not sentence.strip() or len(sentence.split()) < 5:  # Skip very short sentences
+            return sentence
+        try:
+            # Choose best available model
+            if model_type == 'auto':
+                if 'ul2' in self.paraphrase_models:
+                    model_type = 'ul2'
+                elif 'flan_t5' in self.paraphrase_models:
+                    model_type = 'flan_t5'
+                else:
+                    model_type = 't5'
+            model = self.paraphrase_models.get(model_type)
+            if not model:
+                return sentence
+            # Prepare input based on model type - use simple, clean prompts
+            if model_type == 'ul2':
+                input_text = f"Rewrite: {sentence}"
+            elif model_type == 'flan_t5':
+                input_text = f"Rewrite this text: {sentence}"
+            else:
+                # Standard T5 - use basic paraphrase prompt
+                input_text = f"paraphrase: {sentence}"
+            # Generate paraphrase with conservative settings
+            result = model(
+                input_text,
+                max_length=min(len(sentence.split()) * 2 + 10, 100),  # More conservative length
+                min_length=max(3, len(sentence.split()) - 3),
+                do_sample=True,
+                temperature=0.6,  # Lower temperature for more conservative outputs
+                top_p=0.8,        # Lower top_p
+                num_return_sequences=1,
+                no_repeat_ngram_size=2,
+                repetition_penalty=1.1
+            )
+            paraphrased = result[0]['generated_text'].strip()
+            # Enhanced quality checks
+            if self._is_quality_paraphrase_enhanced(sentence, paraphrased):
+                return paraphrased
+            else:
+                return sentence
+        except Exception as e:
+            logger.warning(f"⚠️ Paraphrasing failed: {e}")
+            return sentence
+    def _is_quality_paraphrase_enhanced(self, original: str, paraphrase: str) -> bool:
+        """Enhanced quality check for paraphrases with stricter criteria."""
+        if not paraphrase or paraphrase.strip() == original.strip():
+            return False
+        # Check for editorial markers or foreign language
+        bad_markers = ['False:', 'Paraphrase:', 'True:', 'Note:', 'Edit:', '[', ']', 'Cette', 'loi', 'aux']
+        if any(marker in paraphrase for marker in bad_markers):
+            return False
+        # Check length ratio (shouldn't be too different)
+        length_ratio = len(paraphrase) / len(original)
+        if length_ratio < 0.5 or length_ratio > 2.0:
+            return False
+        # Check for broken words or missing spaces
+        if any(len(word) > 20 for word in paraphrase.split()):  # Very long words indicate concatenation
+            return False
+        # Check semantic similarity if available
+        try:
+            if self.sentence_model:
+                embeddings = self.sentence_model.encode([original, paraphrase])
+                similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
+                # Stricter similarity thresholds
+                if 'minilm' in self.sentence_model_name.lower():
+                    return 0.7 <= similarity <= 0.95  # Good range for MiniLM
+                else:
+                    return 0.65 <= similarity <= 0.95
+            return True  # Fallback if no sentence model
+        except Exception as e:
+            logger.warning(f"⚠️ Quality check failed: {e}")
+            return False
+    def generate_with_latest_model(self, prompt: str, max_length: int = 150) -> str:
+        """Generate text using the latest models (DeepSeek-R1 or Qwen3)."""
+        if not self.text_gen_model:
+            return prompt
+        try:
+            if hasattr(self, 'text_gen_tokenizer'):
+                # Direct model inference for DeepSeek/Qwen
+                inputs = self.text_gen_tokenizer.encode(prompt, return_tensors='pt')
+                with torch.no_grad():
+                    outputs = self.text_gen_model.generate(
+                        inputs,
+                        max_length=max_length,
+                        do_sample=True,
+                        temperature=0.7,
+                        top_p=0.9,
+                        pad_token_id=self.text_gen_tokenizer.eos_token_id
+                    )
+                generated = self.text_gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Extract only the new generated part
+                new_text = generated[len(prompt):].strip()
+                return prompt + " " + new_text if new_text else prompt
+            elif hasattr(self, 'text_gen_pipeline'):
+                # Pipeline inference
+                result = self.text_gen_pipeline(
+                    prompt,
+                    max_length=max_length,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9
+                )
+                return result[0]['generated_text']
+        except Exception as e:
+            logger.warning(f"⚠️ Text generation failed: {e}")
+            return prompt
+        return prompt
+    def _is_quality_paraphrase(self, original: str, paraphrase: str) -> bool:
+        """Enhanced quality check for paraphrases using latest models."""
+        if not paraphrase or paraphrase.strip() == original.strip():
+            return False
+        try:
+            # Check semantic similarity using advanced model
+            if self.sentence_model:
+                embeddings = self.sentence_model.encode([original, paraphrase])
+                similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
+                # BGE-M3 and advanced models have different thresholds
+                if 'bge-m3' in self.sentence_model_name.lower():
+                    min_similarity = 0.7  # Higher threshold for BGE-M3
+                elif 'mpnet' in self.sentence_model_name.lower():
+                    min_similarity = 0.65  # Medium threshold for MPNet
+                else:
+                    min_similarity = 0.6   # Standard threshold
+                return similarity >= min_similarity
+            return True  # Fallback if no sentence model
+        except Exception as e:
+            logger.warning(f"⚠️ Quality check failed: {e}")
+            return True  # Conservative fallback
+    def enhance_with_advanced_synonyms(self, text: str) -> str:
+        """Enhanced synonym replacement using latest models."""
+        if not text.strip():
+            return text
+        try:
+            doc = NLP_GLOBAL(text)
+            enhanced_tokens = []
+            for token in doc:
+                # Be more conservative with synonym replacement
+                if (token.is_alpha and not token.is_stop and
+                    len(token.text) > 4 and token.pos_ in ['NOUN', 'VERB', 'ADJ'] and  # Removed 'ADV' and increased min length
+                    not token.is_punct and token.lemma_.lower() not in ['say', 'get', 'make', 'take', 'come', 'go']):  # Avoid common verbs
+                    # Use contextual synonym selection with lower probability
+                    if random.random() < 0.3:  # Only 30% chance of replacement
+                        synonym = self._get_contextual_synonym_advanced(
+                            token.text, token.pos_, text, token.i
+                        )
+                        if synonym and len(synonym) <= len(token.text) + 3:  # Prevent very long replacements
+                            enhanced_tokens.append(synonym + token.whitespace_)
+                        else:
+                            enhanced_tokens.append(token.text_with_ws)
+                    else:
+                        enhanced_tokens.append(token.text_with_ws)
+                else:
+                    enhanced_tokens.append(token.text_with_ws)
+            result = ''.join(enhanced_tokens)
+            # Quality check: ensure result is reasonable
+            if len(result) > len(text) * 1.5:  # Prevent text expansion beyond 150%
+                return text
+            return result
+        except Exception as e:
+            logger.warning(f"⚠️ Advanced synonym enhancement failed: {e}")
+            return text
+    def _get_contextual_synonym_advanced(self, word: str, pos: str, context: str, position: int) -> Optional[str]:
+        """Advanced contextual synonym selection using latest models."""
+        try:
+            # Get traditional synonyms first
+            synonyms = self._get_wordnet_synonyms(word, pos)
+            if not synonyms or not self.sentence_model:
+                return None
+            # Use advanced sentence model for context-aware selection
+            original_sentence = context
+            best_synonym = None
+            best_score = -1
+            for synonym in synonyms[:5]:  # Limit to top 5 for efficiency
+                # Create candidate sentence with synonym
+                words = context.split()
+                if position < len(words):
+                    words[position] = synonym
+                    candidate_sentence = ' '.join(words)
+                    # Calculate semantic similarity
+                    embeddings = self.sentence_model.encode([original_sentence, candidate_sentence])
+                    similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
+                    # For advanced models, we want high similarity but some variation
+                    if 'bge-m3' in self.sentence_model_name.lower():
+                        # BGE-M3 is more nuanced
+                        if 0.85 <= similarity <= 0.98 and similarity > best_score:
+                            best_score = similarity
+                            best_synonym = synonym
+                    else:
+                        # Standard models
+                        if 0.8 <= similarity <= 0.95 and similarity > best_score:
+                            best_score = similarity
+                            best_synonym = synonym
+            return best_synonym
+        except Exception as e:
+            logger.warning(f"⚠️ Advanced contextual synonym selection failed: {e}")
+            return None
+    def _get_wordnet_synonyms(self, word: str, pos: str) -> List[str]:
+        """Enhanced WordNet synonym extraction."""
+        try:
+            # Map spaCy POS to WordNet POS
+            pos_map = {
+                'NOUN': wordnet.NOUN,
+                'VERB': wordnet.VERB,
+                'ADJ': wordnet.ADJ,
+                'ADV': wordnet.ADV
+            }
+            wn_pos = pos_map.get(pos)
+            if not wn_pos:
+                return []
+            synonyms = set()
+            synsets = wordnet.synsets(word.lower(), pos=wn_pos)
+            for synset in synsets[:3]:  # Top 3 synsets
+                for lemma in synset.lemmas()[:4]:  # Top 4 lemmas per synset
+                    synonym = lemma.name().replace('_', ' ')
+                    if synonym.lower() != word.lower() and len(synonym) > 2:
+                        synonyms.add(synonym)
+            return list(synonyms)
+        except Exception as e:
+            logger.warning(f"⚠️ WordNet synonym extraction failed: {e}")
+            return []
+class AdvancedAcademicTextHumanizer:
+    """
+    Next-generation text humanizer with state-of-the-art ML models and
+    advanced AI detection avoidance techniques.
+    """
+    def __init__(
+        self,
+        sentence_model: str = 'fast',                        # OPTIMIZED: Use fast models by default
+        paraphrase_model: str = 'fast',                      # OPTIMIZED: Use fast models by default
+        p_passive: float = 0.05,                            # REDUCED: Very conservative passive conversion
+        p_synonym_replacement: float = 0.15,                # REDUCED: Conservative synonym replacement
+        p_academic_transition: float = 0.10,                # REDUCED: Conservative transitions
+        p_paraphrase: float = 0.10,                         # REDUCED: Conservative paraphrasing
+        seed: Optional[int] = None,
+        preserve_formatting: bool = True,
+        enable_advanced_models: bool = True,                # OPTIMIZED: Always enabled for quality
+        ai_avoidance_mode: bool = True                      # OPTIMIZED: Always enabled for best results
+    ):
+        """
+        Initialize the advanced text humanizer with cutting-edge capabilities.
+        """
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+        self.nlp = NLP_GLOBAL
+        if self.nlp is None:
+            raise RuntimeError("spaCy model not initialized. Call initialize_nlp() first.")
+        # Initialize advanced models
+        self.advanced_humanizer = StateOfTheArtHumanizer(
+            sentence_model=sentence_model,
+            paraphrase_model=paraphrase_model,
+            enable_advanced_models=enable_advanced_models
+        )
+        # Transformation probabilities with new advanced features
+        self.p_passive = max(0.0, min(1.0, p_passive))
+        self.p_synonym_replacement = max(0.0, min(1.0, p_synonym_replacement))
+        self.p_academic_transition = max(0.0, min(1.0, p_academic_transition))
+        self.p_paraphrase = max(0.0, min(1.0, p_paraphrase))
+        self.preserve_formatting = preserve_formatting
+        self.ai_avoidance_mode = ai_avoidance_mode
+        self.markdown_preserver = AdvancedMarkdownPreserver()
+        # Enhanced academic transitions with variety
+        self.academic_transitions = {
+            'addition': [
+                "Moreover,", "Additionally,", "Furthermore,", "In addition,",
+                "What's more,", "Beyond that,", "On top of that,", "Also worth noting,"
+            ],
+            'contrast': [
+                "However,", "Nevertheless,", "Nonetheless,", "Conversely,",
+                "On the contrary,", "In contrast,", "That said,", "Yet,"
+            ],
+            'causation': [
+                "Therefore,", "Consequently,", "Thus,", "Hence,",
+                "As a result,", "This leads to,", "It follows that,", "Accordingly,"
+            ],
+            'emphasis': [
+                "Notably,", "Significantly,", "Importantly,", "Remarkably,",
+                "It's worth emphasizing,", "Particularly noteworthy,", "Crucially,", "Indeed,"
+            ],
+            'sequence': [
+                "Subsequently,", "Following this,", "Thereafter,", "Next,",
+                "In the next phase,", "Moving forward,", "Then,", "Later on,"
+            ]
+        }
+        # Comprehensive contraction mapping
+        self.contraction_map = {
+            "n't": " not", "'re": " are", "'s": " is", "'ll": " will",
+            "'ve": " have", "'d": " would", "'m": " am", "'t": " not",
+            "won't": "will not", "can't": "cannot", "shouldn't": "should not",
+            "wouldn't": "would not", "couldn't": "could not", "mustn't": "must not",
+            "isn't": "is not", "aren't": "are not", "wasn't": "was not",
+            "weren't": "were not", "haven't": "have not", "hasn't": "has not",
+            "hadn't": "had not", "doesn't": "does not", "didn't": "did not",
+            "don't": "do not", "let's": "let us", "that's": "that is",
+            "there's": "there is", "here's": "here is", "what's": "what is",
+            "where's": "where is", "who's": "who is", "it's": "it is"
+        }
+    def humanize_text(
+        self,
+        text: str,
+        use_passive: bool = False,
+        use_synonyms: bool = False,
+        use_paraphrasing: bool = False,
+        preserve_paragraphs: bool = True
+    ) -> str:
+        """
+        Advanced text humanization with state-of-the-art techniques.
+        """
+        if not text or not text.strip():
+            return text
+        try:
+            if self.preserve_formatting:
+                return self._humanize_with_advanced_preservation(
+                    text, use_passive, use_synonyms, use_paraphrasing, preserve_paragraphs
+                )
+            else:
+                return self._humanize_advanced_simple(text, use_passive, use_synonyms, use_paraphrasing)
+        except Exception as e:
+            logger.error(f"Error during advanced text humanization: {e}")
+            return text
+    def _humanize_with_advanced_preservation(
+        self,
+        text: str,
+        use_passive: bool,
+        use_synonyms: bool,
+        use_paraphrasing: bool,
+        preserve_paragraphs: bool
+    ) -> str:
+        """Advanced humanization with comprehensive formatting preservation."""
+        segments = self.markdown_preserver.segment_text(text)
+        for segment in segments:
+            if segment.segment_type == 'text' and segment.content.strip():
+                # Apply AI detection avoidance if needed
+                if self.ai_avoidance_mode and segment.ai_probability > 0.6:
+                    segment.content = self._apply_ai_avoidance_techniques(
+                        segment.content, use_passive, use_synonyms, use_paraphrasing
+                    )
+                else:
+                    segment.content = self._transform_text_segment_advanced(
+                        segment.content, use_passive, use_synonyms, use_paraphrasing
+                    )
+        return self.markdown_preserver.reconstruct_text(segments)
+    def _apply_ai_avoidance_techniques(
+        self,
+        text: str,
+        use_passive: bool,
+        use_synonyms: bool,
+        use_paraphrasing: bool
+    ) -> str:
+        """Apply specialized techniques to avoid AI detection."""
+        try:
+            # 1. Add natural imperfections
+            text = self._add_natural_variations(text)
+            # 2. Increase sentence variety
+            text = self._vary_sentence_structure(text)
+            # 3. Reduce formal language density
+            text = self._reduce_formality(text)
+            # 4. Apply standard transformations
+            text = self._transform_text_segment_advanced(
+                text, use_passive, use_synonyms, use_paraphrasing
+            )
+            return text
+        except Exception as e:
+            logger.warning(f"Error in AI avoidance: {e}")
+            return text
+    def _add_natural_variations(self, text: str) -> str:
+        """Add natural human-like variations."""
+        # Add occasional contractions to balance formality
+        if random.random() < 0.3:
+            formal_replacements = {
+                "do not": "don't", "will not": "won't", "cannot": "can't",
+                "should not": "shouldn't", "would not": "wouldn't"
+            }
+            for formal, contraction in formal_replacements.items():
+                if formal in text and random.random() < 0.4:
+                    text = text.replace(formal, contraction, 1)
+        return text
+    def _vary_sentence_structure(self, text: str) -> str:
+        """Increase sentence structure variety."""
+        sentences = sent_tokenize(text)
+        if len(sentences) < 2:
+            return text
+        varied_sentences = []
+        for i, sentence in enumerate(sentences):
+            if i > 0 and random.random() < 0.3:
+                # Occasionally start with different structures
+                starters = ["Well,", "Actually,", "Interestingly,", "To be clear,"]
+                if not any(sentence.startswith(starter) for starter in starters):
+                    starter = random.choice(starters)
+                    sentence = f"{starter} {sentence.lower()}"
+            varied_sentences.append(sentence)
+        return ' '.join(varied_sentences)
+    def _reduce_formality(self, text: str) -> str:
+        """Reduce excessive formality to appear more human."""
+        # Replace overly formal words with more natural alternatives
+        formal_to_natural = {
+            'utilize': 'use', 'facilitate': 'help', 'demonstrate': 'show',
+            'implement': 'put in place', 'comprehensive': 'complete',
+            'methodology': 'method', 'substantial': 'large',
+            'numerous': 'many', 'acquire': 'get'
+        }
+        for formal, natural in formal_to_natural.items():
+            if formal in text.lower() and random.random() < 0.6:
+                text = re.sub(r'\b' + formal + r'\b', natural, text, flags=re.IGNORECASE)
+        return text
+    def _transform_text_segment_advanced(
+        self,
+        text: str,
+        use_passive: bool,
+        use_synonyms: bool,
+        use_paraphrasing: bool
+    ) -> str:
+        """Advanced text segment transformation with ML models."""
+        try:
+            doc = self.nlp(text)
+            transformed_sentences = []
+            for sent in doc.sents:
+                sentence_str = sent.text.strip()
+                if not sentence_str:
+                    continue
+                # 1. Expand contractions
+                sentence_str = self.expand_contractions_advanced(sentence_str)
+                # 2. Advanced paraphrasing (new!)
+                if use_paraphrasing and random.random() < self.p_paraphrase:
+                    paraphrased = self.advanced_humanizer.paraphrase_sentence(sentence_str)
+                    if paraphrased != sentence_str:
+                        sentence_str = paraphrased
+                # 3. Context-aware academic transitions
+                if random.random() < self.p_academic_transition:
+                    sentence_str = self.add_contextual_transitions(sentence_str)
+                # 4. Advanced passive voice conversion
+                if use_passive and random.random() < self.p_passive:
+                    sentence_str = self.convert_to_passive_advanced(sentence_str)
+                # 5. Enhanced contextual synonym replacement
+                if use_synonyms and random.random() < self.p_synonym_replacement:
+                    sentence_str = self.enhance_with_advanced_synonyms(sentence_str)
+                transformed_sentences.append(sentence_str)
+            result = ' '.join(transformed_sentences)
+            return result if result.strip() else text
+        except Exception as e:
+            logger.warning(f"Error in advanced transformation: {e}")
+            return text
+    def expand_contractions_advanced(self, sentence: str) -> str:
+        """Enhanced contraction expansion with better context handling."""
+        # Handle special cases with regex for better accuracy
+        for contraction, expansion in self.contraction_map.items():
+            if len(contraction) > 3:  # Full word contractions
+                pattern = r'\b' + re.escape(contraction) + r'\b'
+                sentence = re.sub(pattern, expansion, sentence, flags=re.IGNORECASE)
+        # Handle suffix contractions
+        tokens = word_tokenize(sentence)
+        expanded_tokens = []
+        for token in tokens:
+            original_case = token
+            lower_token = token.lower()
+            replaced = False
+            for contraction, expansion in self.contraction_map.items():
+                if (len(contraction) <= 3 and
+                    lower_token.endswith(contraction) and
+                    len(lower_token) > len(contraction)):
+                    base = lower_token[:-len(contraction)]
+                    new_token = base + expansion
+                    # Preserve capitalization pattern
+                    if original_case[0].isupper():
+                        new_token = new_token[0].upper() + new_token[1:]
+                    expanded_tokens.append(new_token)
+                    replaced = True
+                    break
+            if not replaced:
+                expanded_tokens.append(token)
+        return ' '.join(expanded_tokens)
+    def add_contextual_transitions(self, sentence: str) -> str:
+        """Add contextually intelligent academic transitions."""
+        sentence_lower = sentence.lower()
+        # Enhanced context detection
+        context_patterns = {
+            'contrast': ['but', 'however', 'although', 'while', 'despite', 'whereas'],
+            'causation': ['because', 'since', 'therefore', 'so', 'due to', 'as a result'],
+            'addition': ['also', 'and', 'plus', 'including', 'along with'],
+            'emphasis': ['important', 'significant', 'notable', 'crucial', 'key'],
+            'sequence': ['first', 'second', 'then', 'next', 'finally', 'last']
+        }
+        # Determine best transition type
+        best_type = 'addition'  # default
+        max_matches = 0
+        for transition_type, patterns in context_patterns.items():
+            matches = sum(1 for pattern in patterns if pattern in sentence_lower)
+            if matches > max_matches:
+                max_matches = matches
+                best_type = transition_type
+        # Select appropriate transition
+        transition = random.choice(self.academic_transitions[best_type])
+        return f"{transition} {sentence}"
+    def convert_to_passive_advanced(self, sentence: str) -> str:
+        """Advanced passive voice conversion with better grammatical accuracy."""
+        try:
+            doc = self.nlp(sentence)
+            # Find suitable active voice patterns
+            for token in doc:
+                if (token.pos_ == 'VERB' and
+                    token.dep_ == 'ROOT' and
+                    token.tag_ in ['VBD', 'VBZ', 'VBP']):
+                    # Find subject and object
+                    subj = None
+                    obj = None
+                    for child in token.children:
+                        if child.dep_ == 'nsubj':
+                            subj = child
+                        elif child.dep_ in ['dobj', 'pobj']:
+                            obj = child
+                    if subj and obj:
+                        # Create passive transformation
+                        verb_base = token.lemma_
+                        # Choose auxiliary verb
+                        aux = 'was' if subj.tag_ in ['NN', 'NNP'] else 'were'
+                        if token.tag_ in ['VBZ', 'VBP']:  # Present tense
+                            aux = 'is' if subj.tag_ in ['NN', 'NNP'] else 'are'
+                        # Create past participle
+                        if verb_base.endswith('e'):
+                            past_participle = verb_base + 'd'
+                        elif verb_base in ['go', 'do', 'be', 'have']:
+                            # Irregular verbs
+                            irregular_map = {'go': 'gone', 'do': 'done', 'be': 'been', 'have': 'had'}
+                            past_participle = irregular_map.get(verb_base, verb_base + 'ed')
+                        else:
+                            past_participle = verb_base + 'ed'
+                        # Construct passive sentence
+                        passive_phrase = f"{obj.text} {aux} {past_participle} by {subj.text}"
+                        # Replace in original sentence
+                        original_phrase = f"{subj.text} {token.text} {obj.text}"
+                        if original_phrase in sentence:
+                            return sentence.replace(original_phrase, passive_phrase)
+            return sentence
+        except Exception as e:
+            logger.warning(f"Error in advanced passive conversion: {e}")
+            return sentence
+    def get_advanced_transformation_stats(self, original_text: str, transformed_text: str) -> Dict[str, Union[int, float]]:
+        """Get comprehensive transformation statistics with ML analysis."""
+        orig_tokens = word_tokenize(original_text)
+        trans_tokens = word_tokenize(transformed_text)
+        orig_sents = sent_tokenize(original_text)
+        trans_sents = sent_tokenize(transformed_text)
+        # Calculate advanced metrics
+        stats = {
+            'original_word_count': len(orig_tokens),
+            'transformed_word_count': len(trans_tokens),
+            'original_sentence_count': len(orig_sents),
+            'transformed_sentence_count': len(trans_sents),
+            'word_change_ratio': len(trans_tokens) / len(orig_tokens) if orig_tokens else 0,
+            'sentence_change_ratio': len(trans_sents) / len(orig_sents) if orig_sents else 0,
+            'character_count_original': len(original_text),
+            'character_count_transformed': len(transformed_text),
+        }
+        # Add ML-based analysis
+        try:
+            # Semantic similarity
+            if hasattr(self, 'advanced_humanizer') and self.advanced_humanizer.sentence_model:
+                embeddings = self.advanced_humanizer.sentence_model.encode([original_text, transformed_text])
+                semantic_similarity = float(util.cos_sim(embeddings[0], embeddings[1]).item())
+                stats['semantic_similarity'] = semantic_similarity
+            # AI detection metrics
+            original_segments = self.markdown_preserver.segment_text(original_text)
+            transformed_segments = self.markdown_preserver.segment_text(transformed_text)
+            orig_ai_scores = [seg.ai_probability for seg in original_segments if seg.segment_type == 'text']
+            trans_ai_scores = [seg.ai_probability for seg in transformed_segments if seg.segment_type == 'text']
+            if orig_ai_scores and trans_ai_scores:
+                stats['original_ai_probability'] = np.mean(orig_ai_scores)
+                stats['transformed_ai_probability'] = np.mean(trans_ai_scores)
+                stats['ai_detection_improvement'] = stats['original_ai_probability'] - stats['transformed_ai_probability']
+        except Exception as e:
+            logger.warning(f"Error calculating advanced stats: {e}")
+        return stats
+    def _humanize_advanced_simple(self, text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool) -> str:
+        """Simple advanced transformation without formatting preservation."""
+        paragraphs = text.split('\n\n')
+        transformed_paragraphs = []
+        for paragraph in paragraphs:
+            if paragraph.strip():
+                transformed = self._transform_text_segment_advanced(
+                    paragraph, use_passive, use_synonyms, use_paraphrasing
+                )
+                transformed_paragraphs.append(transformed)
+            else:
+                transformed_paragraphs.append(paragraph)
+        return '\n\n'.join(transformed_paragraphs)