ii5 commited on
Commit
ea54f0b
·
verified ·
1 Parent(s): 04bcb58

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +147 -0
  2. requirements.txt +0 -0
  3. transformer/__init__.py +18 -0
  4. transformer/app.py +1100 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple AI Text Humanizer using Gradio
3
+
4
+ A clean text-to-text interface for humanizing AI-generated content.
5
+ """
6
+
7
+ import gradio as gr
8
+ import time
9
+ from typing import Optional
10
+
11
+ from transformer.app import AdvancedAcademicTextHumanizer, download_nltk_resources
12
+
13
+ # Global humanizer instance
14
+ humanizer_instance = None
15
+
16
+ def initialize_humanizer():
17
+ """Initialize the humanizer model."""
18
+ global humanizer_instance
19
+ if humanizer_instance is None:
20
+ try:
21
+ print("🔄 Downloading NLTK resources...")
22
+ # Download NLTK resources
23
+ download_nltk_resources()
24
+
25
+ print("🔄 Initializing lightweight models...")
26
+ # Initialize humanizer with lightweight, fast settings
27
+ humanizer_instance = AdvancedAcademicTextHumanizer(
28
+ sentence_model="fast", # Uses all-MiniLM-L6-v2 (lightweight)
29
+ paraphrase_model="fast", # Uses t5-small (fast)
30
+ enable_advanced_models=True,
31
+ ai_avoidance_mode=True
32
+ )
33
+ print("✅ All models loaded successfully and ready!")
34
+ return "✅ Models loaded successfully"
35
+ except Exception as e:
36
+ error_msg = f"❌ Error loading models: {str(e)}"
37
+ print(error_msg)
38
+ return error_msg
39
+ return "✅ Models already loaded"
40
+
41
+ def humanize_text(input_text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool) -> str:
42
+ """Transform AI text to human-like text."""
43
+ if not input_text.strip():
44
+ return "Please enter some text to transform."
45
+
46
+ global humanizer_instance
47
+ if humanizer_instance is None:
48
+ init_result = initialize_humanizer()
49
+ if "Error" in init_result:
50
+ return init_result
51
+
52
+ try:
53
+ # Transform the text
54
+ transformed = humanizer_instance.humanize_text(
55
+ input_text,
56
+ use_passive=use_passive,
57
+ use_synonyms=use_synonyms,
58
+ use_paraphrasing=use_paraphrasing
59
+ )
60
+ return transformed
61
+ except Exception as e:
62
+ return f"❌ Error during transformation: {str(e)}"
63
+
64
+ def create_interface():
65
+ """Create the Gradio interface."""
66
+
67
+ with gr.Blocks(title="AI Text Humanizer", theme=gr.themes.Soft()) as interface:
68
+ gr.Markdown("# 🤖➡️🧔🏻‍♂️ AI Text Humanizer")
69
+ gr.Markdown("Transform AI-generated text into human-like content using advanced ML models.")
70
+
71
+ with gr.Row():
72
+ with gr.Column():
73
+ input_text = gr.Textbox(
74
+ label="Input Text",
75
+ placeholder="Paste your AI-generated text here...",
76
+ lines=10,
77
+ max_lines=20
78
+ )
79
+
80
+ with gr.Row():
81
+ use_passive = gr.Checkbox(
82
+ label="Passive Voice Transformation",
83
+ value=False,
84
+ info="Convert active voice to passive"
85
+ )
86
+ use_synonyms = gr.Checkbox(
87
+ label="Synonym Replacement",
88
+ value=True,
89
+ info="AI-powered contextual synonyms"
90
+ )
91
+ use_paraphrasing = gr.Checkbox(
92
+ label="Neural Paraphrasing",
93
+ value=True,
94
+ info="T5 sentence-level rewriting"
95
+ )
96
+
97
+ transform_btn = gr.Button("🚀 Transform Text", variant="primary")
98
+
99
+ with gr.Column():
100
+ output_text = gr.Textbox(
101
+ label="Transformed Text",
102
+ lines=10,
103
+ max_lines=20,
104
+ interactive=False
105
+ )
106
+
107
+ # Initialize models on startup
108
+ gr.Markdown("### Model Status")
109
+ status_text = gr.Textbox(
110
+ label="Initialization Status",
111
+ value="Click 'Transform Text' to load models...",
112
+ interactive=False
113
+ )
114
+
115
+ # Connect the transformation function
116
+ transform_btn.click(
117
+ fn=humanize_text,
118
+ inputs=[input_text, use_passive, use_synonyms, use_paraphrasing],
119
+ outputs=output_text
120
+ )
121
+
122
+ # Initialize models when interface loads
123
+ interface.load(
124
+ fn=initialize_humanizer,
125
+ outputs=status_text
126
+ )
127
+
128
+ gr.Markdown("---")
129
+ gr.Markdown("**Note:** First-time model loading may take a few moments.")
130
+
131
+ return interface
132
+
133
+ def main():
134
+ """Launch the Gradio interface."""
135
+ interface = create_interface()
136
+
137
+ # Launch with Mac-optimized settings
138
+ interface.launch(
139
+ server_name="127.0.0.1",
140
+ server_port=7860,
141
+ share=False,
142
+ debug=False,
143
+ show_error=True
144
+ )
145
+
146
+ if __name__ == "__main__":
147
+ main()
requirements.txt ADDED
Binary file (1.93 kB). View file
 
transformer/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Text Humanizer Package
3
+
4
+ A sophisticated text transformation system that converts AI-generated text
5
+ into more human-like, academic writing while preserving formatting.
6
+ """
7
+
8
+ __version__ = "2.0.0"
9
+ __author__ = "AI Text Humanizer Team"
10
+ __description__ = "Advanced text humanization with markdown preservation"
11
+
12
+ from .app import AdvancedAcademicTextHumanizer, NLP_GLOBAL, download_nltk_resources
13
+
14
+ __all__ = [
15
+ "AdvancedAcademicTextHumanizer",
16
+ "NLP_GLOBAL",
17
+ "download_nltk_resources"
18
+ ]
transformer/app.py ADDED
@@ -0,0 +1,1100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Advanced Academic Text Humanizer with State-of-the-Art ML Models
3
+
4
+ This module provides cutting-edge text transformation capabilities using the latest
5
+ ML models for superior AI text humanization, including T5 paraphrasing, advanced
6
+ sentence transformers, and AI detection avoidance techniques.
7
+ """
8
+
9
+ import ssl
10
+ import random
11
+ import warnings
12
+ import re
13
+ import logging
14
+ import math
15
+ from typing import List, Dict, Tuple, Optional, Union
16
+ from dataclasses import dataclass
17
+ from functools import lru_cache
18
+
19
+ import nltk
20
+ import spacy
21
+ import torch
22
+ import numpy as np
23
+ from nltk.tokenize import word_tokenize, sent_tokenize
24
+ from nltk.corpus import wordnet, stopwords
25
+ from sentence_transformers import SentenceTransformer, util
26
+ from transformers import (
27
+ T5ForConditionalGeneration, T5Tokenizer,
28
+ PegasusForConditionalGeneration, PegasusTokenizer,
29
+ pipeline, AutoTokenizer, AutoModelForCausalLM
30
+ )
31
+
32
+ # Configure logging
33
+ logging.basicConfig(level=logging.INFO)
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # Suppress warnings
37
+ warnings.filterwarnings("ignore", category=FutureWarning)
38
+ warnings.filterwarnings("ignore", category=UserWarning)
39
+
40
+ # Global models
41
+ NLP_GLOBAL = None
42
+ DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
43
+
44
+ # Latest state-of-the-art models configuration
45
+ LATEST_MODELS = {
46
+ 'sentence_transformers': {
47
+ 'premium': 'sentence-transformers/all-MiniLM-L12-v2', # Lighter premium option
48
+ 'balanced': 'sentence-transformers/all-MiniLM-L6-v2', # Fast and reliable
49
+ 'fast': 'sentence-transformers/all-MiniLM-L6-v2' # Same as balanced for consistency
50
+ },
51
+ 'paraphrasing': {
52
+ 'premium': 'google-t5/t5-base', # Much lighter than UL2
53
+ 'balanced': 'google-t5/t5-small', # Good balance
54
+ 'fast': 'google-t5/t5-small' # Fast and efficient
55
+ },
56
+ 'text_generation': {
57
+ 'premium': 'google-t5/t5-base', # Much lighter than 70B models
58
+ 'balanced': 'google-t5/t5-small', # Small and fast
59
+ 'fast': 'google-t5/t5-small' # Consistent with balanced
60
+ }
61
+ }
62
+
63
+ def initialize_nlp():
64
+ """Initialize the global NLP model with enhanced capabilities."""
65
+ global NLP_GLOBAL
66
+ if NLP_GLOBAL is None:
67
+ try:
68
+ NLP_GLOBAL = spacy.load("en_core_web_sm")
69
+ logger.info("Successfully loaded spaCy model")
70
+ except Exception as e:
71
+ logger.error(f"Failed to load spaCy model: {e}")
72
+ raise
73
+
74
+ # Initialize on import
75
+ try:
76
+ initialize_nlp()
77
+ except Exception as e:
78
+ logger.warning(f"Could not initialize NLP model: {e}")
79
+
80
+ @dataclass
81
+ class TextSegment:
82
+ """Enhanced text segment with additional metadata."""
83
+ content: str
84
+ segment_type: str # 'text', 'markdown', 'code', 'list', 'header'
85
+ line_number: int
86
+ preserve_formatting: bool = False
87
+ perplexity_score: float = 0.0
88
+ ai_probability: float = 0.0
89
+
90
+ class AdvancedMarkdownPreserver:
91
+ """Enhanced markdown preservation with better pattern recognition."""
92
+
93
+ def __init__(self):
94
+ self.patterns = {
95
+ 'code_block': re.compile(r'```[\s\S]*?```', re.MULTILINE),
96
+ 'inline_code': re.compile(r'`[^`]+`'),
97
+ 'header': re.compile(r'^#{1,6}\s+.*$', re.MULTILINE),
98
+ 'list_item': re.compile(r'^\s*[-*+]\s+.*$', re.MULTILINE),
99
+ 'numbered_list': re.compile(r'^\s*\d+\.\s+.*$', re.MULTILINE),
100
+ 'link': re.compile(r'\[([^\]]+)\]\(([^)]+)\)'),
101
+ 'bold': re.compile(r'\*\*([^*]+)\*\*'),
102
+ 'italic': re.compile(r'\*([^*]+)\*'),
103
+ 'blockquote': re.compile(r'^>\s+.*$', re.MULTILINE),
104
+ 'horizontal_rule': re.compile(r'^---+$', re.MULTILINE),
105
+ 'table_row': re.compile(r'^\s*\|.*\|\s*$', re.MULTILINE),
106
+ 'latex_math': re.compile(r'\$\$.*?\$\$|\$.*?\$', re.DOTALL),
107
+ 'footnote': re.compile(r'\[\^[^\]]+\]'),
108
+ }
109
+
110
+ def segment_text(self, text: str) -> List[TextSegment]:
111
+ """Segment text with enhanced analysis."""
112
+ segments = []
113
+ lines = text.split('\n')
114
+
115
+ for i, line in enumerate(lines):
116
+ segment_type = self._identify_line_type(line)
117
+ preserve = segment_type != 'text'
118
+
119
+ # Calculate perplexity and AI probability for text segments
120
+ perplexity = self._calculate_perplexity(line) if segment_type == 'text' else 0.0
121
+ ai_prob = self._calculate_ai_probability(line) if segment_type == 'text' else 0.0
122
+
123
+ segments.append(TextSegment(
124
+ content=line,
125
+ segment_type=segment_type,
126
+ line_number=i,
127
+ preserve_formatting=preserve,
128
+ perplexity_score=perplexity,
129
+ ai_probability=ai_prob
130
+ ))
131
+
132
+ return segments
133
+
134
+ def _identify_line_type(self, line: str) -> str:
135
+ """Enhanced line type identification."""
136
+ if not line.strip():
137
+ return 'empty'
138
+
139
+ for pattern_name, pattern in self.patterns.items():
140
+ if pattern.match(line):
141
+ return pattern_name
142
+
143
+ return 'text'
144
+
145
+ def _calculate_perplexity(self, text: str) -> float:
146
+ """Calculate text perplexity as an AI detection metric."""
147
+ if not text.strip():
148
+ return 0.0
149
+
150
+ words = word_tokenize(text.lower())
151
+ if len(words) < 3:
152
+ return 0.0
153
+
154
+ # Simple perplexity approximation based on word frequency patterns
155
+ word_lengths = [len(word) for word in words if word.isalpha()]
156
+ if not word_lengths:
157
+ return 0.0
158
+
159
+ avg_length = np.mean(word_lengths)
160
+ length_variance = np.var(word_lengths)
161
+
162
+ # AI text tends to have more consistent word lengths (lower variance)
163
+ perplexity = length_variance / (avg_length + 1e-6)
164
+ return min(perplexity, 10.0) # Cap at 10
165
+
166
+ def _calculate_ai_probability(self, text: str) -> float:
167
+ """Calculate probability that text is AI-generated."""
168
+ if not text.strip():
169
+ return 0.0
170
+
171
+ # Check for AI-typical patterns
172
+ ai_indicators = 0
173
+ total_checks = 6
174
+
175
+ # 1. Consistent sentence structure
176
+ sentences = sent_tokenize(text)
177
+ if len(sentences) > 1:
178
+ lengths = [len(sent.split()) for sent in sentences]
179
+ if np.std(lengths) < 3: # Very consistent lengths
180
+ ai_indicators += 1
181
+
182
+ # 2. Overuse of transitional phrases
183
+ transitions = ['however', 'moreover', 'furthermore', 'additionally', 'consequently']
184
+ transition_count = sum(1 for trans in transitions if trans in text.lower())
185
+ if transition_count > len(sentences) * 0.3:
186
+ ai_indicators += 1
187
+
188
+ # 3. Lack of contractions
189
+ contractions = ["n't", "'ll", "'re", "'ve", "'d", "'m"]
190
+ if not any(cont in text for cont in contractions) and len(text.split()) > 10:
191
+ ai_indicators += 1
192
+
193
+ # 4. Overly formal language in casual contexts
194
+ formal_words = ['utilize', 'facilitate', 'demonstrate', 'implement', 'comprehensive']
195
+ formal_count = sum(1 for word in formal_words if word in text.lower())
196
+ if formal_count > len(text.split()) * 0.1:
197
+ ai_indicators += 1
198
+
199
+ # 5. Perfect grammar (rarely natural)
200
+ if len(text) > 50 and not re.search(r'[.]{2,}|[!]{2,}|[?]{2,}', text):
201
+ ai_indicators += 1
202
+
203
+ # 6. Repetitive phrasing patterns
204
+ words = text.lower().split()
205
+ if len(words) > 10:
206
+ unique_words = len(set(words))
207
+ if unique_words / len(words) < 0.6: # Low lexical diversity
208
+ ai_indicators += 1
209
+
210
+ return ai_indicators / total_checks
211
+
212
+ def reconstruct_text(self, segments: List[TextSegment]) -> str:
213
+ """Reconstruct text from processed segments."""
214
+ return '\n'.join(segment.content for segment in segments)
215
+
216
+ def download_nltk_resources():
217
+ """Download required NLTK resources with comprehensive coverage."""
218
+ try:
219
+ _create_unverified_https_context = ssl._create_unverified_context
220
+ except AttributeError:
221
+ pass
222
+ else:
223
+ ssl._create_default_https_context = _create_unverified_https_context
224
+
225
+ resources = [
226
+ 'punkt', 'averaged_perceptron_tagger', 'punkt_tab',
227
+ 'wordnet', 'averaged_perceptron_tagger_eng', 'stopwords',
228
+ 'vader_lexicon', 'omw-1.4'
229
+ ]
230
+
231
+ for resource in resources:
232
+ try:
233
+ nltk.download(resource, quiet=True)
234
+ logger.info(f"Successfully downloaded {resource}")
235
+ except Exception as e:
236
+ logger.warning(f"Could not download {resource}: {str(e)}")
237
+
238
+ class StateOfTheArtHumanizer:
239
+ """State-of-the-art humanizer with LATEST 2025 models."""
240
+
241
+ def __init__(
242
+ self,
243
+ sentence_model: str = 'fast', # 🚀 FAST: Uses MiniLM-L6-v2 (fast)
244
+ paraphrase_model: str = 'fast', # 🎯 FAST: T5-Small
245
+ text_generation_model: str = 'fast', # 🔥 FAST: T5-Small
246
+ device: Optional[str] = None,
247
+ enable_advanced_models: bool = True, # Always enabled for quality
248
+ model_quality: str = 'fast' # 'premium', 'balanced', 'fast'
249
+ ):
250
+ """Initialize with latest 2025 state-of-the-art models."""
251
+ self.device = device or str(DEVICE)
252
+ self.enable_advanced_models = enable_advanced_models
253
+ self.model_quality = model_quality
254
+
255
+ # Map model quality to specific models
256
+ self.sentence_model_name = self._get_model_name('sentence_transformers', sentence_model)
257
+ self.paraphrase_model_name = self._get_model_name('paraphrasing', paraphrase_model)
258
+ self.text_gen_model_name = self._get_model_name('text_generation', text_generation_model)
259
+
260
+ # Initialize models
261
+ self.sentence_model = None
262
+ self.paraphrase_models = {}
263
+ self.text_gen_model = None
264
+
265
+ logger.info(f"🚀 Initializing SOTA Humanizer with:")
266
+ logger.info(f" 📊 Sentence Model: {self.sentence_model_name}")
267
+ logger.info(f" 🧠 Paraphrase Model: {self.paraphrase_model_name}")
268
+ logger.info(f" 🔥 Text Gen Model: {self.text_gen_model_name}")
269
+
270
+ self._initialize_models()
271
+
272
+ def _get_model_name(self, category: str, quality: str) -> str:
273
+ """Get the actual model name from the quality setting."""
274
+ if quality in LATEST_MODELS[category]:
275
+ return LATEST_MODELS[category][quality]
276
+ else:
277
+ # If specific model name provided, use it directly
278
+ return quality
279
+
280
+ def _initialize_models(self):
281
+ """Initialize all models with error handling."""
282
+ try:
283
+ # Initialize sentence transformer (BGE-M3 or fallback)
284
+ logger.info(f"🔄 Loading sentence model: {self.sentence_model_name}")
285
+ self.sentence_model = SentenceTransformer(self.sentence_model_name, device=self.device)
286
+ logger.info("✅ Sentence model loaded successfully")
287
+
288
+ # Initialize paraphrasing models
289
+ self._initialize_paraphrase_models(self.paraphrase_model_name)
290
+
291
+ # Initialize text generation model (if premium)
292
+ if self.model_quality == 'premium' and self.enable_advanced_models:
293
+ self._initialize_text_generation_model()
294
+
295
+ except Exception as e:
296
+ logger.error(f"❌ Model initialization failed: {e}")
297
+ # Fallback to basic models
298
+ self._initialize_fallback_models()
299
+
300
+ def _initialize_fallback_models(self):
301
+ """Initialize fallback models if latest ones fail."""
302
+ try:
303
+ logger.info("🔄 Falling back to reliable models...")
304
+ self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=self.device)
305
+ self._initialize_paraphrase_models('google-t5/t5-small')
306
+ logger.info("✅ Fallback models loaded successfully")
307
+ except Exception as e:
308
+ logger.error(f"❌ Even fallback models failed: {e}")
309
+
310
+ def _initialize_text_generation_model(self):
311
+ """Initialize latest text generation model (DeepSeek-R1 or Qwen3)."""
312
+ try:
313
+ if 'deepseek' in self.text_gen_model_name.lower():
314
+ logger.info(f"🚀 Loading DeepSeek model: {self.text_gen_model_name}")
315
+ # For DeepSeek models, use specific configuration
316
+ self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name)
317
+ self.text_gen_model = AutoModelForCausalLM.from_pretrained(
318
+ self.text_gen_model_name,
319
+ torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32,
320
+ device_map='auto' if self.device != 'cpu' else None,
321
+ trust_remote_code=True
322
+ )
323
+ logger.info("✅ DeepSeek model loaded successfully")
324
+
325
+ elif 'qwen' in self.text_gen_model_name.lower():
326
+ logger.info(f"🔥 Loading Qwen3 model: {self.text_gen_model_name}")
327
+ # For Qwen models
328
+ self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name)
329
+ self.text_gen_model = AutoModelForCausalLM.from_pretrained(
330
+ self.text_gen_model_name,
331
+ torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32,
332
+ device_map='auto' if self.device != 'cpu' else None
333
+ )
334
+ logger.info("✅ Qwen3 model loaded successfully")
335
+
336
+ else:
337
+ # Use pipeline for other models
338
+ self.text_gen_pipeline = pipeline(
339
+ "text2text-generation",
340
+ model=self.text_gen_model_name,
341
+ device=0 if self.device != 'cpu' else -1,
342
+ torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
343
+ )
344
+ logger.info("✅ Text generation pipeline loaded successfully")
345
+
346
+ except Exception as e:
347
+ logger.warning(f"⚠️ Advanced text generation model failed to load: {e}")
348
+ self.text_gen_model = None
349
+
350
+ def _initialize_paraphrase_models(self, model_name: str):
351
+ """Initialize paraphrasing models with enhanced capabilities."""
352
+ try:
353
+ if 'ul2' in model_name.lower():
354
+ # Special handling for UL2 model
355
+ logger.info(f"🏆 Loading UL2 model: {model_name}")
356
+ self.paraphrase_models['ul2'] = pipeline(
357
+ "text2text-generation",
358
+ model=model_name,
359
+ device=0 if self.device != 'cpu' else -1,
360
+ torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
361
+ )
362
+ logger.info("✅ UL2 model loaded successfully")
363
+
364
+ elif 'flan-t5' in model_name.lower():
365
+ # FLAN-T5 models
366
+ logger.info(f"🎯 Loading FLAN-T5 model: {model_name}")
367
+ self.paraphrase_models['flan_t5'] = pipeline(
368
+ "text2text-generation",
369
+ model=model_name,
370
+ device=0 if self.device != 'cpu' else -1,
371
+ torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
372
+ )
373
+ logger.info("✅ FLAN-T5 model loaded successfully")
374
+
375
+ else:
376
+ # Standard T5 models
377
+ self.paraphrase_models['t5'] = pipeline(
378
+ "text2text-generation",
379
+ model=model_name,
380
+ device=0 if self.device != 'cpu' else -1,
381
+ torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32
382
+ )
383
+ logger.info("✅ T5 model loaded successfully")
384
+
385
+ except Exception as e:
386
+ logger.error(f"❌ Paraphrase model initialization failed: {e}")
387
+ raise
388
+
389
+ def paraphrase_sentence(self, sentence: str, model_type: str = 'auto') -> str:
390
+ """Advanced paraphrasing with latest models."""
391
+ if not sentence.strip() or len(sentence.split()) < 5: # Skip very short sentences
392
+ return sentence
393
+
394
+ try:
395
+ # Choose best available model
396
+ if model_type == 'auto':
397
+ if 'ul2' in self.paraphrase_models:
398
+ model_type = 'ul2'
399
+ elif 'flan_t5' in self.paraphrase_models:
400
+ model_type = 'flan_t5'
401
+ else:
402
+ model_type = 't5'
403
+
404
+ model = self.paraphrase_models.get(model_type)
405
+ if not model:
406
+ return sentence
407
+
408
+ # Prepare input based on model type - use simple, clean prompts
409
+ if model_type == 'ul2':
410
+ input_text = f"Rewrite: {sentence}"
411
+ elif model_type == 'flan_t5':
412
+ input_text = f"Rewrite this text: {sentence}"
413
+ else:
414
+ # Standard T5 - use basic paraphrase prompt
415
+ input_text = f"paraphrase: {sentence}"
416
+
417
+ # Generate paraphrase with conservative settings
418
+ result = model(
419
+ input_text,
420
+ max_length=min(len(sentence.split()) * 2 + 10, 100), # More conservative length
421
+ min_length=max(3, len(sentence.split()) - 3),
422
+ do_sample=True,
423
+ temperature=0.6, # Lower temperature for more conservative outputs
424
+ top_p=0.8, # Lower top_p
425
+ num_return_sequences=1,
426
+ no_repeat_ngram_size=2,
427
+ repetition_penalty=1.1
428
+ )
429
+
430
+ paraphrased = result[0]['generated_text'].strip()
431
+
432
+ # Enhanced quality checks
433
+ if self._is_quality_paraphrase_enhanced(sentence, paraphrased):
434
+ return paraphrased
435
+ else:
436
+ return sentence
437
+
438
+ except Exception as e:
439
+ logger.warning(f"⚠️ Paraphrasing failed: {e}")
440
+ return sentence
441
+
442
+ def _is_quality_paraphrase_enhanced(self, original: str, paraphrase: str) -> bool:
443
+ """Enhanced quality check for paraphrases with stricter criteria."""
444
+ if not paraphrase or paraphrase.strip() == original.strip():
445
+ return False
446
+
447
+ # Check for editorial markers or foreign language
448
+ bad_markers = ['False:', 'Paraphrase:', 'True:', 'Note:', 'Edit:', '[', ']', 'Cette', 'loi', 'aux']
449
+ if any(marker in paraphrase for marker in bad_markers):
450
+ return False
451
+
452
+ # Check length ratio (shouldn't be too different)
453
+ length_ratio = len(paraphrase) / len(original)
454
+ if length_ratio < 0.5 or length_ratio > 2.0:
455
+ return False
456
+
457
+ # Check for broken words or missing spaces
458
+ if any(len(word) > 20 for word in paraphrase.split()): # Very long words indicate concatenation
459
+ return False
460
+
461
+ # Check semantic similarity if available
462
+ try:
463
+ if self.sentence_model:
464
+ embeddings = self.sentence_model.encode([original, paraphrase])
465
+ similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
466
+
467
+ # Stricter similarity thresholds
468
+ if 'minilm' in self.sentence_model_name.lower():
469
+ return 0.7 <= similarity <= 0.95 # Good range for MiniLM
470
+ else:
471
+ return 0.65 <= similarity <= 0.95
472
+
473
+ return True # Fallback if no sentence model
474
+
475
+ except Exception as e:
476
+ logger.warning(f"⚠️ Quality check failed: {e}")
477
+ return False
478
+
479
+ def generate_with_latest_model(self, prompt: str, max_length: int = 150) -> str:
480
+ """Generate text using the latest models (DeepSeek-R1 or Qwen3)."""
481
+ if not self.text_gen_model:
482
+ return prompt
483
+
484
+ try:
485
+ if hasattr(self, 'text_gen_tokenizer'):
486
+ # Direct model inference for DeepSeek/Qwen
487
+ inputs = self.text_gen_tokenizer.encode(prompt, return_tensors='pt')
488
+
489
+ with torch.no_grad():
490
+ outputs = self.text_gen_model.generate(
491
+ inputs,
492
+ max_length=max_length,
493
+ do_sample=True,
494
+ temperature=0.7,
495
+ top_p=0.9,
496
+ pad_token_id=self.text_gen_tokenizer.eos_token_id
497
+ )
498
+
499
+ generated = self.text_gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
500
+ # Extract only the new generated part
501
+ new_text = generated[len(prompt):].strip()
502
+ return prompt + " " + new_text if new_text else prompt
503
+
504
+ elif hasattr(self, 'text_gen_pipeline'):
505
+ # Pipeline inference
506
+ result = self.text_gen_pipeline(
507
+ prompt,
508
+ max_length=max_length,
509
+ do_sample=True,
510
+ temperature=0.7,
511
+ top_p=0.9
512
+ )
513
+ return result[0]['generated_text']
514
+
515
+ except Exception as e:
516
+ logger.warning(f"⚠️ Text generation failed: {e}")
517
+ return prompt
518
+
519
+ return prompt
520
+
521
+ def _is_quality_paraphrase(self, original: str, paraphrase: str) -> bool:
522
+ """Enhanced quality check for paraphrases using latest models."""
523
+ if not paraphrase or paraphrase.strip() == original.strip():
524
+ return False
525
+
526
+ try:
527
+ # Check semantic similarity using advanced model
528
+ if self.sentence_model:
529
+ embeddings = self.sentence_model.encode([original, paraphrase])
530
+ similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
531
+
532
+ # BGE-M3 and advanced models have different thresholds
533
+ if 'bge-m3' in self.sentence_model_name.lower():
534
+ min_similarity = 0.7 # Higher threshold for BGE-M3
535
+ elif 'mpnet' in self.sentence_model_name.lower():
536
+ min_similarity = 0.65 # Medium threshold for MPNet
537
+ else:
538
+ min_similarity = 0.6 # Standard threshold
539
+
540
+ return similarity >= min_similarity
541
+
542
+ return True # Fallback if no sentence model
543
+
544
+ except Exception as e:
545
+ logger.warning(f"⚠️ Quality check failed: {e}")
546
+ return True # Conservative fallback
547
+
548
+ def enhance_with_advanced_synonyms(self, text: str) -> str:
549
+ """Enhanced synonym replacement using latest models."""
550
+ if not text.strip():
551
+ return text
552
+
553
+ try:
554
+ doc = NLP_GLOBAL(text)
555
+ enhanced_tokens = []
556
+
557
+ for token in doc:
558
+ # Be more conservative with synonym replacement
559
+ if (token.is_alpha and not token.is_stop and
560
+ len(token.text) > 4 and token.pos_ in ['NOUN', 'VERB', 'ADJ'] and # Removed 'ADV' and increased min length
561
+ not token.is_punct and token.lemma_.lower() not in ['say', 'get', 'make', 'take', 'come', 'go']): # Avoid common verbs
562
+
563
+ # Use contextual synonym selection with lower probability
564
+ if random.random() < 0.3: # Only 30% chance of replacement
565
+ synonym = self._get_contextual_synonym_advanced(
566
+ token.text, token.pos_, text, token.i
567
+ )
568
+ if synonym and len(synonym) <= len(token.text) + 3: # Prevent very long replacements
569
+ enhanced_tokens.append(synonym + token.whitespace_)
570
+ else:
571
+ enhanced_tokens.append(token.text_with_ws)
572
+ else:
573
+ enhanced_tokens.append(token.text_with_ws)
574
+ else:
575
+ enhanced_tokens.append(token.text_with_ws)
576
+
577
+ result = ''.join(enhanced_tokens)
578
+
579
+ # Quality check: ensure result is reasonable
580
+ if len(result) > len(text) * 1.5: # Prevent text expansion beyond 150%
581
+ return text
582
+
583
+ return result
584
+
585
+ except Exception as e:
586
+ logger.warning(f"⚠️ Advanced synonym enhancement failed: {e}")
587
+ return text
588
+
589
+ def _get_contextual_synonym_advanced(self, word: str, pos: str, context: str, position: int) -> Optional[str]:
590
+ """Advanced contextual synonym selection using latest models."""
591
+ try:
592
+ # Get traditional synonyms first
593
+ synonyms = self._get_wordnet_synonyms(word, pos)
594
+
595
+ if not synonyms or not self.sentence_model:
596
+ return None
597
+
598
+ # Use advanced sentence model for context-aware selection
599
+ original_sentence = context
600
+ best_synonym = None
601
+ best_score = -1
602
+
603
+ for synonym in synonyms[:5]: # Limit to top 5 for efficiency
604
+ # Create candidate sentence with synonym
605
+ words = context.split()
606
+ if position < len(words):
607
+ words[position] = synonym
608
+ candidate_sentence = ' '.join(words)
609
+
610
+ # Calculate semantic similarity
611
+ embeddings = self.sentence_model.encode([original_sentence, candidate_sentence])
612
+ similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
613
+
614
+ # For advanced models, we want high similarity but some variation
615
+ if 'bge-m3' in self.sentence_model_name.lower():
616
+ # BGE-M3 is more nuanced
617
+ if 0.85 <= similarity <= 0.98 and similarity > best_score:
618
+ best_score = similarity
619
+ best_synonym = synonym
620
+ else:
621
+ # Standard models
622
+ if 0.8 <= similarity <= 0.95 and similarity > best_score:
623
+ best_score = similarity
624
+ best_synonym = synonym
625
+
626
+ return best_synonym
627
+
628
+ except Exception as e:
629
+ logger.warning(f"⚠️ Advanced contextual synonym selection failed: {e}")
630
+ return None
631
+
632
+ def _get_wordnet_synonyms(self, word: str, pos: str) -> List[str]:
633
+ """Enhanced WordNet synonym extraction."""
634
+ try:
635
+ # Map spaCy POS to WordNet POS
636
+ pos_map = {
637
+ 'NOUN': wordnet.NOUN,
638
+ 'VERB': wordnet.VERB,
639
+ 'ADJ': wordnet.ADJ,
640
+ 'ADV': wordnet.ADV
641
+ }
642
+
643
+ wn_pos = pos_map.get(pos)
644
+ if not wn_pos:
645
+ return []
646
+
647
+ synonyms = set()
648
+ synsets = wordnet.synsets(word.lower(), pos=wn_pos)
649
+
650
+ for synset in synsets[:3]: # Top 3 synsets
651
+ for lemma in synset.lemmas()[:4]: # Top 4 lemmas per synset
652
+ synonym = lemma.name().replace('_', ' ')
653
+ if synonym.lower() != word.lower() and len(synonym) > 2:
654
+ synonyms.add(synonym)
655
+
656
+ return list(synonyms)
657
+
658
+ except Exception as e:
659
+ logger.warning(f"⚠️ WordNet synonym extraction failed: {e}")
660
+ return []
661
+
662
+ class AdvancedAcademicTextHumanizer:
663
+ """
664
+ Next-generation text humanizer with state-of-the-art ML models and
665
+ advanced AI detection avoidance techniques.
666
+ """
667
+
668
+ def __init__(
669
+ self,
670
+ sentence_model: str = 'fast', # OPTIMIZED: Use fast models by default
671
+ paraphrase_model: str = 'fast', # OPTIMIZED: Use fast models by default
672
+ p_passive: float = 0.05, # REDUCED: Very conservative passive conversion
673
+ p_synonym_replacement: float = 0.15, # REDUCED: Conservative synonym replacement
674
+ p_academic_transition: float = 0.10, # REDUCED: Conservative transitions
675
+ p_paraphrase: float = 0.10, # REDUCED: Conservative paraphrasing
676
+ seed: Optional[int] = None,
677
+ preserve_formatting: bool = True,
678
+ enable_advanced_models: bool = True, # OPTIMIZED: Always enabled for quality
679
+ ai_avoidance_mode: bool = True # OPTIMIZED: Always enabled for best results
680
+ ):
681
+ """
682
+ Initialize the advanced text humanizer with cutting-edge capabilities.
683
+ """
684
+ if seed is not None:
685
+ random.seed(seed)
686
+ np.random.seed(seed)
687
+ torch.manual_seed(seed)
688
+
689
+ self.nlp = NLP_GLOBAL
690
+ if self.nlp is None:
691
+ raise RuntimeError("spaCy model not initialized. Call initialize_nlp() first.")
692
+
693
+ # Initialize advanced models
694
+ self.advanced_humanizer = StateOfTheArtHumanizer(
695
+ sentence_model=sentence_model,
696
+ paraphrase_model=paraphrase_model,
697
+ enable_advanced_models=enable_advanced_models
698
+ )
699
+
700
+ # Transformation probabilities with new advanced features
701
+ self.p_passive = max(0.0, min(1.0, p_passive))
702
+ self.p_synonym_replacement = max(0.0, min(1.0, p_synonym_replacement))
703
+ self.p_academic_transition = max(0.0, min(1.0, p_academic_transition))
704
+ self.p_paraphrase = max(0.0, min(1.0, p_paraphrase))
705
+
706
+ self.preserve_formatting = preserve_formatting
707
+ self.ai_avoidance_mode = ai_avoidance_mode
708
+ self.markdown_preserver = AdvancedMarkdownPreserver()
709
+
710
+ # Enhanced academic transitions with variety
711
+ self.academic_transitions = {
712
+ 'addition': [
713
+ "Moreover,", "Additionally,", "Furthermore,", "In addition,",
714
+ "What's more,", "Beyond that,", "On top of that,", "Also worth noting,"
715
+ ],
716
+ 'contrast': [
717
+ "However,", "Nevertheless,", "Nonetheless,", "Conversely,",
718
+ "On the contrary,", "In contrast,", "That said,", "Yet,"
719
+ ],
720
+ 'causation': [
721
+ "Therefore,", "Consequently,", "Thus,", "Hence,",
722
+ "As a result,", "This leads to,", "It follows that,", "Accordingly,"
723
+ ],
724
+ 'emphasis': [
725
+ "Notably,", "Significantly,", "Importantly,", "Remarkably,",
726
+ "It's worth emphasizing,", "Particularly noteworthy,", "Crucially,", "Indeed,"
727
+ ],
728
+ 'sequence': [
729
+ "Subsequently,", "Following this,", "Thereafter,", "Next,",
730
+ "In the next phase,", "Moving forward,", "Then,", "Later on,"
731
+ ]
732
+ }
733
+
734
+ # Comprehensive contraction mapping
735
+ self.contraction_map = {
736
+ "n't": " not", "'re": " are", "'s": " is", "'ll": " will",
737
+ "'ve": " have", "'d": " would", "'m": " am", "'t": " not",
738
+ "won't": "will not", "can't": "cannot", "shouldn't": "should not",
739
+ "wouldn't": "would not", "couldn't": "could not", "mustn't": "must not",
740
+ "isn't": "is not", "aren't": "are not", "wasn't": "was not",
741
+ "weren't": "were not", "haven't": "have not", "hasn't": "has not",
742
+ "hadn't": "had not", "doesn't": "does not", "didn't": "did not",
743
+ "don't": "do not", "let's": "let us", "that's": "that is",
744
+ "there's": "there is", "here's": "here is", "what's": "what is",
745
+ "where's": "where is", "who's": "who is", "it's": "it is"
746
+ }
747
+
748
+ def humanize_text(
749
+ self,
750
+ text: str,
751
+ use_passive: bool = False,
752
+ use_synonyms: bool = False,
753
+ use_paraphrasing: bool = False,
754
+ preserve_paragraphs: bool = True
755
+ ) -> str:
756
+ """
757
+ Advanced text humanization with state-of-the-art techniques.
758
+ """
759
+ if not text or not text.strip():
760
+ return text
761
+
762
+ try:
763
+ if self.preserve_formatting:
764
+ return self._humanize_with_advanced_preservation(
765
+ text, use_passive, use_synonyms, use_paraphrasing, preserve_paragraphs
766
+ )
767
+ else:
768
+ return self._humanize_advanced_simple(text, use_passive, use_synonyms, use_paraphrasing)
769
+ except Exception as e:
770
+ logger.error(f"Error during advanced text humanization: {e}")
771
+ return text
772
+
773
+ def _humanize_with_advanced_preservation(
774
+ self,
775
+ text: str,
776
+ use_passive: bool,
777
+ use_synonyms: bool,
778
+ use_paraphrasing: bool,
779
+ preserve_paragraphs: bool
780
+ ) -> str:
781
+ """Advanced humanization with comprehensive formatting preservation."""
782
+ segments = self.markdown_preserver.segment_text(text)
783
+
784
+ for segment in segments:
785
+ if segment.segment_type == 'text' and segment.content.strip():
786
+ # Apply AI detection avoidance if needed
787
+ if self.ai_avoidance_mode and segment.ai_probability > 0.6:
788
+ segment.content = self._apply_ai_avoidance_techniques(
789
+ segment.content, use_passive, use_synonyms, use_paraphrasing
790
+ )
791
+ else:
792
+ segment.content = self._transform_text_segment_advanced(
793
+ segment.content, use_passive, use_synonyms, use_paraphrasing
794
+ )
795
+
796
+ return self.markdown_preserver.reconstruct_text(segments)
797
+
798
+ def _apply_ai_avoidance_techniques(
799
+ self,
800
+ text: str,
801
+ use_passive: bool,
802
+ use_synonyms: bool,
803
+ use_paraphrasing: bool
804
+ ) -> str:
805
+ """Apply specialized techniques to avoid AI detection."""
806
+ try:
807
+ # 1. Add natural imperfections
808
+ text = self._add_natural_variations(text)
809
+
810
+ # 2. Increase sentence variety
811
+ text = self._vary_sentence_structure(text)
812
+
813
+ # 3. Reduce formal language density
814
+ text = self._reduce_formality(text)
815
+
816
+ # 4. Apply standard transformations
817
+ text = self._transform_text_segment_advanced(
818
+ text, use_passive, use_synonyms, use_paraphrasing
819
+ )
820
+
821
+ return text
822
+ except Exception as e:
823
+ logger.warning(f"Error in AI avoidance: {e}")
824
+ return text
825
+
826
+ def _add_natural_variations(self, text: str) -> str:
827
+ """Add natural human-like variations."""
828
+ # Add occasional contractions to balance formality
829
+ if random.random() < 0.3:
830
+ formal_replacements = {
831
+ "do not": "don't", "will not": "won't", "cannot": "can't",
832
+ "should not": "shouldn't", "would not": "wouldn't"
833
+ }
834
+ for formal, contraction in formal_replacements.items():
835
+ if formal in text and random.random() < 0.4:
836
+ text = text.replace(formal, contraction, 1)
837
+
838
+ return text
839
+
840
+ def _vary_sentence_structure(self, text: str) -> str:
841
+ """Increase sentence structure variety."""
842
+ sentences = sent_tokenize(text)
843
+ if len(sentences) < 2:
844
+ return text
845
+
846
+ varied_sentences = []
847
+ for i, sentence in enumerate(sentences):
848
+ if i > 0 and random.random() < 0.3:
849
+ # Occasionally start with different structures
850
+ starters = ["Well,", "Actually,", "Interestingly,", "To be clear,"]
851
+ if not any(sentence.startswith(starter) for starter in starters):
852
+ starter = random.choice(starters)
853
+ sentence = f"{starter} {sentence.lower()}"
854
+
855
+ varied_sentences.append(sentence)
856
+
857
+ return ' '.join(varied_sentences)
858
+
859
+ def _reduce_formality(self, text: str) -> str:
860
+ """Reduce excessive formality to appear more human."""
861
+ # Replace overly formal words with more natural alternatives
862
+ formal_to_natural = {
863
+ 'utilize': 'use', 'facilitate': 'help', 'demonstrate': 'show',
864
+ 'implement': 'put in place', 'comprehensive': 'complete',
865
+ 'methodology': 'method', 'substantial': 'large',
866
+ 'numerous': 'many', 'acquire': 'get'
867
+ }
868
+
869
+ for formal, natural in formal_to_natural.items():
870
+ if formal in text.lower() and random.random() < 0.6:
871
+ text = re.sub(r'\b' + formal + r'\b', natural, text, flags=re.IGNORECASE)
872
+
873
+ return text
874
+
875
+ def _transform_text_segment_advanced(
876
+ self,
877
+ text: str,
878
+ use_passive: bool,
879
+ use_synonyms: bool,
880
+ use_paraphrasing: bool
881
+ ) -> str:
882
+ """Advanced text segment transformation with ML models."""
883
+ try:
884
+ doc = self.nlp(text)
885
+ transformed_sentences = []
886
+
887
+ for sent in doc.sents:
888
+ sentence_str = sent.text.strip()
889
+ if not sentence_str:
890
+ continue
891
+
892
+ # 1. Expand contractions
893
+ sentence_str = self.expand_contractions_advanced(sentence_str)
894
+
895
+ # 2. Advanced paraphrasing (new!)
896
+ if use_paraphrasing and random.random() < self.p_paraphrase:
897
+ paraphrased = self.advanced_humanizer.paraphrase_sentence(sentence_str)
898
+ if paraphrased != sentence_str:
899
+ sentence_str = paraphrased
900
+
901
+ # 3. Context-aware academic transitions
902
+ if random.random() < self.p_academic_transition:
903
+ sentence_str = self.add_contextual_transitions(sentence_str)
904
+
905
+ # 4. Advanced passive voice conversion
906
+ if use_passive and random.random() < self.p_passive:
907
+ sentence_str = self.convert_to_passive_advanced(sentence_str)
908
+
909
+ # 5. Enhanced contextual synonym replacement
910
+ if use_synonyms and random.random() < self.p_synonym_replacement:
911
+ sentence_str = self.enhance_with_advanced_synonyms(sentence_str)
912
+
913
+ transformed_sentences.append(sentence_str)
914
+
915
+ result = ' '.join(transformed_sentences)
916
+ return result if result.strip() else text
917
+
918
+ except Exception as e:
919
+ logger.warning(f"Error in advanced transformation: {e}")
920
+ return text
921
+
922
+ def expand_contractions_advanced(self, sentence: str) -> str:
923
+ """Enhanced contraction expansion with better context handling."""
924
+ # Handle special cases with regex for better accuracy
925
+ for contraction, expansion in self.contraction_map.items():
926
+ if len(contraction) > 3: # Full word contractions
927
+ pattern = r'\b' + re.escape(contraction) + r'\b'
928
+ sentence = re.sub(pattern, expansion, sentence, flags=re.IGNORECASE)
929
+
930
+ # Handle suffix contractions
931
+ tokens = word_tokenize(sentence)
932
+ expanded_tokens = []
933
+
934
+ for token in tokens:
935
+ original_case = token
936
+ lower_token = token.lower()
937
+ replaced = False
938
+
939
+ for contraction, expansion in self.contraction_map.items():
940
+ if (len(contraction) <= 3 and
941
+ lower_token.endswith(contraction) and
942
+ len(lower_token) > len(contraction)):
943
+
944
+ base = lower_token[:-len(contraction)]
945
+ new_token = base + expansion
946
+
947
+ # Preserve capitalization pattern
948
+ if original_case[0].isupper():
949
+ new_token = new_token[0].upper() + new_token[1:]
950
+
951
+ expanded_tokens.append(new_token)
952
+ replaced = True
953
+ break
954
+
955
+ if not replaced:
956
+ expanded_tokens.append(token)
957
+
958
+ return ' '.join(expanded_tokens)
959
+
960
+ def add_contextual_transitions(self, sentence: str) -> str:
961
+ """Add contextually intelligent academic transitions."""
962
+ sentence_lower = sentence.lower()
963
+
964
+ # Enhanced context detection
965
+ context_patterns = {
966
+ 'contrast': ['but', 'however', 'although', 'while', 'despite', 'whereas'],
967
+ 'causation': ['because', 'since', 'therefore', 'so', 'due to', 'as a result'],
968
+ 'addition': ['also', 'and', 'plus', 'including', 'along with'],
969
+ 'emphasis': ['important', 'significant', 'notable', 'crucial', 'key'],
970
+ 'sequence': ['first', 'second', 'then', 'next', 'finally', 'last']
971
+ }
972
+
973
+ # Determine best transition type
974
+ best_type = 'addition' # default
975
+ max_matches = 0
976
+
977
+ for transition_type, patterns in context_patterns.items():
978
+ matches = sum(1 for pattern in patterns if pattern in sentence_lower)
979
+ if matches > max_matches:
980
+ max_matches = matches
981
+ best_type = transition_type
982
+
983
+ # Select appropriate transition
984
+ transition = random.choice(self.academic_transitions[best_type])
985
+
986
+ return f"{transition} {sentence}"
987
+
988
+ def convert_to_passive_advanced(self, sentence: str) -> str:
989
+ """Advanced passive voice conversion with better grammatical accuracy."""
990
+ try:
991
+ doc = self.nlp(sentence)
992
+
993
+ # Find suitable active voice patterns
994
+ for token in doc:
995
+ if (token.pos_ == 'VERB' and
996
+ token.dep_ == 'ROOT' and
997
+ token.tag_ in ['VBD', 'VBZ', 'VBP']):
998
+
999
+ # Find subject and object
1000
+ subj = None
1001
+ obj = None
1002
+
1003
+ for child in token.children:
1004
+ if child.dep_ == 'nsubj':
1005
+ subj = child
1006
+ elif child.dep_ in ['dobj', 'pobj']:
1007
+ obj = child
1008
+
1009
+ if subj and obj:
1010
+ # Create passive transformation
1011
+ verb_base = token.lemma_
1012
+
1013
+ # Choose auxiliary verb
1014
+ aux = 'was' if subj.tag_ in ['NN', 'NNP'] else 'were'
1015
+ if token.tag_ in ['VBZ', 'VBP']: # Present tense
1016
+ aux = 'is' if subj.tag_ in ['NN', 'NNP'] else 'are'
1017
+
1018
+ # Create past participle
1019
+ if verb_base.endswith('e'):
1020
+ past_participle = verb_base + 'd'
1021
+ elif verb_base in ['go', 'do', 'be', 'have']:
1022
+ # Irregular verbs
1023
+ irregular_map = {'go': 'gone', 'do': 'done', 'be': 'been', 'have': 'had'}
1024
+ past_participle = irregular_map.get(verb_base, verb_base + 'ed')
1025
+ else:
1026
+ past_participle = verb_base + 'ed'
1027
+
1028
+ # Construct passive sentence
1029
+ passive_phrase = f"{obj.text} {aux} {past_participle} by {subj.text}"
1030
+
1031
+ # Replace in original sentence
1032
+ original_phrase = f"{subj.text} {token.text} {obj.text}"
1033
+ if original_phrase in sentence:
1034
+ return sentence.replace(original_phrase, passive_phrase)
1035
+
1036
+ return sentence
1037
+
1038
+ except Exception as e:
1039
+ logger.warning(f"Error in advanced passive conversion: {e}")
1040
+ return sentence
1041
+
1042
+ def get_advanced_transformation_stats(self, original_text: str, transformed_text: str) -> Dict[str, Union[int, float]]:
1043
+ """Get comprehensive transformation statistics with ML analysis."""
1044
+ orig_tokens = word_tokenize(original_text)
1045
+ trans_tokens = word_tokenize(transformed_text)
1046
+ orig_sents = sent_tokenize(original_text)
1047
+ trans_sents = sent_tokenize(transformed_text)
1048
+
1049
+ # Calculate advanced metrics
1050
+ stats = {
1051
+ 'original_word_count': len(orig_tokens),
1052
+ 'transformed_word_count': len(trans_tokens),
1053
+ 'original_sentence_count': len(orig_sents),
1054
+ 'transformed_sentence_count': len(trans_sents),
1055
+ 'word_change_ratio': len(trans_tokens) / len(orig_tokens) if orig_tokens else 0,
1056
+ 'sentence_change_ratio': len(trans_sents) / len(orig_sents) if orig_sents else 0,
1057
+ 'character_count_original': len(original_text),
1058
+ 'character_count_transformed': len(transformed_text),
1059
+ }
1060
+
1061
+ # Add ML-based analysis
1062
+ try:
1063
+ # Semantic similarity
1064
+ if hasattr(self, 'advanced_humanizer') and self.advanced_humanizer.sentence_model:
1065
+ embeddings = self.advanced_humanizer.sentence_model.encode([original_text, transformed_text])
1066
+ semantic_similarity = float(util.cos_sim(embeddings[0], embeddings[1]).item())
1067
+ stats['semantic_similarity'] = semantic_similarity
1068
+
1069
+ # AI detection metrics
1070
+ original_segments = self.markdown_preserver.segment_text(original_text)
1071
+ transformed_segments = self.markdown_preserver.segment_text(transformed_text)
1072
+
1073
+ orig_ai_scores = [seg.ai_probability for seg in original_segments if seg.segment_type == 'text']
1074
+ trans_ai_scores = [seg.ai_probability for seg in transformed_segments if seg.segment_type == 'text']
1075
+
1076
+ if orig_ai_scores and trans_ai_scores:
1077
+ stats['original_ai_probability'] = np.mean(orig_ai_scores)
1078
+ stats['transformed_ai_probability'] = np.mean(trans_ai_scores)
1079
+ stats['ai_detection_improvement'] = stats['original_ai_probability'] - stats['transformed_ai_probability']
1080
+
1081
+ except Exception as e:
1082
+ logger.warning(f"Error calculating advanced stats: {e}")
1083
+
1084
+ return stats
1085
+
1086
+ def _humanize_advanced_simple(self, text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool) -> str:
1087
+ """Simple advanced transformation without formatting preservation."""
1088
+ paragraphs = text.split('\n\n')
1089
+ transformed_paragraphs = []
1090
+
1091
+ for paragraph in paragraphs:
1092
+ if paragraph.strip():
1093
+ transformed = self._transform_text_segment_advanced(
1094
+ paragraph, use_passive, use_synonyms, use_paraphrasing
1095
+ )
1096
+ transformed_paragraphs.append(transformed)
1097
+ else:
1098
+ transformed_paragraphs.append(paragraph)
1099
+
1100
+ return '\n\n'.join(transformed_paragraphs)