Spaces:

Zlovoblachko
/

lang_learn_app

Running

App Files Files Community

Zlovoblachko commited on 29 days ago

Commit

756883e

1 Parent(s): c444d4f

initial commit

Browse files

Files changed (1) hide show

app.py +324 -114

app.py CHANGED Viewed

@@ -5,8 +5,14 @@ import os
 from datetime import datetime
 import torch
 import nltk
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, ElectraTokenizer, ElectraForTokenClassification
 import torch.nn as nn
 # Download NLTK data
 try:
@@ -14,6 +20,304 @@ try:
 except LookupError:
     nltk.download('punkt')
 # Initialize SQLite database for storing submissions and exercises
 def init_database():
     conn = sqlite3.connect('language_app.db')
@@ -74,110 +378,11 @@ def init_database():
     conn.commit()
     conn.close()
-# Neural Network Model (simplified version of your existing model)
-class SimpleGrammarChecker:
-    def __init__(self):
-        self.model_name = "Zlovoblachko/Realec-2step-ft-realec"
-        self.ged_model_name = "Zlovoblachko/4tag-electra-grammar-error-detection"
-        self.load_models()
-    def load_models(self):
-        try:
-            # Load T5 model
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
-            # Load GED model
-            self.ged_tokenizer = ElectraTokenizer.from_pretrained(self.ged_model_name)
-            self.ged_model = ElectraForTokenClassification.from_pretrained(self.ged_model_name)
-            print("Models loaded successfully!")
-        except Exception as e:
-            print(f"Error loading models: {e}")
-            self.model = None
-            self.ged_model = None
-    def analyze_text(self, text):
-        if not self.model or not text.strip():
-            return "Model not available or empty text", ""
-        try:
-            # Tokenize and generate correction
-            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    input_ids=inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    max_length=512,
-                    num_beams=4,
-                    early_stopping=True
-                )
-            corrected_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Get GED predictions if available
-            error_spans = []
-            if self.ged_model:
-                error_spans = self.get_error_spans(text)
-            # Generate HTML output
-            html_output = self.generate_html_analysis(text, corrected_text, error_spans)
-            return corrected_text, html_output
-        except Exception as e:
-            return f"Error during analysis: {str(e)}", ""
-    def get_error_spans(self, text):
-        try:
-            inputs = self.ged_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
-            with torch.no_grad():
-                outputs = self.ged_model(**inputs)
-                predictions = torch.argmax(outputs.logits, dim=2)
-            tokens = self.ged_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
-            token_predictions = predictions[0].cpu().numpy().tolist()
-            error_spans = []
-            for i, (token, pred) in enumerate(zip(tokens, token_predictions)):
-                if token.startswith("##") or token in ["[CLS]", "[SEP]", "[PAD]"]:
-                    continue
-                if pred != 0:  # 0 is correct, 1=R, 2=M, 3=U
-                    error_type = ["C", "R", "M", "U"][pred]
-                    error_spans.append({"token": token, "type": error_type, "position": i})
-            return error_spans
-        except:
-            return []
-    def generate_html_analysis(self, original, corrected, error_spans):
-        html = f"""
-        <div style='font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9;'>
-            <h3 style='color: #333; margin-top: 0;'>Grammar Analysis Results</h3>
-            <div style='margin: 15px 0;'>
-                <h4 style='color: #555;'>Original Text:</h4>
-                <p style='padding: 10px; background-color: #fff; border: 1px solid #ddd; border-radius: 4px;'>{original}</p>
-            </div>
-            <div style='margin: 15px 0;'>
-                <h4 style='color: #28a745;'>Corrected Text:</h4>
-                <p style='padding: 10px; background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 4px;'>{corrected}</p>
-            </div>
-            <div style='margin: 15px 0;'>
-                <h4 style='color: #333;'>Error Analysis:</h4>
-                <p style='color: #666;'>Found {len(error_spans)} potential errors</p>
-            </div>
-        </div>
-        """
-        return html
-# Initialize components
 init_database()
-grammar_checker = SimpleGrammarChecker()
 # Gradio Interface Functions
 def analyze_student_writing(text, student_name, task_title="General Writing Task"):
@@ -188,7 +393,7 @@ def analyze_student_writing(text, student_name, task_title="General Writing Task
     if not student_name.strip():
         return "Please enter your name.", ""
-    # Analyze text
     corrected_text, html_analysis = grammar_checker.analyze_text(text)
     # Store in database
@@ -220,7 +425,7 @@ def analyze_student_writing(text, student_name, task_title="General Writing Task
     return corrected_text, html_analysis
 def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
-    """Create an exercise from text with errors"""
     if not text.strip():
         return "Please enter text to create an exercise.", ""
@@ -257,6 +462,7 @@ def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
     exercise_html = f"""
     <div style='font-family: Arial, sans-serif; padding: 20px; border: 1px solid #ddd; border-radius: 8px;'>
         <h3>{exercise_title}</h3>
         <p><strong>Instructions:</strong> Correct the grammatical errors in the following sentences:</p>
         <ol>
     """
@@ -266,10 +472,10 @@ def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
     exercise_html += "</ol></div>"
-    return f"Exercise created with {len(exercise_sentences)} sentences!", exercise_html
 def attempt_exercise(exercise_id, student_responses, student_name):
-    """Submit exercise attempt and get score"""
     if not student_name.strip():
         return "Please enter your name.", ""
@@ -296,19 +502,22 @@ def attempt_exercise(exercise_id, student_responses, student_name):
     if len(responses) != len(exercise_sentences):
         return f"Please provide exactly {len(exercise_sentences)} responses (one per line).", ""
-    # Calculate score
     correct_count = 0
     feedback = []
     for i, (sentence_data, response) in enumerate(zip(exercise_sentences, responses), 1):
         correct_answer = sentence_data['corrected']
-        is_correct = response.lower().strip() == correct_answer.lower().strip()
         if is_correct:
             correct_count += 1
-            feedback.append(f"✅ Sentence {i}: Correct!")
         else:
-            feedback.append(f"❌ Sentence {i}: Your answer: '{response}' | Correct answer: '{correct_answer}'")
     score = (correct_count / len(exercise_sentences)) * 100
@@ -386,9 +595,10 @@ def get_student_progress(student_name):
     return progress_html
 # Create Gradio Interface
-with gr.Blocks(title="Language Learning App - Grammar Checker", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🎓 Language Learning Application")
     gr.Markdown("### AI-Powered Grammar Checking and Exercise Generation")
     with gr.Tabs():
         # Student Writing Analysis Tab
@@ -491,7 +701,7 @@ with gr.Blocks(title="Language Learning App - Grammar Checker", theme=gr.themes.
     3. **Exercise Practice**: Students can practice with generated exercises and get scored feedback
     4. **Progress Tracking**: View student progress across submissions and exercises
-    *Powered by advanced neural networks for grammar error detection and correction*
     """)
 if __name__ == "__main__":

 from datetime import datetime
 import torch
 import nltk
+from transformers import (
+    T5Tokenizer,
+    T5ForConditionalGeneration,
+    ElectraTokenizer,
+    ElectraForTokenClassification
+)
 import torch.nn as nn
+from tqdm import tqdm
 # Download NLTK data
 try:
 except LookupError:
     nltk.download('punkt')
+class HuggingFaceT5GEDInference:
+    def __init__(self, model_name="Zlovoblachko/REAlEC_2step_model_testing",
+                 ged_model_name="Zlovoblachko/11tag-electra-grammar-stage2", device=None):
+        """
+        Initialize the inference class for T5-GED model from HuggingFace
+        Args:
+            model_name: HuggingFace model name/path for the T5-GED model
+            ged_model_name: HuggingFace model name/path for the GED model
+            device: Device to run inference on (cuda/cpu)
+        """
+        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load GED model and tokenizer (same as training)
+        print(f"Loading GED model from HuggingFace: {ged_model_name}...")
+        self.ged_model, self.ged_tokenizer = self._load_ged_model(ged_model_name)
+        # Load T5 model and tokenizer from HuggingFace
+        print(f"Loading T5 model from HuggingFace: {model_name}...")
+        self.t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
+        self.t5_model = T5ForConditionalGeneration.from_pretrained(model_name)
+        self.t5_model.to(self.device)
+        # Create GED encoder (copy of T5 encoder)
+        self.ged_encoder = T5ForConditionalGeneration.from_pretrained(model_name).encoder
+        self.ged_encoder.to(self.device)
+        # Create gating mechanism
+        encoder_hidden_size = self.t5_model.config.d_model
+        self.gate = nn.Linear(2 * encoder_hidden_size, 1)
+        self.gate.to(self.device)
+        # Try to load GED components from HuggingFace
+        try:
+            print("Loading GED components...")
+            from huggingface_hub import hf_hub_download
+            ged_components_path = hf_hub_download(
+                repo_id=model_name,
+                filename="ged_components.pt",
+                cache_dir=None
+            )
+            ged_components = torch.load(ged_components_path, map_location=self.device)
+            self.ged_encoder.load_state_dict(ged_components["ged_encoder"])
+            self.gate.load_state_dict(ged_components["gate"])
+            print("GED components loaded successfully!")
+        except Exception as e:
+            print(f"Warning: Could not load GED components: {e}")
+            print("Using default initialization for GED encoder and gate.")
+        # Set to evaluation mode
+        self.t5_model.eval()
+        self.ged_encoder.eval()
+        self.gate.eval()
+    def _load_ged_model(self, model_name):
+        """Load GED model and tokenizer from HuggingFace"""
+        tokenizer = ElectraTokenizer.from_pretrained(model_name)
+        model = ElectraForTokenClassification.from_pretrained(model_name)
+        model.to(self.device)
+        model.eval()
+        return model, tokenizer
+    def _get_ged_predictions(self, text):
+        """Get GED predictions for input text - exact same as training preprocessing"""
+        inputs = self.ged_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
+        with torch.no_grad():
+            outputs = self.ged_model(**inputs)
+            logits = outputs.logits
+        predictions = torch.argmax(logits, dim=2)
+        token_predictions = predictions[0].cpu().numpy().tolist()
+        tokens = self.ged_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
+        ged_tags = []
+        for token, pred in zip(tokens, token_predictions):
+            if token.startswith("##") or token in ["[CLS]", "[SEP]", "[PAD]"]:
+                continue
+            ged_tags.append(str(pred))
+        return " ".join(ged_tags), tokens, token_predictions
+    def _get_error_spans(self, text):
+        """Extract error spans with simplified categories for display"""
+        ged_tags_str, tokens, predictions = self._get_ged_predictions(text)
+        error_spans = []
+        clean_tokens = []
+        for token, pred in zip(tokens, predictions):
+            if token.startswith("##") or token in ["[CLS]", "[SEP]", "[PAD]"]:
+                continue
+            clean_tokens.append(token)
+            if pred != 0:  # 0 is correct, others are various error types
+                # Simplify the 11-tag system to basic categories for user display
+                if pred in [1, 2, 3, 4]:  # Various replacement/substitution errors
+                    error_type = "Grammar"
+                elif pred in [5, 6]:  # Missing elements
+                    error_type = "Missing"
+                elif pred in [7, 8]:  # Unnecessary elements
+                    error_type = "Unnecessary"
+                elif pred in [9, 10]:  # Other error types
+                    error_type = "Usage"
+                else:
+                    error_type = "Error"
+                error_spans.append({
+                    "token": token,
+                    "type": error_type,
+                    "position": len(clean_tokens) - 1
+                })
+        return error_spans
+    def _preprocess_inputs(self, text, max_length=128):
+        """Preprocess input text exactly as during training"""
+        # Get GED predictions
+        ged_tags, _, _ = self._get_ged_predictions(text)
+        # Tokenize source text (same as training)
+        src_tokens = self.t5_tokenizer(
+            text,
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt"
+        )
+        # Tokenize GED tags (same as training)
+        ged_tokens = self.t5_tokenizer(
+            ged_tags,
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": src_tokens.input_ids.to(self.device),
+            "attention_mask": src_tokens.attention_mask.to(self.device),
+            "ged_input_ids": ged_tokens.input_ids.to(self.device),
+            "ged_attention_mask": ged_tokens.attention_mask.to(self.device)
+        }
+    def _forward_with_ged(self, input_ids, attention_mask, ged_input_ids, ged_attention_mask, max_length=200):
+        """
+        Forward pass with GED integration - replicates T5WithGED.forward() logic
+        """
+        # Get source encoder outputs
+        src_encoder_outputs = self.t5_model.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        # Get GED encoder outputs
+        ged_encoder_outputs = self.ged_encoder(
+            input_ids=ged_input_ids,
+            attention_mask=ged_attention_mask,
+            return_dict=True
+        )
+        # Get hidden states
+        src_hidden_states = src_encoder_outputs.last_hidden_state
+        ged_hidden_states = ged_encoder_outputs.last_hidden_state
+        # Combine hidden states (same as training)
+        min_len = min(src_hidden_states.size(1), ged_hidden_states.size(1))
+        combined = torch.cat([
+            src_hidden_states[:, :min_len, :],
+            ged_hidden_states[:, :min_len, :]
+        ], dim=2)
+        # Apply gating mechanism
+        gate_scores = torch.sigmoid(self.gate(combined))
+        combined_hidden = (
+            gate_scores * src_hidden_states[:, :min_len, :] +
+            (1 - gate_scores) * ged_hidden_states[:, :min_len, :]
+        )
+        # Update encoder outputs
+        src_encoder_outputs.last_hidden_state = combined_hidden
+        # Generate using T5 decoder
+        decoder_outputs = self.t5_model.generate(
+            encoder_outputs=src_encoder_outputs,
+            max_length=max_length,
+            do_sample=False,
+            num_beams=1
+        )
+        return decoder_outputs
+    def correct_text(self, text, max_length=200):
+        """
+        Correct grammatical errors in input text
+        Args:
+            text: Input text to correct
+            max_length: Maximum length for generation
+        Returns:
+            Corrected text as string
+        """
+        # Preprocess inputs exactly as training
+        inputs = self._preprocess_inputs(text)
+        # Generate correction using GED-enhanced model
+        with torch.no_grad():
+            generated_ids = self._forward_with_ged(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                ged_input_ids=inputs["ged_input_ids"],
+                ged_attention_mask=inputs["ged_attention_mask"],
+                max_length=max_length
+            )
+        # Decode output
+        corrected_text = self.t5_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        return corrected_text
+    def analyze_text(self, text):
+        """Enhanced analysis method for Gradio integration"""
+        if not text.strip():
+            return "Model not available or empty text", ""
+        try:
+            # Get corrected text
+            corrected_text = self.correct_text(text)
+            # Get error spans
+            error_spans = self._get_error_spans(text)
+            # Generate HTML output
+            html_output = self.generate_html_analysis(text, corrected_text, error_spans)
+            return corrected_text, html_output
+        except Exception as e:
+            return f"Error during analysis: {str(e)}", ""
+    def generate_html_analysis(self, original, corrected, error_spans):
+        """Generate enhanced HTML analysis output"""
+        # Create highlighted original text
+        highlighted_original = original
+        if error_spans:
+            # Sort by position in reverse to avoid index shifting
+            sorted_spans = sorted(error_spans, key=lambda x: x['position'], reverse=True)
+            # Simple highlighting - in a more sophisticated version, you'd map token positions to character positions
+            for span in sorted_spans:
+                token = span['token']
+                error_type = span['type']
+                # Color coding for different error types
+                color_map = {
+                    "Grammar": "#ffebee",      # Light red
+                    "Missing": "#e8f5e8",      # Light green
+                    "Unnecessary": "#fff3e0",   # Light orange
+                    "Usage": "#e3f2fd"         # Light blue
+                }
+                color = color_map.get(error_type, "#f5f5f5")
+                # Simple token replacement (basic highlighting)
+                if token in highlighted_original:
+                    highlighted_original = highlighted_original.replace(
+                        token,
+                        f"<span style='background-color: {color}; padding: 1px 3px; border-radius: 3px; margin: 0 1px;' title='{error_type}'>{token}</span>",
+                        1
+                    )
+        html = f"""
+        <div style='font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9;'>
+            <h3 style='color: #333; margin-top: 0;'>Grammar Analysis Results</h3>
+            <div style='margin: 15px 0;'>
+                <h4 style='color: #555;'>Original Text with Error Highlighting:</h4>
+                <div style='padding: 10px; background-color: #fff; border: 1px solid #ddd; border-radius: 4px;'>{highlighted_original}</div>
+            </div>
+            <div style='margin: 15px 0;'>
+                <h4 style='color: #28a745;'>Corrected Text:</h4>
+                <p style='padding: 10px; background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 4px;'>{corrected}</p>
+            </div>
+            <div style='margin: 15px 0;'>
+                <h4 style='color: #333;'>Error Summary:</h4>
+                <p style='color: #666;'>Found {len(error_spans)} potential issues</p>
+                <div style='margin-top: 10px;'>
+                    <span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #ffebee; border-radius: 12px; font-size: 12px;'>Grammar</span>
+                    <span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #e8f5e8; border-radius: 12px; font-size: 12px;'>Missing</span>
+                    <span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #fff3e0; border-radius: 12px; font-size: 12px;'>Unnecessary</span>
+                    <span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #e3f2fd; border-radius: 12px; font-size: 12px;'>Usage</span>
+                </div>
+            </div>
+        </div>
+        """
+        return html
 # Initialize SQLite database for storing submissions and exercises
 def init_database():
     conn = sqlite3.connect('language_app.db')
     conn.commit()
     conn.close()
+# Initialize database and components
 init_database()
+print("Initializing enhanced grammar checker...")
+grammar_checker = HuggingFaceT5GEDInference()
+print("Grammar checker initialized successfully!")
 # Gradio Interface Functions
 def analyze_student_writing(text, student_name, task_title="General Writing Task"):
     if not student_name.strip():
         return "Please enter your name.", ""
+    # Analyze text with enhanced model
     corrected_text, html_analysis = grammar_checker.analyze_text(text)
     # Store in database
     return corrected_text, html_analysis
 def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
+    """Create an exercise from text with errors using enhanced analysis"""
     if not text.strip():
         return "Please enter text to create an exercise.", ""
     exercise_html = f"""
     <div style='font-family: Arial, sans-serif; padding: 20px; border: 1px solid #ddd; border-radius: 8px;'>
         <h3>{exercise_title}</h3>
+        <p><strong>Exercise ID: {exercise_id}</strong></p>
         <p><strong>Instructions:</strong> Correct the grammatical errors in the following sentences:</p>
         <ol>
     """
     exercise_html += "</ol></div>"
+    return f"Exercise created with {len(exercise_sentences)} sentences! Exercise ID: {exercise_id}", exercise_html
 def attempt_exercise(exercise_id, student_responses, student_name):
+    """Submit exercise attempt and get score using enhanced analysis"""
     if not student_name.strip():
         return "Please enter your name.", ""
     if len(responses) != len(exercise_sentences):
         return f"Please provide exactly {len(exercise_sentences)} responses (one per line).", ""
+    # Calculate score using enhanced analysis
     correct_count = 0
     feedback = []
     for i, (sentence_data, response) in enumerate(zip(exercise_sentences, responses), 1):
         correct_answer = sentence_data['corrected']
+        # Use the model to check if the response is correct
+        response_corrected, _ = grammar_checker.analyze_text(response)
+        is_correct = response_corrected.strip() == response.strip()  # No further corrections needed
         if is_correct:
             correct_count += 1
+            feedback.append(f"✅ Sentence {i}: Excellent! No errors detected.")
         else:
+            feedback.append(f"❌ Sentence {i}: Your answer: '{response}' | Suggested improvement: '{response_corrected}' | Expected: '{correct_answer}'")
     score = (correct_count / len(exercise_sentences)) * 100
     return progress_html
 # Create Gradio Interface
+with gr.Blocks(title="Language Learning App - Enhanced Grammar Checker", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🎓 Language Learning Application")
     gr.Markdown("### AI-Powered Grammar Checking and Exercise Generation")
+    gr.Markdown("*Now featuring advanced T5-GED neural network with enhanced error detection*")
     with gr.Tabs():
         # Student Writing Analysis Tab
     3. **Exercise Practice**: Students can practice with generated exercises and get scored feedback
     4. **Progress Tracking**: View student progress across submissions and exercises
+    *Powered by advanced T5-GED neural networks for enhanced grammar error detection and correction*
     """)
 if __name__ == "__main__":