Spaces:
Running
Running
Commit
Β·
756883e
1
Parent(s):
c444d4f
initial commit
Browse files
app.py
CHANGED
@@ -5,8 +5,14 @@ import os
|
|
5 |
from datetime import datetime
|
6 |
import torch
|
7 |
import nltk
|
8 |
-
from transformers import
|
|
|
|
|
|
|
|
|
|
|
9 |
import torch.nn as nn
|
|
|
10 |
|
11 |
# Download NLTK data
|
12 |
try:
|
@@ -14,6 +20,304 @@ try:
|
|
14 |
except LookupError:
|
15 |
nltk.download('punkt')
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Initialize SQLite database for storing submissions and exercises
|
18 |
def init_database():
|
19 |
conn = sqlite3.connect('language_app.db')
|
@@ -74,110 +378,11 @@ def init_database():
|
|
74 |
conn.commit()
|
75 |
conn.close()
|
76 |
|
77 |
-
#
|
78 |
-
class SimpleGrammarChecker:
|
79 |
-
def __init__(self):
|
80 |
-
self.model_name = "Zlovoblachko/Realec-2step-ft-realec"
|
81 |
-
self.ged_model_name = "Zlovoblachko/4tag-electra-grammar-error-detection"
|
82 |
-
self.load_models()
|
83 |
-
|
84 |
-
def load_models(self):
|
85 |
-
try:
|
86 |
-
# Load T5 model
|
87 |
-
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
88 |
-
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
|
89 |
-
|
90 |
-
# Load GED model
|
91 |
-
self.ged_tokenizer = ElectraTokenizer.from_pretrained(self.ged_model_name)
|
92 |
-
self.ged_model = ElectraForTokenClassification.from_pretrained(self.ged_model_name)
|
93 |
-
|
94 |
-
print("Models loaded successfully!")
|
95 |
-
except Exception as e:
|
96 |
-
print(f"Error loading models: {e}")
|
97 |
-
self.model = None
|
98 |
-
self.ged_model = None
|
99 |
-
|
100 |
-
def analyze_text(self, text):
|
101 |
-
if not self.model or not text.strip():
|
102 |
-
return "Model not available or empty text", ""
|
103 |
-
|
104 |
-
try:
|
105 |
-
# Tokenize and generate correction
|
106 |
-
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
107 |
-
|
108 |
-
with torch.no_grad():
|
109 |
-
outputs = self.model.generate(
|
110 |
-
input_ids=inputs.input_ids,
|
111 |
-
attention_mask=inputs.attention_mask,
|
112 |
-
max_length=512,
|
113 |
-
num_beams=4,
|
114 |
-
early_stopping=True
|
115 |
-
)
|
116 |
-
|
117 |
-
corrected_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
118 |
-
|
119 |
-
# Get GED predictions if available
|
120 |
-
error_spans = []
|
121 |
-
if self.ged_model:
|
122 |
-
error_spans = self.get_error_spans(text)
|
123 |
-
|
124 |
-
# Generate HTML output
|
125 |
-
html_output = self.generate_html_analysis(text, corrected_text, error_spans)
|
126 |
-
|
127 |
-
return corrected_text, html_output
|
128 |
-
|
129 |
-
except Exception as e:
|
130 |
-
return f"Error during analysis: {str(e)}", ""
|
131 |
-
|
132 |
-
def get_error_spans(self, text):
|
133 |
-
try:
|
134 |
-
inputs = self.ged_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
135 |
-
|
136 |
-
with torch.no_grad():
|
137 |
-
outputs = self.ged_model(**inputs)
|
138 |
-
predictions = torch.argmax(outputs.logits, dim=2)
|
139 |
-
|
140 |
-
tokens = self.ged_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
|
141 |
-
token_predictions = predictions[0].cpu().numpy().tolist()
|
142 |
-
|
143 |
-
error_spans = []
|
144 |
-
for i, (token, pred) in enumerate(zip(tokens, token_predictions)):
|
145 |
-
if token.startswith("##") or token in ["[CLS]", "[SEP]", "[PAD]"]:
|
146 |
-
continue
|
147 |
-
if pred != 0: # 0 is correct, 1=R, 2=M, 3=U
|
148 |
-
error_type = ["C", "R", "M", "U"][pred]
|
149 |
-
error_spans.append({"token": token, "type": error_type, "position": i})
|
150 |
-
|
151 |
-
return error_spans
|
152 |
-
except:
|
153 |
-
return []
|
154 |
-
|
155 |
-
def generate_html_analysis(self, original, corrected, error_spans):
|
156 |
-
html = f"""
|
157 |
-
<div style='font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9;'>
|
158 |
-
<h3 style='color: #333; margin-top: 0;'>Grammar Analysis Results</h3>
|
159 |
-
|
160 |
-
<div style='margin: 15px 0;'>
|
161 |
-
<h4 style='color: #555;'>Original Text:</h4>
|
162 |
-
<p style='padding: 10px; background-color: #fff; border: 1px solid #ddd; border-radius: 4px;'>{original}</p>
|
163 |
-
</div>
|
164 |
-
|
165 |
-
<div style='margin: 15px 0;'>
|
166 |
-
<h4 style='color: #28a745;'>Corrected Text:</h4>
|
167 |
-
<p style='padding: 10px; background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 4px;'>{corrected}</p>
|
168 |
-
</div>
|
169 |
-
|
170 |
-
<div style='margin: 15px 0;'>
|
171 |
-
<h4 style='color: #333;'>Error Analysis:</h4>
|
172 |
-
<p style='color: #666;'>Found {len(error_spans)} potential errors</p>
|
173 |
-
</div>
|
174 |
-
</div>
|
175 |
-
"""
|
176 |
-
return html
|
177 |
-
|
178 |
-
# Initialize components
|
179 |
init_database()
|
180 |
-
|
|
|
|
|
181 |
|
182 |
# Gradio Interface Functions
|
183 |
def analyze_student_writing(text, student_name, task_title="General Writing Task"):
|
@@ -188,7 +393,7 @@ def analyze_student_writing(text, student_name, task_title="General Writing Task
|
|
188 |
if not student_name.strip():
|
189 |
return "Please enter your name.", ""
|
190 |
|
191 |
-
# Analyze text
|
192 |
corrected_text, html_analysis = grammar_checker.analyze_text(text)
|
193 |
|
194 |
# Store in database
|
@@ -220,7 +425,7 @@ def analyze_student_writing(text, student_name, task_title="General Writing Task
|
|
220 |
return corrected_text, html_analysis
|
221 |
|
222 |
def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
|
223 |
-
"""Create an exercise from text with errors"""
|
224 |
if not text.strip():
|
225 |
return "Please enter text to create an exercise.", ""
|
226 |
|
@@ -257,6 +462,7 @@ def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
|
|
257 |
exercise_html = f"""
|
258 |
<div style='font-family: Arial, sans-serif; padding: 20px; border: 1px solid #ddd; border-radius: 8px;'>
|
259 |
<h3>{exercise_title}</h3>
|
|
|
260 |
<p><strong>Instructions:</strong> Correct the grammatical errors in the following sentences:</p>
|
261 |
<ol>
|
262 |
"""
|
@@ -266,10 +472,10 @@ def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
|
|
266 |
|
267 |
exercise_html += "</ol></div>"
|
268 |
|
269 |
-
return f"Exercise created with {len(exercise_sentences)} sentences!", exercise_html
|
270 |
|
271 |
def attempt_exercise(exercise_id, student_responses, student_name):
|
272 |
-
"""Submit exercise attempt and get score"""
|
273 |
if not student_name.strip():
|
274 |
return "Please enter your name.", ""
|
275 |
|
@@ -296,19 +502,22 @@ def attempt_exercise(exercise_id, student_responses, student_name):
|
|
296 |
if len(responses) != len(exercise_sentences):
|
297 |
return f"Please provide exactly {len(exercise_sentences)} responses (one per line).", ""
|
298 |
|
299 |
-
# Calculate score
|
300 |
correct_count = 0
|
301 |
feedback = []
|
302 |
|
303 |
for i, (sentence_data, response) in enumerate(zip(exercise_sentences, responses), 1):
|
304 |
correct_answer = sentence_data['corrected']
|
305 |
-
|
|
|
|
|
|
|
306 |
|
307 |
if is_correct:
|
308 |
correct_count += 1
|
309 |
-
feedback.append(f"β
Sentence {i}:
|
310 |
else:
|
311 |
-
feedback.append(f"β Sentence {i}: Your answer: '{response}' |
|
312 |
|
313 |
score = (correct_count / len(exercise_sentences)) * 100
|
314 |
|
@@ -386,9 +595,10 @@ def get_student_progress(student_name):
|
|
386 |
return progress_html
|
387 |
|
388 |
# Create Gradio Interface
|
389 |
-
with gr.Blocks(title="Language Learning App - Grammar Checker", theme=gr.themes.Soft()) as app:
|
390 |
gr.Markdown("# π Language Learning Application")
|
391 |
gr.Markdown("### AI-Powered Grammar Checking and Exercise Generation")
|
|
|
392 |
|
393 |
with gr.Tabs():
|
394 |
# Student Writing Analysis Tab
|
@@ -491,7 +701,7 @@ with gr.Blocks(title="Language Learning App - Grammar Checker", theme=gr.themes.
|
|
491 |
3. **Exercise Practice**: Students can practice with generated exercises and get scored feedback
|
492 |
4. **Progress Tracking**: View student progress across submissions and exercises
|
493 |
|
494 |
-
*Powered by advanced neural networks for grammar error detection and correction*
|
495 |
""")
|
496 |
|
497 |
if __name__ == "__main__":
|
|
|
5 |
from datetime import datetime
|
6 |
import torch
|
7 |
import nltk
|
8 |
+
from transformers import (
|
9 |
+
T5Tokenizer,
|
10 |
+
T5ForConditionalGeneration,
|
11 |
+
ElectraTokenizer,
|
12 |
+
ElectraForTokenClassification
|
13 |
+
)
|
14 |
import torch.nn as nn
|
15 |
+
from tqdm import tqdm
|
16 |
|
17 |
# Download NLTK data
|
18 |
try:
|
|
|
20 |
except LookupError:
|
21 |
nltk.download('punkt')
|
22 |
|
23 |
+
class HuggingFaceT5GEDInference:
|
24 |
+
def __init__(self, model_name="Zlovoblachko/REAlEC_2step_model_testing",
|
25 |
+
ged_model_name="Zlovoblachko/11tag-electra-grammar-stage2", device=None):
|
26 |
+
"""
|
27 |
+
Initialize the inference class for T5-GED model from HuggingFace
|
28 |
+
|
29 |
+
Args:
|
30 |
+
model_name: HuggingFace model name/path for the T5-GED model
|
31 |
+
ged_model_name: HuggingFace model name/path for the GED model
|
32 |
+
device: Device to run inference on (cuda/cpu)
|
33 |
+
"""
|
34 |
+
self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
35 |
+
|
36 |
+
# Load GED model and tokenizer (same as training)
|
37 |
+
print(f"Loading GED model from HuggingFace: {ged_model_name}...")
|
38 |
+
self.ged_model, self.ged_tokenizer = self._load_ged_model(ged_model_name)
|
39 |
+
|
40 |
+
# Load T5 model and tokenizer from HuggingFace
|
41 |
+
print(f"Loading T5 model from HuggingFace: {model_name}...")
|
42 |
+
self.t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
|
43 |
+
self.t5_model = T5ForConditionalGeneration.from_pretrained(model_name)
|
44 |
+
self.t5_model.to(self.device)
|
45 |
+
|
46 |
+
# Create GED encoder (copy of T5 encoder)
|
47 |
+
self.ged_encoder = T5ForConditionalGeneration.from_pretrained(model_name).encoder
|
48 |
+
self.ged_encoder.to(self.device)
|
49 |
+
|
50 |
+
# Create gating mechanism
|
51 |
+
encoder_hidden_size = self.t5_model.config.d_model
|
52 |
+
self.gate = nn.Linear(2 * encoder_hidden_size, 1)
|
53 |
+
self.gate.to(self.device)
|
54 |
+
|
55 |
+
# Try to load GED components from HuggingFace
|
56 |
+
try:
|
57 |
+
print("Loading GED components...")
|
58 |
+
from huggingface_hub import hf_hub_download
|
59 |
+
ged_components_path = hf_hub_download(
|
60 |
+
repo_id=model_name,
|
61 |
+
filename="ged_components.pt",
|
62 |
+
cache_dir=None
|
63 |
+
)
|
64 |
+
ged_components = torch.load(ged_components_path, map_location=self.device)
|
65 |
+
self.ged_encoder.load_state_dict(ged_components["ged_encoder"])
|
66 |
+
self.gate.load_state_dict(ged_components["gate"])
|
67 |
+
print("GED components loaded successfully!")
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Warning: Could not load GED components: {e}")
|
70 |
+
print("Using default initialization for GED encoder and gate.")
|
71 |
+
|
72 |
+
# Set to evaluation mode
|
73 |
+
self.t5_model.eval()
|
74 |
+
self.ged_encoder.eval()
|
75 |
+
self.gate.eval()
|
76 |
+
|
77 |
+
def _load_ged_model(self, model_name):
|
78 |
+
"""Load GED model and tokenizer from HuggingFace"""
|
79 |
+
tokenizer = ElectraTokenizer.from_pretrained(model_name)
|
80 |
+
model = ElectraForTokenClassification.from_pretrained(model_name)
|
81 |
+
model.to(self.device)
|
82 |
+
model.eval()
|
83 |
+
return model, tokenizer
|
84 |
+
|
85 |
+
def _get_ged_predictions(self, text):
|
86 |
+
"""Get GED predictions for input text - exact same as training preprocessing"""
|
87 |
+
inputs = self.ged_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
|
88 |
+
with torch.no_grad():
|
89 |
+
outputs = self.ged_model(**inputs)
|
90 |
+
logits = outputs.logits
|
91 |
+
predictions = torch.argmax(logits, dim=2)
|
92 |
+
token_predictions = predictions[0].cpu().numpy().tolist()
|
93 |
+
tokens = self.ged_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
|
94 |
+
|
95 |
+
ged_tags = []
|
96 |
+
for token, pred in zip(tokens, token_predictions):
|
97 |
+
if token.startswith("##") or token in ["[CLS]", "[SEP]", "[PAD]"]:
|
98 |
+
continue
|
99 |
+
ged_tags.append(str(pred))
|
100 |
+
|
101 |
+
return " ".join(ged_tags), tokens, token_predictions
|
102 |
+
|
103 |
+
def _get_error_spans(self, text):
|
104 |
+
"""Extract error spans with simplified categories for display"""
|
105 |
+
ged_tags_str, tokens, predictions = self._get_ged_predictions(text)
|
106 |
+
|
107 |
+
error_spans = []
|
108 |
+
clean_tokens = []
|
109 |
+
|
110 |
+
for token, pred in zip(tokens, predictions):
|
111 |
+
if token.startswith("##") or token in ["[CLS]", "[SEP]", "[PAD]"]:
|
112 |
+
continue
|
113 |
+
clean_tokens.append(token)
|
114 |
+
|
115 |
+
if pred != 0: # 0 is correct, others are various error types
|
116 |
+
# Simplify the 11-tag system to basic categories for user display
|
117 |
+
if pred in [1, 2, 3, 4]: # Various replacement/substitution errors
|
118 |
+
error_type = "Grammar"
|
119 |
+
elif pred in [5, 6]: # Missing elements
|
120 |
+
error_type = "Missing"
|
121 |
+
elif pred in [7, 8]: # Unnecessary elements
|
122 |
+
error_type = "Unnecessary"
|
123 |
+
elif pred in [9, 10]: # Other error types
|
124 |
+
error_type = "Usage"
|
125 |
+
else:
|
126 |
+
error_type = "Error"
|
127 |
+
|
128 |
+
error_spans.append({
|
129 |
+
"token": token,
|
130 |
+
"type": error_type,
|
131 |
+
"position": len(clean_tokens) - 1
|
132 |
+
})
|
133 |
+
|
134 |
+
return error_spans
|
135 |
+
|
136 |
+
def _preprocess_inputs(self, text, max_length=128):
|
137 |
+
"""Preprocess input text exactly as during training"""
|
138 |
+
# Get GED predictions
|
139 |
+
ged_tags, _, _ = self._get_ged_predictions(text)
|
140 |
+
|
141 |
+
# Tokenize source text (same as training)
|
142 |
+
src_tokens = self.t5_tokenizer(
|
143 |
+
text,
|
144 |
+
truncation=True,
|
145 |
+
max_length=max_length,
|
146 |
+
return_tensors="pt"
|
147 |
+
)
|
148 |
+
|
149 |
+
# Tokenize GED tags (same as training)
|
150 |
+
ged_tokens = self.t5_tokenizer(
|
151 |
+
ged_tags,
|
152 |
+
truncation=True,
|
153 |
+
max_length=max_length,
|
154 |
+
return_tensors="pt"
|
155 |
+
)
|
156 |
+
|
157 |
+
return {
|
158 |
+
"input_ids": src_tokens.input_ids.to(self.device),
|
159 |
+
"attention_mask": src_tokens.attention_mask.to(self.device),
|
160 |
+
"ged_input_ids": ged_tokens.input_ids.to(self.device),
|
161 |
+
"ged_attention_mask": ged_tokens.attention_mask.to(self.device)
|
162 |
+
}
|
163 |
+
|
164 |
+
def _forward_with_ged(self, input_ids, attention_mask, ged_input_ids, ged_attention_mask, max_length=200):
|
165 |
+
"""
|
166 |
+
Forward pass with GED integration - replicates T5WithGED.forward() logic
|
167 |
+
"""
|
168 |
+
# Get source encoder outputs
|
169 |
+
src_encoder_outputs = self.t5_model.encoder(
|
170 |
+
input_ids=input_ids,
|
171 |
+
attention_mask=attention_mask,
|
172 |
+
return_dict=True
|
173 |
+
)
|
174 |
+
|
175 |
+
# Get GED encoder outputs
|
176 |
+
ged_encoder_outputs = self.ged_encoder(
|
177 |
+
input_ids=ged_input_ids,
|
178 |
+
attention_mask=ged_attention_mask,
|
179 |
+
return_dict=True
|
180 |
+
)
|
181 |
+
|
182 |
+
# Get hidden states
|
183 |
+
src_hidden_states = src_encoder_outputs.last_hidden_state
|
184 |
+
ged_hidden_states = ged_encoder_outputs.last_hidden_state
|
185 |
+
|
186 |
+
# Combine hidden states (same as training)
|
187 |
+
min_len = min(src_hidden_states.size(1), ged_hidden_states.size(1))
|
188 |
+
combined = torch.cat([
|
189 |
+
src_hidden_states[:, :min_len, :],
|
190 |
+
ged_hidden_states[:, :min_len, :]
|
191 |
+
], dim=2)
|
192 |
+
|
193 |
+
# Apply gating mechanism
|
194 |
+
gate_scores = torch.sigmoid(self.gate(combined))
|
195 |
+
combined_hidden = (
|
196 |
+
gate_scores * src_hidden_states[:, :min_len, :] +
|
197 |
+
(1 - gate_scores) * ged_hidden_states[:, :min_len, :]
|
198 |
+
)
|
199 |
+
|
200 |
+
# Update encoder outputs
|
201 |
+
src_encoder_outputs.last_hidden_state = combined_hidden
|
202 |
+
|
203 |
+
# Generate using T5 decoder
|
204 |
+
decoder_outputs = self.t5_model.generate(
|
205 |
+
encoder_outputs=src_encoder_outputs,
|
206 |
+
max_length=max_length,
|
207 |
+
do_sample=False,
|
208 |
+
num_beams=1
|
209 |
+
)
|
210 |
+
|
211 |
+
return decoder_outputs
|
212 |
+
|
213 |
+
def correct_text(self, text, max_length=200):
|
214 |
+
"""
|
215 |
+
Correct grammatical errors in input text
|
216 |
+
|
217 |
+
Args:
|
218 |
+
text: Input text to correct
|
219 |
+
max_length: Maximum length for generation
|
220 |
+
|
221 |
+
Returns:
|
222 |
+
Corrected text as string
|
223 |
+
"""
|
224 |
+
# Preprocess inputs exactly as training
|
225 |
+
inputs = self._preprocess_inputs(text)
|
226 |
+
|
227 |
+
# Generate correction using GED-enhanced model
|
228 |
+
with torch.no_grad():
|
229 |
+
generated_ids = self._forward_with_ged(
|
230 |
+
input_ids=inputs["input_ids"],
|
231 |
+
attention_mask=inputs["attention_mask"],
|
232 |
+
ged_input_ids=inputs["ged_input_ids"],
|
233 |
+
ged_attention_mask=inputs["ged_attention_mask"],
|
234 |
+
max_length=max_length
|
235 |
+
)
|
236 |
+
|
237 |
+
# Decode output
|
238 |
+
corrected_text = self.t5_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
|
239 |
+
return corrected_text
|
240 |
+
|
241 |
+
def analyze_text(self, text):
|
242 |
+
"""Enhanced analysis method for Gradio integration"""
|
243 |
+
if not text.strip():
|
244 |
+
return "Model not available or empty text", ""
|
245 |
+
|
246 |
+
try:
|
247 |
+
# Get corrected text
|
248 |
+
corrected_text = self.correct_text(text)
|
249 |
+
|
250 |
+
# Get error spans
|
251 |
+
error_spans = self._get_error_spans(text)
|
252 |
+
|
253 |
+
# Generate HTML output
|
254 |
+
html_output = self.generate_html_analysis(text, corrected_text, error_spans)
|
255 |
+
|
256 |
+
return corrected_text, html_output
|
257 |
+
|
258 |
+
except Exception as e:
|
259 |
+
return f"Error during analysis: {str(e)}", ""
|
260 |
+
|
261 |
+
def generate_html_analysis(self, original, corrected, error_spans):
|
262 |
+
"""Generate enhanced HTML analysis output"""
|
263 |
+
# Create highlighted original text
|
264 |
+
highlighted_original = original
|
265 |
+
if error_spans:
|
266 |
+
# Sort by position in reverse to avoid index shifting
|
267 |
+
sorted_spans = sorted(error_spans, key=lambda x: x['position'], reverse=True)
|
268 |
+
|
269 |
+
# Simple highlighting - in a more sophisticated version, you'd map token positions to character positions
|
270 |
+
for span in sorted_spans:
|
271 |
+
token = span['token']
|
272 |
+
error_type = span['type']
|
273 |
+
|
274 |
+
# Color coding for different error types
|
275 |
+
color_map = {
|
276 |
+
"Grammar": "#ffebee", # Light red
|
277 |
+
"Missing": "#e8f5e8", # Light green
|
278 |
+
"Unnecessary": "#fff3e0", # Light orange
|
279 |
+
"Usage": "#e3f2fd" # Light blue
|
280 |
+
}
|
281 |
+
|
282 |
+
color = color_map.get(error_type, "#f5f5f5")
|
283 |
+
|
284 |
+
# Simple token replacement (basic highlighting)
|
285 |
+
if token in highlighted_original:
|
286 |
+
highlighted_original = highlighted_original.replace(
|
287 |
+
token,
|
288 |
+
f"<span style='background-color: {color}; padding: 1px 3px; border-radius: 3px; margin: 0 1px;' title='{error_type}'>{token}</span>",
|
289 |
+
1
|
290 |
+
)
|
291 |
+
|
292 |
+
html = f"""
|
293 |
+
<div style='font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9;'>
|
294 |
+
<h3 style='color: #333; margin-top: 0;'>Grammar Analysis Results</h3>
|
295 |
+
|
296 |
+
<div style='margin: 15px 0;'>
|
297 |
+
<h4 style='color: #555;'>Original Text with Error Highlighting:</h4>
|
298 |
+
<div style='padding: 10px; background-color: #fff; border: 1px solid #ddd; border-radius: 4px;'>{highlighted_original}</div>
|
299 |
+
</div>
|
300 |
+
|
301 |
+
<div style='margin: 15px 0;'>
|
302 |
+
<h4 style='color: #28a745;'>Corrected Text:</h4>
|
303 |
+
<p style='padding: 10px; background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 4px;'>{corrected}</p>
|
304 |
+
</div>
|
305 |
+
|
306 |
+
<div style='margin: 15px 0;'>
|
307 |
+
<h4 style='color: #333;'>Error Summary:</h4>
|
308 |
+
<p style='color: #666;'>Found {len(error_spans)} potential issues</p>
|
309 |
+
|
310 |
+
<div style='margin-top: 10px;'>
|
311 |
+
<span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #ffebee; border-radius: 12px; font-size: 12px;'>Grammar</span>
|
312 |
+
<span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #e8f5e8; border-radius: 12px; font-size: 12px;'>Missing</span>
|
313 |
+
<span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #fff3e0; border-radius: 12px; font-size: 12px;'>Unnecessary</span>
|
314 |
+
<span style='display: inline-block; margin: 2px 5px; padding: 2px 8px; background-color: #e3f2fd; border-radius: 12px; font-size: 12px;'>Usage</span>
|
315 |
+
</div>
|
316 |
+
</div>
|
317 |
+
</div>
|
318 |
+
"""
|
319 |
+
return html
|
320 |
+
|
321 |
# Initialize SQLite database for storing submissions and exercises
|
322 |
def init_database():
|
323 |
conn = sqlite3.connect('language_app.db')
|
|
|
378 |
conn.commit()
|
379 |
conn.close()
|
380 |
|
381 |
+
# Initialize database and components
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
init_database()
|
383 |
+
print("Initializing enhanced grammar checker...")
|
384 |
+
grammar_checker = HuggingFaceT5GEDInference()
|
385 |
+
print("Grammar checker initialized successfully!")
|
386 |
|
387 |
# Gradio Interface Functions
|
388 |
def analyze_student_writing(text, student_name, task_title="General Writing Task"):
|
|
|
393 |
if not student_name.strip():
|
394 |
return "Please enter your name.", ""
|
395 |
|
396 |
+
# Analyze text with enhanced model
|
397 |
corrected_text, html_analysis = grammar_checker.analyze_text(text)
|
398 |
|
399 |
# Store in database
|
|
|
425 |
return corrected_text, html_analysis
|
426 |
|
427 |
def create_exercise_from_text(text, exercise_title="Grammar Exercise"):
|
428 |
+
"""Create an exercise from text with errors using enhanced analysis"""
|
429 |
if not text.strip():
|
430 |
return "Please enter text to create an exercise.", ""
|
431 |
|
|
|
462 |
exercise_html = f"""
|
463 |
<div style='font-family: Arial, sans-serif; padding: 20px; border: 1px solid #ddd; border-radius: 8px;'>
|
464 |
<h3>{exercise_title}</h3>
|
465 |
+
<p><strong>Exercise ID: {exercise_id}</strong></p>
|
466 |
<p><strong>Instructions:</strong> Correct the grammatical errors in the following sentences:</p>
|
467 |
<ol>
|
468 |
"""
|
|
|
472 |
|
473 |
exercise_html += "</ol></div>"
|
474 |
|
475 |
+
return f"Exercise created with {len(exercise_sentences)} sentences! Exercise ID: {exercise_id}", exercise_html
|
476 |
|
477 |
def attempt_exercise(exercise_id, student_responses, student_name):
|
478 |
+
"""Submit exercise attempt and get score using enhanced analysis"""
|
479 |
if not student_name.strip():
|
480 |
return "Please enter your name.", ""
|
481 |
|
|
|
502 |
if len(responses) != len(exercise_sentences):
|
503 |
return f"Please provide exactly {len(exercise_sentences)} responses (one per line).", ""
|
504 |
|
505 |
+
# Calculate score using enhanced analysis
|
506 |
correct_count = 0
|
507 |
feedback = []
|
508 |
|
509 |
for i, (sentence_data, response) in enumerate(zip(exercise_sentences, responses), 1):
|
510 |
correct_answer = sentence_data['corrected']
|
511 |
+
|
512 |
+
# Use the model to check if the response is correct
|
513 |
+
response_corrected, _ = grammar_checker.analyze_text(response)
|
514 |
+
is_correct = response_corrected.strip() == response.strip() # No further corrections needed
|
515 |
|
516 |
if is_correct:
|
517 |
correct_count += 1
|
518 |
+
feedback.append(f"β
Sentence {i}: Excellent! No errors detected.")
|
519 |
else:
|
520 |
+
feedback.append(f"β Sentence {i}: Your answer: '{response}' | Suggested improvement: '{response_corrected}' | Expected: '{correct_answer}'")
|
521 |
|
522 |
score = (correct_count / len(exercise_sentences)) * 100
|
523 |
|
|
|
595 |
return progress_html
|
596 |
|
597 |
# Create Gradio Interface
|
598 |
+
with gr.Blocks(title="Language Learning App - Enhanced Grammar Checker", theme=gr.themes.Soft()) as app:
|
599 |
gr.Markdown("# π Language Learning Application")
|
600 |
gr.Markdown("### AI-Powered Grammar Checking and Exercise Generation")
|
601 |
+
gr.Markdown("*Now featuring advanced T5-GED neural network with enhanced error detection*")
|
602 |
|
603 |
with gr.Tabs():
|
604 |
# Student Writing Analysis Tab
|
|
|
701 |
3. **Exercise Practice**: Students can practice with generated exercises and get scored feedback
|
702 |
4. **Progress Tracking**: View student progress across submissions and exercises
|
703 |
|
704 |
+
*Powered by advanced T5-GED neural networks for enhanced grammar error detection and correction*
|
705 |
""")
|
706 |
|
707 |
if __name__ == "__main__":
|