Spaces:

Diezu
/

Test_gradio

Sleeping

App Files Files Community

Diezu commited on Feb 6

Commit

9de1a48

verified ·

1 Parent(s): 5db5590

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -19

app.py CHANGED Viewed

@@ -1,38 +1,50 @@
 import gradio as gr
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 def load_model(checkpoint_path):
-    # Load tokenizer và model từ Hugging Face Hub
     tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
     return model, tokenizer
 def correct_spelling(text):
-    # Thêm tiền tố nếu model yêu cầu (tuỳ vào cách model được huấn luyện)
-    input_text = text  # Thay đổi nếu cần
-    # Tokenize và generate
-    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
-    outputs = model.generate(
-        **inputs,
-        max_length=512,
-        num_beams=5,
-        early_stopping=True
-    )
     corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return corrected_text
-# Đường dẫn đến model của bạn trên Hugging Face Hub
-checkpoint_path = "Diezu/Batpho_v2"  # Thay bằng tên repository của bạn
 model, tokenizer = load_model(checkpoint_path)
-# Tạo giao diện Gradio
 demo = gr.Interface(
-    fn=correct_spelling,
     inputs=gr.Textbox(placeholder="Nhập văn bản có lỗi chính tả..."),
-    outputs=gr.Textbox(label="Văn bản đã sửa"),
     title="Demo Sửa Lỗi Chính Tả",
-    description="Sử dụng mô hình của bạn để sửa lỗi chính tả tiếng Việt."
 )
-demo.launch()

 import gradio as gr
+import difflib
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 def load_model(checkpoint_path):
     tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
     return model, tokenizer
 def correct_spelling(text):
+    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
+    outputs = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
     corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return corrected_text
+def find_spelling_errors(s1, s2):
+    a = s1.split()
+    b = s2.split()
+    matcher = difflib.SequenceMatcher(None, a, b)
+    highlighted_text = []
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == 'replace':
+            highlighted_text.append(f'<span style="color: red; text-decoration: underline;">{" ".join(a[i1:i2])}</span>')
+        elif tag == 'delete':
+            highlighted_text.append(f'<span style="color: orange; text-decoration: line-through;">{" ".join(a[i1:i2])}</span>')
+        elif tag == 'insert':
+            highlighted_text.append(f'<span style="color: green; font-weight: bold;">{" ".join(b[j1:j2])}</span>')
+        else:
+            highlighted_text.append(" ".join(a[i1:i2]))
+    return " ".join(highlighted_text)
+def process_text(text):
+    corrected_text = correct_spelling(text)
+    highlighted_text = find_spelling_errors(text, corrected_text)
+    return highlighted_text, corrected_text
+checkpoint_path = "Diezu/Batpho_v2"
 model, tokenizer = load_model(checkpoint_path)
 demo = gr.Interface(
+    fn=process_text,
     inputs=gr.Textbox(placeholder="Nhập văn bản có lỗi chính tả..."),
+    outputs=[gr.HTML(label="Phát hiện lỗi"), gr.Textbox(label="Văn bản đã sửa")],
     title="Demo Sửa Lỗi Chính Tả",
+    description="Sử dụng mô hình để sửa lỗi chính tả. Từ bị thay thế sẽ được gạch chân đỏ, từ bị xóa có màu cam gạch ngang, từ được thêm có màu xanh đậm."
 )
+demo.launch()