Spaces:

bashyaldhiraj2067
/

xpu_5_nepali_text

Sleeping

App Files Files Community

bashyaldhiraj2067 commited on Mar 18

Commit

ba5f14a

verified ·

1 Parent(s): 8579004

Upload 2 files

Browse files

Files changed (2) hide show

app.py +71 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+import torch
+from transformers import AutoModelForSeq2SeqLM
+from huggingface_hub import InferenceClient
+# Define tokenizer
+special_tokens = ["<pad>", "<s>", "</s>", "<unk>"]
+nepali_chars = list("अआइईउऊऋॠऌॡऎएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह्ािीुूृॄेैोौंंःँ।०१२३४५६७८९,.;?!़ॅंःॊॅऒऽॉड़ॐ॥ऑऱफ़ढ़")
+char_vocab = special_tokens + nepali_chars
+char2id = {char: idx for idx, char in enumerate(char_vocab)}
+id2char = {idx: char for char, idx in char2id.items()}
+class CharTokenizer:
+    def __init__(self, char2id, id2char):
+        self.char2id = char2id
+        self.id2char = id2char
+    def encode(self, text):
+        return [self.char2id.get(char, self.char2id["<unk>"]) for char in text]
+    def decode(self, tokens):
+        return "".join([self.id2char.get(token, "<unk>") for token in tokens])
+    def decodex(self, tokens):
+        decoded_string = ""
+        for i, token in enumerate(tokens):
+            char = self.id2char.get(token, "<unk>")
+            if char == "<unk>":
+                if i == 0 or i == len(tokens) - 1 or self.id2char.get(tokens[i - 1], "<unk>") == "<unk>":
+                    decoded_string += ""
+                else:
+                    decoded_string += " "
+            elif char == "<pad>":
+                pass
+            else:
+                decoded_string += char
+        return decoded_string
+# Initialize tokenizer
+tokenizer = CharTokenizer(char2id, id2char)
+# Load T5 model
+model_name = "bashyaldhiraj2067/t5_char_nepali"
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+def correct_text(input_text, max_length=256):
+    input_ids = tokenizer.encode(input_text)
+    input_tensor = torch.tensor([input_ids])
+    with torch.no_grad():
+        outputs = model.generate(
+            input_tensor,
+            max_length=max_length,
+            return_dict_in_generate=True
+        )
+    generated_tokens = outputs.sequences[0].tolist()
+    return tokenizer.decodex(generated_tokens)
+# Gradio interface
+demo = gr.Interface(
+    fn=correct_text,
+    inputs=[gr.Textbox(label="Enter Nepali Text"), gr.Slider(50, 256, step=10, label="Max Length")],
+    outputs=gr.Textbox(label="Corrected Text"),
+    title="Nepali Text Correction",
+    description="Enter text with errors and get corrected output using a T5 model trained on Nepali text.",
+)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+huggingface_hub==0.25.2
+torch
+transformers