File size: 3,276 Bytes
288b4d9
 
 
e2b6396
288b4d9
 
 
 
 
 
3ab7e14
 
 
 
 
 
 
 
 
e2b6396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288b4d9
e2b6396
 
 
3ab7e14
 
 
 
 
288b4d9
 
 
3ab7e14
 
 
 
 
 
 
 
 
288b4d9
 
3ab7e14
 
 
 
288b4d9
e2b6396
 
 
 
288b4d9
 
 
 
 
 
 
e2b6396
 
 
 
 
 
288b4d9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re

# Load the model and tokenizer
model_name = 'abinayam/gpt-2-tamil'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# System prompt
system_prompt = """You are an expert Tamil language model specializing in spelling and grammar correction. Your task is to:
1. Correct any spelling errors in the given text.
2. Fix grammatical mistakes, including proper application of sandhi rules.
3. Ensure the corrected text maintains the original meaning and context.
4. Provide the corrected version of the entire input text.

Remember to preserve the structure and intent of the original text while making necessary corrections."""

# Common error corrections
common_errors = {
    'பழங்கல்': 'பழங்கள்',
    # Add more common spelling errors here
}

def apply_sandhi_rules(text):
    # Apply sandhi rules
    text = re.sub(r'(கு|க்கு)\s+(ப|த|க|ச)', r'\1ப் \2', text)
    # Add more sandhi rules as needed
    return text

def preprocess_text(text):
    # Apply common error corrections
    for error, correction in common_errors.items():
        text = text.replace(error, correction)
    return text

def postprocess_text(text):
    # Apply sandhi rules
    text = apply_sandhi_rules(text)
    return text

def correct_text(input_text):
    # Preprocess the input text
    preprocessed_text = preprocess_text(input_text)
    
    # Prepare the full prompt with system prompt and input text
    full_prompt = f"{system_prompt}\n\nInput: {preprocessed_text}\n\nCorrected:"
    
    # Tokenize the full prompt
    input_ids = tokenizer.encode(full_prompt, return_tensors='pt')
    
    # Generate corrected text
    with torch.no_grad():
        output = model.generate(
            input_ids, 
            max_length=len(input_ids[0]) + 100,  # Adjust based on expected output length
            num_return_sequences=1, 
            temperature=0.7,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract the corrected text (everything after "Corrected:")
    corrected_text = generated_text.split("Corrected:")[-1].strip()
    
    # Postprocess the corrected text
    final_text = postprocess_text(corrected_text)
    
    return final_text

# Create the Gradio interface
iface = gr.Interface(
    fn=correct_text,
    inputs=gr.Textbox(lines=5, placeholder="Enter Tamil text here..."),
    outputs=gr.Textbox(label="Corrected Text"),
    title="Tamil Spell Corrector and Grammar Checker",
    description="This app uses the 'abinayam/gpt-2-tamil' model along with custom rules to correct spelling and grammar in Tamil text.",
    examples=[
        ["நான் நேற்று கடைக்கு போனேன். அங்கே நிறைய பழங்கல் வாங்கினேன்."],
        ["நான் பள்ளிகு செல்கிறேன்."],
        ["அவன் வீட்டுகு வந்தான்."]
    ]
)

# Launch the app
iface.launch()