File size: 5,122 Bytes
a4b85c4
e63bdfe
 
a4b85c4
e63bdfe
a4b85c4
e63bdfe
 
 
 
 
 
 
9fc880b
e63bdfe
a4b85c4
e63bdfe
a4b85c4
e63bdfe
a4b85c4
e63bdfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5d0fef
a4b85c4
 
b5d0fef
a4b85c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5d0fef
a4b85c4
 
 
 
 
 
b5d0fef
a4b85c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5d0fef
a4b85c4
 
 
 
 
b5d0fef
a4b85c4
b5d0fef
a4b85c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import gradio as gr
from transformers import pipeline
import spacy
import subprocess
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
import re
import string
import random

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# Initialize stopwords
stop_words = set(stopwords.words("english"))

# Words we don't want to replace
exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}

# Initialize the English text classification pipeline for AI detection
pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")

# Initialize the spell checker
spell = SpellChecker()

# Ensure the SpaCy model is installed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Exclude tags and words (adjusted for better precision)
exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}

def get_synonyms(word):
    """Find synonyms for a given word considering the context."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
                synonyms.add(lemma.name())
    return synonyms

def replace_with_synonyms(word, pos_tag):
    """Replace words with synonyms, keeping the original POS tag."""
    synonyms = get_synonyms(word)
    # Filter by POS tag
    filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag]
    if filtered_synonyms:
        return random.choice(filtered_synonyms)
    return word

def improve_paraphrasing_and_grammar(text):
    """Paraphrase and correct grammatical errors in the text."""
    doc = nlp(text)
    corrected_text = []

    for sent in doc.sents:
        sentence = []
        for token in sent:
            # Replace words with synonyms, excluding special POS tags
            if token.tag_ not in exclude_tags and token.text.lower() not in exclude_words and token.text not in string.punctuation:
                synonym = replace_with_synonyms(token.text, token.tag_)
                sentence.append(synonym if synonym else token.text)
            else:
                sentence.append(token.text)

        corrected_text.append(' '.join(sentence))
    
    # Ensure proper punctuation and capitalization
    final_text = ' '.join(corrected_text)
    final_text = fix_possessives(final_text)
    final_text = fix_punctuation_spacing(final_text)
    final_text = capitalize_sentences(final_text)
    final_text = fix_article_errors(final_text)
    
    return final_text

def fix_punctuation_spacing(text):
    """Fix spaces before punctuation marks."""
    text = re.sub(r'\s+([,.!?])', r'\1', text)
    return text

def fix_possessives(text):
    """Correct possessives like 'John ' s' -> 'John's'."""
    return re.sub(r"(\w)\s?'\s?s", r"\1's", text)

def capitalize_sentences(text):
    """Capitalize the first letter of each sentence."""
    return '. '.join([s.capitalize() for s in re.split(r'(?<=\w[.!?])\s+', text)])

def fix_article_errors(text):
    """Correct 'a' and 'an' usage based on following word's sound."""
    doc = nlp(text)
    corrected = []
    for token in doc:
        if token.text in ('a', 'an'):
            next_token = token.nbor(1)
            if token.text == "a" and next_token.text[0].lower() in "aeiou":
                corrected.append("an")
            elif token.text == "an" and next_token.text[0].lower() not in "aeiou":
                corrected.append("a")
            else:
                corrected.append(token.text)
        else:
            corrected.append(token.text)
    return ' '.join(corrected)

# Gradio app setup
def gradio_interface(text):
    """Gradio interface function to process the input text."""
    return improve_paraphrasing_and_grammar(text)

with gr.Blocks() as demo:
    gr.Markdown("## Text Paraphrasing and Grammar Correction")
    text_input = gr.Textbox(lines=10, label='Enter text for paraphrasing and grammar correction')
    text_output = gr.Textbox(lines=10, label='Corrected Text', interactive=False)
    submit_button = gr.Button("πŸ”„ Paraphrase and Correct")
    
    submit_button.click(fn=gradio_interface, inputs=text_input, outputs=text_output)

# Launch the Gradio app
demo.launch(share=True)