File size: 5,122 Bytes
a4b85c4 e63bdfe a4b85c4 e63bdfe a4b85c4 e63bdfe 9fc880b e63bdfe a4b85c4 e63bdfe a4b85c4 e63bdfe a4b85c4 e63bdfe b5d0fef a4b85c4 b5d0fef a4b85c4 b5d0fef a4b85c4 b5d0fef a4b85c4 b5d0fef a4b85c4 b5d0fef a4b85c4 b5d0fef a4b85c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
import gradio as gr
from transformers import pipeline
import spacy
import subprocess
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
import re
import string
import random
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
# Initialize stopwords
stop_words = set(stopwords.words("english"))
# Words we don't want to replace
exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
# Initialize the English text classification pipeline for AI detection
pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
# Initialize the spell checker
spell = SpellChecker()
# Ensure the SpaCy model is installed
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
# Exclude tags and words (adjusted for better precision)
exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
def get_synonyms(word):
"""Find synonyms for a given word considering the context."""
synonyms = set()
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
synonyms.add(lemma.name())
return synonyms
def replace_with_synonyms(word, pos_tag):
"""Replace words with synonyms, keeping the original POS tag."""
synonyms = get_synonyms(word)
# Filter by POS tag
filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag]
if filtered_synonyms:
return random.choice(filtered_synonyms)
return word
def improve_paraphrasing_and_grammar(text):
"""Paraphrase and correct grammatical errors in the text."""
doc = nlp(text)
corrected_text = []
for sent in doc.sents:
sentence = []
for token in sent:
# Replace words with synonyms, excluding special POS tags
if token.tag_ not in exclude_tags and token.text.lower() not in exclude_words and token.text not in string.punctuation:
synonym = replace_with_synonyms(token.text, token.tag_)
sentence.append(synonym if synonym else token.text)
else:
sentence.append(token.text)
corrected_text.append(' '.join(sentence))
# Ensure proper punctuation and capitalization
final_text = ' '.join(corrected_text)
final_text = fix_possessives(final_text)
final_text = fix_punctuation_spacing(final_text)
final_text = capitalize_sentences(final_text)
final_text = fix_article_errors(final_text)
return final_text
def fix_punctuation_spacing(text):
"""Fix spaces before punctuation marks."""
text = re.sub(r'\s+([,.!?])', r'\1', text)
return text
def fix_possessives(text):
"""Correct possessives like 'John ' s' -> 'John's'."""
return re.sub(r"(\w)\s?'\s?s", r"\1's", text)
def capitalize_sentences(text):
"""Capitalize the first letter of each sentence."""
return '. '.join([s.capitalize() for s in re.split(r'(?<=\w[.!?])\s+', text)])
def fix_article_errors(text):
"""Correct 'a' and 'an' usage based on following word's sound."""
doc = nlp(text)
corrected = []
for token in doc:
if token.text in ('a', 'an'):
next_token = token.nbor(1)
if token.text == "a" and next_token.text[0].lower() in "aeiou":
corrected.append("an")
elif token.text == "an" and next_token.text[0].lower() not in "aeiou":
corrected.append("a")
else:
corrected.append(token.text)
else:
corrected.append(token.text)
return ' '.join(corrected)
# Gradio app setup
def gradio_interface(text):
"""Gradio interface function to process the input text."""
return improve_paraphrasing_and_grammar(text)
with gr.Blocks() as demo:
gr.Markdown("## Text Paraphrasing and Grammar Correction")
text_input = gr.Textbox(lines=10, label='Enter text for paraphrasing and grammar correction')
text_output = gr.Textbox(lines=10, label='Corrected Text', interactive=False)
submit_button = gr.Button("π Paraphrase and Correct")
submit_button.click(fn=gradio_interface, inputs=text_input, outputs=text_output)
# Launch the Gradio app
demo.launch(share=True)
|