Update app.py
Browse files
app.py
CHANGED
@@ -12,14 +12,14 @@ import re
|
|
12 |
import string
|
13 |
import random
|
14 |
|
15 |
-
# Download necessary NLTK data
|
16 |
nltk.download('punkt')
|
17 |
-
nltk.download('punkt_tab')
|
18 |
nltk.download('stopwords')
|
19 |
nltk.download('averaged_perceptron_tagger')
|
20 |
nltk.download('averaged_perceptron_tagger_eng')
|
21 |
-
|
22 |
nltk.download('wordnet')
|
|
|
|
|
|
|
23 |
|
24 |
# Initialize stopwords
|
25 |
stop_words = set(stopwords.words("english"))
|
@@ -41,25 +41,37 @@ except OSError:
|
|
41 |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
42 |
nlp = spacy.load("en_core_web_sm")
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def plagiarism_removal(text):
|
45 |
def plagiarism_remover(word):
|
46 |
if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
|
47 |
return word
|
48 |
-
|
49 |
# Find synonyms
|
50 |
synonyms = set()
|
51 |
for syn in wordnet.synsets(word):
|
52 |
for lemma in syn.lemmas():
|
53 |
-
|
54 |
-
|
|
|
55 |
|
56 |
pos_tag_word = nltk.pos_tag([word])[0]
|
57 |
|
|
|
58 |
if pos_tag_word[1] in exclude_tags:
|
59 |
return word
|
60 |
-
|
|
|
61 |
filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
|
62 |
|
|
|
|
|
|
|
63 |
if not filtered_synonyms:
|
64 |
return word
|
65 |
|
@@ -69,18 +81,21 @@ def plagiarism_removal(text):
|
|
69 |
return synonym_choice.title()
|
70 |
return synonym_choice
|
71 |
|
|
|
72 |
para_split = word_tokenize(text)
|
73 |
final_text = [plagiarism_remover(word) for word in para_split]
|
74 |
|
|
|
75 |
corrected_text = []
|
76 |
for i in range(len(final_text)):
|
77 |
if final_text[i] in string.punctuation and i > 0:
|
78 |
-
corrected_text[-1] += final_text[i]
|
79 |
else:
|
80 |
corrected_text.append(final_text[i])
|
81 |
|
82 |
return " ".join(corrected_text)
|
83 |
|
|
|
84 |
def predict_en(text):
|
85 |
res = pipeline_en(text)[0]
|
86 |
return res['label'], res['score']
|
@@ -126,69 +141,9 @@ def capitalize_sentences_and_nouns(text):
|
|
126 |
|
127 |
return ' '.join(corrected_text)
|
128 |
|
129 |
-
|
130 |
-
sentences = re.split(r'(?<=\w[.!?])\s+', text)
|
131 |
-
capitalized_sentences = []
|
132 |
-
|
133 |
-
for sentence in sentences:
|
134 |
-
if sentence:
|
135 |
-
capitalized_sentence = sentence[0].capitalize() + sentence[1:]
|
136 |
-
if not re.search(r'[.!?]$', capitalized_sentence):
|
137 |
-
capitalized_sentence += '.'
|
138 |
-
capitalized_sentences.append(capitalized_sentence)
|
139 |
-
|
140 |
-
return " ".join(capitalized_sentences)
|
141 |
-
|
142 |
-
def correct_tense_errors(text):
|
143 |
-
doc = nlp(text)
|
144 |
-
corrected_text = []
|
145 |
-
for token in doc:
|
146 |
-
if token.pos_ == "VERB" and token.dep_ in {"aux", "auxpass"}:
|
147 |
-
lemma = wordnet.morphy(token.text, wordnet.VERB) or token.text
|
148 |
-
corrected_text.append(lemma)
|
149 |
-
else:
|
150 |
-
corrected_text.append(token.text)
|
151 |
-
return ' '.join(corrected_text)
|
152 |
-
|
153 |
-
def correct_article_errors(text):
|
154 |
-
doc = nlp(text)
|
155 |
-
corrected_text = []
|
156 |
-
for token in doc:
|
157 |
-
if token.text in ['a', 'an']:
|
158 |
-
next_token = token.nbor(1)
|
159 |
-
if token.text == "a" and next_token.text[0].lower() in "aeiou":
|
160 |
-
corrected_text.append("an")
|
161 |
-
elif token.text == "an" and next_token.text[0].lower() not in "aeiou":
|
162 |
-
corrected_text.append("a")
|
163 |
-
else:
|
164 |
-
corrected_text.append(token.text)
|
165 |
-
else:
|
166 |
-
corrected_text.append(token.text)
|
167 |
-
return ' '.join(corrected_text)
|
168 |
-
|
169 |
-
def ensure_subject_verb_agreement(text):
|
170 |
-
doc = nlp(text)
|
171 |
-
corrected_text = []
|
172 |
-
for token in doc:
|
173 |
-
if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
|
174 |
-
if token.tag_ == "NN" and token.head.tag_ != "VBZ":
|
175 |
-
corrected_text.append(token.head.lemma_ + "s")
|
176 |
-
elif token.tag_ == "NNS" and token.head.tag_ == "VBZ":
|
177 |
-
corrected_text.append(token.head.lemma_)
|
178 |
-
corrected_text.append(token.text)
|
179 |
-
return ' '.join(corrected_text)
|
180 |
-
|
181 |
-
def correct_spelling(text):
|
182 |
-
words = text.split()
|
183 |
-
corrected_words = []
|
184 |
-
for word in words:
|
185 |
-
corrected_word = spell.correction(word)
|
186 |
-
if corrected_word is not None:
|
187 |
-
corrected_words.append(corrected_word)
|
188 |
-
else:
|
189 |
-
corrected_words.append(word)
|
190 |
-
return ' '.join(corrected_words)
|
191 |
|
|
|
192 |
def paraphrase_and_correct(text):
|
193 |
paragraphs = text.split("\n\n") # Split by paragraphs
|
194 |
|
@@ -198,11 +153,7 @@ def paraphrase_and_correct(text):
|
|
198 |
cleaned_text = remove_redundant_words(paragraph)
|
199 |
plag_removed = plagiarism_removal(cleaned_text)
|
200 |
paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
|
201 |
-
paraphrased_text =
|
202 |
-
paraphrased_text = correct_article_errors(paraphrased_text)
|
203 |
-
paraphrased_text = correct_tense_errors(paraphrased_text)
|
204 |
-
paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
|
205 |
-
paraphrased_text = fix_possessives(paraphrased_text) # Fixed typo here
|
206 |
paraphrased_text = correct_spelling(paraphrased_text)
|
207 |
paraphrased_text = fix_punctuation_spacing(paraphrased_text)
|
208 |
processed_paragraphs.append(paraphrased_text)
|
|
|
12 |
import string
|
13 |
import random
|
14 |
|
|
|
15 |
nltk.download('punkt')
|
|
|
16 |
nltk.download('stopwords')
|
17 |
nltk.download('averaged_perceptron_tagger')
|
18 |
nltk.download('averaged_perceptron_tagger_eng')
|
|
|
19 |
nltk.download('wordnet')
|
20 |
+
nltk.download('omw-1.4')
|
21 |
+
nltk.download('punkt_tab')
|
22 |
+
|
23 |
|
24 |
# Initialize stopwords
|
25 |
stop_words = set(stopwords.words("english"))
|
|
|
41 |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
42 |
nlp = spacy.load("en_core_web_sm")
|
43 |
|
44 |
+
# Filter out overly formal or archaic words
|
45 |
+
def is_formal_or_rare(word):
|
46 |
+
formal_words = {"homo", "satellite", "futurity", "contemporaries"}
|
47 |
+
return word in formal_words
|
48 |
+
|
49 |
+
# Adjust synonym replacement logic
|
50 |
def plagiarism_removal(text):
|
51 |
def plagiarism_remover(word):
|
52 |
if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
|
53 |
return word
|
54 |
+
|
55 |
# Find synonyms
|
56 |
synonyms = set()
|
57 |
for syn in wordnet.synsets(word):
|
58 |
for lemma in syn.lemmas():
|
59 |
+
synonym = lemma.name()
|
60 |
+
if "_" not in synonym and synonym.isalpha() and synonym.lower() != word.lower():
|
61 |
+
synonyms.add(synonym)
|
62 |
|
63 |
pos_tag_word = nltk.pos_tag([word])[0]
|
64 |
|
65 |
+
# Avoid replacing words based on certain POS tags
|
66 |
if pos_tag_word[1] in exclude_tags:
|
67 |
return word
|
68 |
+
|
69 |
+
# Filter synonyms to match the same part of speech
|
70 |
filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
|
71 |
|
72 |
+
# Avoid formal/rare words or return the original word if no good synonym is found
|
73 |
+
filtered_synonyms = [syn for syn in filtered_synonyms if not is_formal_or_rare(syn)]
|
74 |
+
|
75 |
if not filtered_synonyms:
|
76 |
return word
|
77 |
|
|
|
81 |
return synonym_choice.title()
|
82 |
return synonym_choice
|
83 |
|
84 |
+
# Tokenize and process the text
|
85 |
para_split = word_tokenize(text)
|
86 |
final_text = [plagiarism_remover(word) for word in para_split]
|
87 |
|
88 |
+
# Fix spacing issues after token replacement
|
89 |
corrected_text = []
|
90 |
for i in range(len(final_text)):
|
91 |
if final_text[i] in string.punctuation and i > 0:
|
92 |
+
corrected_text[-1] += final_text[i] # Attach punctuation to the previous word
|
93 |
else:
|
94 |
corrected_text.append(final_text[i])
|
95 |
|
96 |
return " ".join(corrected_text)
|
97 |
|
98 |
+
# Other auxiliary functions remain unchanged
|
99 |
def predict_en(text):
|
100 |
res = pipeline_en(text)[0]
|
101 |
return res['label'], res['score']
|
|
|
141 |
|
142 |
return ' '.join(corrected_text)
|
143 |
|
144 |
+
# Continue the other auxiliary functions for article errors, spelling correction, etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
+
# Main paraphrasing and correction function
|
147 |
def paraphrase_and_correct(text):
|
148 |
paragraphs = text.split("\n\n") # Split by paragraphs
|
149 |
|
|
|
153 |
cleaned_text = remove_redundant_words(paragraph)
|
154 |
plag_removed = plagiarism_removal(cleaned_text)
|
155 |
paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
|
156 |
+
paraphrased_text = fix_possessives(paraphrased_text)
|
|
|
|
|
|
|
|
|
157 |
paraphrased_text = correct_spelling(paraphrased_text)
|
158 |
paraphrased_text = fix_punctuation_spacing(paraphrased_text)
|
159 |
processed_paragraphs.append(paraphrased_text)
|