sashdev commited on
Commit
f487093
·
verified ·
1 Parent(s): 8e3461e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -24
app.py CHANGED
@@ -12,6 +12,7 @@ import re
12
  import string
13
  import random
14
 
 
15
  nltk.download('punkt')
16
  nltk.download('stopwords')
17
  nltk.download('averaged_perceptron_tagger')
@@ -20,7 +21,6 @@ nltk.download('wordnet')
20
  nltk.download('omw-1.4')
21
  nltk.download('punkt_tab')
22
 
23
-
24
  # Initialize stopwords
25
  stop_words = set(stopwords.words("english"))
26
 
@@ -41,37 +41,25 @@ except OSError:
41
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
42
  nlp = spacy.load("en_core_web_sm")
43
 
44
- # Filter out overly formal or archaic words
45
- def is_formal_or_rare(word):
46
- formal_words = {"homo", "satellite", "futurity", "contemporaries"}
47
- return word in formal_words
48
-
49
- # Adjust synonym replacement logic
50
  def plagiarism_removal(text):
51
  def plagiarism_remover(word):
52
  if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
53
  return word
54
-
55
  # Find synonyms
56
  synonyms = set()
57
  for syn in wordnet.synsets(word):
58
  for lemma in syn.lemmas():
59
- synonym = lemma.name()
60
- if "_" not in synonym and synonym.isalpha() and synonym.lower() != word.lower():
61
- synonyms.add(synonym)
62
 
63
  pos_tag_word = nltk.pos_tag([word])[0]
64
 
65
- # Avoid replacing words based on certain POS tags
66
  if pos_tag_word[1] in exclude_tags:
67
  return word
68
-
69
- # Filter synonyms to match the same part of speech
70
  filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
71
 
72
- # Avoid formal/rare words or return the original word if no good synonym is found
73
- filtered_synonyms = [syn for syn in filtered_synonyms if not is_formal_or_rare(syn)]
74
-
75
  if not filtered_synonyms:
76
  return word
77
 
@@ -81,21 +69,18 @@ def plagiarism_removal(text):
81
  return synonym_choice.title()
82
  return synonym_choice
83
 
84
- # Tokenize and process the text
85
  para_split = word_tokenize(text)
86
  final_text = [plagiarism_remover(word) for word in para_split]
87
 
88
- # Fix spacing issues after token replacement
89
  corrected_text = []
90
  for i in range(len(final_text)):
91
  if final_text[i] in string.punctuation and i > 0:
92
- corrected_text[-1] += final_text[i] # Attach punctuation to the previous word
93
  else:
94
  corrected_text.append(final_text[i])
95
 
96
  return " ".join(corrected_text)
97
 
98
- # Other auxiliary functions remain unchanged
99
  def predict_en(text):
100
  res = pipeline_en(text)[0]
101
  return res['label'], res['score']
@@ -141,9 +126,71 @@ def capitalize_sentences_and_nouns(text):
141
 
142
  return ' '.join(corrected_text)
143
 
144
- # Continue the other auxiliary functions for article errors, spelling correction, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Main paraphrasing and correction function
147
  def paraphrase_and_correct(text):
148
  paragraphs = text.split("\n\n") # Split by paragraphs
149
 
@@ -153,8 +200,12 @@ def paraphrase_and_correct(text):
153
  cleaned_text = remove_redundant_words(paragraph)
154
  plag_removed = plagiarism_removal(cleaned_text)
155
  paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
 
 
 
 
156
  paraphrased_text = fix_possessives(paraphrased_text)
157
- paraphrased_text = correct_spelling(paraphrased_text)
158
  paraphrased_text = fix_punctuation_spacing(paraphrased_text)
159
  processed_paragraphs.append(paraphrased_text)
160
 
 
12
  import string
13
  import random
14
 
15
+ # Download necessary NLTK data
16
  nltk.download('punkt')
17
  nltk.download('stopwords')
18
  nltk.download('averaged_perceptron_tagger')
 
21
  nltk.download('omw-1.4')
22
  nltk.download('punkt_tab')
23
 
 
24
  # Initialize stopwords
25
  stop_words = set(stopwords.words("english"))
26
 
 
41
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
42
  nlp = spacy.load("en_core_web_sm")
43
 
 
 
 
 
 
 
44
  def plagiarism_removal(text):
45
  def plagiarism_remover(word):
46
  if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
47
  return word
48
+
49
  # Find synonyms
50
  synonyms = set()
51
  for syn in wordnet.synsets(word):
52
  for lemma in syn.lemmas():
53
+ if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
54
+ synonyms.add(lemma.name())
 
55
 
56
  pos_tag_word = nltk.pos_tag([word])[0]
57
 
 
58
  if pos_tag_word[1] in exclude_tags:
59
  return word
60
+
 
61
  filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
62
 
 
 
 
63
  if not filtered_synonyms:
64
  return word
65
 
 
69
  return synonym_choice.title()
70
  return synonym_choice
71
 
 
72
  para_split = word_tokenize(text)
73
  final_text = [plagiarism_remover(word) for word in para_split]
74
 
 
75
  corrected_text = []
76
  for i in range(len(final_text)):
77
  if final_text[i] in string.punctuation and i > 0:
78
+ corrected_text[-1] += final_text[i]
79
  else:
80
  corrected_text.append(final_text[i])
81
 
82
  return " ".join(corrected_text)
83
 
 
84
  def predict_en(text):
85
  res = pipeline_en(text)[0]
86
  return res['label'], res['score']
 
126
 
127
  return ' '.join(corrected_text)
128
 
129
+ def force_first_letter_capital(text):
130
+ sentences = re.split(r'(?<=\w[.!?])\s+', text)
131
+ capitalized_sentences = []
132
+
133
+ for sentence in sentences:
134
+ if sentence:
135
+ capitalized_sentence = sentence[0].capitalize() + sentence[1:]
136
+ if not re.search(r'[.!?]$', capitalized_sentence):
137
+ capitalized_sentence += '.'
138
+ capitalized_sentences.append(capitalized_sentence)
139
+
140
+ return " ".join(capitalized_sentences)
141
+
142
+ def correct_tense_errors(text):
143
+ doc = nlp(text)
144
+ corrected_text = []
145
+ for token in doc:
146
+ if token.pos_ == "VERB" and token.dep_ in {"aux", "auxpass"}:
147
+ lemma = wordnet.morphy(token.text, wordnet.VERB) or token.text
148
+ corrected_text.append(lemma)
149
+ else:
150
+ corrected_text.append(token.text)
151
+ return ' '.join(corrected_text)
152
+
153
+ def correct_article_errors(text):
154
+ doc = nlp(text)
155
+ corrected_text = []
156
+ for token in doc:
157
+ if token.text in ['a', 'an']:
158
+ next_token = token.nbor(1)
159
+ if token.text == "a" and next_token.text[0].lower() in "aeiou":
160
+ corrected_text.append("an")
161
+ elif token.text == "an" and next_token.text[0].lower() not in "aeiou":
162
+ corrected_text.append("a")
163
+ else:
164
+ corrected_text.append(token.text)
165
+ else:
166
+ corrected_text.append(token.text)
167
+ return ' '.join(corrected_text)
168
+
169
+ def ensure_subject_verb_agreement(text):
170
+ doc = nlp(text)
171
+ corrected_text = []
172
+ for token in doc:
173
+ if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
174
+ if token.tag_ == "NN" and token.head.tag_ != "VBZ":
175
+ corrected_text.append(token.head.lemma_ + "s")
176
+ elif token.tag_ == "NNS" and token.head.tag_ == "VBZ":
177
+ corrected_text.append(token.head.lemma_)
178
+ corrected_text.append(token.text)
179
+ return ' '.join(corrected_text)
180
+
181
+ def correct_spelling(text):
182
+ words = word_tokenize(text)
183
+ corrected_words = []
184
+
185
+ for word in words:
186
+ corrected_word = spell.candidates(word)
187
+ if corrected_word:
188
+ corrected_words.append(spell.candidates(word).pop()) # Choose the first candidate as the correction
189
+ else:
190
+ corrected_words.append(word) # If it's not misspelled, keep the original word
191
+
192
+ return ' '.join(corrected_words)
193
 
 
194
  def paraphrase_and_correct(text):
195
  paragraphs = text.split("\n\n") # Split by paragraphs
196
 
 
200
  cleaned_text = remove_redundant_words(paragraph)
201
  plag_removed = plagiarism_removal(cleaned_text)
202
  paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
203
+ paraphrased_text = force_first_letter_capital(paraphrased_text)
204
+ paraphrased_text = correct_article_errors(paraphrased_text)
205
+ paraphrased_text = correct_tense_errors(paraphrased_text)
206
+ paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
207
  paraphrased_text = fix_possessives(paraphrased_text)
208
+ paraphrased_text = correct_spelling(paraphrased_text) # Spelling correction
209
  paraphrased_text = fix_punctuation_spacing(paraphrased_text)
210
  processed_paragraphs.append(paraphrased_text)
211