sashdev commited on
Commit
8e3461e
·
verified ·
1 Parent(s): e8f77f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -75
app.py CHANGED
@@ -12,14 +12,14 @@ import re
12
  import string
13
  import random
14
 
15
- # Download necessary NLTK data
16
  nltk.download('punkt')
17
- nltk.download('punkt_tab')
18
  nltk.download('stopwords')
19
  nltk.download('averaged_perceptron_tagger')
20
  nltk.download('averaged_perceptron_tagger_eng')
21
-
22
  nltk.download('wordnet')
 
 
 
23
 
24
  # Initialize stopwords
25
  stop_words = set(stopwords.words("english"))
@@ -41,25 +41,37 @@ except OSError:
41
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
42
  nlp = spacy.load("en_core_web_sm")
43
 
 
 
 
 
 
 
44
  def plagiarism_removal(text):
45
  def plagiarism_remover(word):
46
  if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
47
  return word
48
-
49
  # Find synonyms
50
  synonyms = set()
51
  for syn in wordnet.synsets(word):
52
  for lemma in syn.lemmas():
53
- if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
54
- synonyms.add(lemma.name())
 
55
 
56
  pos_tag_word = nltk.pos_tag([word])[0]
57
 
 
58
  if pos_tag_word[1] in exclude_tags:
59
  return word
60
-
 
61
  filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
62
 
 
 
 
63
  if not filtered_synonyms:
64
  return word
65
 
@@ -69,18 +81,21 @@ def plagiarism_removal(text):
69
  return synonym_choice.title()
70
  return synonym_choice
71
 
 
72
  para_split = word_tokenize(text)
73
  final_text = [plagiarism_remover(word) for word in para_split]
74
 
 
75
  corrected_text = []
76
  for i in range(len(final_text)):
77
  if final_text[i] in string.punctuation and i > 0:
78
- corrected_text[-1] += final_text[i]
79
  else:
80
  corrected_text.append(final_text[i])
81
 
82
  return " ".join(corrected_text)
83
 
 
84
  def predict_en(text):
85
  res = pipeline_en(text)[0]
86
  return res['label'], res['score']
@@ -126,69 +141,9 @@ def capitalize_sentences_and_nouns(text):
126
 
127
  return ' '.join(corrected_text)
128
 
129
- def force_first_letter_capital(text):
130
- sentences = re.split(r'(?<=\w[.!?])\s+', text)
131
- capitalized_sentences = []
132
-
133
- for sentence in sentences:
134
- if sentence:
135
- capitalized_sentence = sentence[0].capitalize() + sentence[1:]
136
- if not re.search(r'[.!?]$', capitalized_sentence):
137
- capitalized_sentence += '.'
138
- capitalized_sentences.append(capitalized_sentence)
139
-
140
- return " ".join(capitalized_sentences)
141
-
142
- def correct_tense_errors(text):
143
- doc = nlp(text)
144
- corrected_text = []
145
- for token in doc:
146
- if token.pos_ == "VERB" and token.dep_ in {"aux", "auxpass"}:
147
- lemma = wordnet.morphy(token.text, wordnet.VERB) or token.text
148
- corrected_text.append(lemma)
149
- else:
150
- corrected_text.append(token.text)
151
- return ' '.join(corrected_text)
152
-
153
- def correct_article_errors(text):
154
- doc = nlp(text)
155
- corrected_text = []
156
- for token in doc:
157
- if token.text in ['a', 'an']:
158
- next_token = token.nbor(1)
159
- if token.text == "a" and next_token.text[0].lower() in "aeiou":
160
- corrected_text.append("an")
161
- elif token.text == "an" and next_token.text[0].lower() not in "aeiou":
162
- corrected_text.append("a")
163
- else:
164
- corrected_text.append(token.text)
165
- else:
166
- corrected_text.append(token.text)
167
- return ' '.join(corrected_text)
168
-
169
- def ensure_subject_verb_agreement(text):
170
- doc = nlp(text)
171
- corrected_text = []
172
- for token in doc:
173
- if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
174
- if token.tag_ == "NN" and token.head.tag_ != "VBZ":
175
- corrected_text.append(token.head.lemma_ + "s")
176
- elif token.tag_ == "NNS" and token.head.tag_ == "VBZ":
177
- corrected_text.append(token.head.lemma_)
178
- corrected_text.append(token.text)
179
- return ' '.join(corrected_text)
180
-
181
- def correct_spelling(text):
182
- words = text.split()
183
- corrected_words = []
184
- for word in words:
185
- corrected_word = spell.correction(word)
186
- if corrected_word is not None:
187
- corrected_words.append(corrected_word)
188
- else:
189
- corrected_words.append(word)
190
- return ' '.join(corrected_words)
191
 
 
192
  def paraphrase_and_correct(text):
193
  paragraphs = text.split("\n\n") # Split by paragraphs
194
 
@@ -198,11 +153,7 @@ def paraphrase_and_correct(text):
198
  cleaned_text = remove_redundant_words(paragraph)
199
  plag_removed = plagiarism_removal(cleaned_text)
200
  paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
201
- paraphrased_text = force_first_letter_capital(paraphrased_text)
202
- paraphrased_text = correct_article_errors(paraphrased_text)
203
- paraphrased_text = correct_tense_errors(paraphrased_text)
204
- paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
205
- paraphrased_text = fix_possessives(paraphrased_text) # Fixed typo here
206
  paraphrased_text = correct_spelling(paraphrased_text)
207
  paraphrased_text = fix_punctuation_spacing(paraphrased_text)
208
  processed_paragraphs.append(paraphrased_text)
 
12
  import string
13
  import random
14
 
 
15
  nltk.download('punkt')
 
16
  nltk.download('stopwords')
17
  nltk.download('averaged_perceptron_tagger')
18
  nltk.download('averaged_perceptron_tagger_eng')
 
19
  nltk.download('wordnet')
20
+ nltk.download('omw-1.4')
21
+ nltk.download('punkt_tab')
22
+
23
 
24
  # Initialize stopwords
25
  stop_words = set(stopwords.words("english"))
 
41
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
42
  nlp = spacy.load("en_core_web_sm")
43
 
44
+ # Filter out overly formal or archaic words
45
+ def is_formal_or_rare(word):
46
+ formal_words = {"homo", "satellite", "futurity", "contemporaries"}
47
+ return word in formal_words
48
+
49
+ # Adjust synonym replacement logic
50
  def plagiarism_removal(text):
51
  def plagiarism_remover(word):
52
  if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
53
  return word
54
+
55
  # Find synonyms
56
  synonyms = set()
57
  for syn in wordnet.synsets(word):
58
  for lemma in syn.lemmas():
59
+ synonym = lemma.name()
60
+ if "_" not in synonym and synonym.isalpha() and synonym.lower() != word.lower():
61
+ synonyms.add(synonym)
62
 
63
  pos_tag_word = nltk.pos_tag([word])[0]
64
 
65
+ # Avoid replacing words based on certain POS tags
66
  if pos_tag_word[1] in exclude_tags:
67
  return word
68
+
69
+ # Filter synonyms to match the same part of speech
70
  filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
71
 
72
+ # Avoid formal/rare words or return the original word if no good synonym is found
73
+ filtered_synonyms = [syn for syn in filtered_synonyms if not is_formal_or_rare(syn)]
74
+
75
  if not filtered_synonyms:
76
  return word
77
 
 
81
  return synonym_choice.title()
82
  return synonym_choice
83
 
84
+ # Tokenize and process the text
85
  para_split = word_tokenize(text)
86
  final_text = [plagiarism_remover(word) for word in para_split]
87
 
88
+ # Fix spacing issues after token replacement
89
  corrected_text = []
90
  for i in range(len(final_text)):
91
  if final_text[i] in string.punctuation and i > 0:
92
+ corrected_text[-1] += final_text[i] # Attach punctuation to the previous word
93
  else:
94
  corrected_text.append(final_text[i])
95
 
96
  return " ".join(corrected_text)
97
 
98
+ # Other auxiliary functions remain unchanged
99
  def predict_en(text):
100
  res = pipeline_en(text)[0]
101
  return res['label'], res['score']
 
141
 
142
  return ' '.join(corrected_text)
143
 
144
+ # Continue the other auxiliary functions for article errors, spelling correction, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
+ # Main paraphrasing and correction function
147
  def paraphrase_and_correct(text):
148
  paragraphs = text.split("\n\n") # Split by paragraphs
149
 
 
153
  cleaned_text = remove_redundant_words(paragraph)
154
  plag_removed = plagiarism_removal(cleaned_text)
155
  paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
156
+ paraphrased_text = fix_possessives(paraphrased_text)
 
 
 
 
157
  paraphrased_text = correct_spelling(paraphrased_text)
158
  paraphrased_text = fix_punctuation_spacing(paraphrased_text)
159
  processed_paragraphs.append(paraphrased_text)