sashtech commited on
Commit
051de31
·
verified ·
1 Parent(s): 5834cac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -1
app.py CHANGED
@@ -8,6 +8,73 @@ from nltk.corpus import wordnet
8
  from spellchecker import SpellChecker
9
  import re
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # Initialize the English text classification pipeline for AI detection
12
  pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
13
 
@@ -151,7 +218,8 @@ def correct_spelling(text):
151
  def paraphrase_and_correct(text):
152
  # Add synonym replacement here
153
  cleaned_text = remove_redundant_words(text)
154
- paraphrased_text = capitalize_sentences_and_nouns(cleaned_text)
 
155
  paraphrased_text = force_first_letter_capital(paraphrased_text)
156
  paraphrased_text = correct_article_errors(paraphrased_text)
157
  paraphrased_text = correct_tense_errors(paraphrased_text)
 
8
  from spellchecker import SpellChecker
9
  import re
10
 
11
+
12
+
13
+ nltk.download('punkt')
14
+ nltk.download('stopwords')
15
+ nltk.download('averaged_perceptron_tagger')
16
+ nltk.download('wordnet')
17
+ top_words = set(stopwords.words("english")) # More efficient as a set
18
+
19
+ def plagiarism_removal(text):
20
+ def plagiarism_remover(word):
21
+ # Handle stopwords, punctuation, and excluded words
22
+ if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
23
+ return word
24
+
25
+ # Find synonyms
26
+ synonyms = set()
27
+ for syn in wordnet.synsets(word):
28
+ for lemma in syn.lemmas():
29
+ # Exclude overly technical synonyms or words with underscores
30
+ if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
31
+ synonyms.add(lemma.name())
32
+
33
+ # Get part of speech for word and filter synonyms with the same POS
34
+ pos_tag_word = nltk.pos_tag([word])[0]
35
+
36
+ # Avoid replacing certain parts of speech
37
+ if pos_tag_word[1] in exclude_tags:
38
+ return word
39
+
40
+ filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
41
+
42
+ # Return original word if no appropriate synonyms found
43
+ if not filtered_synonyms:
44
+ return word
45
+
46
+ # Select a random synonym from the filtered list
47
+ synonym_choice = random.choice(filtered_synonyms)
48
+
49
+ # Retain original capitalization
50
+ if word.istitle():
51
+ return synonym_choice.title()
52
+ return synonym_choice
53
+
54
+ # Tokenize, replace words, and join them back
55
+ para_split = word_tokenize(text)
56
+ final_text = [plagiarism_remover(word) for word in para_split]
57
+
58
+ # Handle spacing around punctuation correctly
59
+ corrected_text = []
60
+ for i in range(len(final_text)):
61
+ if final_text[i] in string.punctuation and i > 0:
62
+ corrected_text[-1] += final_text[i] # Append punctuation to previous word
63
+ else:
64
+ corrected_text.append(final_text[i])
65
+
66
+ return " ".join(corrected_text)
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+ # Words we don't want to replace
75
+ exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
76
+ exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
77
+
78
  # Initialize the English text classification pipeline for AI detection
79
  pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
80
 
 
218
  def paraphrase_and_correct(text):
219
  # Add synonym replacement here
220
  cleaned_text = remove_redundant_words(text)
221
+ plag_removed=plagiarism_removal(cleaned_text)
222
+ paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
223
  paraphrased_text = force_first_letter_capital(paraphrased_text)
224
  paraphrased_text = correct_article_errors(paraphrased_text)
225
  paraphrased_text = correct_tense_errors(paraphrased_text)