sashtech commited on
Commit
2bc5696
·
verified ·
1 Parent(s): 1bedf23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -40
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  from transformers import pipeline
4
  import spacy
5
  import subprocess
 
6
  import nltk
7
  from nltk.corpus import wordnet, stopwords # Import stopwords here
8
  from spellchecker import SpellChecker
@@ -26,54 +27,70 @@ download_nltk_resources()
26
 
27
  top_words = set(stopwords.words("english")) # More efficient as a set
28
 
29
- def plagiarism_removal(text):
30
- def plagiarism_remover(word):
31
- # Handle stopwords, punctuation, and excluded words
32
- if word.lower() in top_words or word.lower() in exclude_words or word in string.punctuation:
33
- return word
34
-
35
- # Find synonyms
36
- synonyms = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  for syn in wordnet.synsets(word):
38
  for lemma in syn.lemmas():
39
  # Exclude overly technical synonyms or words with underscores
40
  if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
41
  synonyms.add(lemma.name())
42
 
43
- # Get part of speech for word and filter synonyms with the same POS
44
- pos_tag_word = nltk.pos_tag([word])[0]
45
-
46
- # Avoid replacing certain parts of speech
47
- if pos_tag_word[1] in exclude_tags:
48
- return word
49
-
50
- filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
51
-
52
- # Return original word if no appropriate synonyms found
53
- if not filtered_synonyms:
54
- return word
55
-
56
- # Select a random synonym from the filtered list
57
- synonym_choice = random.choice(filtered_synonyms)
58
-
59
- # Retain original capitalization
60
- if word.istitle():
61
- return synonym_choice.title()
62
- return synonym_choice
63
-
64
- # Tokenize, replace words, and join them back
65
- para_split = nltk.word_tokenize(text)
66
- final_text = [plagiarism_remover(word) for word in para_split]
67
 
68
- # Handle spacing around punctuation correctly
69
- corrected_text = []
70
- for i in range(len(final_text)):
71
- if final_text[i] in string.punctuation and i > 0:
72
- corrected_text[-1] += final_text[i] # Append punctuation to previous word
73
- else:
74
- corrected_text.append(final_text[i])
 
 
 
 
 
 
 
 
 
 
75
 
76
- return " ".join(corrected_text)
77
 
78
  # Words we don't want to replace
79
  exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
 
3
  from transformers import pipeline
4
  import spacy
5
  import subprocess
6
+ import json
7
  import nltk
8
  from nltk.corpus import wordnet, stopwords # Import stopwords here
9
  from spellchecker import SpellChecker
 
27
 
28
  top_words = set(stopwords.words("english")) # More efficient as a set
29
 
30
+ import os
31
+ import json
32
+
33
+ # Path to the thesaurus file
34
+ thesaurus_file_path = 'en_thesaurus.jsonl' # Ensure the file path is correct
35
+
36
+ # Function to load the thesaurus into a dictionary
37
+ def load_thesaurus(file_path):
38
+ thesaurus_dict = {}
39
+ try:
40
+ with open(file_path, 'r', encoding='utf-8') as file:
41
+ for line in file:
42
+ # Parse each line as a JSON object
43
+ entry = json.loads(line.strip())
44
+ word = entry.get("word")
45
+ synonyms = entry.get("synonyms", [])
46
+ if word:
47
+ thesaurus_dict[word] = synonyms
48
+ except Exception as e:
49
+ print(f"Error loading thesaurus: {e}")
50
+
51
+ return thesaurus_dict
52
+
53
+ # Load the thesaurus
54
+ synonym_dict = load_thesaurus(thesaurus_file_path)
55
+
56
+ # Modified plagiarism_remover function to use the loaded thesaurus
57
+ def plagiarism_remover(word):
58
+ # Handle stopwords, punctuation, and excluded words
59
+ if word.lower() in top_words or word.lower() in exclude_words or word in string.punctuation:
60
+ return word
61
+
62
+ # Check for synonyms in the custom thesaurus
63
+ synonyms = synonym_dict.get(word.lower(), set())
64
+
65
+ # If no synonyms found in the custom thesaurus, use WordNet
66
+ if not synonyms:
67
  for syn in wordnet.synsets(word):
68
  for lemma in syn.lemmas():
69
  # Exclude overly technical synonyms or words with underscores
70
  if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
71
  synonyms.add(lemma.name())
72
 
73
+ # Get part of speech for word and filter synonyms with the same POS
74
+ pos_tag_word = nltk.pos_tag([word])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Avoid replacing certain parts of speech
77
+ if pos_tag_word[1] in exclude_tags:
78
+ return word
79
+
80
+ filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
81
+
82
+ # Return original word if no appropriate synonyms found
83
+ if not filtered_synonyms:
84
+ return word
85
+
86
+ # Select a random synonym from the filtered list
87
+ synonym_choice = random.choice(filtered_synonyms)
88
+
89
+ # Retain original capitalization
90
+ if word.istitle():
91
+ return synonym_choice.title()
92
+ return synonym_choice
93
 
 
94
 
95
  # Words we don't want to replace
96
  exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}