Spaces:

sashdev
/

LTP

Runtime error

App Files Files Community

sashdev commited on Sep 27, 2024

Commit

f487093

verified ·

1 Parent(s): 8e3461e

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -24

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import re
 import string
 import random
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('averaged_perceptron_tagger')
@@ -20,7 +21,6 @@ nltk.download('wordnet')
 nltk.download('omw-1.4')
 nltk.download('punkt_tab')
 # Initialize stopwords
 stop_words = set(stopwords.words("english"))
@@ -41,37 +41,25 @@ except OSError:
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
-# Filter out overly formal or archaic words
-def is_formal_or_rare(word):
-    formal_words = {"homo", "satellite", "futurity", "contemporaries"}
-    return word in formal_words
-# Adjust synonym replacement logic
 def plagiarism_removal(text):
     def plagiarism_remover(word):
         if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
             return word
         # Find synonyms
         synonyms = set()
         for syn in wordnet.synsets(word):
             for lemma in syn.lemmas():
-                synonym = lemma.name()
-                if "_" not in synonym and synonym.isalpha() and synonym.lower() != word.lower():
-                    synonyms.add(synonym)
         pos_tag_word = nltk.pos_tag([word])[0]
-        # Avoid replacing words based on certain POS tags
         if pos_tag_word[1] in exclude_tags:
             return word
-        # Filter synonyms to match the same part of speech
         filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
-        # Avoid formal/rare words or return the original word if no good synonym is found
-        filtered_synonyms = [syn for syn in filtered_synonyms if not is_formal_or_rare(syn)]
         if not filtered_synonyms:
             return word
@@ -81,21 +69,18 @@ def plagiarism_removal(text):
             return synonym_choice.title()
         return synonym_choice
-    # Tokenize and process the text
     para_split = word_tokenize(text)
     final_text = [plagiarism_remover(word) for word in para_split]
-    # Fix spacing issues after token replacement
     corrected_text = []
     for i in range(len(final_text)):
         if final_text[i] in string.punctuation and i > 0:
-            corrected_text[-1] += final_text[i]  # Attach punctuation to the previous word
         else:
             corrected_text.append(final_text[i])
     return " ".join(corrected_text)
-# Other auxiliary functions remain unchanged
 def predict_en(text):
     res = pipeline_en(text)[0]
     return res['label'], res['score']
@@ -141,9 +126,71 @@ def capitalize_sentences_and_nouns(text):
     return ' '.join(corrected_text)
-# Continue the other auxiliary functions for article errors, spelling correction, etc.
-# Main paraphrasing and correction function
 def paraphrase_and_correct(text):
     paragraphs = text.split("\n\n")  # Split by paragraphs
@@ -153,8 +200,12 @@ def paraphrase_and_correct(text):
         cleaned_text = remove_redundant_words(paragraph)
         plag_removed = plagiarism_removal(cleaned_text)
         paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
         paraphrased_text = fix_possessives(paraphrased_text)
-        paraphrased_text = correct_spelling(paraphrased_text)
         paraphrased_text = fix_punctuation_spacing(paraphrased_text)
         processed_paragraphs.append(paraphrased_text)

 import string
 import random
+# Download necessary NLTK data
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('omw-1.4')
 nltk.download('punkt_tab')
 # Initialize stopwords
 stop_words = set(stopwords.words("english"))
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
 def plagiarism_removal(text):
     def plagiarism_remover(word):
         if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
             return word
         # Find synonyms
         synonyms = set()
         for syn in wordnet.synsets(word):
             for lemma in syn.lemmas():
+                if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
+                    synonyms.add(lemma.name())
         pos_tag_word = nltk.pos_tag([word])[0]
         if pos_tag_word[1] in exclude_tags:
             return word
         filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
         if not filtered_synonyms:
             return word
             return synonym_choice.title()
         return synonym_choice
     para_split = word_tokenize(text)
     final_text = [plagiarism_remover(word) for word in para_split]
     corrected_text = []
     for i in range(len(final_text)):
         if final_text[i] in string.punctuation and i > 0:
+            corrected_text[-1] += final_text[i]
         else:
             corrected_text.append(final_text[i])
     return " ".join(corrected_text)
 def predict_en(text):
     res = pipeline_en(text)[0]
     return res['label'], res['score']
     return ' '.join(corrected_text)
+def force_first_letter_capital(text):
+    sentences = re.split(r'(?<=\w[.!?])\s+', text)
+    capitalized_sentences = []
+    for sentence in sentences:
+        if sentence:
+            capitalized_sentence = sentence[0].capitalize() + sentence[1:]
+            if not re.search(r'[.!?]$', capitalized_sentence):
+                capitalized_sentence += '.'
+            capitalized_sentences.append(capitalized_sentence)
+    return " ".join(capitalized_sentences)
+def correct_tense_errors(text):
+    doc = nlp(text)
+    corrected_text = []
+    for token in doc:
+        if token.pos_ == "VERB" and token.dep_ in {"aux", "auxpass"}:
+            lemma = wordnet.morphy(token.text, wordnet.VERB) or token.text
+            corrected_text.append(lemma)
+        else:
+            corrected_text.append(token.text)
+    return ' '.join(corrected_text)
+def correct_article_errors(text):
+    doc = nlp(text)
+    corrected_text = []
+    for token in doc:
+        if token.text in ['a', 'an']:
+            next_token = token.nbor(1)
+            if token.text == "a" and next_token.text[0].lower() in "aeiou":
+                corrected_text.append("an")
+            elif token.text == "an" and next_token.text[0].lower() not in "aeiou":
+                corrected_text.append("a")
+            else:
+                corrected_text.append(token.text)
+        else:
+            corrected_text.append(token.text)
+    return ' '.join(corrected_text)
+def ensure_subject_verb_agreement(text):
+    doc = nlp(text)
+    corrected_text = []
+    for token in doc:
+        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
+            if token.tag_ == "NN" and token.head.tag_ != "VBZ":
+                corrected_text.append(token.head.lemma_ + "s")
+            elif token.tag_ == "NNS" and token.head.tag_ == "VBZ":
+                corrected_text.append(token.head.lemma_)
+        corrected_text.append(token.text)
+    return ' '.join(corrected_text)
+def correct_spelling(text):
+    words = word_tokenize(text)
+    corrected_words = []
+    for word in words:
+        corrected_word = spell.candidates(word)
+        if corrected_word:
+            corrected_words.append(spell.candidates(word).pop())  # Choose the first candidate as the correction
+        else:
+            corrected_words.append(word)  # If it's not misspelled, keep the original word
+    return ' '.join(corrected_words)
 def paraphrase_and_correct(text):
     paragraphs = text.split("\n\n")  # Split by paragraphs
         cleaned_text = remove_redundant_words(paragraph)
         plag_removed = plagiarism_removal(cleaned_text)
         paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
+        paraphrased_text = force_first_letter_capital(paraphrased_text)
+        paraphrased_text = correct_article_errors(paraphrased_text)
+        paraphrased_text = correct_tense_errors(paraphrased_text)
+        paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
         paraphrased_text = fix_possessives(paraphrased_text)
+        paraphrased_text = correct_spelling(paraphrased_text)  # Spelling correction
         paraphrased_text = fix_punctuation_spacing(paraphrased_text)
         processed_paragraphs.append(paraphrased_text)