Spaces:

Towhidul
/

PECCAVI

Runtime error

App Files Files Community

Towhidul commited on Mar 27, 2024

Commit

7861958

verified ·

1 Parent(s): 793fbc5

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -12

app.py CHANGED Viewed

@@ -67,14 +67,14 @@ def paraphrase(
 def remove_punctuations(text):
-    # Remove punctuations while preserving hyphenated words
-    return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s-]', '', text))
 def tokenize(sentence):
     # Remove punctuations using the updated function and tokenize the sentence into words
     cleaned_sentence = remove_punctuations(sentence)
-    return cleaned_sentence.split()
 def generate_bigrams(words):
     # Generate bigrams from a list of words
@@ -100,15 +100,21 @@ def find_matching_words(sentence1, sentence2):
     matching_words = []
     for i, bigram in enumerate(bigrams2):
         if hash_bigram(bigram) in hashed_bigrams_set:
-            word1_idx = sentence2.find(bigram[0], sum(len(word) for word in sentence2.split()[:i]))
-            word2_idx = sentence2.find(bigram[1], word1_idx + len(bigram[0]))
-            matching_words.append((sentence2[word1_idx:word1_idx+len(bigram[0])], sentence2[word2_idx:word2_idx+len(bigram[1])]))
     return matching_words
 def remove_overlapping(input_set):
     sorted_set = sorted(input_set, key=len, reverse=True)
     output_set = set()
@@ -185,18 +191,19 @@ for paraphrase in paraphrases:
         for i, bigram in enumerate(matching_bigrams):
             if i == 0:
-                combined_word += ' '.join(bigram)
             elif bigram[0] == matching_bigrams[i-1][1]:
-                combined_word += ' ' + bigram[1]
             else:
-                combined_words.append(combined_word)
-                combined_word = ' '.join(bigram)
         # Append the last combined word
-        combined_words.append(combined_word)
         return combined_words
     # Combine matching bigrams into single words
     combined_words = combine_matching_bigrams(matching_words)
     combined_words_list.append(combined_words)

 def remove_punctuations(text):
+    # Remove punctuations while preserving hyphenated words, commas, and full stops
+    return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s\-,\.]', '', text))
 def tokenize(sentence):
     # Remove punctuations using the updated function and tokenize the sentence into words
     cleaned_sentence = remove_punctuations(sentence)
+    # Also split on punctuation marks to handle cases where words are adjacent to punctuation
+    return re.findall(r"[\w'-]+|[.,;!?]", cleaned_sentence)
 def generate_bigrams(words):
     # Generate bigrams from a list of words
     matching_words = []
     for i, bigram in enumerate(bigrams2):
         if hash_bigram(bigram) in hashed_bigrams_set:
+            # Check if the entire bigram exists in the sentence
+            if bigram[0] in words2 and bigram[1] in words2:
+                # Find the start index of the first word of the bigram in the sentence
+                word1_idx = words2.index(bigram[0])
+                # Find the start index of the second word of the bigram in the sentence, starting from the index following the first word
+                word2_idx = words2.index(bigram[1], word1_idx + 1)
+                # Append the matching words to the list
+                matching_words.append((words2[word1_idx], words2[word2_idx]))
     return matching_words
 def remove_overlapping(input_set):
     sorted_set = sorted(input_set, key=len, reverse=True)
     output_set = set()
         for i, bigram in enumerate(matching_bigrams):
             if i == 0:
+                combined_word += bigram[0] + ('' if bigram[1] in string.punctuation else ' ') + bigram[1]
             elif bigram[0] == matching_bigrams[i-1][1]:
+                combined_word += bigram[1] if bigram[1] in string.punctuation else ' ' + bigram[1]
             else:
+                combined_words.append(combined_word.strip())
+                combined_word = bigram[0] + ('' if bigram[1] in string.punctuation else ' ') + bigram[1]
         # Append the last combined word
+        combined_words.append(combined_word.strip())
         return combined_words
     # Combine matching bigrams into single words
     combined_words = combine_matching_bigrams(matching_words)
     combined_words_list.append(combined_words)