Update app.py
Browse files
app.py
CHANGED
@@ -67,14 +67,14 @@ def paraphrase(
|
|
67 |
|
68 |
|
69 |
def remove_punctuations(text):
|
70 |
-
# Remove punctuations while preserving hyphenated words
|
71 |
-
return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s
|
72 |
|
73 |
def tokenize(sentence):
|
74 |
# Remove punctuations using the updated function and tokenize the sentence into words
|
75 |
cleaned_sentence = remove_punctuations(sentence)
|
76 |
-
|
77 |
-
|
78 |
|
79 |
def generate_bigrams(words):
|
80 |
# Generate bigrams from a list of words
|
@@ -100,15 +100,21 @@ def find_matching_words(sentence1, sentence2):
|
|
100 |
matching_words = []
|
101 |
for i, bigram in enumerate(bigrams2):
|
102 |
if hash_bigram(bigram) in hashed_bigrams_set:
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
return matching_words
|
108 |
|
109 |
|
110 |
|
111 |
|
|
|
112 |
def remove_overlapping(input_set):
|
113 |
sorted_set = sorted(input_set, key=len, reverse=True)
|
114 |
output_set = set()
|
@@ -185,18 +191,19 @@ for paraphrase in paraphrases:
|
|
185 |
|
186 |
for i, bigram in enumerate(matching_bigrams):
|
187 |
if i == 0:
|
188 |
-
combined_word += ' '
|
189 |
elif bigram[0] == matching_bigrams[i-1][1]:
|
190 |
-
combined_word += ' ' + bigram[1]
|
191 |
else:
|
192 |
-
combined_words.append(combined_word)
|
193 |
-
combined_word = ' '
|
194 |
|
195 |
# Append the last combined word
|
196 |
-
combined_words.append(combined_word)
|
197 |
|
198 |
return combined_words
|
199 |
|
|
|
200 |
# Combine matching bigrams into single words
|
201 |
combined_words = combine_matching_bigrams(matching_words)
|
202 |
combined_words_list.append(combined_words)
|
|
|
67 |
|
68 |
|
69 |
def remove_punctuations(text):
|
70 |
+
# Remove punctuations while preserving hyphenated words, commas, and full stops
|
71 |
+
return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s\-,\.]', '', text))
|
72 |
|
73 |
def tokenize(sentence):
|
74 |
# Remove punctuations using the updated function and tokenize the sentence into words
|
75 |
cleaned_sentence = remove_punctuations(sentence)
|
76 |
+
# Also split on punctuation marks to handle cases where words are adjacent to punctuation
|
77 |
+
return re.findall(r"[\w'-]+|[.,;!?]", cleaned_sentence)
|
78 |
|
79 |
def generate_bigrams(words):
|
80 |
# Generate bigrams from a list of words
|
|
|
100 |
matching_words = []
|
101 |
for i, bigram in enumerate(bigrams2):
|
102 |
if hash_bigram(bigram) in hashed_bigrams_set:
|
103 |
+
# Check if the entire bigram exists in the sentence
|
104 |
+
if bigram[0] in words2 and bigram[1] in words2:
|
105 |
+
# Find the start index of the first word of the bigram in the sentence
|
106 |
+
word1_idx = words2.index(bigram[0])
|
107 |
+
# Find the start index of the second word of the bigram in the sentence, starting from the index following the first word
|
108 |
+
word2_idx = words2.index(bigram[1], word1_idx + 1)
|
109 |
+
# Append the matching words to the list
|
110 |
+
matching_words.append((words2[word1_idx], words2[word2_idx]))
|
111 |
|
112 |
return matching_words
|
113 |
|
114 |
|
115 |
|
116 |
|
117 |
+
|
118 |
def remove_overlapping(input_set):
|
119 |
sorted_set = sorted(input_set, key=len, reverse=True)
|
120 |
output_set = set()
|
|
|
191 |
|
192 |
for i, bigram in enumerate(matching_bigrams):
|
193 |
if i == 0:
|
194 |
+
combined_word += bigram[0] + ('' if bigram[1] in string.punctuation else ' ') + bigram[1]
|
195 |
elif bigram[0] == matching_bigrams[i-1][1]:
|
196 |
+
combined_word += bigram[1] if bigram[1] in string.punctuation else ' ' + bigram[1]
|
197 |
else:
|
198 |
+
combined_words.append(combined_word.strip())
|
199 |
+
combined_word = bigram[0] + ('' if bigram[1] in string.punctuation else ' ') + bigram[1]
|
200 |
|
201 |
# Append the last combined word
|
202 |
+
combined_words.append(combined_word.strip())
|
203 |
|
204 |
return combined_words
|
205 |
|
206 |
+
|
207 |
# Combine matching bigrams into single words
|
208 |
combined_words = combine_matching_bigrams(matching_words)
|
209 |
combined_words_list.append(combined_words)
|