Towhidul commited on
Commit
7861958
·
verified ·
1 Parent(s): 793fbc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -12
app.py CHANGED
@@ -67,14 +67,14 @@ def paraphrase(
67
 
68
 
69
  def remove_punctuations(text):
70
- # Remove punctuations while preserving hyphenated words
71
- return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s-]', '', text))
72
 
73
  def tokenize(sentence):
74
  # Remove punctuations using the updated function and tokenize the sentence into words
75
  cleaned_sentence = remove_punctuations(sentence)
76
- return cleaned_sentence.split()
77
-
78
 
79
  def generate_bigrams(words):
80
  # Generate bigrams from a list of words
@@ -100,15 +100,21 @@ def find_matching_words(sentence1, sentence2):
100
  matching_words = []
101
  for i, bigram in enumerate(bigrams2):
102
  if hash_bigram(bigram) in hashed_bigrams_set:
103
- word1_idx = sentence2.find(bigram[0], sum(len(word) for word in sentence2.split()[:i]))
104
- word2_idx = sentence2.find(bigram[1], word1_idx + len(bigram[0]))
105
- matching_words.append((sentence2[word1_idx:word1_idx+len(bigram[0])], sentence2[word2_idx:word2_idx+len(bigram[1])]))
 
 
 
 
 
106
 
107
  return matching_words
108
 
109
 
110
 
111
 
 
112
  def remove_overlapping(input_set):
113
  sorted_set = sorted(input_set, key=len, reverse=True)
114
  output_set = set()
@@ -185,18 +191,19 @@ for paraphrase in paraphrases:
185
 
186
  for i, bigram in enumerate(matching_bigrams):
187
  if i == 0:
188
- combined_word += ' '.join(bigram)
189
  elif bigram[0] == matching_bigrams[i-1][1]:
190
- combined_word += ' ' + bigram[1]
191
  else:
192
- combined_words.append(combined_word)
193
- combined_word = ' '.join(bigram)
194
 
195
  # Append the last combined word
196
- combined_words.append(combined_word)
197
 
198
  return combined_words
199
 
 
200
  # Combine matching bigrams into single words
201
  combined_words = combine_matching_bigrams(matching_words)
202
  combined_words_list.append(combined_words)
 
67
 
68
 
69
  def remove_punctuations(text):
70
+ # Remove punctuations while preserving hyphenated words, commas, and full stops
71
+ return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s\-,\.]', '', text))
72
 
73
  def tokenize(sentence):
74
  # Remove punctuations using the updated function and tokenize the sentence into words
75
  cleaned_sentence = remove_punctuations(sentence)
76
+ # Also split on punctuation marks to handle cases where words are adjacent to punctuation
77
+ return re.findall(r"[\w'-]+|[.,;!?]", cleaned_sentence)
78
 
79
  def generate_bigrams(words):
80
  # Generate bigrams from a list of words
 
100
  matching_words = []
101
  for i, bigram in enumerate(bigrams2):
102
  if hash_bigram(bigram) in hashed_bigrams_set:
103
+ # Check if the entire bigram exists in the sentence
104
+ if bigram[0] in words2 and bigram[1] in words2:
105
+ # Find the start index of the first word of the bigram in the sentence
106
+ word1_idx = words2.index(bigram[0])
107
+ # Find the start index of the second word of the bigram in the sentence, starting from the index following the first word
108
+ word2_idx = words2.index(bigram[1], word1_idx + 1)
109
+ # Append the matching words to the list
110
+ matching_words.append((words2[word1_idx], words2[word2_idx]))
111
 
112
  return matching_words
113
 
114
 
115
 
116
 
117
+
118
  def remove_overlapping(input_set):
119
  sorted_set = sorted(input_set, key=len, reverse=True)
120
  output_set = set()
 
191
 
192
  for i, bigram in enumerate(matching_bigrams):
193
  if i == 0:
194
+ combined_word += bigram[0] + ('' if bigram[1] in string.punctuation else ' ') + bigram[1]
195
  elif bigram[0] == matching_bigrams[i-1][1]:
196
+ combined_word += bigram[1] if bigram[1] in string.punctuation else ' ' + bigram[1]
197
  else:
198
+ combined_words.append(combined_word.strip())
199
+ combined_word = bigram[0] + ('' if bigram[1] in string.punctuation else ' ') + bigram[1]
200
 
201
  # Append the last combined word
202
+ combined_words.append(combined_word.strip())
203
 
204
  return combined_words
205
 
206
+
207
  # Combine matching bigrams into single words
208
  combined_words = combine_matching_bigrams(matching_words)
209
  combined_words_list.append(combined_words)