jgyasu's picture
Update lcs.py
3b4471f verified
raw
history blame
1.25 kB
import re
from nltk.corpus import stopwords
def find_common_subsequences(sentence, str_list):
stop_words = set(stopwords.words('english'))
sentence = sentence.lower()
str_list = [s.lower() for s in str_list]
def is_present(subseq, str_list):
subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
return all(subseq_regex.search(s) for s in str_list)
def remove_stop_words_and_special_chars(sentence):
sentence = re.sub(r'[^\w\s]', '', sentence)
words = sentence.split()
filtered_words = [word for word in words if word.lower() not in stop_words]
return " ".join(filtered_words)
sentence = remove_stop_words_and_special_chars(sentence)
str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
words = sentence.split()
common_grams = []
added_phrases = set()
index = 1
for n in range(5, 0, -1):
for i in range(len(words) - n + 1):
subseq = " ".join(words[i:i+n])
if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases):
common_grams.append((index, subseq))
added_phrases.add(subseq)
index += 1
return common_grams