am_text_summary / train /text_analyzer.py
berito's picture
train code added
a608bb4
raw
history blame
745 Bytes
import re
class TextAnalyzer:
def __init__(self,sentences):
self.sentences=sentences
self.clean_sentences()
def get_tokens(self):
words = [word for sentence in self.sentences for word in sentence.split()]
return words
def get_sentences(self):
return self.sentences
def clean_sentences(self):
cleaned_sentences = []
for sentence in self.sentences:
# Remove specific punctuation marks
sentence = re.sub(r'[፣,),(]', '', sentence)
# Remove extra spaces
sentence = re.sub(r'\s+', ' ', sentence).strip()
cleaned_sentences.append(sentence)
self.sentences=cleaned_sentences
cleaned_sentences=None