File size: 745 Bytes
a608bb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import re


class TextAnalyzer:
    def __init__(self,sentences):
        self.sentences=sentences
        self.clean_sentences()
    def get_tokens(self):
        words = [word for sentence in self.sentences for word in sentence.split()]
        return words
    def get_sentences(self):
        return self.sentences
    def clean_sentences(self):
        cleaned_sentences = []
        for sentence in self.sentences:
            # Remove specific punctuation marks
            sentence = re.sub(r'[፣,),(]', '', sentence)
            # Remove extra spaces
            sentence = re.sub(r'\s+', ' ', sentence).strip()
            cleaned_sentences.append(sentence)
        self.sentences=cleaned_sentences
        cleaned_sentences=None