File size: 2,659 Bytes
6ee47c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
import re

class LawTxetPreProcessor():
    
    def __init__(self, law_texts: list) -> None:
        self._law_texets = law_texts
        self._law_name_df = pd.DataFrame(columns=["law_index", "law_name"])
        self._madeh_df = pd.DataFrame(columns=["law_index", "madeh_index", "madeh_text"])
        self._is_df = False
        
    def build_df(self):
        title_list = []
        madeh_list = []
        madeh_index = []
        law_index = []
        counter = 0
        for text in self._law_texets:
            title = self.title_extractor(text)
            title_list.append(title)
            temp_madeh_list = self.madeh_extractor(text, title == "قانون اساسی جمهوری اسلامی ایران")
            law_index.extend([counter for i in temp_madeh_list])
            madeh_index.extend([i+1  for i in range(len(temp_madeh_list))])
            madeh_list.extend(temp_madeh_list)
            counter += 1
        law_index_list = [i for i in range(counter)]
        self._madeh_df = pd.DataFrame({"law_index": law_index,
                                    "madeh_index": madeh_index,
                                    "madeh_text": madeh_list})
        self._law_name_df = pd.DataFrame({"law_index": law_index_list,
                                          "law_name": title_list})
        
    def title_extractor(self, law_text: str) -> str:
        first_newline_index = law_text.find('\n')
        return law_text[:first_newline_index]
    
    def madeh_extractor(self, law_text: str, is_asl:False)-> list:
        result = []
        pattern = r"(^.{0,1}اصل )" if is_asl else r"(^.{0,1}ماده)"
        removed_regex = r"❯.*\n"
        notvalid_pattern = r"(^.{0,1}ماده.*مکرر\n)"
        cleaned_text = re.sub(removed_regex, "", law_text)
        matches = re.finditer(pattern, cleaned_text, flags=re.MULTILINE)
        not_valid_matches = re.finditer(notvalid_pattern, cleaned_text, flags=re.MULTILINE)
        indices = [match.start() for match in matches]
        not_valid_indices = [match.start() for match in not_valid_matches]
        valid_indices = [item for item in indices if item not in not_valid_indices]
        for i in range(len(valid_indices)):
            start = valid_indices[i]
            if i != len(valid_indices)-1:
                end = valid_indices[i+1]
                result.append(cleaned_text[start:end])
            else:
                result.append(cleaned_text[start:])
        return result
    
    
    def get_df(self) -> pd.DataFrame:
        if not self._is_df:
            self.build_df()
        return self._law_name_df, self._madeh_df