File size: 2,521 Bytes
b19c8bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# %%
import re

import fitz
import pandas as pd

# %%
document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf"

# %%
skip_header_offset = 1
regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))")
regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)")
# %%
document = ""
page_article = {}
pdf_page_offset = 1
with fitz.open(document_path) as doc:
    for page_idx, page in enumerate(doc, pdf_page_offset):
        text = page.get_text()
        document += text
        articles = regex_article.findall(text)
        for article in articles:
            page_article[article] = page_idx

len(page_article)
# %%
chapters = {}
chapter_name = "header"
splited_chapters = regex_chapters.split(document)
for chapter in splited_chapters[skip_header_offset:]:
    if chapter.startswith("CAPÍTULO"):
        chapter_name = chapter.replace(" \n", ": ")
    else:
        chapters[chapter_name] = chapter
len(chapters), chapters.keys()
# %%
minimum_article_length = 65
def format_article(article):
    articles = article.lstrip('- ').split("\n \n")
    formated_articles = []
    for article in articles:
        formated_article = article.replace("\n", "").replace("*", "").strip()
        is_article_single = formated_article.startswith("El Estado")
        is_article_too_short = len(formated_article) <= minimum_article_length

        if is_article_too_short and not is_article_single:
            continue

        formated_articles.append(formated_article)

    sentence = " ".join(formated_articles)
    return sentence

# %%
chapter_articles = []
for chapter_name, chapter in chapters.items():
    article_name = "header"
    splited_articles = regex_article.split(chapter)
    for article in splited_articles[skip_header_offset:]:
        if regex_article.match(article):
            article_name = article
            continue

        data = {
            "chapter_name": chapter_name,
            "article_page": page_article.get(article_name),
            "article_name": article_name,
            "article": format_article(article),
        }
        chapter_articles.append(data)
# %%
df_document = pd.DataFrame.from_dict(chapter_articles)

df_document["article_number"] = (
    df_document['article_name']
    .str.extract(r'(^\d+)', expand=False)
)
df_document["article_name"] = (
    df_document['article_name']
    .str.extract(r'^\d+\.- ?(.*)', expand=False)
    .str.rstrip(".-")
)
df_document.head()
# %%
df_document.to_csv("data/articles.csv", index=False)
# %%