|
|
|
import re |
|
|
|
import fitz |
|
import pandas as pd |
|
|
|
|
|
document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf" |
|
|
|
|
|
skip_header_offset = 1 |
|
regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))") |
|
regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)") |
|
|
|
document = "" |
|
page_article = {} |
|
pdf_page_offset = 1 |
|
with fitz.open(document_path) as doc: |
|
for page_idx, page in enumerate(doc, pdf_page_offset): |
|
text = page.get_text() |
|
document += text |
|
articles = regex_article.findall(text) |
|
for article in articles: |
|
page_article[article] = page_idx |
|
|
|
len(page_article) |
|
|
|
chapters = {} |
|
chapter_name = "header" |
|
splited_chapters = regex_chapters.split(document) |
|
for chapter in splited_chapters[skip_header_offset:]: |
|
if chapter.startswith("CAPÍTULO"): |
|
chapter_name = chapter.replace(" \n", ": ") |
|
else: |
|
chapters[chapter_name] = chapter |
|
len(chapters), chapters.keys() |
|
|
|
minimum_article_length = 65 |
|
def format_article(article): |
|
articles = article.lstrip('- ').split("\n \n") |
|
formated_articles = [] |
|
for article in articles: |
|
formated_article = article.replace("\n", "").replace("*", "").strip() |
|
is_article_single = formated_article.startswith("El Estado") |
|
is_article_too_short = len(formated_article) <= minimum_article_length |
|
|
|
if is_article_too_short and not is_article_single: |
|
continue |
|
|
|
formated_articles.append(formated_article) |
|
|
|
sentence = " ".join(formated_articles) |
|
return sentence |
|
|
|
|
|
chapter_articles = [] |
|
for chapter_name, chapter in chapters.items(): |
|
article_name = "header" |
|
splited_articles = regex_article.split(chapter) |
|
for article in splited_articles[skip_header_offset:]: |
|
if regex_article.match(article): |
|
article_name = article |
|
continue |
|
|
|
data = { |
|
"chapter_name": chapter_name, |
|
"article_page": page_article.get(article_name), |
|
"article_name": article_name, |
|
"article": format_article(article), |
|
} |
|
chapter_articles.append(data) |
|
|
|
df_document = pd.DataFrame.from_dict(chapter_articles) |
|
|
|
df_document["article_number"] = ( |
|
df_document['article_name'] |
|
.str.extract(r'(^\d+)', expand=False) |
|
) |
|
df_document["article_name"] = ( |
|
df_document['article_name'] |
|
.str.extract(r'^\d+\.- ?(.*)', expand=False) |
|
.str.rstrip(".-") |
|
) |
|
df_document.head() |
|
|
|
df_document.to_csv("data/articles.csv", index=False) |
|
|
|
|