Spaces:

mlnotes
/

borrador_constitucion_chile

App Files Files Community

borrador_constitucion_chile / pdf_to_text.py

palegre

Add application file beta.

b19c8bc over 2 years ago

history blame contribute delete

2.52 kB

	# %%
	import re

	import fitz
	import pandas as pd

	# %%
	document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf"

	# %%
	skip_header_offset = 1
	regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.\|-))")
	regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)")
	# %%
	document = ""
	page_article = {}
	pdf_page_offset = 1
	with fitz.open(document_path) as doc:
	for page_idx, page in enumerate(doc, pdf_page_offset):
	text = page.get_text()
	document += text
	articles = regex_article.findall(text)
	for article in articles:
	page_article[article] = page_idx

	len(page_article)
	# %%
	chapters = {}
	chapter_name = "header"
	splited_chapters = regex_chapters.split(document)
	for chapter in splited_chapters[skip_header_offset:]:
	if chapter.startswith("CAPÍTULO"):
	chapter_name = chapter.replace(" \n", ": ")
	else:
	chapters[chapter_name] = chapter
	len(chapters), chapters.keys()
	# %%
	minimum_article_length = 65
	def format_article(article):
	articles = article.lstrip('- ').split("\n \n")
	formated_articles = []
	for article in articles:
	formated_article = article.replace("\n", "").replace("*", "").strip()
	is_article_single = formated_article.startswith("El Estado")
	is_article_too_short = len(formated_article) <= minimum_article_length

	if is_article_too_short and not is_article_single:
	continue

	formated_articles.append(formated_article)

	sentence = " ".join(formated_articles)
	return sentence

	# %%
	chapter_articles = []
	for chapter_name, chapter in chapters.items():
	article_name = "header"
	splited_articles = regex_article.split(chapter)
	for article in splited_articles[skip_header_offset:]:
	if regex_article.match(article):
	article_name = article
	continue

	data = {
	"chapter_name": chapter_name,
	"article_page": page_article.get(article_name),
	"article_name": article_name,
	"article": format_article(article),
	}
	chapter_articles.append(data)
	# %%
	df_document = pd.DataFrame.from_dict(chapter_articles)

	df_document["article_number"] = (
	df_document['article_name']
	.str.extract(r'(^\d+)', expand=False)
	)
	df_document["article_name"] = (
	df_document['article_name']
	.str.extract(r'^\d+\.- ?(.*)', expand=False)
	.str.rstrip(".-")
	)
	df_document.head()
	# %%
	df_document.to_csv("data/articles.csv", index=False)
	# %%