Spaces:

drguilhermeapolinario
/

Flamengo

Running

Flamengo / data_cleaning.py

Guilherme Tell Benevenuto Apolinario

Upload data_cleaning.py

c0220f0 verified about 1 year ago

5.4 kB

	import chardet
	import pandas as pd
	import regex


	def iniciar(file_path):
	"""
	A function that detects the encoding of a file and reads the file using the detected encoding.

	Parameters:
	- file_path (str): The path to the file to be read.

	Returns:
	- str: The data read from the file.
	"""
	# Detectar a codificação do arquivo
	with open(file_path, "rb") as file:
	raw_data = file.read()
	result = chardet.detect(raw_data)
	encoding = result["encoding"]
	print(f"Detected encoding: {encoding}")

	# Ler o arquivo com a codificação detectada
	with open(file_path, "r", encoding=encoding) as file:
	csv_data = file.read()

	return csv_data


	def limpa_rci(texto):
	"""
	Cleans up the given text by removing unwanted parts.

	Args:
	texto (str): The text to be cleaned.

	Returns:
	str: The cleaned text.
	"""
	# Remover partes indesejadas
	sub1 = r"(^e-SUS.+\nMIN(.+\n+){7}.+)\|((?<=Equipe;)\d+\s\-\s)\|(^CBO.+\n(.+\n+){8}.+)\|(^Não\sinf.+\n+Identif.+\nDes.+\nDes.+\nDes.+\nRes.+\n+Iden.+$)\|(^Não\sin.+\n+Ident.+Cor$)\|(^Não\sin.+\n+Ident.+\nDesc.+\nBra.+\nNat(.+\n+){34}.+)\|(^Inf.+\nDesc.+\nAdult.+\n(.+\n+){10}.+)\|(^Inf.+\n(Des.+\n){3}.+$)\|(^Inf.+\nDesc.+\n(Tem\salg.+\n){1}.+$)\|(^Inf.+\nA\slist.+\nDes(.+\n+){8}.+$)\|(^Cidadão.+\nDesc.+\nÉ.+\nPossu(.+\n))\|(^Não\sinf.+;0.+$)\|(;;;;;\|;;;;\|Ident.+(-\|/)\s)\|((?<=Saída)\sde.+tro\|.+(?=ativos))\|"
	sub2 = r"(^C.*(rin\|card\|resp).+\n.+\n.+[os]\s)\|(;;;\nT.+\nT.+$)\|(;Sim\|(?<=[az];);\|\ssabe;\|\s.\sEnfisema)\|(^Participa.+$\|Poss.+de\s\|(?<=privado.;\d\d\d);.+\|Poss.+de\s\|(?<=privado.;\d\d\d\d);.+)\|((Desc.+\n){3}Resp.+$)\|(Etnia.+\nD.+\nNã.+$)\|(Nacio.+\nDesc.+\nB.+\nNat.+\nEst.+$)\|(Infor.+\nDesc.+\nCôn(.+\n){31}Inf.+$)\|Deseja\sinf.+$\|(?<=Ensino\s[fm][ué][nd]).+(\)\|[eat][lso])\|(?<=Pré).+\)\|(Classe\sde\salfa.+-\s)\|((?<=Superior).+rado)\|((?<=Alfabet).+\))\|(Infor.+\nDesc.+\nEmpreg(.+\n){10}.+$)\|Intelec.+/\s\|((Está\s(com\s)?))\|(^Condiç.+\nD.+$)\|((Faz\suso.+e\s)\|PIC;.+\|Tem\s\|ou\steve\s\|Teve\s\|internação.+\nUsa\splant.+$\|\s/\sderrame\|diagnóst.+de\s)"
	rx_marc = sub1 + "\|" + sub2
	# Remover partes indesejadas
	texto = regex.sub(rx_marc, "", texto, flags=regex.MULTILINE)

	tira = r"(;;;)"
	texto = regex.sub(tira, "", texto, flags=regex.MULTILINE)
	tira = r"(;;)"
	texto = regex.sub(tira, ";", texto, flags=regex.MULTILINE)
	return texto


	def separa_grupos(texto):
	"""
	A function that uses a regular expression to extract various groups from the input text.

	Args:
	texto (str): The input text from which groups are extracted.

	Returns:
	dict: A dictionary containing different groups extracted from the text like 'Idade', 'genero', 'cor', 'deficiencia', and 'doencas'.
	"""
	# Expressão regular ajustada para capturar vários grupos
	grupos = r"""
	((?P<Data>Data);\d+/\d+/\d+)\|
	((?P<Head>[EPCS][aqri][uodí].+;)\w+)\|
	(?P<Idade>(^Menos.+(?=;0)\|^[0-9][0-9]\s\w).+(?=;0))\|
	(?P<genero>(^(Masc.+\|Fem.+)))\|
	(?P<cor>(^(Bra.+$\|Pret.+$\|Amar.+$\|Par.+$\|Indí.+$)))\|
	(?P<deficiencia>(^(Audi.+a;\d+\|Fís.+a;\d+\|Cogn.+a;\d+\|Vis.+l;\d+\nOut.+$)))\|
	(?P<doencas>(^(hipert.+al;\d+\|diab.+s;\d+\|gesta.+e;\d+\|acam.+o;\d+\|domici.+o;\d+)))
	"""

	# Compilar a expressão regular com as flags re.VERBOSE e re.MULTILINE para maior legibilidade
	pattern = regex.compile(grupos, regex.VERBOSE \| regex.MULTILINE)

	matches = pattern.finditer(texto)
	grupos_enc = {
	"Idade": [],
	"genero": [],
	"cor": [],
	"deficiencia": [],
	"doencas": [],
	}

	for match in matches:
	for group_name, group_value in match.groupdict().items():
	if group_value and group_name in grupos_enc:
	grupos_enc[group_name].append(group_value)

	return grupos_enc


	def criar_dataframe(grupos_encontrados):
	"""
	Create dataframes based on the groups found in `grupos_encontrados`.

	Args:
	grupos_encontrados (dict): A dictionary containing the groups found and their corresponding values.

	Returns:
	dict: A dictionary containing the created dataframes, where the keys are the group names and the values are the corresponding dataframes.
	"""
	dataframes = {}
	for grupo, valores in grupos_encontrados.items():
	if grupo == "Idade":
	df = pd.DataFrame(valores, columns=["Descrição"])
	df["Masculino"] = df["Descrição"].apply(lambda x: x.split(";")[1])
	df["Feminino"] = df["Descrição"].apply(lambda x: x.split(";")[2])
	df["Descrição"] = df["Descrição"].apply(lambda x: x.split(";")[0])
	else:
	# Tratar casos onde os dados são separados por '\n'
	novos_valores = []
	for valor in valores:
	partes = valor.split("\n")
	novos_valores.extend(partes)
	df = pd.DataFrame(novos_valores, columns=["Descrição"])
	df["Valor"] = df["Descrição"].apply(lambda x: x.split(";")[-1])
	df["Descrição"] = df["Descrição"].apply(
	lambda x: ";".join(x.split(";")[:-1])
	)
	dataframes[grupo] = df
	return dataframes