Spaces:

brichett
/

tsgpt

Running

tsgpt / src /data_module /data_pipeline.py

upload src folder

854f61d verified 10 months ago

1.66 kB

	import pandas as pd
	from datasets import load_dataset
	from utils.util import get_proj_root
	from hamilton.function_modifiers import extract_fields, config


	@config.when(loader="hf")
	def miws_dataset__hf(project_root: str = get_proj_root()) -> pd.DataFrame:
	"""Load the MIWS dataset using the HuggingFace Dataset library"""
	dataset = load_dataset(f"{project_root}/data2/", split="train")
	return dataset.to_pandas()


	@config.when(loader="pd")
	def miws_dataset__pd(project_root: str = get_proj_root()) -> pd.DataFrame:
	"""Load the MIWS dataset using the Pandas library"""
	return pd.read_csv(f"{project_root}/data/MIWS_Seed_Complete.csv", delimiter=',', quotechar='"')


	def clean_dataset(miws_dataset: pd.DataFrame) -> pd.DataFrame:
	"""Remove duplicates and drop unecessary columns"""
	cleaned_df = miws_dataset[["ID", "Text", "Ideology", "Label", "Geographical_Location"]]
	return cleaned_df


	def filtered_dataset(clean_dataset: pd.DataFrame, n_rows_to_keep: int = 400) -> pd.DataFrame:
	if n_rows_to_keep is None:
	return clean_dataset

	return clean_dataset.head(n_rows_to_keep)


	@extract_fields(
	dict(
	ids=list[int],
	text_contents=list[str],
	ideologies=list[str],
	labels=list[str],
	locations=list[str],
	)
	)
	def dataset_rows(filtered_dataset: pd.DataFrame) -> dict:
	"""Convert dataframe to dict of lists"""
	df_as_dict = filtered_dataset.to_dict(orient="list")
	return dict(
	ids=df_as_dict["ID"], text_contents=df_as_dict["Text"], ideologies=df_as_dict["Ideology"], labels=df_as_dict["Label"], locations=df_as_dict["Geographical_Location"],
	)