Spaces:

GIZ
/

NegotiateAI

Running on CPU Upgrade

App Files Files Community

NegotiateAI / src /data_processing /document_store_data.py

TeresaK

Upload 35 files

5d4054c verified 5 months ago

raw

history blame

2.94 kB

	import pandas as pd
	import ast
	import json

	DATASET = "data/inc_df_v6_small_4.csv"
	DATASET_PROCESSED = "data/inc_df.csv"
	MEMBERS = "data/authors_filter.json"


	def main():
	print(f"Length of dataset: {len(pd.read_csv(DATASET))}")
	df = pd.read_csv(DATASET)
	df["retriever_id"] = df.index
	columns = [
	"retriever_id",
	"description",
	"href",
	"draft_labs_list",
	"authors_list",
	"draft_allcats",
	"doc_subtype",
	"doc_type",
	"text",
	"round",
	]

	df = df[columns]

	df.rename(
	mapper={
	"draft_labs_list": "draft_labs",
	"draft_allcats": "draft_cats",
	"authors_list": "author",
	},
	axis=1,
	inplace=True,
	)

	###Subselect for countries and country groups
	with open(MEMBERS, "r") as f:
	authors = json.load(f)
	special_character_words_mapper = {
	"Côte D'Ivoire": "Cote DIvoire",
	"Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
	"Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
	}
	members = [
	authors[key]
	for key in [
	"Members - Countries",
	"Members - International and Regional State Associations",
	]
	]
	members = [item for sublist in members for item in sublist]
	members = [special_character_words_mapper.get(member, member) for member in members]

	nonmembers = [
	authors[key]
	for key in [
	"Intergovernmental Negotiation Committee",
	"Observers and Other Participants",
	]
	]
	nonmembers = [item for sublist in nonmembers for item in sublist]

	df["author"][df["author"] == "['Côte D'Ivoire']"] = "['Cote DIvoire']"
	df["author"][
	df["author"] == "['Ligue Camerounaise Des Droits De L'Homme']"
	] = "['Ligue Camerounaise Des Droits De LHomme']"
	df["author"][
	df["author"]
	== "['Association Pour L'Integration Et La Developpement Durable Au Burundi']"
	] = "['Association Pour LIntegration Et La Developpement Durable Au Burundi']"

	df["author"] = df["author"].apply(ast.literal_eval)
	df = df[df["author"].apply(lambda x: any(item in members for item in x))]
	df["author"] = df["author"].apply(
	lambda x: [item for item in x if item not in nonmembers]
	)
	df["author"] = df["author"].apply(
	lambda x: [item.replace("Côte DIvoire", "Cote D'Ivoire") for item in x]
	)
	df["draft_labs"] = df["draft_labs"].fillna("[]")
	df["author"][
	df["author"] == "['The Alliance Of Small Island States (AOSIS)']"
	] = "['Alliance Of Small Island States (AOSIS)']"

	print(f"Filtered dataset to {len(df)} entries")
	df.to_csv(DATASET_PROCESSED, index=False)


	if __name__ == "__main__":
	main()