NegotiateAI / src /data_processing /document_store_data.py
TeresaK's picture
Upload 35 files
5d4054c verified
raw
history blame
2.94 kB
import pandas as pd
import ast
import json
DATASET = "data/inc_df_v6_small_4.csv"
DATASET_PROCESSED = "data/inc_df.csv"
MEMBERS = "data/authors_filter.json"
def main():
print(f"Length of dataset: {len(pd.read_csv(DATASET))}")
df = pd.read_csv(DATASET)
df["retriever_id"] = df.index
columns = [
"retriever_id",
"description",
"href",
"draft_labs_list",
"authors_list",
"draft_allcats",
"doc_subtype",
"doc_type",
"text",
"round",
]
df = df[columns]
df.rename(
mapper={
"draft_labs_list": "draft_labs",
"draft_allcats": "draft_cats",
"authors_list": "author",
},
axis=1,
inplace=True,
)
###Subselect for countries and country groups
with open(MEMBERS, "r") as f:
authors = json.load(f)
special_character_words_mapper = {
"C么te D'Ivoire": "Cote DIvoire",
"Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
"Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
}
members = [
authors[key]
for key in [
"Members - Countries",
"Members - International and Regional State Associations",
]
]
members = [item for sublist in members for item in sublist]
members = [special_character_words_mapper.get(member, member) for member in members]
nonmembers = [
authors[key]
for key in [
"Intergovernmental Negotiation Committee",
"Observers and Other Participants",
]
]
nonmembers = [item for sublist in nonmembers for item in sublist]
df["author"][df["author"] == "['C么te D'Ivoire']"] = "['Cote DIvoire']"
df["author"][
df["author"] == "['Ligue Camerounaise Des Droits De L'Homme']"
] = "['Ligue Camerounaise Des Droits De LHomme']"
df["author"][
df["author"]
== "['Association Pour L'Integration Et La Developpement Durable Au Burundi']"
] = "['Association Pour LIntegration Et La Developpement Durable Au Burundi']"
df["author"] = df["author"].apply(ast.literal_eval)
df = df[df["author"].apply(lambda x: any(item in members for item in x))]
df["author"] = df["author"].apply(
lambda x: [item for item in x if item not in nonmembers]
)
df["author"] = df["author"].apply(
lambda x: [item.replace("C么te DIvoire", "Cote D'Ivoire") for item in x]
)
df["draft_labs"] = df["draft_labs"].fillna("[]")
df["author"][
df["author"] == "['The Alliance Of Small Island States (AOSIS)']"
] = "['Alliance Of Small Island States (AOSIS)']"
print(f"Filtered dataset to {len(df)} entries")
df.to_csv(DATASET_PROCESSED, index=False)
if __name__ == "__main__":
main()