Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
import ast | |
import json | |
DATASET = "data/inc_df_v6_small_4.csv" | |
DATASET_PROCESSED = "data/inc_df.csv" | |
MEMBERS = "data/authors_filter.json" | |
def main(): | |
print(f"Length of dataset: {len(pd.read_csv(DATASET))}") | |
df = pd.read_csv(DATASET) | |
df["retriever_id"] = df.index | |
columns = [ | |
"retriever_id", | |
"description", | |
"href", | |
"draft_labs_list", | |
"authors_list", | |
"draft_allcats", | |
"doc_subtype", | |
"doc_type", | |
"text", | |
"round", | |
] | |
df = df[columns] | |
df.rename( | |
mapper={ | |
"draft_labs_list": "draft_labs", | |
"draft_allcats": "draft_cats", | |
"authors_list": "author", | |
}, | |
axis=1, | |
inplace=True, | |
) | |
###Subselect for countries and country groups | |
with open(MEMBERS, "r") as f: | |
authors = json.load(f) | |
special_character_words_mapper = { | |
"C么te D'Ivoire": "Cote DIvoire", | |
"Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme", | |
"Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi", | |
} | |
members = [ | |
authors[key] | |
for key in [ | |
"Members - Countries", | |
"Members - International and Regional State Associations", | |
] | |
] | |
members = [item for sublist in members for item in sublist] | |
members = [special_character_words_mapper.get(member, member) for member in members] | |
nonmembers = [ | |
authors[key] | |
for key in [ | |
"Intergovernmental Negotiation Committee", | |
"Observers and Other Participants", | |
] | |
] | |
nonmembers = [item for sublist in nonmembers for item in sublist] | |
df["author"][df["author"] == "['C么te D'Ivoire']"] = "['Cote DIvoire']" | |
df["author"][ | |
df["author"] == "['Ligue Camerounaise Des Droits De L'Homme']" | |
] = "['Ligue Camerounaise Des Droits De LHomme']" | |
df["author"][ | |
df["author"] | |
== "['Association Pour L'Integration Et La Developpement Durable Au Burundi']" | |
] = "['Association Pour LIntegration Et La Developpement Durable Au Burundi']" | |
df["author"] = df["author"].apply(ast.literal_eval) | |
df = df[df["author"].apply(lambda x: any(item in members for item in x))] | |
df["author"] = df["author"].apply( | |
lambda x: [item for item in x if item not in nonmembers] | |
) | |
df["author"] = df["author"].apply( | |
lambda x: [item.replace("C么te DIvoire", "Cote D'Ivoire") for item in x] | |
) | |
df["draft_labs"] = df["draft_labs"].fillna("[]") | |
df["author"][ | |
df["author"] == "['The Alliance Of Small Island States (AOSIS)']" | |
] = "['Alliance Of Small Island States (AOSIS)']" | |
print(f"Filtered dataset to {len(df)} entries") | |
df.to_csv(DATASET_PROCESSED, index=False) | |
if __name__ == "__main__": | |
main() | |