import pandas as pd import ast import json DATASET = "data/inc_df_v6_small.csv" DATASET_PROCESSED = "data/inc_df.csv" MEMBERS = "data/authors_filter.json" def main(): print(f"Length of dataset: {len(pd.read_csv(DATASET))}") df = pd.read_csv(DATASET) df["retriever_id"] = df.index columns = [ "retriever_id", "description", "href", "draft_labs_list", "authors_list", "draft_allcats", "doc_subtype", "doc_type", "text", "round", ] df = df[columns] df.rename( mapper={ "draft_labs_list": "draft_labs", "draft_allcats": "draft_cats", "authors_list": "author", }, axis=1, inplace=True, ) ###Subselect for countries and country groups with open(MEMBERS, "r") as f: authors = json.load(f) special_character_words_mapper = { "Côte D'Ivoire": "Côte DIvoire", "Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme", "Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi", } members = [ authors[key] for key in [ "Members - Countries", "Members - International and Regional State Associations", ] ] members = [item for sublist in members for item in sublist] members = [special_character_words_mapper.get(member, member) for member in members] nonmembers = [ authors[key] for key in [ "Intergovernmental Negotiation Committee", "Observers and Other Participants", ] ] nonmembers = [item for sublist in nonmembers for item in sublist] df["author"][df["author"] == "['Côte D'Ivoire']"] = "['Côte DIvoire']" df["author"][ df["author"] == "['Ligue Camerounaise Des Droits De L'Homme']" ] = "['Ligue Camerounaise Des Droits De LHomme']" df["author"][ df["author"] == "['Association Pour L'Integration Et La Developpement Durable Au Burundi']" ] = "['Association Pour LIntegration Et La Developpement Durable Au Burundi']" df["author"] = df["author"].apply(ast.literal_eval) df = df[df["author"].apply(lambda x: any(item in members for item in x))] df["author"] = df["author"].apply( lambda x: [item for item in x if item not in nonmembers] ) df["author"] = df["author"].apply( lambda x: [item.replace("Côte DIvoire", "Côte D 'Ivoire") for item in x] ) df["draft_labs"] = df["draft_labs"].fillna("[]") df["author"][ df["author"] == "['The Alliance Of Small Island States (AOSIS)']" ] = "['Alliance Of Small Island States (AOSIS)']" print(f"Filtered dataset to {len(df)} entries") df.to_csv(DATASET_PROCESSED, index=False) if __name__ == "__main__": main()