Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 2,945 Bytes
5d4054c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import pandas as pd
import ast
import json
DATASET = "data/inc_df_v6_small.csv"
DATASET_PROCESSED = "data/inc_df.csv"
MEMBERS = "data/authors_filter.json"
def main():
print(f"Length of dataset: {len(pd.read_csv(DATASET))}")
df = pd.read_csv(DATASET)
df["retriever_id"] = df.index
columns = [
"retriever_id",
"description",
"href",
"draft_labs_list",
"authors_list",
"draft_allcats",
"doc_subtype",
"doc_type",
"text",
"round",
]
df = df[columns]
df.rename(
mapper={
"draft_labs_list": "draft_labs",
"draft_allcats": "draft_cats",
"authors_list": "author",
},
axis=1,
inplace=True,
)
###Subselect for countries and country groups
with open(MEMBERS, "r") as f:
authors = json.load(f)
special_character_words_mapper = {
"C么te D'Ivoire": "C么te DIvoire",
"Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
"Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
}
members = [
authors[key]
for key in [
"Members - Countries",
"Members - International and Regional State Associations",
]
]
members = [item for sublist in members for item in sublist]
members = [special_character_words_mapper.get(member, member) for member in members]
nonmembers = [
authors[key]
for key in [
"Intergovernmental Negotiation Committee",
"Observers and Other Participants",
]
]
nonmembers = [item for sublist in nonmembers for item in sublist]
df["author"][df["author"] == "['C么te D'Ivoire']"] = "['C么te DIvoire']"
df["author"][
df["author"] == "['Ligue Camerounaise Des Droits De L'Homme']"
] = "['Ligue Camerounaise Des Droits De LHomme']"
df["author"][
df["author"]
== "['Association Pour L'Integration Et La Developpement Durable Au Burundi']"
] = "['Association Pour LIntegration Et La Developpement Durable Au Burundi']"
df["author"] = df["author"].apply(ast.literal_eval)
df = df[df["author"].apply(lambda x: any(item in members for item in x))]
df["author"] = df["author"].apply(
lambda x: [item for item in x if item not in nonmembers]
)
df["author"] = df["author"].apply(
lambda x: [item.replace("C么te DIvoire", "C么te D 'Ivoire") for item in x]
)
df["draft_labs"] = df["draft_labs"].fillna("[]")
df["author"][
df["author"] == "['The Alliance Of Small Island States (AOSIS)']"
] = "['Alliance Of Small Island States (AOSIS)']"
print(f"Filtered dataset to {len(df)} entries")
df.to_csv(DATASET_PROCESSED, index=False)
if __name__ == "__main__":
main()
|