File size: 2,945 Bytes
5d4054c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
import ast
import json

DATASET = "data/inc_df_v6_small.csv"
DATASET_PROCESSED = "data/inc_df.csv"
MEMBERS = "data/authors_filter.json"


def main():
    print(f"Length of dataset: {len(pd.read_csv(DATASET))}")
    df = pd.read_csv(DATASET)
    df["retriever_id"] = df.index
    columns = [
        "retriever_id",
        "description",
        "href",
        "draft_labs_list",
        "authors_list",
        "draft_allcats",
        "doc_subtype",
        "doc_type",
        "text",
        "round",
    ]

    df = df[columns]

    df.rename(
        mapper={
            "draft_labs_list": "draft_labs",
            "draft_allcats": "draft_cats",
            "authors_list": "author",
        },
        axis=1,
        inplace=True,
    )

    ###Subselect for countries and country groups
    with open(MEMBERS, "r") as f:
        authors = json.load(f)
    special_character_words_mapper = {
        "C么te D'Ivoire": "C么te DIvoire",
        "Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
        "Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
    }
    members = [
        authors[key]
        for key in [
            "Members - Countries",
            "Members - International and Regional State Associations",
        ]
    ]
    members = [item for sublist in members for item in sublist]
    members = [special_character_words_mapper.get(member, member) for member in members]

    nonmembers = [
        authors[key]
        for key in [
            "Intergovernmental Negotiation Committee",
            "Observers and Other Participants",
        ]
    ]
    nonmembers = [item for sublist in nonmembers for item in sublist]

    df["author"][df["author"] == "['C么te D'Ivoire']"] = "['C么te DIvoire']"
    df["author"][
        df["author"] == "['Ligue Camerounaise Des Droits De L'Homme']"
    ] = "['Ligue Camerounaise Des Droits De LHomme']"
    df["author"][
        df["author"]
        == "['Association Pour L'Integration Et La Developpement Durable Au Burundi']"
    ] = "['Association Pour LIntegration Et La Developpement Durable Au Burundi']"

    df["author"] = df["author"].apply(ast.literal_eval)
    df = df[df["author"].apply(lambda x: any(item in members for item in x))]
    df["author"] = df["author"].apply(
        lambda x: [item for item in x if item not in nonmembers]
    )
    df["author"] = df["author"].apply(
        lambda x: [item.replace("C么te DIvoire", "C么te D 'Ivoire") for item in x]
    )
    df["draft_labs"] = df["draft_labs"].fillna("[]")
    df["author"][
        df["author"] == "['The Alliance Of Small Island States (AOSIS)']"
    ] = "['Alliance Of Small Island States (AOSIS)']"

    print(f"Filtered dataset to {len(df)} entries")
    df.to_csv(DATASET_PROCESSED, index=False)


if __name__ == "__main__":
    main()