Spaces:

GIZ
/

NegotiateAI

Running on CPU Upgrade

File size: 2,945 Bytes

5d4054c

import pandas as pd
import ast
import json

DATASET = "data/inc_df_v6_small.csv"
DATASET_PROCESSED = "data/inc_df.csv"
MEMBERS = "data/authors_filter.json"


def main():
    print(f"Length of dataset: {len(pd.read_csv(DATASET))}")
    df = pd.read_csv(DATASET)
    df["retriever_id"] = df.index
    columns = [
        "retriever_id",
        "description",
        "href",
        "draft_labs_list",
        "authors_list",
        "draft_allcats",
        "doc_subtype",
        "doc_type",
        "text",
        "round",
    ]

    df = df[columns]

    df.rename(
        mapper={
            "draft_labs_list": "draft_labs",
            "draft_allcats": "draft_cats",
            "authors_list": "author",
        },
        axis=1,
        inplace=True,
    )

    ###Subselect for countries and country groups
    with open(MEMBERS, "r") as f:
        authors = json.load(f)
    special_character_words_mapper = {
        "Côte D'Ivoire": "Côte DIvoire",
        "Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
        "Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
    }
    members = [
        authors[key]
        for key in [
            "Members - Countries",
            "Members - International and Regional State Associations",
        ]
    ]
    members = [item for sublist in members for item in sublist]
    members = [special_character_words_mapper.get(member, member) for member in members]

    nonmembers = [
        authors[key]
        for key in [
            "Intergovernmental Negotiation Committee",
            "Observers and Other Participants",
        ]
    ]
    nonmembers = [item for sublist in nonmembers for item in sublist]

    df["author"][df["author"] == "['Côte D'Ivoire']"] = "['Côte DIvoire']"
    df["author"][
        df["author"] == "['Ligue Camerounaise Des Droits De L'Homme']"
    ] = "['Ligue Camerounaise Des Droits De LHomme']"
    df["author"][
        df["author"]
        == "['Association Pour L'Integration Et La Developpement Durable Au Burundi']"
    ] = "['Association Pour LIntegration Et La Developpement Durable Au Burundi']"

    df["author"] = df["author"].apply(ast.literal_eval)
    df = df[df["author"].apply(lambda x: any(item in members for item in x))]
    df["author"] = df["author"].apply(
        lambda x: [item for item in x if item not in nonmembers]
    )
    df["author"] = df["author"].apply(
        lambda x: [item.replace("Côte DIvoire", "Côte D 'Ivoire") for item in x]
    )
    df["draft_labs"] = df["draft_labs"].fillna("[]")
    df["author"][
        df["author"] == "['The Alliance Of Small Island States (AOSIS)']"
    ] = "['Alliance Of Small Island States (AOSIS)']"

    print(f"Filtered dataset to {len(df)} entries")
    df.to_csv(DATASET_PROCESSED, index=False)


if __name__ == "__main__":
    main()