babel_machine / label_dicts.py
kovacsvi
minor, minor-media label name split
c14e676
import os
import json
import pandas as pd
from google.cloud import bigquery
service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"])
client = bigquery.Client.from_service_account_info(service_account_info)
query = "SELECT * FROM `upheld-magpie-314312.codebooks.codebooks_full`"
query_job = client.query(query)
df_codebooks = query_job.result().to_dataframe()
def get_label_names(df, task):
task_df = df[df["task"] == task].sort_values(by="index")
label_names_dict = dict(zip(task_df["code"], task_df["name"]))
return label_names_dict
def get_num_dict(df, task):
task_df = df[df["task"] == task].sort_values(by="index")
num_dict = dict(zip(task_df["index"], task_df["code"]))
return num_dict
# NOTE: this is the same logic as in the Babel pipeline
# key: task type (as in codebooks.codebooks_full)
# value: (tuple) name of label_names, num_dict variables
# TO-DO: we could replace all of these variables with one dict-like object
task_names = {
"CAP": ("CAP_LABEL_NAMES", "CAP_NUM_DICT"),
"CAP_MINOR": ("CAP_MIN_LABEL_NAMES", "CAP_MIN_NUM_DICT"),
"CAP_MEDIA": ("CAP_MEDIA_LABEL_NAMES", "CAP_MEDIA_NUM_DICT"),
"CAP_MEDIA2": ("CAP_MEDIA2_LABEL_NAMES", "CAP_MEDIA2_NUM_DICT"),
"CAP_MINOR_MEDIA": ("CAP_MIN_MEDIA_LABEL_NAMES", "CAP_MIN_MEDIA_NUM_DICT"),
"MANIFESTO": ("MANIFESTO_LABEL_NAMES", "MANIFESTO_NUM_DICT"),
"SENTIMENT": ("SENTIMENT_LABEL_NAMES", "SENTIMENT_NUM_DICT"),
"EMOTION6": ("EMOTION6_LABEL_NAMES", "EMOTION6_NUM_DICT"),
"EMOTION9": ("EMOTION9_V2_LABEL_NAMES", "EMOTION9_V2_NUM_DICT"),
"EMOTION9_LEGACY": ("EMOTION9_LABEL_NAMES", "EMOTION9_NUM_DICT"),
"ILLFRAMES_MIGRATION": ("ILLFRAMES_MIGRATION_LABEL_NAMES", "ILLFRAMES_MIGRATION_NUM_DICT"),
"ILLFRAMES_COVID": ("ILLFRAMES_COVID_LABEL_NAMES", "ILLFRAMES_COVID_NUM_DICT"),
"ILLFRAMES_WAR": ("ILLFRAMES_WAR_LABEL_NAMES", "ILLFRAMES_WAR_NUM_DICT"),
"ONTOLISST": ("ONTOLISST_LABEL_NAMES", "ONTOLISST_NUM_DICT"),
}
for task, var_names in task_names.items():
label_name_var = var_names[0]
num_dict_var = var_names[1]
globals()[label_name_var] = get_label_names(df_codebooks, task)
globals()[num_dict_var] = get_num_dict(df_codebooks, task)
# making it prettier
CAP_MIN_LABEL_NAMES = {code:label_name.split("- ")[-1] for code, label_name in CAP_MIN_LABEL_NAMES.items()}
CAP_MIN_MEDIA_LABEL_NAMES = {code:label_name.split("- ")[-1] for code, label_name in CAP_MIN_MEDIA_LABEL_NAMES.items()}