Spaces:
Running
Running
import os | |
import json | |
import pandas as pd | |
from google.cloud import bigquery | |
service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"]) | |
client = bigquery.Client.from_service_account_info(service_account_info) | |
query = "SELECT * FROM `upheld-magpie-314312.codebooks.codebooks_full`" | |
query_job = client.query(query) | |
df_codebooks = query_job.result().to_dataframe() | |
def get_label_names(df, task): | |
task_df = df[df["task"] == task].sort_values(by="index") | |
label_names_dict = dict(zip(task_df["code"], task_df["name"])) | |
return label_names_dict | |
def get_num_dict(df, task): | |
task_df = df[df["task"] == task].sort_values(by="index") | |
num_dict = dict(zip(task_df["index"], task_df["code"])) | |
return num_dict | |
# NOTE: this is the same logic as in the Babel pipeline | |
# key: task type (as in codebooks.codebooks_full) | |
# value: (tuple) name of label_names, num_dict variables | |
# TO-DO: we could replace all of these variables with one dict-like object | |
task_names = { | |
"CAP": ("CAP_LABEL_NAMES", "CAP_NUM_DICT"), | |
"CAP_MINOR": ("CAP_MIN_LABEL_NAMES", "CAP_MIN_NUM_DICT"), | |
"CAP_MEDIA": ("CAP_MEDIA_LABEL_NAMES", "CAP_MEDIA_NUM_DICT"), | |
"CAP_MEDIA2": ("CAP_MEDIA2_LABEL_NAMES", "CAP_MEDIA2_NUM_DICT"), | |
"CAP_MINOR_MEDIA": ("CAP_MIN_MEDIA_LABEL_NAMES", "CAP_MIN_MEDIA_NUM_DICT"), | |
"MANIFESTO": ("MANIFESTO_LABEL_NAMES", "MANIFESTO_NUM_DICT"), | |
"SENTIMENT": ("SENTIMENT_LABEL_NAMES", "SENTIMENT_NUM_DICT"), | |
"EMOTION6": ("EMOTION6_LABEL_NAMES", "EMOTION6_NUM_DICT"), | |
"EMOTION9": ("EMOTION9_V2_LABEL_NAMES", "EMOTION9_V2_NUM_DICT"), | |
"EMOTION9_LEGACY": ("EMOTION9_LABEL_NAMES", "EMOTION9_NUM_DICT"), | |
"ILLFRAMES_MIGRATION": ("ILLFRAMES_MIGRATION_LABEL_NAMES", "ILLFRAMES_MIGRATION_NUM_DICT"), | |
"ILLFRAMES_COVID": ("ILLFRAMES_COVID_LABEL_NAMES", "ILLFRAMES_COVID_NUM_DICT"), | |
"ILLFRAMES_WAR": ("ILLFRAMES_WAR_LABEL_NAMES", "ILLFRAMES_WAR_NUM_DICT"), | |
"ONTOLISST": ("ONTOLISST_LABEL_NAMES", "ONTOLISST_NUM_DICT"), | |
} | |
for task, var_names in task_names.items(): | |
label_name_var = var_names[0] | |
num_dict_var = var_names[1] | |
globals()[label_name_var] = get_label_names(df_codebooks, task) | |
globals()[num_dict_var] = get_num_dict(df_codebooks, task) | |
# making it prettier | |
CAP_MIN_LABEL_NAMES = {code:label_name.split("- ")[-1] for code, label_name in CAP_MIN_LABEL_NAMES.items()} | |
CAP_MIN_MEDIA_LABEL_NAMES = {code:label_name.split("- ")[-1] for code, label_name in CAP_MIN_MEDIA_LABEL_NAMES.items()} | |