Spaces:
Running
Running
File size: 3,965 Bytes
b1c2932 7cbaea3 b1c2932 853f29a 4bba8df b1c2932 7a079bf 4bba8df b1c2932 7cbaea3 b1c2932 4bba8df b1c2932 7cbaea3 b1c2932 4bba8df b1c2932 f295362 b1c2932 f295362 7cbaea3 b1c2932 ecdbfcf b1c2932 7cbaea3 f17fb84 b1c2932 7cbaea3 b1c2932 7cbaea3 f17fb84 7cbaea3 f17fb84 7cbaea3 f17fb84 06f6aab f17fb84 ed88b8b f17fb84 7cbaea3 f17fb84 7cbaea3 b1c2932 0b09517 7cbaea3 853f29a b1c2932 7cbaea3 b1c2932 7cbaea3 b1c2932 7cbaea3 b1c2932 ebe8358 b1c2932 4bba8df b1c2932 7cbaea3 b1c2932 4bba8df b1c2932 78e9c18 2b41a25 4bba8df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
import os
import time
import torch
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from huggingface_hub import HfApi
from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
from .utils import is_disk_full, release_model
HF_TOKEN = os.environ["hf_read"]
languages = [
"English",
"Multilingual"
]
domains = {
"media": "media",
"social media": "social",
"parliamentary speech": "parlspeech",
"legislative documents": "legislative",
"executive speech": "execspeech",
"executive order": "execorder",
"party programs": "party",
"judiciary": "judiciary",
"budget": "budget",
"public opinion": "publicopinion",
"local government agenda": "localgovernment"
}
def check_huggingface_path(checkpoint_path: str):
try:
hf_api = HfApi(token=HF_TOKEN)
hf_api.model_info(checkpoint_path, token=HF_TOKEN)
return True
except:
return False
def build_huggingface_path(language: str, domain: str):
language = language.lower()
base_path = "xlm-roberta-large"
lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
# some custom mapping
if language in ["english"]:
if domain in ["media", "legislative"]:
return f"poltextlab/{base_path}-{language}-{domain}-cap-v4"
elif domain in ["social"]:
return f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
return lang_path
else:
if domain in ["social"]:
return f"poltextlab/{base_path}-{domain}-cap-v3"
return "poltextlab/xlm-roberta-large-pooled-cap-v3"
def predict(text, model_id, tokenizer_id):
device = torch.device("cpu")
t0 = time.perf_counter()
jit_model_path = f"/data/jit_models/{model_id.replace('/', '_')}.pt"
model = torch.jit.load(jit_model_path).to(device)
model.eval()
print(f"Model loading: {time.perf_counter() - t0:.3f}s")
t1 = time.perf_counter()
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
print(f"Tokenizer loading: {time.perf_counter() - t1:.3f}s")
t2 = time.perf_counter()
inputs = tokenizer(
text,
max_length=64,
truncation=True,
padding=True,
return_tensors="pt"
)
inputs = {k: v.to(device) for k, v in inputs.items()}
print(f"Tokenization: {time.perf_counter() - t2:.3f}s")
t3 = time.perf_counter()
with torch.no_grad():
output = model(inputs["input_ids"], inputs["attention_mask"])
logits = output["logits"]
print(f"Inference: {time.perf_counter() - t3:.3f}s")
release_model(model, model_id)
t4 = time.perf_counter()
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
print(f"Post-processing: {time.perf_counter() - t4:.3f}s")
return output_pred, output_info
def predict_cap(text, language, domain):
print(domain) # debug statement
domain = domains[domain]
model_id = build_huggingface_path(language, domain)
tokenizer_id = "xlm-roberta-large"
if is_disk_full():
os.system('rm -rf /data/models*')
os.system('rm -r ~/.cache/huggingface/hub')
return predict(text, model_id, tokenizer_id)
demo = gr.Interface(
title="CAP Babel Demo",
fn=predict_cap,
inputs=[gr.Textbox(lines=6, label="Input"),
gr.Dropdown(languages, label="Language", value=languages[-1]),
gr.Dropdown(domains.keys(), label="Domain", value=list(domains.keys())[0])],
outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
|