Spaces:
Runtime error
Runtime error
import os | |
import sys | |
import requests | |
import time | |
import pandas as pd | |
import argilla as rg | |
from datasets import load_dataset | |
from argilla.labeling.text_classification import Rule, add_rules | |
def load_datasets(): | |
# This is the code that you want to execute when the endpoint is available | |
print("Argilla is available! Loading datasets") | |
api_key = sys.argv[-1] | |
rg.init(api_key=api_key, workspace="team") | |
# load dataset from json | |
my_dataframe = pd.read_json( | |
"https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json") | |
# convert pandas dataframe to DatasetForTextClassification | |
dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe) | |
# Define labeling schema to avoid UI user modification | |
settings = rg.TextClassificationSettings(label_schema=["POSITIVE", "NEGATIVE"]) | |
rg.configure_dataset(name="sst-sentiment-explainability", settings=settings) | |
# log the dataset | |
rg.log( | |
dataset_rg, | |
name="sst-sentiment-explainability", | |
tags={ | |
"description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations from Transformers Interpret." | |
} | |
) | |
dataset = load_dataset("argilla/news-summary", split="train").select(range(100)) | |
dataset_rg = rg.read_datasets(dataset, task="Text2Text") | |
# log the dataset | |
rg.log( | |
dataset_rg, | |
name="news-text-summarization", | |
tags={ | |
"description": "A text summarization dataset with news pieces and their predicted summaries." | |
} | |
) | |
# Read dataset from Hub | |
dataset_rg = rg.read_datasets( | |
load_dataset("argilla/agnews_weak_labeling", split="train"), | |
task="TextClassification", | |
) | |
# Define labeling schema to avoid UI user modification | |
settings = rg.TextClassificationSettings(label_schema=["World", "Sports", "Sci/Tech", "Business"]) | |
rg.configure_dataset(name="news-programmatic-labeling", settings=settings) | |
# log the dataset | |
rg.log( | |
dataset_rg, | |
name="news-programmatic-labeling", | |
tags={ | |
"description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)." | |
} | |
) | |
# define queries and patterns for each category (using ES DSL) | |
queries = [ | |
(["money", "financ*", "dollar*"], "Business"), | |
(["war", "gov*", "minister*", "conflict"], "World"), | |
(["*ball", "sport*", "game", "play*"], "Sports"), | |
(["sci*", "techno*", "computer*", "software", "web"], "Sci/Tech"), | |
] | |
# define rules | |
rules = [Rule(query=term, label=label) for terms, label in queries for term in terms] | |
# add rules to the dataset | |
add_rules(dataset="news-programmatic-labeling", rules=rules) | |
# load dataset from the hub | |
dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train") | |
# read in dataset, assuming its a dataset for token classification | |
dataset_rg = rg.read_datasets(dataset, task="TokenClassification") | |
# Define labeling schema to avoid UI user modification | |
labels = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", | |
"PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"] | |
settings = rg.TokenClassificationSettings(label_schema=labels) | |
rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings) | |
# log the dataset | |
rg.log( | |
dataset_rg, | |
"gutenberg_spacy-ner-monitoring", | |
tags={ | |
"description": "A dataset containing text from books with predictions from two spaCy NER pre-trained models." | |
} | |
) | |
while True: | |
try: | |
response = requests.get("http://0.0.0.0:6900/") | |
if response.status_code == 200: | |
load_datasets() | |
break | |
else: | |
time.sleep(10) | |
except Exception as e: | |
print(e) | |
time.sleep(10) | |
pass | |