Spaces:
Sleeping
Sleeping
from transformers import BertTokenizerFast, BertConfig | |
from typing import Dict, List, Union, Tuple | |
def num_unique_labels(dataset: Dict[str, Union[str, List[str]]]) -> Tuple[int, int]: | |
""" | |
Calculate the number of NER labels and INTENT labels in the dataset. | |
Args: | |
dataset (dict): A dictionary containing 'text', 'entities' and 'intent' keys. | |
Returns: | |
Tuple: Number of unique NER and INTENT lables. | |
""" | |
one_dimensional_ner = [tag for subset in dataset['entities'] for tag in subset] | |
return len(set(one_dimensional_ner)), len(set(dataset['intent'])) | |
def ner_labels_to_ids() -> Dict[str, int]: | |
""" | |
Map NER labels to corresponding numeric IDs. | |
Returns: | |
Dict[str, int]: A dictionary where keys are NER labels, and values are their corresponding IDs. | |
""" | |
labels_to_ids_ner = { | |
'O': 0, | |
'B-DATE': 1, | |
'I-DATE': 2, | |
'B-TIME': 3, | |
'I-TIME': 4, | |
'B-TASK': 5, | |
'I-TASK': 6, | |
'B-DUR': 7, | |
'I-DUR': 8 | |
} | |
return labels_to_ids_ner | |
def ner_ids_to_labels(ner_labels_to_ids) -> Dict[int, str]: | |
""" | |
Map numeric IDs to corresponding NER labels. | |
Returns: | |
Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding NER labels. | |
""" | |
ner_ids_to_labels = {v: k for k, v in ner_labels_to_ids.items()} | |
return ner_ids_to_labels | |
def intent_labels_to_ids() -> Dict[str, int]: | |
""" | |
Map intent labels to corresponding numeric values. | |
Returns: | |
Dict[str, int]: A dictionary where keys are intent labels, and values are their corresponding numeric IDs. | |
""" | |
intent_labels_to_ids = { | |
"'Schedule Appointment'": 0, | |
"'Schedule Meeting'": 1, | |
"'Set Alarm'": 2, | |
"'Set Reminder'": 3, | |
"'Set Timer'": 4 | |
} | |
return intent_labels_to_ids | |
def intent_ids_to_labels(intent_labels_to_ids) -> Dict[int, str]: | |
""" | |
Map numeric values to corresponding intent labels. | |
Returns: | |
Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding intent labels. | |
""" | |
intent_ids_to_labels = {v: k for k, v in intent_labels_to_ids.items()} | |
return intent_ids_to_labels | |
def tokenizer() -> BertTokenizerFast: | |
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') | |
return tokenizer | |
def bert_config() -> BertConfig: | |
config = BertConfig.from_pretrained('bert-base-uncased') | |
return config | |
def structure_data(dataset): | |
structured_data = {'text': [], 'entities': [], 'intent': []} | |
for sample in dataset: | |
structured_data['text'].append(sample['text']) | |
structured_data['entities'].append(sample['entities'].split()) | |
structured_data['intent'].append(sample['intent']) | |
return structured_data |