Spaces:

Kamaljp
/

transformers_universe

Runtime error

File size: 8,949 Bytes

import streamlit as st
from transformers import (
    AutoTokenizer,
    XLNetTokenizer
) 
import pathlib
import json

st.set_page_config(layout='wide')

st.title("Transformers library For NLP Tasks : Structured by Topics")

st.write("lets start with the architectures of models")

neural_net_models = dict({
    'encoder': "responsible for understanding the input text.",
    'decoder': "designed to generate new texts answering queries.",
    'encoder-decoder': "understand and generate text & have emergent behaviour",
    'convolution': "used for image recognition and processing.",
})
model_types = list(neural_net_models.keys())

archs = st.radio("model architectures".capitalize(), model_types)

st.write(f"{archs.capitalize()} are {neural_net_models[archs]}")

domains = dict({
    "computer_vision": {
        "encoder": ['vit', 'swin', 'segformer', 'beit'],
        "decoder": ['imagegpt'],
        "encoder-decoder": ['detr'],
        "convolution": ['convnext']
    },
    "nlp": {
        "encoder": ["bert", "roberta", "albert", "distillbert",
                    "deberta", "longformer",],
        "decoder": ["gpt-2", "xlnet", "gpt-j", "opt", "bloom"],
        "encoder-decoder": ["bart", "pegasus", "t5", ],
    },
    "audio": {
        "encoder": ["wav2vec2", "hubert"],
        "encoder-decoder": ["speech2text", "whisper"]
    },
    "multimodal": {
        "encoder": ["visualbert", "vilt", "clip", "owl-vit"],
        "encoder-decoder": ["trocr", "donut"]
    },
    "reinforcement": {
        "decoder": ["trajectory transformer", "decision transformer"]
    }
})

st.write("Lets look at the Individual domains")

domain_list = list(domains.keys())

doms = st.radio("domains of ai".capitalize(), domain_list)

st.write(domains[doms])

st.write("Now comes the Tokenizers, the Entry Points")

tokenizer_algos = {
    "byte_pair": {
        "base": ['gpt', 'gpt-2(byte_level)'],
        "intro": "https://arxiv.org/abs/1508.07909"
    },
    "wordpiece":{
        "base": ['bert', 'distilbert', 'electra'],
        "intro": "https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf"
    },
    "unigram": {
        "base": ['not_used'],
        "intro": "https://arxiv.org/pdf/1804.10959.pdf"
    },
    "sentencepiece":{
        "base": ["xlm", "albert", "xlnet", "marian", "t5"],
        "intro": "https://arxiv.org/pdf/1808.06226.pdf"
    }
}

tokenizer_items = list(tokenizer_algos.keys())

algos = st.radio("tokenizer algos".capitalize(), tokenizer_items)

st.write(tokenizer_algos[algos])

st.write("""We will work on 3 types of tokenizers on a single sentence 
         to see how their output differs, by first encoding and decoding them too.""")

st.markdown("""### Models in Review:
    - gpt2
    - bert-base-uncased
    - xlm""")

input_sentence = "This is a sample sentence for testing tokenizers"

gpt2_model = "gpt2"
bert_model = "bert-base-uncased"
xlm_model = "xlnet-base-cased"

gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model)
xlm_tokenizer = XLNetTokenizer.from_pretrained(xlm_model)

st.markdown("#### The input sentence is")
st.write("The Sample Sentence: ", input_sentence)

gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)
bert_tokenize = bert_tokenizer.tokenize(input_sentence)
xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)

with st.expander(label="Byte Pair Tokenizer", expanded=False):
    st.write("gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)")
    st.write(gpt2_tokenize)
with st.expander(label="Word Piece Tokenizer", expanded=False):
    st.write("bert_tokenize = bert_tokenizer.tokenize(input_sentence)")
    st.write(bert_tokenize)
with st.expander(label="SentencePiece Tokenizer", expanded=False):
    st.write("xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)")
    st.write(xlm_tokenize)

st.markdown("""#### Tokenizer Options:
            There are following parameters in Tokenizer object are most used
    - padding =  'longest'(True), 'max_length', 'do_not_pad'(False)
    - truncation =  'longest_first'(True), 'only_second', 'only_first',
             'do_not_truncate'(False)
    - max_length = <= model_max_length """)
## Refer to https://huggingface.co/docs/transformers/pad_truncation
gpt2_max_length = gpt2_tokenizer.model_max_length
bert_max_length = bert_tokenizer.model_max_length
xlm_max_length = "Not Speced"

st.markdown("""We also need the model max length, which is the 
         what the model is configured with.""")
st.write("GPT: ", gpt2_max_length)
st.write("Bert: ", bert_max_length)
st.write("XLM: ", xlm_max_length)

sent1 = "This app is talking about the variety of Tokenizers and their outputs"
sent2 = """Tokenizers do one thing, bring out numbers from text. The better numbers far better
        the results"""

st.write("We will be working with the following sentences.")
st.write("Sentence1: ", sent1)
st.write("Sentence2: ", sent2)

st.markdown("#### Tokenization in Action. Using GPT Tokenizer")
st.markdown("""##### Trial-1:
    > No parameter provided
    > Sentences are given with comma seperation""")
gpt2_encode = gpt2_tokenizer(sent1, sent2)
st.write(gpt2_encode)

st.markdown("""##### Trial-2:
    > No parameter provided
    > Sentences are made into a List""")
gpt2_encode = gpt2_tokenizer([sent1, sent2])
st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2])")
st.write(gpt2_encode)

# gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id

st.markdown("""##### Trial-3:
    > Need to add pad token to tokenizer, if the model doesn't have.
    > padding = True
    > Sentences are made into a List""")
gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)
st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)")
st.write(gpt2_encode)

st.markdown("""##### Trial-4:
    > Need to add pad token to tokenizer, if the model doesn't have.
    > padding = max_length (requires max_length = int)
    > Sentences are made into a List""")
gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             padding=True,
                             max_length=15)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             padding=True,
                             max_length=15""")

st.write(gpt2_encode)

st.markdown("""##### Trial-5:
    > truncate = True (requires max_length = int)
    > Sentences are seperated by a comma
    Will see total output of 12 token, 6 per sentence""")

gpt2_encode = gpt2_tokenizer(sent1, sent2,
                             truncation=True,
                             max_length=12)
st.write("""gpt2_encode = gpt2_tokenizer(sent1, sent2,
                             truncation=True,
                              max_length=12)""")

st.write(gpt2_encode)

st.markdown("""##### Trial-6:
    > truncate = True (requires max_length = int)
    > Sentences are made into a list 
    Will have longest first""")

gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             truncation=True,
                             max_length=12)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             truncation=True,
                              max_length=12)""")

st.write(gpt2_encode)

st.markdown("""##### Trial-7:
    > truncate = only_first 
    > Sentences are made into a list 
    Will have only 8 tokens """)

gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             truncation='only_first',
                             max_length=8)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             truncation='only_first',
                              max_length=8)""")

st.write(gpt2_encode)

st.markdown("""##### Trial-8:
    > truncate = False (only_second, is erroring out) 
    > Sentences are made into a list 
    No Truncation, 2 ids list""")

gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             truncation=False,
                             max_length=7)
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2],
                             truncation=False,
                             max_length=7)""")

st.write(gpt2_encode)

curr_dir = pathlib.Path(__file__).parent.resolve()
file_loc = curr_dir / "task_arch.json"
file_loc = file_loc.resolve()

with open(file_loc, 'r') as arch:
    data = json.load(arch)

tasks = list(data.keys())
st.markdown("#### Lets dive into the model architectures...")

task = st.radio("The NLP tasks", tasks)

task_data = data[task]

num_models = len(task_data['architectures'])

show_archs = st.slider("How many archs to Show",
                       min_value=4, max_value=num_models)

pruned_data = {
    "architectures": task_data['architectures'][:show_archs],
    "AutoModelClass": task_data["AutoModelClass"],
    "dataset": task_data["dataset"],
    "model_used": task_data["model_used"]
}

st.write(pruned_data)