|
|
|
|
|
""" |
|
Created on Sun Jul 17 06:45:53 PM EDT 2022 |
|
author: Ryan Hildebrandt, github.com/ryancahildebrandt |
|
""" |
|
|
|
import pandas as pd |
|
import sklearn as sk |
|
import sklearn.feature_extraction |
|
import streamlit as st |
|
import tensorflow_hub as hub |
|
|
|
from sentence_transformers import SentenceTransformer as snt |
|
|
|
|
|
|
|
@st.cache(persist = True) |
|
def model_tfidf(in_text, ngram_range): |
|
""" |
|
TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶ |
|
""" |
|
return sk.feature_extraction.text.TfidfVectorizer(lowercase = False, ngram_range = ngram_range).fit_transform(in_text).toarray() |
|
|
|
@st.cache(persist = True) |
|
def model_hash(in_text, ngram_range): |
|
""" |
|
HashingVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), analyzer='word', n_features=1048576, binary=False, norm='l2', alternate_sign=True, dtype=<class 'numpy.float64'>)[source]¶ |
|
""" |
|
return sk.feature_extraction.text.HashingVectorizer(lowercase = False, ngram_range = ngram_range).fit_transform(in_text).toarray() |
|
|
|
@st.cache(persist = True) |
|
def model_count(in_text, ngram_range): |
|
""" |
|
CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)[source]¶ |
|
""" |
|
return sk.feature_extraction.text.CountVectorizer(lowercase = False, ngram_range = ngram_range).fit_transform(in_text).toarray() |
|
|
|
@st.cache(persist = True) |
|
def model_use(in_text): |
|
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") |
|
return model(in_text).numpy() |
|
|
|
@st.cache(persist = True) |
|
def model_snt(in_text, model_name): |
|
|
|
model = snt(model_name) |
|
return model.encode(in_text) |
|
|
|
st_available_models = ["all-MiniLM-L12-v1","all-MiniLM-L12-v2","all-MiniLM-L6-v1","all-MiniLM-L6-v2","all-distilroberta-v1","all-mpnet-base-v1","all-mpnet-base-v2","all-roberta-large-v1","average_word_embeddings_glove.6B.300d","average_word_embeddings_komninos","distiluse-base-multilingual-cased-v1","distiluse-base-multilingual-cased-v2","gtr-t5-base","gtr-t5-large","gtr-t5-xl","gtr-t5-xxl","msmarco-bert-base-dot-v5","msmarco-distilbert-base-tas-b","msmarco-distilbert-dot-v5","multi-qa-MiniLM-L6-cos-v1","multi-qa-MiniLM-L6-dot-v1","multi-qa-distilbert-cos-v1","multi-qa-distilbert-dot-v1","multi-qa-mpnet-base-cos-v1","multi-qa-mpnet-base-dot-v1","paraphrase-MiniLM-L12-v2","paraphrase-MiniLM-L3-v2","paraphrase-MiniLM-L6-v2","paraphrase-TinyBERT-L6-v2","paraphrase-albert-small-v2","paraphrase-distilroberta-base-v2","paraphrase-mpnet-base-v2","paraphrase-multilingual-MiniLM-L12-v2","paraphrase-multilingual-mpnet-base-v2","sentence-t5-base","sentence-t5-large","sentence-t5-xl","sentence-t5-xxl"] |
|
|