|
import streamlit as st |
|
import skops.hub_utils as hub_utils |
|
import pandas as pd |
|
import re |
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
def features(sentence, index): |
|
return { |
|
'word': sentence[index], |
|
'is_first': index == 0, |
|
'is_last': index == len(sentence) - 1, |
|
'lword': len(sentence[index]), |
|
'prefix-1': sentence[index][:1], |
|
'prefix-2': sentence[index][:2], |
|
'prefix-3': sentence[index][:3], |
|
'prefix-4': sentence[index][:4], |
|
'prefix-5': sentence[index][:5], |
|
'suffix-1': sentence[index][-1], |
|
'suffix-2': sentence[index][-2:], |
|
'suffix-3': sentence[index][-3:], |
|
'suffix-4': sentence[index][-4:], |
|
'suffix-5': sentence[index][-5:], |
|
'prev_word_4': prvwords_4(sentence, index), |
|
'prev_word_3': prvwords_3(sentence, index), |
|
'prev_word_2': prvwords_2(sentence, index), |
|
'prev_word_1': prvwords_1(sentence, index), |
|
'next_word_1': nextwords_1(sentence, index), |
|
'next_word_2': nextwords_2(sentence, index), |
|
'next_word_3': nextwords_3(sentence, index), |
|
'next_word_4': nextwords_4(sentence, index), |
|
'is_numeric': sentence[index].isdigit(), |
|
} |
|
|
|
|
|
def prepare_text(text): |
|
symbol_pattern = r'([^\w\s\d])' |
|
prepared_text = re.sub(symbol_pattern, r' \1 ', text) |
|
prepared_text = re.sub(r'\s+', ' ', prepared_text) |
|
return prepared_text.strip() |
|
|
|
|
|
text_input = st.text_input("Enter some text:") |
|
|
|
|
|
if text_input: |
|
|
|
prepared_text = prepare_text(text_input) |
|
|
|
|
|
tokenized_text = word_tokenize(prepared_text) |
|
|
|
|
|
features_list = [features(tokenized_text, i) for i in range(len(tokenized_text))] |
|
|
|
|
|
data = pd.DataFrame(features_list) |
|
|
|
|
|
model_id = "Alshargi/arabic-msa-dialects-segmentation" |
|
res = hub_utils.get_model_output(model_id, data) |
|
|
|
|
|
st.write("Model Output:", res) |
|
else: |
|
st.write("Please enter some text.") |
|
|