File size: 2,224 Bytes
958fa20
27b7dcb
 
59515e7
27b7dcb
419c706
59515e7
419c706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59515e7
419c706
59515e7
419c706
59515e7
419c706
 
59515e7
419c706
59515e7
 
1f97e75
59515e7
 
 
 
 
 
 
 
 
 
 
27b7dcb
 
 
 
 
 
59515e7
27b7dcb
 
e21a911
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
import skops.hub_utils as hub_utils
import pandas as pd
import re
from nltk.tokenize import word_tokenize

# Define feature functions
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'lword': len(sentence[index]),
        'prefix-1': sentence[index][:1],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'prefix-4': sentence[index][:4],
        'prefix-5': sentence[index][:5],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'suffix-4': sentence[index][-4:],
        'suffix-5': sentence[index][-5:],
        'prev_word_4': prvwords_4(sentence, index),
        'prev_word_3': prvwords_3(sentence, index),
        'prev_word_2': prvwords_2(sentence, index),
        'prev_word_1': prvwords_1(sentence, index),
        'next_word_1':  nextwords_1(sentence, index),
        'next_word_2':  nextwords_2(sentence, index),
        'next_word_3':  nextwords_3(sentence, index),
        'next_word_4':  nextwords_4(sentence, index),
        'is_numeric': sentence[index].isdigit(),
    }

# Function to prepare text
def prepare_text(text):
    symbol_pattern = r'([^\w\s\d])'
    prepared_text = re.sub(symbol_pattern, r' \1 ', text)
    prepared_text = re.sub(r'\s+', ' ', prepared_text)
    return prepared_text.strip()

# Text input field for user input
text_input = st.text_input("Enter some text:")

# Check if the user input is not empty
if text_input:
    # Prepare text
    prepared_text = prepare_text(text_input)
    
    # Tokenize text
    tokenized_text = word_tokenize(prepared_text)
    
    # Extract features
    features_list = [features(tokenized_text, i) for i in range(len(tokenized_text))]
    
    # Create a DataFrame with the features
    data = pd.DataFrame(features_list)
    
    # Load the model from the Hub
    model_id = "Alshargi/arabic-msa-dialects-segmentation"
    res = hub_utils.get_model_output(model_id, data)
    
    # Display the model output
    st.write("Model Output:", res)
else:
    st.write("Please enter some text.")