File size: 4,302 Bytes
360d3d5 1f61de5 8586ba1 360d3d5 8586ba1 360d3d5 8586ba1 1f61de5 8586ba1 360d3d5 8586ba1 63e6e27 8586ba1 484364a f7c5c6f 8586ba1 f7c5c6f 8586ba1 f7c5c6f 360d3d5 8586ba1 1f61de5 360d3d5 8586ba1 f7c5c6f 8586ba1 1f61de5 8586ba1 1f61de5 8586ba1 1f61de5 8586ba1 1f61de5 8586ba1 1f61de5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import streamlit as st
import joblib
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification
import skops.hub_utils as hub_utils
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download NLTK resources including the Arabic stopwords
nltk.download('stopwords')
nltk.download('punkt')
arabic_stopwords = set(stopwords.words('arabic'))
TOP_labels = {
0: 'A GENERAL WORKS',
1: 'B PHILOSOPHY. PSYCHOLOGY. RELIGION',
2: 'C AUXILIARY SCIENCES OF HISTORY',
3: 'D WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.',
4: 'E HISTORY OF THE AMERICAS CONTENANT',
5: 'F HISTORY OF THE AMERICAS LOCAL',
6: 'G GEOGRAPHY. ANTHROPOLOGY. RECREATION',
7: 'H SOCIAL SCIENCES',
8: 'J POLITICAL SCIENCE',
9: 'K LAW',
10: 'L EDUCATION',
11: 'M MUSIC',
12: 'N FINE ARTS',
13: 'P LANGUAGE AND LITERATURE',
14: 'Q SCIENCE',
15: 'R MEDICINE',
16: 'S AGRICULTURE',
17: 'T TECHNOLOGY',
18: 'U MILITARY SCIENCE',
19: 'V NAVAL SCIENCE',
20: 'W MEDICINE AND RELATED SUBJECTS',
21: 'Z BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES'
}
# Load models
# Load CountVectorizer
loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl')
print("_top count_vectorizer model loaded")
# Load TfidfTransformer
loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl')
print("_top tfidf_transformer model loaded")
# Load the saved model
loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl')
print("_top trained_model model loaded")
def remove_tashkeel(text):
tashkeel = "ูููููููู"
for char in tashkeel:
text = text.replace(char, '')
return text
def remove_arabic_stopwords(text):
arabic_stopwords = set(stopwords.words('arabic'))
words = text.split()
filtered_words = [word for word in words if word not in arabic_stopwords]
return ' '.join(filtered_words)
def check_TOP(to_predict):
p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)])
p_tfidf = loaded_tf_transformer_top.transform(p_count)
# Predict the subcategory
top_number = loaded_model_top.predict(p_tfidf)[0]
# Get subcategory details
top_name = TOP_labels[top_number]
themaxresX = f"{top_name} N#: {top_number}"
# Get predicted probabilities for each subcategory
probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100
# Sort the probabilities and get top predictions
sorted_indices = np.argsort(probabilities)[::-1] # Sort in descending order
top_predictions = ['% {} {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]]
return themaxresX, top_predictions
def get_final_result(text):
top_result, top_predictions = check_TOP(remove_arabic_stopwords(text))
print("Text: ", text)
print("Top:", top_result)
if top_result.split(" ")[0] == "A":
sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text))
print("Sub:", sub_result)
print()
print("------------")
print("Top Predictions:")
for prediction in top_predictions:
print(prediction)
print()
def process_text(text_input):
if text_input:
# Extract features
features_list = [] # Assuming features function is defined elsewhere
data = pd.DataFrame(features_list)
# Load the model from the Hub
model_id = "Alshargi/arabic-msa-dialects-segmentation"
model = AutoModelForSequenceClassification.from_pretrained(model_id)
# Get model output using hub_utils
res = hub_utils.get_model_output(model, data)
# Return the model output
return res
else:
return "Please enter some text."
def main():
st.title("Arabic Segmentation Model Output with Streamlit")
# Text input
input_text = st.text_input("Enter your text:")
# Process the text when a button is clicked
if st.button("Process"):
output = process_text(input_text)
result = prepare_text(input_text)
st.write("Model Output:")
st.write(result)
if __name__ == "__main__":
main()
|