File size: 4,302 Bytes
360d3d5
1f61de5
8586ba1
360d3d5
8586ba1
360d3d5
8586ba1
1f61de5
8586ba1
 
360d3d5
8586ba1
 
63e6e27
8586ba1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484364a
 
 
 
f7c5c6f
 
8586ba1
 
f7c5c6f
8586ba1
f7c5c6f
 
360d3d5
8586ba1
1f61de5
360d3d5
8586ba1
f7c5c6f
 
 
 
 
8586ba1
1f61de5
8586ba1
 
1f61de5
 
8586ba1
1f61de5
 
 
8586ba1
1f61de5
8586ba1
 
1f61de5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

import streamlit as st
import joblib
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification
import skops.hub_utils as hub_utils
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources including the Arabic stopwords
nltk.download('stopwords')
nltk.download('punkt')
arabic_stopwords = set(stopwords.words('arabic'))

TOP_labels = {
    0: 'A  GENERAL WORKS',
    1: 'B  PHILOSOPHY. PSYCHOLOGY. RELIGION',
    2: 'C  AUXILIARY SCIENCES OF HISTORY',
    3: 'D  WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.',
    4: 'E  HISTORY OF THE AMERICAS CONTENANT',
    5: 'F  HISTORY OF THE AMERICAS LOCAL',
    6: 'G  GEOGRAPHY. ANTHROPOLOGY. RECREATION',
    7: 'H  SOCIAL SCIENCES',
    8: 'J  POLITICAL SCIENCE',
    9: 'K  LAW',
    10: 'L  EDUCATION',
    11: 'M  MUSIC',
    12: 'N  FINE ARTS',
    13: 'P  LANGUAGE AND LITERATURE',
    14: 'Q  SCIENCE',
    15: 'R  MEDICINE',
    16: 'S  AGRICULTURE',
    17: 'T  TECHNOLOGY',
    18: 'U  MILITARY SCIENCE',
    19: 'V  NAVAL SCIENCE',
    20: 'W  MEDICINE AND RELATED SUBJECTS',
    21: 'Z  BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES'
}


# Load models
# Load CountVectorizer
loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl')
print("_top count_vectorizer model loaded")

# Load TfidfTransformer
loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl')
print("_top tfidf_transformer model loaded")

# Load the saved model
loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl')
print("_top trained_model model loaded")


def remove_tashkeel(text):
    tashkeel = "ู‘ูŽู‹ููŒููู’"
    for char in tashkeel:
        text = text.replace(char, '')
    return text


def remove_arabic_stopwords(text):
    arabic_stopwords = set(stopwords.words('arabic'))
    words = text.split()
    filtered_words = [word for word in words if word not in arabic_stopwords]
    return ' '.join(filtered_words)


def check_TOP(to_predict):
    p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)])
    p_tfidf = loaded_tf_transformer_top.transform(p_count)

    # Predict the subcategory
    top_number = loaded_model_top.predict(p_tfidf)[0]

    # Get subcategory details
    top_name = TOP_labels[top_number]
    themaxresX = f"{top_name}  N#: {top_number}"

    # Get predicted probabilities for each subcategory
    probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100

    # Sort the probabilities and get top predictions
    sorted_indices = np.argsort(probabilities)[::-1]  # Sort in descending order
    top_predictions = ['% {}  {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]]

    return themaxresX, top_predictions


def get_final_result(text):
    top_result, top_predictions = check_TOP(remove_arabic_stopwords(text))
    print("Text: ", text)
    print("Top:", top_result)

    if top_result.split("  ")[0] == "A":
        sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text))
        print("Sub:", sub_result)

    print()
    print("------------")
    print("Top Predictions:")
    for prediction in top_predictions:
        print(prediction)
    print()


    

def process_text(text_input):
    if text_input:
        # Extract features
        features_list = []  # Assuming features function is defined elsewhere
        data = pd.DataFrame(features_list)

        # Load the model from the Hub
        model_id = "Alshargi/arabic-msa-dialects-segmentation"
        model = AutoModelForSequenceClassification.from_pretrained(model_id)

        # Get model output using hub_utils
        res = hub_utils.get_model_output(model, data)

        # Return the model output
        return res
    else:
        return "Please enter some text."


def main():
    st.title("Arabic Segmentation Model Output with Streamlit")

    # Text input
    input_text = st.text_input("Enter your text:")

    # Process the text when a button is clicked
    if st.button("Process"):
        output = process_text(input_text)
        result = prepare_text(input_text)
        st.write("Model Output:")
        st.write(result)


if __name__ == "__main__":
    main()