Alshargi commited on
Commit
8586ba1
ยท
verified ยท
1 Parent(s): 141635c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -161
app.py CHANGED
@@ -1,195 +1,145 @@
1
 
2
  import streamlit as st
3
- import skops.hub_utils as hub_utils
4
  import pandas as pd
 
5
  from transformers import AutoModelForSequenceClassification
6
- import re
7
- from nltk.tokenize import word_tokenize
8
  import nltk
 
 
9
 
10
-
 
11
  nltk.download('punkt')
12
-
13
-
14
-
15
- def nextwords_1(ww, inx):
16
- try:
17
- return '' if inx == len(ww) - 1 else ww[inx + 1]
18
- except:
19
- pass
20
-
21
- return ''
22
-
23
- def nextwords_2(ww, inx):
24
- try:
25
- return '' if inx == len(ww) - 2 else ww[inx + 2]
26
- except:
27
- pass
28
-
29
- return ''
30
-
31
-
32
-
33
- def nextwords_3(ww, inx):
34
- try:
35
- return '' if inx == len(ww) - 3 else ww[inx + 3]
36
- except:
37
- pass
38
-
39
- return ''
40
-
41
- def nextwords_4(ww, inx):
42
- try:
43
- return '' if inx == len(ww) - 4 else ww[inx + 4]
44
- except:
45
- pass
46
-
47
- return ''
48
-
49
-
50
- def prvwords_1(ww, inx):
51
- try:
52
- return '' if inx == 0 else ww[inx - 1]
53
- except:
54
- pass
55
-
56
- return ''
57
-
58
-
59
- def prvwords_2(ww, inx):
60
- try:
61
- return '' if inx == 0 else ww[inx - 2]
62
- except:
63
- pass
64
-
65
- return ''
66
-
67
- def prvwords_3(ww, inx):
68
- try:
69
- return '' if inx == 0 else ww[inx - 3]
70
- except:
71
- pass
72
-
73
- return ''
74
-
75
-
76
- def prvwords_4(ww, inx):
77
- try:
78
- return '' if inx == 0 else ww[inx - 4]
79
- except:
80
- pass
81
-
82
- return ''
83
-
84
-
85
-
86
-
87
- # Define feature functions
88
- def features(sentence, index):
89
- return {
90
- 'word': sentence[index],
91
- 'is_first': index == 0,
92
- 'is_last': index == len(sentence) - 1,
93
- 'lword': len(sentence[index]),
94
- 'prefix-1': sentence[index][:1],
95
- 'prefix-2': sentence[index][:2],
96
- 'prefix-3': sentence[index][:3],
97
- 'prefix-4': sentence[index][:4],
98
- 'prefix-5': sentence[index][:5],
99
- 'suffix-1': sentence[index][-1],
100
- 'suffix-2': sentence[index][-2:],
101
- 'suffix-3': sentence[index][-3:],
102
- 'suffix-4': sentence[index][-4:],
103
- 'suffix-5': sentence[index][-5:],
104
- 'prev_word_4': prvwords_4(sentence, index),
105
- 'prev_word_3': prvwords_3(sentence, index),
106
- 'prev_word_2': prvwords_2(sentence, index),
107
- 'prev_word_1': prvwords_1(sentence, index),
108
- 'next_word_1': nextwords_1(sentence, index),
109
- 'next_word_2': nextwords_2(sentence, index),
110
- 'next_word_3': nextwords_3(sentence, index),
111
- 'next_word_4': nextwords_4(sentence, index),
112
- 'is_numeric': sentence[index].isdigit(),
113
- }
114
-
115
-
116
- def prepare_text(text):
117
- # Define regular expression pattern to match symbols and punctuation from any language
118
- symbol_pattern = r'([^\w\s\d])' # Capture non-word, non-space, non-digit characters
119
- prepared_text = re.sub(symbol_pattern, r' \1 ', text)
120
- prepared_text = re.sub(r'\s+', ' ', prepared_text)
121
-
122
- return prepared_text.strip() # Remove leading and trailing spaces
123
-
124
-
125
-
126
- def rebuildxx(ww, xres):
127
- numprfx = xres.count('p')
128
- numsufx = xres.count('f')
129
- resfinal = ''
130
- if numprfx != 0 and numsufx != 0 :
131
- resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] , ww[-numsufx:] )
132
- if numprfx == 0 and numsufx == 0 :
133
- #resfinal = "{}+{}+{}".format("", ww , "" )
134
- resfinal = "{}".format(ww )
135
-
136
- if numprfx == 0 and numsufx != 0 :
137
- #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
138
- resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
139
-
140
- if numprfx != 0 and numsufx == 0 :
141
- #resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:], "")
142
- resfinal = "{}+{}".format(ww[:numprfx] , ww[numprfx:])
143
-
144
- return resfinal
145
 
146
 
147
 
148
 
149
- # Define the function for processing user input
150
  def process_text(text_input):
151
  if text_input:
152
- # Prepare text (define this function)
153
- prepared_text = prepare_text(text_input) # Assuming prepare_text function is defined elsewhere
154
-
155
- # Tokenize text
156
- tokenized_text = word_tokenize(prepared_text) # Assuming word_tokenize function is imported
157
-
158
- # Extract features (define this function)
159
- features_list = [features(tokenized_text, i) for i in range(len(tokenized_text))] # Assuming features function is defined elsewhere
160
-
161
- # Create a DataFrame with the features
162
  data = pd.DataFrame(features_list)
163
-
164
  # Load the model from the Hub
165
  model_id = "Alshargi/arabic-msa-dialects-segmentation"
166
  model = AutoModelForSequenceClassification.from_pretrained(model_id)
167
-
168
  # Get model output using hub_utils
169
  res = hub_utils.get_model_output(model, data)
170
-
171
  # Return the model output
172
  return res
173
  else:
174
  return "Please enter some text."
175
 
 
176
  def main():
177
- st.title("Arabic segmintation Model Output with Streamlit")
178
-
179
  # Text input
180
  input_text = st.text_input("Enter your text:")
181
-
182
  # Process the text when a button is clicked
183
  if st.button("Process"):
184
  output = process_text(input_text)
185
- gg = word_tokenize(prepare_text(input_text))
186
- cc = ""
187
- for x, y in zip(gg, output):
188
- cc += rebuildxx(x, y) + " "
189
-
190
- #print(cc)
191
  st.write("Model Output:")
192
- st.write(cc)
 
193
 
194
  if __name__ == "__main__":
195
  main()
 
1
 
2
  import streamlit as st
3
+ import joblib
4
  import pandas as pd
5
+ import numpy as np
6
  from transformers import AutoModelForSequenceClassification
7
+ import skops.hub_utils as hub_utils
 
8
  import nltk
9
+ from nltk.corpus import stopwords
10
+ from nltk.tokenize import word_tokenize
11
 
12
+ # Download NLTK resources including the Arabic stopwords
13
+ nltk.download('stopwords')
14
  nltk.download('punkt')
15
+ arabic_stopwords = set(stopwords.words('arabic'))
16
+
17
+ TOP_labels = {
18
+ 0: 'A GENERAL WORKS',
19
+ 1: 'B PHILOSOPHY. PSYCHOLOGY. RELIGION',
20
+ 2: 'C AUXILIARY SCIENCES OF HISTORY',
21
+ 3: 'D WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.',
22
+ 4: 'E HISTORY OF THE AMERICAS CONTENANT',
23
+ 5: 'F HISTORY OF THE AMERICAS LOCAL',
24
+ 6: 'G GEOGRAPHY. ANTHROPOLOGY. RECREATION',
25
+ 7: 'H SOCIAL SCIENCES',
26
+ 8: 'J POLITICAL SCIENCE',
27
+ 9: 'K LAW',
28
+ 10: 'L EDUCATION',
29
+ 11: 'M MUSIC',
30
+ 12: 'N FINE ARTS',
31
+ 13: 'P LANGUAGE AND LITERATURE',
32
+ 14: 'Q SCIENCE',
33
+ 15: 'R MEDICINE',
34
+ 16: 'S AGRICULTURE',
35
+ 17: 'T TECHNOLOGY',
36
+ 18: 'U MILITARY SCIENCE',
37
+ 19: 'V NAVAL SCIENCE',
38
+ 20: 'W MEDICINE AND RELATED SUBJECTS',
39
+ 21: 'Z BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES'
40
+ }
41
+
42
+
43
+ # Load models
44
+ # Load CountVectorizer
45
+ loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl')
46
+ print("_top count_vectorizer model loaded")
47
+
48
+ # Load TfidfTransformer
49
+ loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl')
50
+ print("_top tfidf_transformer model loaded")
51
+
52
+ # Load the saved model
53
+ loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl')
54
+ print("_top trained_model model loaded")
55
+
56
+
57
+ def remove_tashkeel(text):
58
+ tashkeel = "ู‘ูŽู‹ููŒููู’"
59
+ for char in tashkeel:
60
+ text = text.replace(char, '')
61
+ return text
62
+
63
+
64
+ def remove_arabic_stopwords(text):
65
+ arabic_stopwords = set(stopwords.words('arabic'))
66
+ words = text.split()
67
+ filtered_words = [word for word in words if word not in arabic_stopwords]
68
+ return ' '.join(filtered_words)
69
+
70
+
71
+ def check_TOP(to_predict):
72
+ p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)])
73
+ p_tfidf = loaded_tf_transformer_top.transform(p_count)
74
+
75
+ # Predict the subcategory
76
+ top_number = loaded_model_top.predict(p_tfidf)[0]
77
+
78
+ # Get subcategory details
79
+ top_name = TOP_labels[top_number]
80
+ themaxresX = f"{top_name} N#: {top_number}"
81
+
82
+ # Get predicted probabilities for each subcategory
83
+ probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100
84
+
85
+ # Sort the probabilities and get top predictions
86
+ sorted_indices = np.argsort(probabilities)[::-1] # Sort in descending order
87
+ top_predictions = ['% {} {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]]
88
+
89
+ return themaxresX, top_predictions
90
+
91
+
92
+ def get_final_result(text):
93
+ top_result, top_predictions = check_TOP(remove_arabic_stopwords(text))
94
+ print("Text: ", text)
95
+ print("Top:", top_result)
96
+
97
+ if top_result.split(" ")[0] == "A":
98
+ sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text))
99
+ print("Sub:", sub_result)
100
+
101
+ print()
102
+ print("------------")
103
+ print("Top Predictions:")
104
+ for prediction in top_predictions:
105
+ print(prediction)
106
+ print()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
 
109
 
110
 
 
111
  def process_text(text_input):
112
  if text_input:
113
+ # Extract features
114
+ features_list = [] # Assuming features function is defined elsewhere
 
 
 
 
 
 
 
 
115
  data = pd.DataFrame(features_list)
116
+
117
  # Load the model from the Hub
118
  model_id = "Alshargi/arabic-msa-dialects-segmentation"
119
  model = AutoModelForSequenceClassification.from_pretrained(model_id)
120
+
121
  # Get model output using hub_utils
122
  res = hub_utils.get_model_output(model, data)
123
+
124
  # Return the model output
125
  return res
126
  else:
127
  return "Please enter some text."
128
 
129
+
130
  def main():
131
+ st.title("Arabic Segmentation Model Output with Streamlit")
132
+
133
  # Text input
134
  input_text = st.text_input("Enter your text:")
135
+
136
  # Process the text when a button is clicked
137
  if st.button("Process"):
138
  output = process_text(input_text)
139
+ result = prepare_text(input_text)
 
 
 
 
 
140
  st.write("Model Output:")
141
+ st.write(result)
142
+
143
 
144
  if __name__ == "__main__":
145
  main()