Spaces:

Alshargi
/

sam

Sleeping

App Files Files Community

Alshargi commited on Apr 28, 2024

Commit

8586ba1

verified ·

1 Parent(s): 141635c

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -161

app.py CHANGED Viewed

@@ -1,195 +1,145 @@
 import streamlit as st
-import skops.hub_utils as hub_utils
 import pandas as pd
 from transformers import AutoModelForSequenceClassification
-import re
-from nltk.tokenize import word_tokenize
 import nltk
 nltk.download('punkt')
-def nextwords_1(ww, inx):
-    try:
-        return '' if inx == len(ww) - 1 else ww[inx + 1]
-    except:
-        pass
-    return ''
-def nextwords_2(ww, inx):
-    try:
-        return '' if inx == len(ww) - 2 else ww[inx + 2]
-    except:
-        pass
-    return ''
-def nextwords_3(ww, inx):
-    try:
-        return '' if inx == len(ww) - 3 else ww[inx + 3]
-    except:
-        pass
-    return ''
-def nextwords_4(ww, inx):
-    try:
-        return '' if inx == len(ww) - 4 else ww[inx + 4]
-    except:
-        pass
-    return ''
-def prvwords_1(ww, inx):
-    try:
-        return '' if inx == 0 else ww[inx - 1]
-    except:
-        pass
-    return ''
-def prvwords_2(ww, inx):
-    try:
-        return '' if inx == 0 else ww[inx - 2]
-    except:
-        pass
-    return ''
-def prvwords_3(ww, inx):
-    try:
-        return '' if inx == 0 else ww[inx - 3]
-    except:
-        pass
-    return ''
-def prvwords_4(ww, inx):
-    try:
-        return '' if inx == 0 else ww[inx - 4]
-    except:
-        pass
-    return ''
-# Define feature functions
-def features(sentence, index):
-    return {
-        'word': sentence[index],
-        'is_first': index == 0,
-        'is_last': index == len(sentence) - 1,
-        'lword': len(sentence[index]),
-        'prefix-1': sentence[index][:1],
-        'prefix-2': sentence[index][:2],
-        'prefix-3': sentence[index][:3],
-        'prefix-4': sentence[index][:4],
-        'prefix-5': sentence[index][:5],
-        'suffix-1': sentence[index][-1],
-        'suffix-2': sentence[index][-2:],
-        'suffix-3': sentence[index][-3:],
-        'suffix-4': sentence[index][-4:],
-        'suffix-5': sentence[index][-5:],
-        'prev_word_4': prvwords_4(sentence, index),
-        'prev_word_3': prvwords_3(sentence, index),
-        'prev_word_2': prvwords_2(sentence, index),
-        'prev_word_1': prvwords_1(sentence, index),
-        'next_word_1':  nextwords_1(sentence, index),
-        'next_word_2':  nextwords_2(sentence, index),
-        'next_word_3':  nextwords_3(sentence, index),
-        'next_word_4':  nextwords_4(sentence, index),
-        'is_numeric': sentence[index].isdigit(),
-    }
-def prepare_text(text):
-    # Define regular expression pattern to match symbols and punctuation from any language
-    symbol_pattern = r'([^\w\s\d])'  # Capture non-word, non-space, non-digit characters
-    prepared_text = re.sub(symbol_pattern, r' \1 ', text)
-    prepared_text = re.sub(r'\s+', ' ', prepared_text)
-    return prepared_text.strip()  # Remove leading and trailing spaces
-def rebuildxx(ww, xres):
-    numprfx = xres.count('p')
-    numsufx = xres.count('f')
-    resfinal = ''
-    if numprfx != 0 and  numsufx != 0 :
-        resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] ,  ww[-numsufx:] )
-    if numprfx == 0 and  numsufx == 0 :
-        #resfinal = "{}+{}+{}".format("", ww , ""  )
-        resfinal = "{}".format(ww )
-    if numprfx == 0 and  numsufx != 0 :
-        #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
-        resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
-    if numprfx != 0 and  numsufx == 0 :
-        #resfinal = "{}+{}+{}".format(ww[:numprfx] ,  ww[numprfx:],  "")
-        resfinal = "{}+{}".format(ww[:numprfx] ,  ww[numprfx:])
-    return resfinal
-# Define the function for processing user input
 def process_text(text_input):
     if text_input:
-        # Prepare text (define this function)
-        prepared_text = prepare_text(text_input)  # Assuming prepare_text function is defined elsewhere
-        # Tokenize text
-        tokenized_text = word_tokenize(prepared_text)  # Assuming word_tokenize function is imported
-        # Extract features (define this function)
-        features_list = [features(tokenized_text, i) for i in range(len(tokenized_text))]  # Assuming features function is defined elsewhere
-        # Create a DataFrame with the features
         data = pd.DataFrame(features_list)
         # Load the model from the Hub
         model_id = "Alshargi/arabic-msa-dialects-segmentation"
         model = AutoModelForSequenceClassification.from_pretrained(model_id)
         # Get model output using hub_utils
         res = hub_utils.get_model_output(model, data)
         # Return the model output
         return res
     else:
         return "Please enter some text."
 def main():
-    st.title("Arabic segmintation Model Output with Streamlit")
     # Text input
     input_text = st.text_input("Enter your text:")
     # Process the text when a button is clicked
     if st.button("Process"):
         output = process_text(input_text)
-        gg = word_tokenize(prepare_text(input_text))
-        cc = ""
-        for x, y in zip(gg, output):
-            cc += rebuildxx(x, y) + " "
-        #print(cc)
         st.write("Model Output:")
-        st.write(cc)
 if __name__ == "__main__":
     main()

 import streamlit as st
+import joblib
 import pandas as pd
+import numpy as np
 from transformers import AutoModelForSequenceClassification
+import skops.hub_utils as hub_utils
 import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+# Download NLTK resources including the Arabic stopwords
+nltk.download('stopwords')
 nltk.download('punkt')
+arabic_stopwords = set(stopwords.words('arabic'))
+TOP_labels = {
+    0: 'A  GENERAL WORKS',
+    1: 'B  PHILOSOPHY. PSYCHOLOGY. RELIGION',
+    2: 'C  AUXILIARY SCIENCES OF HISTORY',
+    3: 'D  WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.',
+    4: 'E  HISTORY OF THE AMERICAS CONTENANT',
+    5: 'F  HISTORY OF THE AMERICAS LOCAL',
+    6: 'G  GEOGRAPHY. ANTHROPOLOGY. RECREATION',
+    7: 'H  SOCIAL SCIENCES',
+    8: 'J  POLITICAL SCIENCE',
+    9: 'K  LAW',
+    10: 'L  EDUCATION',
+    11: 'M  MUSIC',
+    12: 'N  FINE ARTS',
+    13: 'P  LANGUAGE AND LITERATURE',
+    14: 'Q  SCIENCE',
+    15: 'R  MEDICINE',
+    16: 'S  AGRICULTURE',
+    17: 'T  TECHNOLOGY',
+    18: 'U  MILITARY SCIENCE',
+    19: 'V  NAVAL SCIENCE',
+    20: 'W  MEDICINE AND RELATED SUBJECTS',
+    21: 'Z  BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES'
+}
+# Load models
+# Load CountVectorizer
+loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl')
+print("_top count_vectorizer model loaded")
+# Load TfidfTransformer
+loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl')
+print("_top tfidf_transformer model loaded")
+# Load the saved model
+loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl')
+print("_top trained_model model loaded")
+def remove_tashkeel(text):
+    tashkeel = "ًٌٍَُِّْ"
+    for char in tashkeel:
+        text = text.replace(char, '')
+    return text
+def remove_arabic_stopwords(text):
+    arabic_stopwords = set(stopwords.words('arabic'))
+    words = text.split()
+    filtered_words = [word for word in words if word not in arabic_stopwords]
+    return ' '.join(filtered_words)
+def check_TOP(to_predict):
+    p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)])
+    p_tfidf = loaded_tf_transformer_top.transform(p_count)
+    # Predict the subcategory
+    top_number = loaded_model_top.predict(p_tfidf)[0]
+    # Get subcategory details
+    top_name = TOP_labels[top_number]
+    themaxresX = f"{top_name}  N#: {top_number}"
+    # Get predicted probabilities for each subcategory
+    probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100
+    # Sort the probabilities and get top predictions
+    sorted_indices = np.argsort(probabilities)[::-1]  # Sort in descending order
+    top_predictions = ['% {}  {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]]
+    return themaxresX, top_predictions
+def get_final_result(text):
+    top_result, top_predictions = check_TOP(remove_arabic_stopwords(text))
+    print("Text: ", text)
+    print("Top:", top_result)
+    if top_result.split("  ")[0] == "A":
+        sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text))
+        print("Sub:", sub_result)
+    print()
+    print("------------")
+    print("Top Predictions:")
+    for prediction in top_predictions:
+        print(prediction)
+    print()
 def process_text(text_input):
     if text_input:
+        # Extract features
+        features_list = []  # Assuming features function is defined elsewhere
         data = pd.DataFrame(features_list)
         # Load the model from the Hub
         model_id = "Alshargi/arabic-msa-dialects-segmentation"
         model = AutoModelForSequenceClassification.from_pretrained(model_id)
         # Get model output using hub_utils
         res = hub_utils.get_model_output(model, data)
         # Return the model output
         return res
     else:
         return "Please enter some text."
 def main():
+    st.title("Arabic Segmentation Model Output with Streamlit")
     # Text input
     input_text = st.text_input("Enter your text:")
     # Process the text when a button is clicked
     if st.button("Process"):
         output = process_text(input_text)
+        result = prepare_text(input_text)
         st.write("Model Output:")
+        st.write(result)
 if __name__ == "__main__":
     main()