Spaces:

Alshargi
/

sam

Sleeping

App Files Files Community

Alshargi commited on Apr 25, 2024

Commit

419c706

verified ·

1 Parent(s): fa74430

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -10

app.py CHANGED Viewed

@@ -1,20 +1,111 @@
 import streamlit as st
-from transformers import pipeline, AutoConfig
-# Load the model configuration from config.json
-config = AutoConfig.from_pretrained("Alshargi/arabic-msa-dialects-segmentation")
-# Load the model using the configuration
-model = pipeline("text-classification", model="Alshargi/arabic-msa-dialects-segmentation", config=config)
 # Slider to select a value
 x = st.slider('Select a value')
-# Display the squared value
-st.write(x, 'squared is', x * x)
-# Make prediction using the loaded model
-prediction = model(x)
 # Display the prediction
-st.write("Prediction:", prediction)

 import streamlit as st
+import joblib
+from nltk import word_tokenize
+#import string, re
+def features(sentence, index):
+    return {
+        'word': sentence[index],
+        'is_first': index == 0,
+        'is_last': index == len(sentence) - 1,
+        'lword': len(sentence[index]),
+        'prefix-1': sentence[index][:1],
+        'prefix-2': sentence[index][:2],
+        'prefix-3': sentence[index][:3],
+        'prefix-4': sentence[index][:4],
+        'prefix-5': sentence[index][:5],
+        'suffix-1': sentence[index][-1],
+        'suffix-2': sentence[index][-2:],
+        'suffix-3': sentence[index][-3:],
+        'suffix-4': sentence[index][-4:],
+        'suffix-5': sentence[index][-5:],
+        'prev_word_4': prvwords_4(sentence, index),
+        'prev_word_3': prvwords_3(sentence, index),
+        'prev_word_2': prvwords_2(sentence, index),
+        'prev_word_1': prvwords_1(sentence, index),
+        'next_word_1':  nextwords_1(sentence, index),
+        'next_word_2':  nextwords_2(sentence, index),
+        'next_word_3':  nextwords_3(sentence, index),
+        'next_word_4':  nextwords_4(sentence, index),
+        'is_numeric': sentence[index].isdigit(),
+            }
+def rebuildxx(ww, xres):
+    numprfx = xres.count('p')
+    numsufx = xres.count('f')
+    resfinal = ''
+    if numprfx != 0 and  numsufx != 0 :
+        resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] ,  ww[-numsufx:] )
+    if numprfx == 0 and  numsufx == 0 :
+        #resfinal = "{}+{}+{}".format("", ww , ""  )
+        resfinal = "{}".format(ww )
+    if numprfx == 0 and  numsufx != 0 :
+        #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
+        resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
+    if numprfx != 0 and  numsufx == 0 :
+        #resfinal = "{}+{}+{}".format(ww[:numprfx] ,  ww[numprfx:],  "")
+        resfinal = "{}+{}".format(ww[:numprfx] ,  ww[numprfx:])
+    return resfinal
+import re
+def prepare_text(text):
+    # Define regular expression pattern to match symbols and punctuation from any language
+    symbol_pattern = r'([^\w\s\d])'  # Capture non-word, non-space, non-digit characters
+    prepared_text = re.sub(symbol_pattern, r' \1 ', text)
+    prepared_text = re.sub(r'\s+', ' ', prepared_text)
+    return prepared_text.strip()  # Remove leading and trailing spaces
+# load model
+clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
+print("loaded")
+keepall = []
 # Slider to select a value
 x = st.slider('Select a value')
+themaxres = x
+dd = x.replace("،", "")
+dd = dd.replace("؟", "")
+keepall = []
+gg = word_tokenize(dd)
+result = clf.predict([features(gg, index) for index in range(len(gg))])
+cc = ""
+for x, y in zip(gg, result):
+    cc += rebuildxx(x, y) + " "
 # Display the prediction
+st.write("Prediction:", cc)