Alshargi commited on
Commit
419c706
·
verified ·
1 Parent(s): fa74430

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -10
app.py CHANGED
@@ -1,20 +1,111 @@
1
  import streamlit as st
2
- from transformers import pipeline, AutoConfig
3
 
4
- # Load the model configuration from config.json
5
- config = AutoConfig.from_pretrained("Alshargi/arabic-msa-dialects-segmentation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # Load the model using the configuration
8
- model = pipeline("text-classification", model="Alshargi/arabic-msa-dialects-segmentation", config=config)
9
 
10
  # Slider to select a value
11
  x = st.slider('Select a value')
 
 
 
 
 
 
 
12
 
13
- # Display the squared value
14
- st.write(x, 'squared is', x * x)
15
 
16
- # Make prediction using the loaded model
17
- prediction = model(x)
 
 
18
 
19
  # Display the prediction
20
- st.write("Prediction:", prediction)
 
 
 
1
  import streamlit as st
 
2
 
3
+ import joblib
4
+ from nltk import word_tokenize
5
+
6
+
7
+
8
+
9
+ #import string, re
10
+ def features(sentence, index):
11
+ return {
12
+ 'word': sentence[index],
13
+ 'is_first': index == 0,
14
+ 'is_last': index == len(sentence) - 1,
15
+ 'lword': len(sentence[index]),
16
+
17
+ 'prefix-1': sentence[index][:1],
18
+ 'prefix-2': sentence[index][:2],
19
+ 'prefix-3': sentence[index][:3],
20
+ 'prefix-4': sentence[index][:4],
21
+ 'prefix-5': sentence[index][:5],
22
+
23
+ 'suffix-1': sentence[index][-1],
24
+ 'suffix-2': sentence[index][-2:],
25
+ 'suffix-3': sentence[index][-3:],
26
+ 'suffix-4': sentence[index][-4:],
27
+ 'suffix-5': sentence[index][-5:],
28
+
29
+ 'prev_word_4': prvwords_4(sentence, index),
30
+ 'prev_word_3': prvwords_3(sentence, index),
31
+ 'prev_word_2': prvwords_2(sentence, index),
32
+ 'prev_word_1': prvwords_1(sentence, index),
33
+
34
+
35
+ 'next_word_1': nextwords_1(sentence, index),
36
+ 'next_word_2': nextwords_2(sentence, index),
37
+ 'next_word_3': nextwords_3(sentence, index),
38
+ 'next_word_4': nextwords_4(sentence, index),
39
+
40
+ 'is_numeric': sentence[index].isdigit(),
41
+ }
42
+
43
+
44
+
45
+
46
+ def rebuildxx(ww, xres):
47
+ numprfx = xres.count('p')
48
+ numsufx = xres.count('f')
49
+ resfinal = ''
50
+ if numprfx != 0 and numsufx != 0 :
51
+ resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] , ww[-numsufx:] )
52
+ if numprfx == 0 and numsufx == 0 :
53
+ #resfinal = "{}+{}+{}".format("", ww , "" )
54
+ resfinal = "{}".format(ww )
55
+
56
+ if numprfx == 0 and numsufx != 0 :
57
+ #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
58
+ resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
59
+
60
+ if numprfx != 0 and numsufx == 0 :
61
+ #resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:], "")
62
+ resfinal = "{}+{}".format(ww[:numprfx] , ww[numprfx:])
63
+
64
+ return resfinal
65
+
66
+
67
+
68
+ import re
69
+
70
+ def prepare_text(text):
71
+ # Define regular expression pattern to match symbols and punctuation from any language
72
+ symbol_pattern = r'([^\w\s\d])' # Capture non-word, non-space, non-digit characters
73
+ prepared_text = re.sub(symbol_pattern, r' \1 ', text)
74
+ prepared_text = re.sub(r'\s+', ' ', prepared_text)
75
+
76
+ return prepared_text.strip() # Remove leading and trailing spaces
77
+
78
+
79
+
80
+
81
+ # load model
82
+ clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
83
+ print("loaded")
84
+
85
+
86
+
87
+
88
+
89
+ keepall = []
90
 
 
 
91
 
92
  # Slider to select a value
93
  x = st.slider('Select a value')
94
+ themaxres = x
95
+ dd = x.replace("،", "")
96
+ dd = dd.replace("؟", "")
97
+ keepall = []
98
+
99
+ gg = word_tokenize(dd)
100
+ result = clf.predict([features(gg, index) for index in range(len(gg))])
101
 
 
 
102
 
103
+ cc = ""
104
+ for x, y in zip(gg, result):
105
+ cc += rebuildxx(x, y) + " "
106
+
107
 
108
  # Display the prediction
109
+ st.write("Prediction:", cc)
110
+
111
+