Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,23 @@ import torch
|
|
7 |
import numpy as np
|
8 |
from collections import Counter
|
9 |
import os
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Configure page
|
12 |
st.set_page_config(
|
13 |
page_title="Arabic Poem Analysis",
|
@@ -54,23 +70,6 @@ def split_text(text, max_length=512):
|
|
54 |
return chunks
|
55 |
|
56 |
def clean_arabic_text(text):
|
57 |
-
# Add Arabic stop words
|
58 |
-
ARABIC_STOP_WORDS = {
|
59 |
-
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู
',
|
60 |
-
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู',
|
61 |
-
'ุชูู', 'ูุคูุงุก', 'ูู
', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู
',
|
62 |
-
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู',
|
63 |
-
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู
', 'ูู', 'ูู', 'ู
ุง', 'ู
ุงุฐุง', 'ู
ุชู', 'ููู',
|
64 |
-
'ุงูู', 'ูู
ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู',
|
65 |
-
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู
ุงู
', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ',
|
66 |
-
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู
', 'ูู', 'ูู', 'ู
ู', 'ูู', 'ูู', 'ููุฉ',
|
67 |
-
'ูู
ุง', 'ููุง', 'ู
ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู
', 'ุญูุซ', 'ููุงู',
|
68 |
-
'ุฌุฏุง', 'ุฐุงุช', 'ุถู
ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู
ุซู', 'ููู', 'ุนูุฏ',
|
69 |
-
'ุฃู
ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู',
|
70 |
-
'ููู', 'ุชูู', 'ููู
', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู
ู', 'ููุฐุง',
|
71 |
-
'ุงูู', 'ุถู
ู', 'ุงููุง', 'ุฌู
ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง',
|
72 |
-
'ูุงุฒู
', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
|
73 |
-
}
|
74 |
"""Clean Arabic text by removing stop words and normalizing."""
|
75 |
words = text.split()
|
76 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
@@ -319,21 +318,21 @@ if uploaded_file is not None:
|
|
319 |
|
320 |
if topic_strategy == "Manual":
|
321 |
n_documents = len(df)
|
322 |
-
max_topics =
|
|
|
323 |
|
324 |
n_topics = st.slider(
|
325 |
"Number of Topics",
|
326 |
-
min_value=
|
327 |
max_value=max_topics,
|
328 |
-
value=
|
329 |
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
330 |
)
|
331 |
|
332 |
st.info(f"""
|
333 |
๐ก For your dataset of {n_documents:,} documents:
|
334 |
-
-
|
335 |
-
-
|
336 |
-
- Recommended range: {max(2, max_topics//5)}-{max_topics//2}
|
337 |
""")
|
338 |
|
339 |
with col2:
|
|
|
7 |
import numpy as np
|
8 |
from collections import Counter
|
9 |
import os
|
10 |
+
# Add Arabic stop words
|
11 |
+
ARABIC_STOP_WORDS = {
|
12 |
+
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู
',
|
13 |
+
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู',
|
14 |
+
'ุชูู', 'ูุคูุงุก', 'ูู
', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู
',
|
15 |
+
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู',
|
16 |
+
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู
', 'ูู', 'ูู', 'ู
ุง', 'ู
ุงุฐุง', 'ู
ุชู', 'ููู',
|
17 |
+
'ุงูู', 'ูู
ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู',
|
18 |
+
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู
ุงู
', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ',
|
19 |
+
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู
', 'ูู', 'ูู', 'ู
ู', 'ูู', 'ูู', 'ููุฉ',
|
20 |
+
'ูู
ุง', 'ููุง', 'ู
ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู
', 'ุญูุซ', 'ููุงู',
|
21 |
+
'ุฌุฏุง', 'ุฐุงุช', 'ุถู
ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู
ุซู', 'ููู', 'ุนูุฏ',
|
22 |
+
'ุฃู
ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู',
|
23 |
+
'ููู', 'ุชูู', 'ููู
', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู
ู', 'ููุฐุง',
|
24 |
+
'ุงูู', 'ุถู
ู', 'ุงููุง', 'ุฌู
ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง',
|
25 |
+
'ูุงุฒู
', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
|
26 |
+
}
|
27 |
# Configure page
|
28 |
st.set_page_config(
|
29 |
page_title="Arabic Poem Analysis",
|
|
|
70 |
return chunks
|
71 |
|
72 |
def clean_arabic_text(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
"""Clean Arabic text by removing stop words and normalizing."""
|
74 |
words = text.split()
|
75 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
|
|
318 |
|
319 |
if topic_strategy == "Manual":
|
320 |
n_documents = len(df)
|
321 |
+
max_topics = min(500, n_documents // 50)
|
322 |
+
min_topics = 5
|
323 |
|
324 |
n_topics = st.slider(
|
325 |
"Number of Topics",
|
326 |
+
min_value=min_topics,
|
327 |
max_value=max_topics,
|
328 |
+
value=default_topics,
|
329 |
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
330 |
)
|
331 |
|
332 |
st.info(f"""
|
333 |
๐ก For your dataset of {n_documents:,} documents:
|
334 |
+
- Available topic range: {min_topics}-{max_topics}
|
335 |
+
- Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
|
|
|
336 |
""")
|
337 |
|
338 |
with col2:
|