Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 24, 2024

Commit

f496437

verified ·

1 Parent(s): 58609ca

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -24

app.py CHANGED Viewed

@@ -7,7 +7,23 @@ import torch
 import numpy as np
 from collections import Counter
 import os
 # Configure page
 st.set_page_config(
     page_title="Arabic Poem Analysis",
@@ -54,23 +70,6 @@ def split_text(text, max_length=512):
     return chunks
 def clean_arabic_text(text):
-    # Add Arabic stop words
-    ARABIC_STOP_WORDS = {
-        'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
-        'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
-        'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
-        'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
-        'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
-        'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
-        'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
-        'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
-        'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
-        'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
-        'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
-        'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
-        'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
-        'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
-    }
     """Clean Arabic text by removing stop words and normalizing."""
     words = text.split()
     cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
@@ -319,21 +318,21 @@ if uploaded_file is not None:
             if topic_strategy == "Manual":
                 n_documents = len(df)
-                max_topics = max(3, min(50, n_documents // 20))
                 n_topics = st.slider(
                     "Number of Topics",
-                    min_value=2,
                     max_value=max_topics,
-                    value=min(20, max_topics),
                     help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
                 )
                 st.info(f"""
                     💡 For your dataset of {n_documents:,} documents:
-                    - Minimum topics: 2
-                    - Maximum topics: {max_topics}
-                    - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
                     """)
         with col2:

 import numpy as np
 from collections import Counter
 import os
+# Add Arabic stop words
+ARABIC_STOP_WORDS = {
+    'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
+    'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
+    'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
+    'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
+    'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
+    'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
+    'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
+    'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
+    'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
+    'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
+    'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
+    'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
+    'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
+    'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
+    }
 # Configure page
 st.set_page_config(
     page_title="Arabic Poem Analysis",
     return chunks
 def clean_arabic_text(text):
     """Clean Arabic text by removing stop words and normalizing."""
     words = text.split()
     cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
             if topic_strategy == "Manual":
                 n_documents = len(df)
+                max_topics = min(500, n_documents // 50)
+                min_topics = 5
                 n_topics = st.slider(
                     "Number of Topics",
+                    min_value=min_topics,
                     max_value=max_topics,
+                    value=default_topics,
                     help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
                 )
                 st.info(f"""
                     💡 For your dataset of {n_documents:,} documents:
+                    - Available topic range: {min_topics}-{max_topics}
+                    - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
                     """)
         with col2: