Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 24, 2024

Commit

db1f2f7

verified ·

1 Parent(s): 78da58a

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -143

app.py CHANGED Viewed

@@ -13,8 +13,8 @@ st.set_page_config(
     page_icon="📚",
     layout="wide"
 )
-@st.cache_resource
 def load_models():
     """Load and cache the models to prevent reloading"""
     tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
@@ -55,20 +55,21 @@ def split_text(text, max_length=512):
 def clean_arabic_text(text):
     # Add Arabic stop words
     ARABIC_STOP_WORDS = {
-    'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
-    'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
-    'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
-    'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
-    'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
-    'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
-    'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
-    'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
-    'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
-    'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
-    'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
-    'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
-    'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
-    'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'}
     """Clean Arabic text by removing stop words and normalizing."""
     words = text.split()
     cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
@@ -270,142 +271,142 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
             continue
     return summaries, topic_model
-    # Load models
-    try:
-        bert_tokenizer, bert_model, emotion_classifier = load_models()
-        st.success("Models loaded successfully!")
-    except Exception as e:
-        st.error(f"Error loading models: {str(e)}")
-        st.stop()
-    # Main app interface
-    st.title("📚 Arabic Poem Analysis")
-    st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
-    # File upload
-    uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
-    if uploaded_file is not None:
-        try:
-            # Read the file
-            if uploaded_file.name.endswith('.csv'):
-                df = pd.read_csv(uploaded_file)
-            else:
-                df = pd.read_excel(uploaded_file)
-            # Validate columns
-            required_columns = ['country', 'poem']
-            if not all(col in df.columns for col in required_columns):
-                st.error("File must contain 'country' and 'poem' columns.")
-                st.stop()
-            # Clean data
-            df['country'] = df['country'].str.strip()
-            df = df.dropna(subset=['country', 'poem'])
-            # Add topic modeling controls
-            st.subheader("Topic Modeling Settings")
-            col1, col2 = st.columns(2)
-            with col1:
-                topic_strategy = st.radio(
-                    "Topic Number Strategy",
-                    ["Auto", "Manual"],
-                    help="Choose whether to let the model determine the optimal number of topics or set it manually"
-                )
-                if topic_strategy == "Manual":
-                    # Calculate reasonable max topics based on dataset size
-                    n_documents = len(df)
-                    max_topics = max(2, min(50, n_documents // 20))  # Ensure minimum of 2
-                    n_topics = st.slider(
-                        "Number of Topics",
-                        min_value=2,
-                        max_value=max_topics,
-                        value=min(20, max_topics),
-                        help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
-                    )
-                    st.info(f"""
-                        💡 For your dataset of {n_documents:,} documents:
-                        - Minimum topics: 2
-                        - Maximum topics: {max_topics}
-                        - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
-                        """)
-            with col2:
-                top_n = st.number_input(
-                    "Number of top topics/emotions to display:",
-                    min_value=1,
-                    max_value=100,
-                    value=10
                 )
-                min_topic_size = st.slider(
-                    "Minimum Topic Size",
-                    min_value=10,
-                    max_value=100,
-                    value=30,
-                    help="Minimum number of documents required to form a topic"
                 )
-            if st.button("Process Data"):
-                with st.spinner("Processing your data..."):
-                    summaries, topic_model = process_and_summarize(
-                        df,
-                        bert_tokenizer,
-                        bert_model,
-                        emotion_classifier,
-                        top_n=top_n,
-                        topic_strategy=topic_strategy,
-                        n_topics=n_topics if topic_strategy == "Manual" else None,
-                        min_topic_size=min_topic_size
-                    )
-                    if summaries:
-                        st.success("Analysis complete!")
-                        # Display results in tabs
-                        tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
-                        with tab1:
-                            for summary in summaries:
-                                with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"):
-                                    col1, col2 = st.columns(2)
-                                    with col1:
-                                        st.subheader("Top Topics")
-                                        for topic in summary['top_topics']:
-                                            st.write(f"• {topic['topic']}: {topic['count']} poems")
-                                    with col2:
-                                        st.subheader("Emotions")
-                                        for emotion in summary['top_emotions']:
-                                            st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
-                        with tab2:
-                            st.subheader("Global Topic Distribution")
-                            topic_info = topic_model.get_topic_info()
-                            for _, row in topic_info.iterrows():
-                                if row['Topic'] == -1:
-                                    topic_name = "Miscellaneous"
-                                else:
-                                    words = topic_model.get_topic(row['Topic'])
-                                    topic_name = " | ".join([word for word, _ in words[:5]])
-                                st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
-        except Exception as e:
-            st.error(f"Error processing file: {str(e)}")
-    else:
-        st.info("👆 Upload a file to get started!")
-        # Example format
-        st.write("### Expected File Format:")
-        example_df = pd.DataFrame({
-            'country': ['Egypt', 'Palestine'],
-            'poem': ['قصيدة مصرية', 'قصيدة فلسطينية']
-        })
-        st.dataframe(example_df)

     page_icon="📚",
     layout="wide"
 )
+@st.cache_resource
 def load_models():
     """Load and cache the models to prevent reloading"""
     tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
 def clean_arabic_text(text):
     # Add Arabic stop words
     ARABIC_STOP_WORDS = {
+        'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
+        'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
+        'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
+        'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
+        'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
+        'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
+        'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
+        'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
+        'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
+        'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
+        'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
+        'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
+        'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
+        'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
+    }
     """Clean Arabic text by removing stop words and normalizing."""
     words = text.split()
     cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
             continue
     return summaries, topic_model
+# Load models
+try:
+    bert_tokenizer, bert_model, emotion_classifier = load_models()
+    st.success("Models loaded successfully!")
+except Exception as e:
+    st.error(f"Error loading models: {str(e)}")
+    st.stop()
+# Main app interface
+st.title("📚 Arabic Poem Analysis")
+st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
+# File upload
+uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
+if uploaded_file is not None:
+    try:
+        # Read the file
+        if uploaded_file.name.endswith('.csv'):
+            df = pd.read_csv(uploaded_file)
+        else:
+            df = pd.read_excel(uploaded_file)
+        # Validate columns
+        required_columns = ['country', 'poem']
+        if not all(col in df.columns for col in required_columns):
+            st.error("File must contain 'country' and 'poem' columns.")
+            st.stop()
+        # Clean data
+        df['country'] = df['country'].str.strip()
+        df = df.dropna(subset=['country', 'poem'])
+        # Add topic modeling controls
+        st.subheader("Topic Modeling Settings")
+        col1, col2 = st.columns(2)
+        with col1:
+            topic_strategy = st.radio(
+                "Topic Number Strategy",
+                ["Auto", "Manual"],
+                help="Choose whether to let the model determine the optimal number of topics or set it manually"
+            )
+            if topic_strategy == "Manual":
+                n_documents = len(df)
+                max_topics = max(2, min(50, n_documents // 20))
+                n_topics = st.slider(
+                    "Number of Topics",
+                    min_value=2,
+                    max_value=max_topics,
+                    value=min(20, max_topics),
+                    help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
                 )
+                st.info(f"""
+                    💡 For your dataset of {n_documents:,} documents:
+                    - Minimum topics: 2
+                    - Maximum topics: {max_topics}
+                    - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
+                    """)
+        with col2:
+            top_n = st.number_input(
+                "Number of top topics/emotions to display:",
+                min_value=1,
+                max_value=100,
+                value=10
+            )
+            min_topic_size = st.slider(
+                "Minimum Topic Size",
+                min_value=10,
+                max_value=100,
+                value=30,
+                help="Minimum number of documents required to form a topic"
+            )
+        if st.button("Process Data"):
+            with st.spinner("Processing your data..."):
+                summaries, topic_model = process_and_summarize(
+                    df,
+                    bert_tokenizer,
+                    bert_model,
+                    emotion_classifier,
+                    top_n=top_n,
+                    topic_strategy=topic_strategy,
+                    n_topics=n_topics if topic_strategy == "Manual" else None,
+                    min_topic_size=min_topic_size
                 )
+                if summaries:
+                    st.success("Analysis complete!")
+                    # Display results in tabs
+                    tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
+                    with tab1:
+                        for summary in summaries:
+                            with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"):
+                                col1, col2 = st.columns(2)
+                                with col1:
+                                    st.subheader("Top Topics")
+                                    for topic in summary['top_topics']:
+                                        st.write(f"• {topic['topic']}: {topic['count']} poems")
+                                with col2:
+                                    st.subheader("Emotions")
+                                    for emotion in summary['top_emotions']:
+                                        st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
+                    with tab2:
+                        st.subheader("Global Topic Distribution")
+                        topic_info = topic_model.get_topic_info()
+                        for _, row in topic_info.iterrows():
+                            if row['Topic'] == -1:
+                                topic_name = "Miscellaneous"
+                            else:
+                                words = topic_model.get_topic(row['Topic'])
+                                topic_name = " | ".join([word for word, _ in words[:5]])
+                            st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
+    except Exception as e:
+        st.error(f"Error processing file: {str(e)}")
+else:
+    st.info("👆 Upload a file to get started!")
+    # Example format
+    st.write("### Expected File Format:")
+    example_df = pd.DataFrame({
+        'country': ['Egypt', 'Palestine'],
+        'poem': ['قصيدة مصرية', 'قصيدة فلسطينية']
+    })
+    st.dataframe(example_df)