kambris commited on
Commit
f496437
ยท
verified ยท
1 Parent(s): 58609ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -24
app.py CHANGED
@@ -7,7 +7,23 @@ import torch
7
  import numpy as np
8
  from collections import Counter
9
  import os
10
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # Configure page
12
  st.set_page_config(
13
  page_title="Arabic Poem Analysis",
@@ -54,23 +70,6 @@ def split_text(text, max_length=512):
54
  return chunks
55
 
56
  def clean_arabic_text(text):
57
- # Add Arabic stop words
58
- ARABIC_STOP_WORDS = {
59
- 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู†', 'ู…ุน', 'ุฎู„ุงู„', 'ุญุชู‰', 'ุฅุฐุง', 'ุซู…',
60
- 'ุฃูˆ', 'ูˆ', 'ู', 'ู„', 'ุจ', 'ูƒ', 'ู„ู„', 'ุงู„', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ',
61
- 'ุชู„ูƒ', 'ู‡ุคู„ุงุก', 'ู‡ู…', 'ู‡ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู†ุญู†', 'ุงู†ุช', 'ุงู†ุชู…',
62
- 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ุงูŠ', 'ูƒู„', 'ุจุนุถ', 'ุบูŠุฑ', 'ุญูˆู„',
63
- 'ุนู†ุฏ', 'ู‚ุฏ', 'ู„ู‚ุฏ', 'ู„ู…', 'ู„ู†', 'ู„ูˆ', 'ู…ุง', 'ู…ุงุฐุง', 'ู…ุชู‰', 'ูƒูŠู',
64
- 'ุงูŠู†', 'ู„ู…ุงุฐุง', 'ุงู„ุฐูŠ', 'ุงู„ุชูŠ', 'ุงู„ุฐูŠู†', 'ุงู„ู„ุงุชูŠ', 'ุงู„ู„ูˆุงุชูŠ',
65
- 'ุงู„ุงู†', 'ุจูŠู†', 'ููˆู‚', 'ุชุญุช', 'ุงู…ุงู…', 'ุฎู„ู', 'ุญูŠู†', 'ู‚ุจู„', 'ุจุนุฏ',
66
- 'ูˆ', 'ุฃู†', 'ููŠ', 'ูƒู„', 'ู„ู…', 'ู„ู†', 'ู„ู‡', 'ู…ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู‚ูˆุฉ',
67
- 'ูƒู…ุง', 'ู„ู‡ุง', 'ู…ู†ุฐ', 'ูˆู‚ุฏ', 'ูˆู„ุง', 'ู†ูุณ', 'ูˆู„ู…', 'ุญูŠุซ', 'ู‡ู†ุงูƒ',
68
- 'ุฌุฏุง', 'ุฐุงุช', 'ุถู…ู†', 'ุงู†ู‡', 'ู„ุฏู‰', 'ุนู„ูŠู‡', 'ู…ุซู„', 'ูˆู„ู‡', 'ุนู†ุฏ',
69
- 'ุฃู…ุง', 'ู‡ุฐู‡', 'ูˆุฃู†', 'ูˆูƒู„', 'ูˆู‚ุงู„', 'ู„ุฏูŠ', 'ูˆูƒุงู†', 'ููŠู‡', 'ูˆู‡ูŠ',
70
- 'ูˆู‡ูˆ', 'ุชู„ูƒ', 'ูƒู„ู…', 'ู„ูƒู†', 'ูˆููŠ', 'ูˆู‚ู', 'ูˆู„ู‚ุฏ', 'ูˆู…ู†', 'ูˆู‡ุฐุง',
71
- 'ุงูˆู„', 'ุถู…ู†', 'ุงู†ู‡ุง', 'ุฌู…ูŠุน', 'ุงู„ุฐูŠ', 'ู‚ุจู„', 'ุจุนุฏ', 'ุญูˆู„', 'ุงูŠุถุง',
72
- 'ู„ุงุฒู…', 'ุญุงุฌุฉ', 'ุนู„ูŠ', 'ูŠุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
73
- }
74
  """Clean Arabic text by removing stop words and normalizing."""
75
  words = text.split()
76
  cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
@@ -319,21 +318,21 @@ if uploaded_file is not None:
319
 
320
  if topic_strategy == "Manual":
321
  n_documents = len(df)
322
- max_topics = max(3, min(50, n_documents // 20))
 
323
 
324
  n_topics = st.slider(
325
  "Number of Topics",
326
- min_value=2,
327
  max_value=max_topics,
328
- value=min(20, max_topics),
329
  help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
330
  )
331
 
332
  st.info(f"""
333
  ๐Ÿ’ก For your dataset of {n_documents:,} documents:
334
- - Minimum topics: 2
335
- - Maximum topics: {max_topics}
336
- - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
337
  """)
338
 
339
  with col2:
 
7
  import numpy as np
8
  from collections import Counter
9
  import os
10
+ # Add Arabic stop words
11
+ ARABIC_STOP_WORDS = {
12
+ 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู†', 'ู…ุน', 'ุฎู„ุงู„', 'ุญุชู‰', 'ุฅุฐุง', 'ุซู…',
13
+ 'ุฃูˆ', 'ูˆ', 'ู', 'ู„', 'ุจ', 'ูƒ', 'ู„ู„', 'ุงู„', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ',
14
+ 'ุชู„ูƒ', 'ู‡ุคู„ุงุก', 'ู‡ู…', 'ู‡ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู†ุญู†', 'ุงู†ุช', 'ุงู†ุชู…',
15
+ 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ุงูŠ', 'ูƒู„', 'ุจุนุถ', 'ุบูŠุฑ', 'ุญูˆู„',
16
+ 'ุนู†ุฏ', 'ู‚ุฏ', 'ู„ู‚ุฏ', 'ู„ู…', 'ู„ู†', 'ู„ูˆ', 'ู…ุง', 'ู…ุงุฐุง', 'ู…ุชู‰', 'ูƒูŠู',
17
+ 'ุงูŠู†', 'ู„ู…ุงุฐุง', 'ุงู„ุฐูŠ', 'ุงู„ุชูŠ', 'ุงู„ุฐูŠู†', 'ุงู„ู„ุงุชูŠ', 'ุงู„ู„ูˆุงุชูŠ',
18
+ 'ุงู„ุงู†', 'ุจูŠู†', 'ููˆู‚', 'ุชุญุช', 'ุงู…ุงู…', 'ุฎู„ู', 'ุญูŠู†', 'ู‚ุจู„', 'ุจุนุฏ',
19
+ 'ูˆ', 'ุฃู†', 'ููŠ', 'ูƒู„', 'ู„ู…', 'ู„ู†', 'ู„ู‡', 'ู…ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู‚ูˆุฉ',
20
+ 'ูƒู…ุง', 'ู„ู‡ุง', 'ู…ู†ุฐ', 'ูˆู‚ุฏ', 'ูˆู„ุง', 'ู†ูุณ', 'ูˆู„ู…', 'ุญูŠุซ', 'ู‡ู†ุงูƒ',
21
+ 'ุฌุฏุง', 'ุฐุงุช', 'ุถู…ู†', 'ุงู†ู‡', 'ู„ุฏู‰', 'ุนู„ูŠู‡', 'ู…ุซู„', 'ูˆู„ู‡', 'ุนู†ุฏ',
22
+ 'ุฃู…ุง', 'ู‡ุฐู‡', 'ูˆุฃู†', 'ูˆูƒู„', 'ูˆู‚ุงู„', 'ู„ุฏูŠ', 'ูˆูƒุงู†', 'ููŠู‡', 'ูˆู‡ูŠ',
23
+ 'ูˆู‡ูˆ', 'ุชู„ูƒ', 'ูƒู„ู…', 'ู„ูƒู†', 'ูˆููŠ', 'ูˆู‚ู', 'ูˆู„ู‚ุฏ', 'ูˆู…ู†', 'ูˆู‡ุฐุง',
24
+ 'ุงูˆู„', 'ุถู…ู†', 'ุงู†ู‡ุง', 'ุฌู…ูŠุน', 'ุงู„ุฐูŠ', 'ู‚ุจู„', 'ุจุนุฏ', 'ุญูˆู„', 'ุงูŠุถุง',
25
+ 'ู„ุงุฒู…', 'ุญุงุฌุฉ', 'ุนู„ูŠ', 'ูŠุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
26
+ }
27
  # Configure page
28
  st.set_page_config(
29
  page_title="Arabic Poem Analysis",
 
70
  return chunks
71
 
72
  def clean_arabic_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  """Clean Arabic text by removing stop words and normalizing."""
74
  words = text.split()
75
  cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
 
318
 
319
  if topic_strategy == "Manual":
320
  n_documents = len(df)
321
+ max_topics = min(500, n_documents // 50)
322
+ min_topics = 5
323
 
324
  n_topics = st.slider(
325
  "Number of Topics",
326
+ min_value=min_topics,
327
  max_value=max_topics,
328
+ value=default_topics,
329
  help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
330
  )
331
 
332
  st.info(f"""
333
  ๐Ÿ’ก For your dataset of {n_documents:,} documents:
334
+ - Available topic range: {min_topics}-{max_topics}
335
+ - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
 
336
  """)
337
 
338
  with col2: