kambris commited on
Commit
5fce9bd
·
verified ·
1 Parent(s): 254cb4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -73
app.py CHANGED
@@ -8,100 +8,59 @@ import torch
8
  tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
9
  model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
10
 
11
- # Emotion classification pipeline (updated model for Arabic emotion classification)
12
  emotion_classifier = pipeline("text-classification", model="aubmindlab/bert-base-arabertv2")
13
 
14
  # Function to get embeddings from ARAT5 for topic modeling
15
  def generate_embeddings(texts):
16
- # Tokenize the Arabic text for ARAT5
17
  inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
18
  with torch.no_grad():
19
- # Use ARAT5 to generate embeddings
20
  outputs = model.encoder(input_ids=inputs['input_ids'])
21
- # Extract the embeddings (mean of hidden states for simplicity)
22
- return outputs.last_hidden_state.mean(dim=1).numpy()
23
 
24
- # Function to process the CSV file and return emotion and topic model
25
  def process_file(uploaded_file):
26
- # Load CSV
27
- df = pd.read_csv(uploaded_file)
28
-
29
- # Display basic info about the CSV
30
- st.write("CSV Loaded Successfully!")
31
- st.write(f"Data Preview: {df.head()}")
 
 
 
 
 
 
 
 
 
32
 
33
- # Ensure 'date' column is in datetime format and extract the year
34
- df['date'] = pd.to_datetime(df['date'], errors='coerce') # Replace 'date' with your actual column name
 
35
  df['year'] = df['date'].dt.year
36
 
37
- # Modify this to use the 'poem' column that contains the Arabic poems
38
- texts = df['poem'].dropna().tolist() # Replace 'poem' with your actual column name
39
-
40
- # Emotion Classification: Classify emotions for each poem (Arabic)
41
  emotions = [emotion_classifier(text)[0]['label'] for text in texts]
42
  df['emotion'] = emotions
43
 
44
- # Topic Modeling using ARAT5 embeddings
45
  embeddings = generate_embeddings(texts)
46
  topic_model = BERTopic()
47
  topics, _ = topic_model.fit_transform(embeddings)
48
  df['topic'] = topics
49
-
50
- # Return the processed dataframe
51
  return df
52
 
53
  # Streamlit App
54
- st.title("Arabic Poem Topic Modeling & Emotion Classification with ARAT5")
55
- st.write("Upload a CSV file to perform topic modeling and emotion classification on Arabic poems.")
56
-
57
- # File upload widget
58
- uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
59
 
60
- # If file is uploaded, process and display results
61
  if uploaded_file is not None:
62
- result_df = process_file(uploaded_file)
63
-
64
- # Show date selection widgets
65
- st.write("### Filter by Date Range")
66
- start_date = st.date_input("Start Date", value=pd.to_datetime(result_df['date'].min()))
67
- end_date = st.date_input("End Date", value=pd.to_datetime(result_df['date'].max()))
68
-
69
- # Filter data based on selected date range
70
- filtered_df = result_df[(result_df['date'] >= start_date) & (result_df['date'] <= end_date)]
71
-
72
- # Display filtered data
73
- st.write(f"Filtered Data (Poems from {start_date} to {end_date}):")
74
- st.write(filtered_df[['poet_name', 'era', 'poem', 'emotion', 'topic', 'date']])
75
-
76
- # Create buttons to show different summaries
77
- summary_type = st.radio("Select Summary Type:",
78
- ("Emotion and Topic Summary by Date Range",
79
- "Global Emotion and Topic Summary"))
80
-
81
- # Display the selected summary
82
- if summary_type == "Emotion and Topic Summary by Date Range":
83
- st.write("Emotion and Topic Summary for Selected Date Range:")
84
-
85
- # Emotion Distribution in Date Range
86
- emotion_counts = filtered_df['emotion'].value_counts()
87
- st.write("Emotion Counts in Date Range:")
88
- st.write(emotion_counts)
89
-
90
- # Topic Distribution in Date Range
91
- topic_counts = filtered_df['topic'].value_counts()
92
- st.write("Topic Counts in Date Range:")
93
- st.write(topic_counts)
94
-
95
- # Visualize emotion distribution over the selected range (optional)
96
- st.bar_chart(emotion_counts, use_container_width=True)
97
-
98
- # Visualize topic distribution over the selected range (optional)
99
- st.bar_chart(topic_counts, use_container_width=True)
100
-
101
- elif summary_type == "Global Emotion and Topic Summary":
102
- st.write("Global Emotion and Topic Summary (All Poems):")
103
- global_emotion_count = result_df['emotion'].value_counts().to_dict()
104
- global_topic_count = result_df['topic'].value_counts().to_dict()
105
-
106
- st.write(f"Emotion Distribution: {global_emotion_count}")
107
- st.write(f"Topic Distribution: {global_topic_count}")
 
8
  tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
9
  model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
10
 
11
+ # Emotion classification pipeline
12
  emotion_classifier = pipeline("text-classification", model="aubmindlab/bert-base-arabertv2")
13
 
14
  # Function to get embeddings from ARAT5 for topic modeling
15
  def generate_embeddings(texts):
 
16
  inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
17
  with torch.no_grad():
 
18
  outputs = model.encoder(input_ids=inputs['input_ids'])
19
+ embeddings = outputs[0].mean(dim=1).numpy()
20
+ return embeddings
21
 
22
+ # Function to process the CSV or Excel file
23
  def process_file(uploaded_file):
24
+ # Determine the file type
25
+ if uploaded_file.name.endswith(".csv"):
26
+ df = pd.read_csv(uploaded_file)
27
+ elif uploaded_file.name.endswith(".xlsx"):
28
+ df = pd.read_excel(uploaded_file)
29
+ else:
30
+ st.error("Unsupported file format.")
31
+ return None
32
+
33
+ # Validate required columns
34
+ required_columns = ['date', 'poem']
35
+ missing_columns = [col for col in required_columns if col not in df.columns]
36
+ if missing_columns:
37
+ st.error(f"Missing columns: {', '.join(missing_columns)}")
38
+ return None
39
 
40
+ # Process the file
41
+ df['date'] = pd.to_datetime(df['date'], errors='coerce')
42
+ df = df.dropna(subset=['date'])
43
  df['year'] = df['date'].dt.year
44
 
45
+ texts = df['poem'].dropna().tolist()
 
 
 
46
  emotions = [emotion_classifier(text)[0]['label'] for text in texts]
47
  df['emotion'] = emotions
48
 
 
49
  embeddings = generate_embeddings(texts)
50
  topic_model = BERTopic()
51
  topics, _ = topic_model.fit_transform(embeddings)
52
  df['topic'] = topics
 
 
53
  return df
54
 
55
  # Streamlit App
56
+ st.title("Arabic Poem Topic Modeling & Emotion Classification")
57
+ uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
 
 
 
58
 
 
59
  if uploaded_file is not None:
60
+ try:
61
+ result_df = process_file(uploaded_file)
62
+ if result_df is not None:
63
+ st.write("Data successfully processed!")
64
+ st.write(result_df.head())
65
+ except Exception as e:
66
+ st.error(f"Error: {e}")