kambris commited on
Commit
729733d
·
verified ·
1 Parent(s): cebfb12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -170,9 +170,9 @@ def classify_emotion(text, classifier):
170
  return "LABEL_2"
171
 
172
  def get_embedding_for_text(text, tokenizer, model):
173
- """Get embedding for complete text."""
174
  chunks = split_text(text)
175
  chunk_embeddings = []
 
176
 
177
  for chunk in chunks:
178
  try:
@@ -189,18 +189,15 @@ def get_embedding_for_text(text, tokenizer, model):
189
  outputs = model(**inputs)
190
 
191
  embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
192
- chunk_embeddings.append(embedding[0])
193
  except Exception as e:
194
- st.warning(f"Error processing chunk: {str(e)}")
195
  continue
196
 
197
  if chunk_embeddings:
198
- # Convert to numpy array and ensure 2D shape
199
- chunk_embeddings = np.array(chunk_embeddings)
200
- if len(chunk_embeddings.shape) == 1:
201
- chunk_embeddings = chunk_embeddings.reshape(1, -1)
202
- return chunk_embeddings
203
- return np.zeros((1, model.config.hidden_size))
204
 
205
  def format_topics(topic_model, topic_counts):
206
  """Format topics for display."""
@@ -237,6 +234,7 @@ def format_emotions(emotion_counts):
237
 
238
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
239
  summaries = []
 
240
 
241
  topic_model_params = {
242
  "language": "arabic",
@@ -253,7 +251,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
253
  topic_model_params["nr_topics"] = "auto"
254
 
255
  topic_model = BERTopic(
256
- embedding_model=None, # Changed from bert_model to None
257
  **topic_model_params
258
  )
259
 
@@ -270,26 +268,29 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
270
 
271
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
272
  all_emotions = []
 
273
 
274
  # Generate embeddings
275
- embeddings = []
276
  for i, text in enumerate(texts):
277
  try:
278
  embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
279
- if embedding is not None and not np.isnan(embedding).any():
280
  embeddings.append(embedding)
281
-
282
  progress = (i + 1) / len(texts) * 0.4
283
  progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
284
  except Exception as e:
285
  st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
286
  continue
287
 
288
- # Convert embeddings to numpy array
289
- embeddings = np.array(embeddings)
290
-
 
 
 
 
291
  # Process emotions
292
- for i, text in enumerate(texts):
293
  try:
294
  emotion = classify_emotion(text, emotion_classifier)
295
  all_emotions.append(emotion)
@@ -305,8 +306,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
305
  continue
306
 
307
  # Ensure texts and embeddings match
308
- if len(embeddings) != len(texts):
309
- texts = texts[:len(embeddings)]
310
 
311
  # Fit and transform the topic model
312
  topics, probs = topic_model.fit_transform(texts, embeddings)
@@ -329,6 +329,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
329
 
330
  return summaries, topic_model
331
 
 
332
  try:
333
  bert_tokenizer, bert_model, emotion_classifier = load_models()
334
  st.success("Models loaded successfully!")
 
170
  return "LABEL_2"
171
 
172
  def get_embedding_for_text(text, tokenizer, model):
 
173
  chunks = split_text(text)
174
  chunk_embeddings = []
175
+ embedding_size = model.config.hidden_size
176
 
177
  for chunk in chunks:
178
  try:
 
189
  outputs = model(**inputs)
190
 
191
  embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
192
+ chunk_embeddings.append(embedding.reshape(-1))
193
  except Exception as e:
 
194
  continue
195
 
196
  if chunk_embeddings:
197
+ # Ensure consistent shape
198
+ final_embedding = np.mean(chunk_embeddings, axis=0)
199
+ return final_embedding.reshape(-1)
200
+ return np.zeros(embedding_size)
 
 
201
 
202
  def format_topics(topic_model, topic_counts):
203
  """Format topics for display."""
 
234
 
235
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
236
  summaries = []
237
+ embedding_size = bert_model.config.hidden_size
238
 
239
  topic_model_params = {
240
  "language": "arabic",
 
251
  topic_model_params["nr_topics"] = "auto"
252
 
253
  topic_model = BERTopic(
254
+ embedding_model=None,
255
  **topic_model_params
256
  )
257
 
 
268
 
269
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
270
  all_emotions = []
271
+ embeddings = []
272
 
273
  # Generate embeddings
 
274
  for i, text in enumerate(texts):
275
  try:
276
  embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
277
+ if embedding is not None and embedding.shape[0] == embedding_size:
278
  embeddings.append(embedding)
 
279
  progress = (i + 1) / len(texts) * 0.4
280
  progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
281
  except Exception as e:
282
  st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
283
  continue
284
 
285
+ # Convert to numpy array and ensure 2D shape
286
+ if embeddings:
287
+ embeddings = np.vstack(embeddings)
288
+ else:
289
+ st.warning(f"No valid embeddings generated for {country}")
290
+ continue
291
+
292
  # Process emotions
293
+ for i, text in enumerate(texts[:len(embeddings)]):
294
  try:
295
  emotion = classify_emotion(text, emotion_classifier)
296
  all_emotions.append(emotion)
 
306
  continue
307
 
308
  # Ensure texts and embeddings match
309
+ texts = texts[:len(embeddings)]
 
310
 
311
  # Fit and transform the topic model
312
  topics, probs = topic_model.fit_transform(texts, embeddings)
 
329
 
330
  return summaries, topic_model
331
 
332
+
333
  try:
334
  bert_tokenizer, bert_model, emotion_classifier = load_models()
335
  st.success("Models loaded successfully!")