kambris commited on
Commit
3f0f6de
·
verified ·
1 Parent(s): 6f973fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -53
app.py CHANGED
@@ -1,50 +1,43 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertModel
4
  from bertopic import BERTopic
5
  import torch
6
- import numpy as np
7
 
8
- # Initialize ARAT5 model and tokenizer for topic modeling
9
- tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
10
- model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
11
 
12
- # Initialize BERT tokenizer and model for feature extraction
13
- bert_tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
14
- bert_model = BertModel.from_pretrained("aubmindlab/bert-base-arabertv2")
15
 
16
- # Function to get embeddings from ARAT5 for topic modeling
17
  def generate_embeddings(texts):
18
- embeddings = []
19
-
20
- for text in texts:
21
- # Tokenize the text with truncation set to False
22
- # We are using the BertTokenizer directly without using the pipeline
23
- tokens = bert_tokenizer.encode(text, add_special_tokens=True, truncation=False, padding=False)
24
-
25
- # Split the tokens into chunks of size 512 (maximum length)
26
- chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
27
-
28
- poem_embeddings = []
29
-
30
- for chunk in chunked_texts:
31
- # Convert the chunk to a tensor and prepare the input for BERT model
32
- inputs = torch.tensor(chunk).unsqueeze(0) # Adding batch dimension
33
- with torch.no_grad():
34
- outputs = bert_model(inputs)
35
- # Get the embeddings from the last hidden state (mean of all token embeddings)
36
- chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
37
 
38
- poem_embeddings.append(chunk_embedding)
 
 
 
 
39
 
40
- # Average the embeddings of all chunks (optional, can also concatenate them)
41
- final_embedding = np.mean(np.array(poem_embeddings), axis=0)
42
- embeddings.append(final_embedding)
 
 
 
 
 
43
 
 
44
  return embeddings
45
 
46
- # Function to process the CSV or Excel file
47
- def process_file(uploaded_file):
48
  # Determine the file type
49
  if uploaded_file.name.endswith(".csv"):
50
  df = pd.read_csv(uploaded_file)
@@ -52,39 +45,75 @@ def process_file(uploaded_file):
52
  df = pd.read_excel(uploaded_file)
53
  else:
54
  st.error("Unsupported file format.")
55
- return None
56
 
57
  # Validate required columns
58
  required_columns = ['country', 'poem']
59
  missing_columns = [col for col in required_columns if col not in df.columns]
60
  if missing_columns:
61
  st.error(f"Missing columns: {', '.join(missing_columns)}")
62
- return None
63
-
64
- # Process the file
 
65
  df = df.dropna(subset=['country', 'poem'])
66
-
67
- texts = df['poem'].dropna().tolist()
68
 
69
- # Generate embeddings for all poems
70
- embeddings = generate_embeddings(texts)
71
-
72
- # Perform topic modeling with BERTopic
73
  topic_model = BERTopic()
74
- topics, _ = topic_model.fit_transform(embeddings)
75
- df['topic'] = topics
76
-
77
- return df
 
 
 
 
 
78
 
79
- # Streamlit App
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  st.title("Arabic Poem Topic Modeling & Emotion Classification")
 
 
81
  uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
82
 
83
  if uploaded_file is not None:
84
  try:
85
- result_df = process_file(uploaded_file)
86
- if result_df is not None:
87
- st.write("Data successfully processed!")
88
- st.write(result_df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  except Exception as e:
90
  st.error(f"Error: {e}")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
4
  from bertopic import BERTopic
5
  import torch
6
+ from collections import Counter
7
 
8
+ # Load AraBERT tokenizer and model for embeddings
9
+ bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
10
+ bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
11
 
12
+ # Load AraBERT model for emotion classification
13
+ emotion_model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2")
14
+ emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
15
 
16
+ # Function to generate embeddings using AraBERT
17
  def generate_embeddings(texts):
18
+ # Tokenize the list of texts using the tokenizer
19
+ inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=False, max_length=512)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Split large sequences into chunks of size 512
22
+ chunked_inputs = []
23
+ for input_ids in inputs['input_ids']:
24
+ chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
25
+ chunked_inputs.extend(chunks)
26
 
27
+ # Process each chunk and get embeddings
28
+ embeddings = []
29
+ for chunk in chunked_inputs:
30
+ input_tensor = torch.tensor(chunk).unsqueeze(0) # Add batch dimension
31
+ with torch.no_grad():
32
+ outputs = bert_model(input_tensor)
33
+ chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
34
+ embeddings.append(chunk_embedding)
35
 
36
+ # Return the embeddings averaged across chunks
37
  return embeddings
38
 
39
+ # Function to process the uploaded file and summarize by country
40
+ def process_and_summarize(uploaded_file, top_n=50):
41
  # Determine the file type
42
  if uploaded_file.name.endswith(".csv"):
43
  df = pd.read_csv(uploaded_file)
 
45
  df = pd.read_excel(uploaded_file)
46
  else:
47
  st.error("Unsupported file format.")
48
+ return None, None
49
 
50
  # Validate required columns
51
  required_columns = ['country', 'poem']
52
  missing_columns = [col for col in required_columns if col not in df.columns]
53
  if missing_columns:
54
  st.error(f"Missing columns: {', '.join(missing_columns)}")
55
+ return None, None
56
+
57
+ # Parse and preprocess the file
58
+ df['country'] = df['country'].str.strip()
59
  df = df.dropna(subset=['country', 'poem'])
 
 
60
 
61
+ # Group by country
62
+ summaries = []
 
 
63
  topic_model = BERTopic()
64
+ for country, group in df.groupby('country'):
65
+ st.info(f"Processing poems for {country}...")
66
+
67
+ # Combine all poems for the country
68
+ texts = group['poem'].dropna().tolist()
69
+
70
+ # Classify emotions
71
+ st.info(f"Classifying emotions for {country}...")
72
+ emotions = [emotion_classifier(text)[0]['label'] for text in texts]
73
 
74
+ # Generate embeddings and fit topic model
75
+ st.info(f"Generating embeddings and topics for {country}...")
76
+ embeddings = generate_embeddings(texts)
77
+ topics, _ = topic_model.fit_transform(embeddings)
78
+
79
+ # Aggregate topics and emotions
80
+ top_topics = Counter(topics).most_common(top_n)
81
+ top_emotions = Counter(emotions).most_common(top_n)
82
+
83
+ summaries.append({
84
+ 'country': country,
85
+ 'total_poems': len(texts),
86
+ 'top_topics': top_topics,
87
+ 'top_emotions': top_emotions
88
+ })
89
+
90
+ return summaries, topic_model
91
+
92
+ # Streamlit App Interface
93
  st.title("Arabic Poem Topic Modeling & Emotion Classification")
94
+ st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
95
+
96
  uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
97
 
98
  if uploaded_file is not None:
99
  try:
100
+ top_n = st.number_input("Select the number of top topics/emotions to display:", min_value=1, max_value=100, value=50)
101
+
102
+ summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n)
103
+ if summaries is not None:
104
+ st.success("Data successfully processed!")
105
+
106
+ # Display summary for each country
107
+ for summary in summaries:
108
+ st.write(f"### {summary['country']}")
109
+ st.write(f"Total Poems: {summary['total_poems']}")
110
+ st.write(f"Top {top_n} Topics:")
111
+ st.write(summary['top_topics'])
112
+ st.write(f"Top {top_n} Emotions:")
113
+ st.write(summary['top_emotions'])
114
+
115
+ # Display overall topics
116
+ st.write("### Global Topic Information:")
117
+ st.write(topic_model.get_topic_info())
118
  except Exception as e:
119
  st.error(f"Error: {e}")