MarMont commited on
Commit
12bb295
1 Parent(s): 802e30e

separate function for cleaning

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -185,9 +185,7 @@ def get_topic_value(row, i):
185
  except Exception as e:
186
  print(e)
187
 
188
- def full_lda(df):
189
-
190
- print('cleaning')
191
  df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
192
 
193
  # Apply the function above and get tweets free of emoji's
@@ -246,6 +244,13 @@ def full_lda(df):
246
  # Apply tokenizer
247
  df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
248
 
 
 
 
 
 
 
 
249
  print('base model setup')
250
  # Create a id2word dictionary
251
  global id2word
@@ -532,11 +537,13 @@ def main(dataset, model, progress=gr.Progress(track_tqdm=True)):
532
  print(df)
533
 
534
  if model == 'LDA':
 
535
  print('doing lda')
536
  top_tweets = full_lda(df)
537
  print('done lda')
538
  place_data = 'test'
539
  else:
 
540
  base_bertopic(df)
541
  top_tweets = optimized_bertopic()
542
 
 
185
  except Exception as e:
186
  print(e)
187
 
188
+ def cleaning(df):
 
 
189
  df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
190
 
191
  # Apply the function above and get tweets free of emoji's
 
244
  # Apply tokenizer
245
  df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
246
 
247
+ return df
248
+
249
+ def full_lda(df):
250
+
251
+ print('cleaning')
252
+
253
+
254
  print('base model setup')
255
  # Create a id2word dictionary
256
  global id2word
 
537
  print(df)
538
 
539
  if model == 'LDA':
540
+ df = cleaning(df)
541
  print('doing lda')
542
  top_tweets = full_lda(df)
543
  print('done lda')
544
  place_data = 'test'
545
  else:
546
+ df = cleaning(df)
547
  base_bertopic(df)
548
  top_tweets = optimized_bertopic()
549