faizhalas commited on
Commit
1e9dacc
·
verified ·
1 Parent(s): e90198a

Update pages/1 Scattertext.py

Browse files
Files changed (1) hide show
  1. pages/1 Scattertext.py +7 -4
pages/1 Scattertext.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
  import scattertext as stx
3
  import pandas as pd
4
  import re
@@ -117,7 +118,7 @@ def clean_csv(extype):
117
 
118
  #===stopword removal===
119
  stop = stopwords.words('english')
120
- paper[ColCho].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
121
 
122
  #===lemmatize===
123
  lemmatizer = WordNetLemmatizer()
@@ -125,7 +126,7 @@ def clean_csv(extype):
125
  words = text.split()
126
  words = [lemmatizer.lemmatize(word) for word in words]
127
  return ' '.join(words)
128
- paper[ColCho].apply(lemmatize_words)
129
 
130
  words_rmv = [word.strip() for word in words_to_remove.split(";")]
131
  remove_set = set(words_rmv)
@@ -178,7 +179,7 @@ def running_scattertext(cat_col, catname, noncatname):
178
  st.toast('Process completed', icon='🎉')
179
  time.sleep(1)
180
  st.toast('Visualizing', icon='⏳')
181
- st.components.v1.html(html, height = 1200, scrolling = True)
182
 
183
  except ValueError:
184
  st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
@@ -191,11 +192,13 @@ def df_w2w(search_terms1, search_terms2):
191
  for term in search_terms1:
192
  dfs1 = pd.concat([dfs1, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
193
  dfs1['Topic'] = 'First Term'
 
194
 
195
  dfs2 = pd.DataFrame()
196
  for term in search_terms2:
197
  dfs2 = pd.concat([dfs2, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
198
  dfs2['Topic'] = 'Second Term'
 
199
  filtered_df = pd.concat([dfs1, dfs2], ignore_index=True)
200
 
201
  return dfs1, dfs2, filtered_df
@@ -350,7 +353,7 @@ if uploaded_file is not None:
350
  st.write('You only have data in ', (MAX))
351
 
352
  with tab2:
353
- st.markdown('**Kessler, J.S. (2017). Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ.** https://doi.org/10.48550/arXiv.1703.00565')
354
 
355
  with tab3:
356
  st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
 
1
  import streamlit as st
2
+ import streamlit.components.v1 as components
3
  import scattertext as stx
4
  import pandas as pd
5
  import re
 
118
 
119
  #===stopword removal===
120
  stop = stopwords.words('english')
121
+ paper[ColCho] = paper[ColCho].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
122
 
123
  #===lemmatize===
124
  lemmatizer = WordNetLemmatizer()
 
126
  words = text.split()
127
  words = [lemmatizer.lemmatize(word) for word in words]
128
  return ' '.join(words)
129
+ paper[ColCho] = paper[ColCho].apply(lemmatize_words)
130
 
131
  words_rmv = [word.strip() for word in words_to_remove.split(";")]
132
  remove_set = set(words_rmv)
 
179
  st.toast('Process completed', icon='🎉')
180
  time.sleep(1)
181
  st.toast('Visualizing', icon='⏳')
182
+ components.html(html, height = 1200, scrolling = True)
183
 
184
  except ValueError:
185
  st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
 
192
  for term in search_terms1:
193
  dfs1 = pd.concat([dfs1, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
194
  dfs1['Topic'] = 'First Term'
195
+ dfs1 = dfs1.drop_duplicates()
196
 
197
  dfs2 = pd.DataFrame()
198
  for term in search_terms2:
199
  dfs2 = pd.concat([dfs2, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
200
  dfs2['Topic'] = 'Second Term'
201
+ dfs2 = dfs2.drop_duplicates()
202
  filtered_df = pd.concat([dfs1, dfs2], ignore_index=True)
203
 
204
  return dfs1, dfs2, filtered_df
 
353
  st.write('You only have data in ', (MAX))
354
 
355
  with tab2:
356
+ st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
357
 
358
  with tab3:
359
  st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')