Ahtisham1583 commited on
Commit
9465f1d
·
1 Parent(s): 020842d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -12,24 +12,24 @@ import nltk
12
  nltk.download('wordnet')
13
 
14
  # load the dataset
15
- dataset = pandas.read_csv('authors.csv')
16
  dataset.head()
17
 
18
  #Fetch wordcount for each abstract
19
- dataset['word_count'] = dataset['name'].apply(lambda x: len(str(x).split(" ")))
20
- dataset[['name','word_count']].head()
21
 
22
  ##Descriptive statistics of word counts
23
  dataset.word_count.describe()
24
 
25
  #Identify common words
26
- freq = pandas.Series(' '.join(dataset['name'].astype(str)).split()).value_counts()[:20]
27
 
28
- #freq = pandas.Series(' '.join(dataset['name']).split()).value_counts()[:20]
29
  freq
30
 
31
  #Identify uncommon words
32
- freq1 = pandas.Series(' '.join(dataset['name'].astype(str)).split()).value_counts()[-20:]
33
 
34
  #freq1 = pandas.Series(' '.join(dataset
35
  # ['name']).split()).value_counts()[-20:]
@@ -69,7 +69,7 @@ print(new_words)
69
  corpus = []
70
  for i in range(0, 3847):
71
  #Remove punctuations
72
- text = re.sub('[^a-zA-Z]', ' ', dataset['name'][i])
73
 
74
  #Convert to lowercase
75
  text = text.lower()
@@ -280,7 +280,7 @@ from gensim.models import word2vec
280
  tokenized_sentences = [sentence.split() for sentence in corpus]
281
  model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
282
 
283
- model.wv.most_similar(positive=["jianan"])
284
 
285
  import nltk
286
  #nltk.download('omw-1.4')
 
12
  nltk.download('wordnet')
13
 
14
  # load the dataset
15
+ dataset = pandas.read_csv('covid_abstracts.csv')
16
  dataset.head()
17
 
18
  #Fetch wordcount for each abstract
19
+ dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" ")))
20
+ dataset[['title','word_count']].head()
21
 
22
  ##Descriptive statistics of word counts
23
  dataset.word_count.describe()
24
 
25
  #Identify common words
26
+ freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20]
27
 
28
+ #freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20]
29
  freq
30
 
31
  #Identify uncommon words
32
+ freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:]
33
 
34
  #freq1 = pandas.Series(' '.join(dataset
35
  # ['name']).split()).value_counts()[-20:]
 
69
  corpus = []
70
  for i in range(0, 3847):
71
  #Remove punctuations
72
+ text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])
73
 
74
  #Convert to lowercase
75
  text = text.lower()
 
280
  tokenized_sentences = [sentence.split() for sentence in corpus]
281
  model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
282
 
283
+ model.wv.most_similar(positive=["incidence"])
284
 
285
  import nltk
286
  #nltk.download('omw-1.4')