Commit
·
9465f1d
1
Parent(s):
020842d
Update app.py
Browse files
app.py
CHANGED
@@ -12,24 +12,24 @@ import nltk
|
|
12 |
nltk.download('wordnet')
|
13 |
|
14 |
# load the dataset
|
15 |
-
dataset = pandas.read_csv('
|
16 |
dataset.head()
|
17 |
|
18 |
#Fetch wordcount for each abstract
|
19 |
-
dataset['word_count'] = dataset['
|
20 |
-
dataset[['
|
21 |
|
22 |
##Descriptive statistics of word counts
|
23 |
dataset.word_count.describe()
|
24 |
|
25 |
#Identify common words
|
26 |
-
freq = pandas.Series(' '.join(dataset['
|
27 |
|
28 |
-
#freq = pandas.Series(' '.join(dataset['
|
29 |
freq
|
30 |
|
31 |
#Identify uncommon words
|
32 |
-
freq1 = pandas.Series(' '.join(dataset['
|
33 |
|
34 |
#freq1 = pandas.Series(' '.join(dataset
|
35 |
# ['name']).split()).value_counts()[-20:]
|
@@ -69,7 +69,7 @@ print(new_words)
|
|
69 |
corpus = []
|
70 |
for i in range(0, 3847):
|
71 |
#Remove punctuations
|
72 |
-
text = re.sub('[^a-zA-Z]', ' ', dataset['
|
73 |
|
74 |
#Convert to lowercase
|
75 |
text = text.lower()
|
@@ -280,7 +280,7 @@ from gensim.models import word2vec
|
|
280 |
tokenized_sentences = [sentence.split() for sentence in corpus]
|
281 |
model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
|
282 |
|
283 |
-
model.wv.most_similar(positive=["
|
284 |
|
285 |
import nltk
|
286 |
#nltk.download('omw-1.4')
|
|
|
12 |
nltk.download('wordnet')
|
13 |
|
14 |
# load the dataset
|
15 |
+
dataset = pandas.read_csv('covid_abstracts.csv')
|
16 |
dataset.head()
|
17 |
|
18 |
#Fetch wordcount for each abstract
|
19 |
+
dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" ")))
|
20 |
+
dataset[['title','word_count']].head()
|
21 |
|
22 |
##Descriptive statistics of word counts
|
23 |
dataset.word_count.describe()
|
24 |
|
25 |
#Identify common words
|
26 |
+
freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20]
|
27 |
|
28 |
+
#freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20]
|
29 |
freq
|
30 |
|
31 |
#Identify uncommon words
|
32 |
+
freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:]
|
33 |
|
34 |
#freq1 = pandas.Series(' '.join(dataset
|
35 |
# ['name']).split()).value_counts()[-20:]
|
|
|
69 |
corpus = []
|
70 |
for i in range(0, 3847):
|
71 |
#Remove punctuations
|
72 |
+
text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])
|
73 |
|
74 |
#Convert to lowercase
|
75 |
text = text.lower()
|
|
|
280 |
tokenized_sentences = [sentence.split() for sentence in corpus]
|
281 |
model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
|
282 |
|
283 |
+
model.wv.most_similar(positive=["incidence"])
|
284 |
|
285 |
import nltk
|
286 |
#nltk.download('omw-1.4')
|