Spaces:

Ahtisham1583
/

Project_KeyExtraction-NLP

Sleeping

Ahtisham1583 commited on Dec 6, 2023

Commit

9465f1d

1 Parent(s): 020842d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,24 +12,24 @@ import nltk
 nltk.download('wordnet')
 # load the dataset
-dataset = pandas.read_csv('authors.csv')
 dataset.head()
 #Fetch wordcount for each abstract
-dataset['word_count'] = dataset['name'].apply(lambda x: len(str(x).split(" ")))
-dataset[['name','word_count']].head()
 ##Descriptive statistics of word counts
 dataset.word_count.describe()
 #Identify common words
-freq = pandas.Series(' '.join(dataset['name'].astype(str)).split()).value_counts()[:20]
-#freq = pandas.Series(' '.join(dataset['name']).split()).value_counts()[:20]
 freq
 #Identify uncommon words
-freq1 = pandas.Series(' '.join(dataset['name'].astype(str)).split()).value_counts()[-20:]
 #freq1 =  pandas.Series(' '.join(dataset
 #         ['name']).split()).value_counts()[-20:]
@@ -69,7 +69,7 @@ print(new_words)
 corpus = []
 for i in range(0, 3847):
     #Remove punctuations
-    text = re.sub('[^a-zA-Z]', ' ', dataset['name'][i])
     #Convert to lowercase
     text = text.lower()
@@ -280,7 +280,7 @@ from gensim.models import word2vec
 tokenized_sentences = [sentence.split() for sentence in corpus]
 model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
-model.wv.most_similar(positive=["jianan"])
 import nltk
 #nltk.download('omw-1.4')

 nltk.download('wordnet')
 # load the dataset
+dataset = pandas.read_csv('covid_abstracts.csv')
 dataset.head()
 #Fetch wordcount for each abstract
+dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" ")))
+dataset[['title','word_count']].head()
 ##Descriptive statistics of word counts
 dataset.word_count.describe()
 #Identify common words
+freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20]
+#freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20]
 freq
 #Identify uncommon words
+freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:]
 #freq1 =  pandas.Series(' '.join(dataset
 #         ['name']).split()).value_counts()[-20:]
 corpus = []
 for i in range(0, 3847):
     #Remove punctuations
+    text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])
     #Convert to lowercase
     text = text.lower()
 tokenized_sentences = [sentence.split() for sentence in corpus]
 model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
+model.wv.most_similar(positive=["incidence"])
 import nltk
 #nltk.download('omw-1.4')