|
|
|
"""Project_KeyExtraction-NLP.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1adCS5In25XQnEQ53D2H9CjaX7jL9yz6Z |
|
""" |
|
|
|
import pandas |
|
import nltk |
|
nltk.download('wordnet') |
|
|
|
|
|
dataset = pandas.read_csv('covid_abstracts.csv') |
|
dataset.head() |
|
|
|
|
|
dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" "))) |
|
dataset[['title','word_count']].head() |
|
|
|
|
|
dataset.word_count.describe() |
|
|
|
|
|
freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20] |
|
|
|
|
|
freq |
|
|
|
|
|
freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:] |
|
|
|
|
|
|
|
freq1 |
|
|
|
from nltk.stem.porter import PorterStemmer |
|
from nltk.stem.wordnet import WordNetLemmatizer |
|
lem = WordNetLemmatizer() |
|
stem = PorterStemmer() |
|
word = "cryptogenic" |
|
print("stemming:",stem.stem(word)) |
|
print("lemmatization:", lem.lemmatize(word, "v")) |
|
|
|
import nltk |
|
nltk.download('wordnet') |
|
|
|
|
|
import re |
|
import nltk |
|
nltk.download('stopwords') |
|
from nltk.corpus import stopwords |
|
from nltk.stem.porter import PorterStemmer |
|
from nltk.tokenize import RegexpTokenizer |
|
|
|
from nltk.stem.wordnet import WordNetLemmatizer |
|
|
|
|
|
stop_words = set(stopwords.words("english")) |
|
|
|
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"] |
|
stop_words = stop_words.union(new_words) |
|
|
|
print(stop_words) |
|
|
|
print(new_words) |
|
|
|
corpus = [] |
|
for i in range(0, 3847): |
|
|
|
text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i]) |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
text=re.sub("</?.*?>"," <> ",text) |
|
|
|
|
|
text=re.sub("(\\d|\\W)+"," ",text) |
|
|
|
|
|
text = text.split() |
|
|
|
|
|
ps=PorterStemmer() |
|
|
|
lem = WordNetLemmatizer() |
|
text = [lem.lemmatize(word) for word in text if not word in |
|
stop_words] |
|
text = " ".join(text) |
|
corpus.append(text) |
|
|
|
|
|
corpus[222] |
|
|
|
|
|
corpus[300] |
|
|
|
|
|
from os import path |
|
from PIL import Image |
|
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator |
|
import matplotlib.pyplot as plt |
|
|
|
wordcloud = WordCloud( |
|
background_color='white', |
|
stopwords=stop_words, |
|
max_words=100, |
|
max_font_size=50, |
|
random_state=42 |
|
).generate(str(corpus)) |
|
print(wordcloud) |
|
fig = plt.figure(1) |
|
plt.imshow(wordcloud) |
|
plt.axis('off') |
|
plt.show() |
|
fig.savefig("word1.png", dpi=900) |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import re |
|
|
|
|
|
|
|
|
|
|
|
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3)) |
|
X = cv.fit_transform(corpus) |
|
|
|
|
|
custom_stop_words = ['same', 'hers', 'they', 'with', 'if', 'y', 'iv', 'new', ...] |
|
cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1, 3)) |
|
X = cv.fit_transform(corpus) |
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1,3)) |
|
X = cv.fit_transform(corpus) |
|
|
|
custom_stop_words = ['from', 'to', 'against', 'each', 'own', ...] |
|
cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1,3)) |
|
X = cv.fit_transform(corpus) |
|
|
|
list(cv.vocabulary_.keys())[:10] |
|
|
|
|
|
def get_top_n_words(corpus, n=None): |
|
vec = CountVectorizer().fit(corpus) |
|
bag_of_words = vec.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in |
|
vec.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], |
|
reverse=True) |
|
return words_freq[:n] |
|
|
|
top_words = get_top_n_words(corpus, n=20) |
|
top_df = pandas.DataFrame(top_words) |
|
top_df.columns=["Word", "Freq"] |
|
|
|
import seaborn as sns |
|
sns.set(rc={'figure.figsize':(13,8)}) |
|
g = sns.barplot(x="Word", y="Freq", data=top_df) |
|
g.set_xticklabels(g.get_xticklabels(), rotation=30) |
|
|
|
|
|
def get_top_n2_words(corpus, n=None): |
|
vec1 = CountVectorizer(ngram_range=(2,2), |
|
max_features=2000).fit(corpus) |
|
bag_of_words = vec1.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in |
|
vec1.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], |
|
reverse=True) |
|
return words_freq[:n] |
|
top2_words = get_top_n2_words(corpus, n=20) |
|
top2_df = pandas.DataFrame(top2_words) |
|
top2_df.columns=["Bi-gram", "Freq"] |
|
print(top2_df) |
|
|
|
import seaborn as sns |
|
sns.set(rc={'figure.figsize':(13,8)}) |
|
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df) |
|
h.set_xticklabels(h.get_xticklabels(), rotation=45) |
|
|
|
|
|
def get_top_n3_words(corpus, n=None): |
|
vec1 = CountVectorizer(ngram_range=(3,3), |
|
max_features=2000).fit(corpus) |
|
bag_of_words = vec1.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in |
|
vec1.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], |
|
reverse=True) |
|
return words_freq[:n] |
|
top3_words = get_top_n3_words(corpus, n=20) |
|
top3_df = pandas.DataFrame(top3_words) |
|
top3_df.columns=["Tri-gram", "Freq"] |
|
print(top3_df) |
|
|
|
import seaborn as sns |
|
sns.set(rc={'figure.figsize':(13,8)}) |
|
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df) |
|
j.set_xticklabels(j.get_xticklabels(), rotation=45) |
|
|
|
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer |
|
|
|
|
|
|
|
|
|
cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3)) |
|
|
|
|
|
X = cv.fit_transform(corpus) |
|
|
|
|
|
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) |
|
tfidf_transformer.fit(X) |
|
|
|
|
|
feature_names = cv.get_feature_names_out() |
|
|
|
|
|
doc = corpus[82] |
|
|
|
|
|
tf_idf_vector = tfidf_transformer.transform(cv.transform([doc])) |
|
|
|
|
|
|
|
|
|
from scipy.sparse import coo_matrix |
|
def sort_coo(coo_matrix): |
|
tuples = zip(coo_matrix.col, coo_matrix.data) |
|
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) |
|
|
|
def extract_topn_from_vector(feature_names, sorted_items, topn=10): |
|
"""get the feature names and tf-idf score of top n items""" |
|
|
|
|
|
sorted_items = sorted_items[:topn] |
|
|
|
score_vals = [] |
|
feature_vals = [] |
|
|
|
|
|
for idx, score in sorted_items: |
|
|
|
|
|
score_vals.append(round(score, 3)) |
|
feature_vals.append(feature_names[idx]) |
|
|
|
|
|
|
|
results= {} |
|
for idx in range(len(feature_vals)): |
|
results[feature_vals[idx]]=score_vals[idx] |
|
|
|
return results |
|
|
|
sorted_items=sort_coo(tf_idf_vector.tocoo()) |
|
|
|
keywords=extract_topn_from_vector(feature_names,sorted_items,10) |
|
|
|
|
|
print("\nAbstract:") |
|
print(doc) |
|
print("\nKeywords:") |
|
for k in keywords: |
|
print(k,keywords[k]) |
|
|
|
from gensim.models import word2vec |
|
tokenized_sentences = [sentence.split() for sentence in corpus] |
|
model = word2vec.Word2Vec(tokenized_sentences, min_count=1) |
|
|
|
model.wv.most_similar(positive=["incidence"]) |
|
|
|
import nltk |
|
|
|
from nltk.corpus import wordnet as wn |
|
|
|
wn.synsets('car') |
|
|
|
wn.synset('car.n.01').definition() |
|
import gradio as gr |
|
from nltk.corpus import wordnet as wn |
|
|
|
|
|
def get_synset_definition(word): |
|
synsets = wn.synsets(word) |
|
if synsets: |
|
first_synset = synsets[0] |
|
return first_synset.definition() |
|
else: |
|
return "No synsets found for the given word." |
|
|
|
|
|
iface = gr.Interface( |
|
fn=get_synset_definition, |
|
inputs=gr.Textbox(), |
|
outputs=gr.Textbox(), |
|
live=True, |
|
title="WordNet Synset Definition", |
|
description="Enter a word to get the definition of its first WordNet synset.", |
|
) |
|
|
|
|
|
iface.launch() |
|
|
|
|