Ahtisham1583 commited on
Commit
c4c0fad
·
1 Parent(s): 0243f95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +271 -0
app.py CHANGED
@@ -1,3 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from nltk.corpus import wordnet as wn
3
 
@@ -22,3 +292,4 @@ iface = gr.Interface(
22
 
23
  # Launch the Gradio interface
24
  iface.launch()
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Project_KeyExtraction-NLP.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1adCS5In25XQnEQ53D2H9CjaX7jL9yz6Z
8
+ """
9
+
10
+ import pandas
11
+ # load the dataset
12
+ dataset = pandas.read_csv('/content/drive/MyDrive/dataset/authors.csv')
13
+ dataset.head()
14
+
15
+ #Fetch wordcount for each abstract
16
+ dataset['word_count'] = dataset['name'].apply(lambda x: len(str(x).split(" ")))
17
+ dataset[['name','word_count']].head()
18
+
19
+ ##Descriptive statistics of word counts
20
+ dataset.word_count.describe()
21
+
22
+ #Identify common words
23
+ freq = pandas.Series(' '.join(dataset['name']).split()).value_counts()[:20]
24
+ freq
25
+
26
+ #Identify uncommon words
27
+ freq1 = pandas.Series(' '.join(dataset
28
+ ['name']).split()).value_counts()[-20:]
29
+ freq1
30
+
31
+ from nltk.stem.porter import PorterStemmer
32
+ from nltk.stem.wordnet import WordNetLemmatizer
33
+ lem = WordNetLemmatizer()
34
+ stem = PorterStemmer()
35
+ word = "cryptogenic"
36
+ print("stemming:",stem.stem(word))
37
+ print("lemmatization:", lem.lemmatize(word, "v"))
38
+
39
+ import nltk
40
+ nltk.download('wordnet')
41
+
42
+ # Libraries for text preprocessing
43
+ import re
44
+ import nltk
45
+ nltk.download('stopwords')
46
+ from nltk.corpus import stopwords
47
+ from nltk.stem.porter import PorterStemmer
48
+ from nltk.tokenize import RegexpTokenizer
49
+ #nltk.download('wordnet')
50
+ from nltk.stem.wordnet import WordNetLemmatizer
51
+
52
+ ##Creating a list of stop words and adding custom stopwords
53
+ stop_words = set(stopwords.words("english"))
54
+ ##Creating a list of custom stopwords
55
+ new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
56
+ stop_words = stop_words.union(new_words)
57
+
58
+ print(stop_words)
59
+
60
+ print(new_words)
61
+
62
+ corpus = []
63
+ for i in range(0, 3847):
64
+ #Remove punctuations
65
+ text = re.sub('[^a-zA-Z]', ' ', dataset['name'][i])
66
+
67
+ #Convert to lowercase
68
+ text = text.lower()
69
+
70
+ #remove tags
71
+ text=re.sub("</?.*?>"," <> ",text)
72
+
73
+ # remove special characters and digits
74
+ text=re.sub("(\\d|\\W)+"," ",text)
75
+
76
+ ##Convert to list from string
77
+ text = text.split()
78
+
79
+ ##Stemming
80
+ ps=PorterStemmer()
81
+ #Lemmatisation
82
+ lem = WordNetLemmatizer()
83
+ text = [lem.lemmatize(word) for word in text if not word in
84
+ stop_words]
85
+ text = " ".join(text)
86
+ corpus.append(text)
87
+
88
+ #View corpus item
89
+ corpus[222]
90
+
91
+ #View corpus item
92
+ corpus[300]
93
+
94
+ #Word cloud
95
+ from os import path
96
+ from PIL import Image
97
+ from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
98
+ import matplotlib.pyplot as plt
99
+
100
+ wordcloud = WordCloud(
101
+ background_color='white',
102
+ stopwords=stop_words,
103
+ max_words=100,
104
+ max_font_size=50,
105
+ random_state=42
106
+ ).generate(str(corpus))
107
+ print(wordcloud)
108
+ fig = plt.figure(1)
109
+ plt.imshow(wordcloud)
110
+ plt.axis('off')
111
+ plt.show()
112
+ fig.savefig("word1.png", dpi=900)
113
+
114
+ from sklearn.feature_extraction.text import CountVectorizer
115
+ import re
116
+ cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
117
+ X=cv.fit_transform(corpus)
118
+
119
+ from sklearn.feature_extraction.text import CountVectorizer
120
+
121
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1,3))
122
+ X = cv.fit_transform(corpus)
123
+
124
+ custom_stop_words = ['from', 'to', 'against', 'each', 'own', ...] # Add your custom stop words
125
+ cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1,3))
126
+ X = cv.fit_transform(corpus)
127
+
128
+ list(cv.vocabulary_.keys())[:10]
129
+
130
+ #Most frequently occuring words
131
+ def get_top_n_words(corpus, n=None):
132
+ vec = CountVectorizer().fit(corpus)
133
+ bag_of_words = vec.transform(corpus)
134
+ sum_words = bag_of_words.sum(axis=0)
135
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
136
+ vec.vocabulary_.items()]
137
+ words_freq =sorted(words_freq, key = lambda x: x[1],
138
+ reverse=True)
139
+ return words_freq[:n]
140
+ #Convert most freq words to dataframe for plotting bar plot
141
+ top_words = get_top_n_words(corpus, n=20)
142
+ top_df = pandas.DataFrame(top_words)
143
+ top_df.columns=["Word", "Freq"]
144
+ #Barplot of most freq words
145
+ import seaborn as sns
146
+ sns.set(rc={'figure.figsize':(13,8)})
147
+ g = sns.barplot(x="Word", y="Freq", data=top_df)
148
+ g.set_xticklabels(g.get_xticklabels(), rotation=30)
149
+
150
+ #Most frequently occuring Bi-grams
151
+ def get_top_n2_words(corpus, n=None):
152
+ vec1 = CountVectorizer(ngram_range=(2,2),
153
+ max_features=2000).fit(corpus)
154
+ bag_of_words = vec1.transform(corpus)
155
+ sum_words = bag_of_words.sum(axis=0)
156
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
157
+ vec1.vocabulary_.items()]
158
+ words_freq =sorted(words_freq, key = lambda x: x[1],
159
+ reverse=True)
160
+ return words_freq[:n]
161
+ top2_words = get_top_n2_words(corpus, n=20)
162
+ top2_df = pandas.DataFrame(top2_words)
163
+ top2_df.columns=["Bi-gram", "Freq"]
164
+ print(top2_df)
165
+ #Barplot of most freq Bi-grams
166
+ import seaborn as sns
167
+ sns.set(rc={'figure.figsize':(13,8)})
168
+ h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
169
+ h.set_xticklabels(h.get_xticklabels(), rotation=45)
170
+
171
+ #Most frequently occuring Tri-grams
172
+ def get_top_n3_words(corpus, n=None):
173
+ vec1 = CountVectorizer(ngram_range=(3,3),
174
+ max_features=2000).fit(corpus)
175
+ bag_of_words = vec1.transform(corpus)
176
+ sum_words = bag_of_words.sum(axis=0)
177
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
178
+ vec1.vocabulary_.items()]
179
+ words_freq =sorted(words_freq, key = lambda x: x[1],
180
+ reverse=True)
181
+ return words_freq[:n]
182
+ top3_words = get_top_n3_words(corpus, n=20)
183
+ top3_df = pandas.DataFrame(top3_words)
184
+ top3_df.columns=["Tri-gram", "Freq"]
185
+ print(top3_df)
186
+ #Barplot of most freq Tri-grams
187
+ import seaborn as sns
188
+ sns.set(rc={'figure.figsize':(13,8)})
189
+ j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
190
+ j.set_xticklabels(j.get_xticklabels(), rotation=45)
191
+
192
+ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
193
+
194
+ # Assuming you already have the 'corpus' defined
195
+
196
+ # Create a CountVectorizer
197
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
198
+
199
+ # Fit and transform the corpus
200
+ X = cv.fit_transform(corpus)
201
+
202
+ # Create a TfidfTransformer and fit it to the CountVectorizer output
203
+ tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
204
+ tfidf_transformer.fit(X)
205
+
206
+ # Get feature names from CountVectorizer
207
+ feature_names = cv.get_feature_names_out()
208
+
209
+ # Fetch document for which keywords need to be extracted
210
+ doc = corpus[82]
211
+
212
+ # Generate tf-idf for the given document
213
+ tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
214
+
215
+ # Now you can proceed with your further code
216
+
217
+ #Function for sorting tf_idf in descending order
218
+ from scipy.sparse import coo_matrix
219
+ def sort_coo(coo_matrix):
220
+ tuples = zip(coo_matrix.col, coo_matrix.data)
221
+ return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
222
+
223
+ def extract_topn_from_vector(feature_names, sorted_items, topn=10):
224
+ """get the feature names and tf-idf score of top n items"""
225
+
226
+ #use only top n items from vector
227
+ sorted_items = sorted_items[:topn]
228
+
229
+ score_vals = []
230
+ feature_vals = []
231
+
232
+ # word index and corresponding tf-idf score
233
+ for idx, score in sorted_items:
234
+
235
+ #keep track of feature name and its corresponding score
236
+ score_vals.append(round(score, 3))
237
+ feature_vals.append(feature_names[idx])
238
+
239
+ #create a tuples of feature,score
240
+ #results = zip(feature_vals,score_vals)
241
+ results= {}
242
+ for idx in range(len(feature_vals)):
243
+ results[feature_vals[idx]]=score_vals[idx]
244
+
245
+ return results
246
+ #sort the tf-idf vectors by descending order of scores
247
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
248
+ #extract only the top n; n here is 10
249
+ keywords=extract_topn_from_vector(feature_names,sorted_items,10)
250
+
251
+ # now print the results
252
+ print("\nAbstract:")
253
+ print(doc)
254
+ print("\nKeywords:")
255
+ for k in keywords:
256
+ print(k,keywords[k])
257
+
258
+ from gensim.models import word2vec
259
+ tokenized_sentences = [sentence.split() for sentence in corpus]
260
+ model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
261
+
262
+ model.wv.most_similar(positive=["jianan"])
263
+
264
+ import nltk
265
+ #nltk.download('omw-1.4')
266
+ from nltk.corpus import wordnet as wn
267
+
268
+ wn.synsets('car')
269
+
270
+ wn.synset('car.n.01').definition()
271
  import gradio as gr
272
  from nltk.corpus import wordnet as wn
273
 
 
292
 
293
  # Launch the Gradio interface
294
  iface.launch()
295
+