Sa-m commited on
Commit
fb1ce50
·
verified ·
1 Parent(s): 1df8bb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -416
app.py CHANGED
@@ -1,415 +1,3 @@
1
- # """
2
- # # MANIFESTO ANALYSIS
3
- # """
4
-
5
- # ##IMPORTING LIBRARIES
6
- # import random
7
- # import matplotlib.pyplot as plt
8
- # import nltk
9
- # from nltk.tokenize import word_tokenize,sent_tokenize
10
- # from nltk.corpus import stopwords
11
- # from nltk.stem.porter import PorterStemmer
12
- # from nltk.stem import WordNetLemmatizer
13
- # from nltk.corpus import stopwords
14
- # from nltk.tokenize import word_tokenize
15
- # from nltk.probability import FreqDist
16
- # from cleantext import clean
17
- # import textract
18
- # import urllib.request
19
- # import nltk.corpus
20
- # from nltk.text import Text
21
- # import io
22
- # from io import StringIO,BytesIO
23
- # import sys
24
- # import pandas as pd
25
- # import cv2
26
- # import re
27
- # from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
28
- # from textblob import TextBlob
29
- # from PIL import Image
30
- # import os
31
- # import gradio as gr
32
- # from zipfile import ZipFile
33
- # import contractions
34
- # import unidecode
35
-
36
- # nltk.download('punkt_tab')
37
- # nltk.download('stopwords')
38
- # nltk.download('punkt')
39
- # nltk.download('wordnet')
40
- # nltk.download('words')
41
-
42
-
43
- # """## PARSING FILES"""
44
-
45
- # #def Parsing(parsed_text):
46
- # #parsed_text=parsed_text.name
47
- # #raw_party =parser.from_file(parsed_text)
48
- # # raw_party = raw_party['content'],cache_examples=True
49
- # # return clean(raw_party)
50
-
51
-
52
- # def Parsing(parsed_text):
53
- # parsed_text=parsed_text.name
54
- # raw_party =textract.process(parsed_text, encoding='ascii',method='pdfminer')
55
- # return clean(raw_party)
56
-
57
-
58
- # #Added more stopwords to avoid irrelevant terms
59
- # stop_words = set(stopwords.words('english'))
60
- # stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
61
-
62
- # """## PREPROCESSING"""
63
-
64
- # def clean_text(text):
65
- # '''
66
- # The function which returns clean text
67
- # '''
68
- # text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
69
- # text=unidecode.unidecode(text)# diacritics remove
70
- # text=contractions.fix(text) # contraction fix
71
- # text = re.sub(r"\n", " ", text)
72
- # text = re.sub(r"\n\n", " ", text)
73
- # text = re.sub(r"\t", " ", text)
74
- # text = re.sub(r"/ ", " ", text)
75
- # text = text.strip(" ")
76
- # text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
77
-
78
- # text = [word for word in text.split() if word not in stop_words]
79
- # text = ' '.join(text)
80
- # return text
81
-
82
- # # text_Party=clean_text(raw_party)
83
-
84
- # def Preprocess(textParty):
85
- # '''
86
- # Removing special characters extra spaces
87
- # '''
88
- # text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
89
- # #Removing all stop words
90
- # pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
91
- # text2Party = pattern.sub('', text1Party)
92
- # # fdist_cong = FreqDist(word_tokens_cong)
93
- # return text2Party
94
-
95
-
96
-
97
-
98
-
99
- # '''
100
- # Using Concordance, you can see each time a word is used, along with its
101
- # immediate context. It can give you a peek into how a word is being used
102
- # at the sentence level and what words are used with it
103
- # '''
104
- # def conc(text_Party,strng):
105
- # word_tokens_party = word_tokenize(text_Party)
106
- # moby = Text(word_tokens_party)
107
- # resultList = []
108
- # for i in range(0,1):
109
- # save_stdout = sys.stdout
110
- # result = StringIO()
111
- # sys.stdout = result
112
- # moby.concordance(strng,lines=4,width=82)
113
- # sys.stdout = save_stdout
114
- # s=result.getvalue().splitlines()
115
- # return result.getvalue()
116
-
117
- # def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10,numLins=4):
118
- # """
119
- # Function to get all the phases that contain the target word in a text/passage tar_passage.
120
- # Workaround to save the output given by nltk Concordance function
121
-
122
- # str target_word, str tar_passage int left_margin int right_margin --> list of str
123
- # left_margin and right_margin allocate the number of words/pununciation before and after target word
124
- # Left margin will take note of the beginning of the text
125
- # """
126
- # ## Create list of tokens using nltk function
127
- # tokens = nltk.word_tokenize(tar_passage)
128
-
129
- # ## Create the text of tokens
130
- # text = nltk.Text(tokens)
131
-
132
- # ## Collect all the index or offset position of the target word
133
- # c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
134
-
135
- # ## Collect the range of the words that is within the target word by using text.tokens[start;end].
136
- # ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
137
- # concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)])
138
-
139
- # ## join the sentences for each of the target phrase and return it
140
- # result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1]
141
- # result=result[:numLins+1]
142
-
143
- # res='\n\n'.join(result)
144
- # return res
145
-
146
-
147
- # def normalize(d, target=1.0):
148
- # raw = sum(d.values())
149
- # factor = target/raw
150
- # return {key:value*factor for key,value in d.items()}
151
-
152
- # def fDistance(text2Party):
153
- # '''
154
- # Most frequent words search
155
- # '''
156
- # word_tokens_party = word_tokenize(text2Party) #Tokenizing
157
- # fdistance = FreqDist(word_tokens_party).most_common(10)
158
- # mem={}
159
- # for x in fdistance:
160
- # mem[x[0]]=x[1]
161
- # return normalize(mem)
162
-
163
- # def fDistancePlot(text2Party,plotN=15):
164
- # '''
165
- # Most Frequent Words Visualization
166
- # '''
167
- # word_tokens_party = word_tokenize(text2Party) #Tokenizing
168
- # fdistance = FreqDist(word_tokens_party)
169
- # plt.title('Frequency Distribution')
170
- # plt.axis('off')
171
- # plt.figure(figsize=(4,3))
172
- # fdistance.plot(plotN)
173
- # plt.tight_layout()
174
- # buf = BytesIO()
175
- # plt.savefig(buf)
176
- # buf.seek(0)
177
- # img1 = Image.open(buf)
178
- # plt.clf()
179
- # return img1
180
-
181
-
182
- # def DispersionPlot(textParty):
183
- # '''
184
- # Dispersion PLot
185
- # '''
186
- # word_tokens_party = word_tokenize(textParty) #Tokenizing
187
- # moby = Text(word_tokens_party)
188
- # fdistance = FreqDist(word_tokens_party)
189
- # word_Lst=[]
190
- # for x in range(5):
191
- # word_Lst.append(fdistance.most_common(6)[x][0])
192
-
193
- # plt.axis('off')
194
- # plt.figure(figsize=(4,3))
195
- # plt.title('Dispersion Plot')
196
- # moby.dispersion_plot(word_Lst)
197
- # plt.plot(color="#EF6D6D")
198
- # plt.tight_layout()
199
- # buf = BytesIO()
200
- # plt.savefig(buf)
201
- # buf.seek(0)
202
- # img = Image.open(buf)
203
- # plt.clf()
204
- # return img
205
-
206
-
207
- # def getSubjectivity(text):
208
-
209
- # '''
210
- # Create a function to get the polarity
211
- # '''
212
- # return TextBlob(text).sentiment.subjectivity
213
-
214
-
215
- # def getPolarity(text):
216
- # '''
217
- # Create a function to get the polarity
218
- # '''
219
- # return TextBlob(text).sentiment.polarity
220
-
221
-
222
- # def getAnalysis(score):
223
- # if score < 0:
224
- # return 'Negative'
225
- # elif score == 0:
226
- # return 'Neutral'
227
- # else:
228
- # return 'Positive'
229
- # def Original_Image(path):
230
- # img= cv2.imread(path)
231
- # img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
232
- # return img
233
-
234
- # def Image_Processed(path):
235
- # '''
236
- # Reading the image file
237
- # '''
238
- # img= cv2.imread(path)
239
- # img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
240
-
241
- # #Thresholding
242
- # ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY)
243
-
244
- # return bw_img
245
-
246
- # def word_cloud(orgIm,mask_img,text_Party_pr,maxWord=2000,colorGener=True,
247
- # contCol='white',bckColor='white'):
248
- # '''
249
- # #Generating word cloud
250
- # '''
251
- # mask =mask_img
252
- # # Create and generate a word cloud image:
253
- # wordcloud = WordCloud(max_words=maxWord, background_color=bckColor,
254
- # mask=mask,
255
- # colormap='nipy_spectral_r',
256
- # contour_color=contCol,
257
- # width=800, height=800,
258
- # margin=2,
259
- # contour_width=3).generate(text_Party_pr)
260
-
261
- # # create coloring from image
262
-
263
-
264
- # plt.axis("off")
265
- # if colorGener==True:
266
- # image_colors = ImageColorGenerator(orgIm)
267
- # plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear")
268
-
269
-
270
- # else:
271
- # plt.imshow(wordcloud)
272
-
273
-
274
-
275
-
276
- # def word_cloud_generator(parsed_text_name,text_Party):
277
- # parsed=parsed_text_name.lower()
278
-
279
- # if 'bjp' in parsed:
280
- # orgImg=Original_Image('bjpImg2.jpeg')
281
- # bwImg=Image_Processed('bjpImg2.jpeg')
282
- # plt.figure(figsize=(6,5))
283
- # word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True,
284
- # contCol='white', bckColor='black')
285
- # plt.tight_layout()
286
- # buf = BytesIO()
287
- # plt.savefig(buf)
288
- # buf.seek(0)
289
- # img1 = Image.open(buf)
290
- # plt.clf()
291
- # return img1
292
-
293
-
294
- # elif 'congress' in parsed:
295
- # orgImg=Original_Image('congress3.jpeg')
296
- # bwImg=Image_Processed('congress3.jpeg')
297
- # plt.figure(figsize=(5,4))
298
- # word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True)
299
-
300
- # plt.tight_layout()
301
- # buf = BytesIO()
302
- # plt.savefig(buf)
303
- # buf.seek(0)
304
- # img2 = Image.open(buf)
305
- # plt.clf()
306
- # return img2
307
- # #congrsMain.jpg
308
-
309
-
310
- # elif 'aap' in parsed:
311
- # orgImg=Original_Image('aapMain2.jpg')
312
- # bwImg=Image_Processed('aapMain2.jpg')
313
- # plt.figure(figsize=(5,4))
314
- # word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=False,contCol='black')
315
-
316
- # plt.tight_layout()
317
- # buf = BytesIO()
318
- # plt.savefig(buf)
319
- # buf.seek(0)
320
- # img3 = Image.open(buf)
321
- # plt.clf()
322
- # return img3
323
-
324
- # else :
325
- # wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party)
326
- # plt.figure(figsize=(5,5))
327
- # plt.imshow(wordcloud, interpolation="bilinear")
328
- # plt.axis("off")
329
- # plt.tight_layout()
330
- # buf = BytesIO()
331
- # plt.savefig(buf)
332
- # buf.seek(0)
333
- # img4 = Image.open(buf)
334
- # plt.clf()
335
- # return img4
336
-
337
-
338
-
339
- # '''
340
- # url = "http://library.bjp.org/jspui/bitstream/123456789/2988/1/BJP-Election-english-2019.pdf"
341
- # path_input = "./Bjp_Manifesto_2019.pdf"
342
- # urllib.request.urlretrieve(url, filename=path_input)
343
-
344
- # url="https://drive.google.com/uc?id=1BLCiy_BWilfVdrUH8kbO-44DJevwO5CG&export=download"
345
- # path_input = "./Aap_Manifesto_2019.pdf"
346
- # urllib.request.urlretrieve(url, filename=path_input)
347
-
348
- # url="https://drive.google.com/uc?id=1HVZvTtYntl0YKLnE0cwu0CvAIRhXOv60&export=download"
349
- # path_input = "./Congress_Manifesto_2019.pdf"
350
- # urllib.request.urlretrieve(url, filename=path_input)
351
- # '''
352
- # def analysis(Manifesto,Search):
353
- # raw_party = Parsing(Manifesto)
354
- # text_Party=clean_text(raw_party)
355
- # text_Party= Preprocess(text_Party)
356
-
357
- # df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
358
- # df['Subjectivity'] = df['Content'].apply(getSubjectivity)
359
- # df['Polarity'] = df['Content'].apply(getPolarity)
360
- # df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis)
361
- # df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis)
362
- # plt.title('Sentiment Analysis')
363
- # plt.xlabel('Sentiment')
364
- # plt.ylabel('Counts')
365
- # plt.figure(figsize=(4,3))
366
- # df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45")
367
- # plt.tight_layout()
368
- # buf = BytesIO()
369
- # plt.savefig(buf)
370
- # buf.seek(0)
371
- # img1 = Image.open(buf)
372
- # plt.clf()
373
-
374
- # plt.figure(figsize=(4,3))
375
- # df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1")
376
- # plt.tight_layout()
377
- # buf = BytesIO()
378
- # plt.savefig(buf)
379
- # buf.seek(0)
380
- # img2 = Image.open(buf)
381
- # plt.clf()
382
-
383
- # img3 = word_cloud_generator(Manifesto.name,text_Party)
384
-
385
- # fdist_Party=fDistance(text_Party)
386
- # img4=fDistancePlot(text_Party)
387
- # img5=DispersionPlot(text_Party)
388
- # #concordance(text_Party,Search)
389
- # searChRes=get_all_phases_containing_tar_wrd(Search,text_Party)
390
- # searChRes=searChRes.replace(Search,"\u0332".join(Search))
391
- # plt.close('all')
392
- # return searChRes,fdist_Party,img1,img2,img3,img4,img5
393
-
394
-
395
- # Search_txt= "text"
396
- # filePdf = "file"
397
- # text = gr.Textbox(label='Context Based Search')
398
- # mfw=gr.Label(label="Most Relevant Topics")
399
- # plot1=gr.Image(label='Sentiment Analysis')
400
- # plot2=gr.Image(label='Subjectivity Analysis')
401
- # plot3=gr.Image(label='Word Cloud')
402
- # plot4=gr.Image(label='Frequency Distribution')
403
- # plot5=gr.Image(label='Dispersion Plot')
404
-
405
- # io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']],theme='peach')
406
- # io.launch(debug=True,share=False)
407
-
408
-
409
- # #allow_screenshot=False,allow_flagging="never",
410
- # #examples=[['manifestos/Bjp_Manifesto_2019.pdf','modi'],['AAP_Manifesto_2019.pdf','delhi'],['manifestos/Congress_Manifesto_2019.pdf','safety']])
411
-
412
-
413
  """
414
  # MANIFESTO ANALYSIS
415
  """
@@ -546,7 +134,7 @@ def conc(text_Party,strng):
546
  s=result.getvalue().splitlines()
547
  return result.getvalue()
548
 
549
- def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10,numLins=4):
550
  """
551
  Function to get all the phases that contain the target word in a text/passage tar_passage.
552
  Workaround to save the output given by nltk Concordance function
@@ -555,10 +143,14 @@ def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10
555
  left_margin and right_margin allocate the number of words/pununciation before and after target word
556
  Left margin will take note of the beginning of the text
557
  """
558
- ## Create list of tokens using nltk function
 
 
 
 
559
  tokens = nltk.word_tokenize(tar_passage)
560
 
561
- ## Create the text of tokens
562
  text = nltk.Text(tokens)
563
 
564
  ## Collect all the index or offset position of the target word
@@ -849,8 +441,21 @@ def analysis(Manifesto, Search):
849
  Search: Search term entered by the user
850
  '''
851
  try:
 
 
 
 
 
 
 
 
852
  # Process the uploaded PDF
853
  raw_party = Parsing(Manifesto)
 
 
 
 
 
854
  text_Party = clean_text(raw_party)
855
  text_Party_processed = Preprocess(text_Party)
856
 
@@ -978,4 +583,3 @@ demo.launch(debug=True, share=False)
978
  # io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']])
979
  # io.launch(debug=True,share=False)
980
 
981
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  # MANIFESTO ANALYSIS
3
  """
 
134
  s=result.getvalue().splitlines()
135
  return result.getvalue()
136
 
137
+ def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10, numLins=4):
138
  """
139
  Function to get all the phases that contain the target word in a text/passage tar_passage.
140
  Workaround to save the output given by nltk Concordance function
 
143
  left_margin and right_margin allocate the number of words/pununciation before and after target word
144
  Left margin will take note of the beginning of the text
145
  """
146
+ # Handle empty or None search terms
147
+ if not target_word or target_word.strip() == "":
148
+ return "Please enter a search term"
149
+
150
+ # Create list of tokens using nltk function
151
  tokens = nltk.word_tokenize(tar_passage)
152
 
153
+ # Create the text of tokens
154
  text = nltk.Text(tokens)
155
 
156
  ## Collect all the index or offset position of the target word
 
441
  Search: Search term entered by the user
442
  '''
443
  try:
444
+ # Check if a file was uploaded
445
+ if Manifesto is None:
446
+ return "Please upload a PDF file", {}, None, None, None, None, None, "No file uploaded"
447
+
448
+ # Handle empty search term
449
+ if Search is None or Search.strip() == "":
450
+ Search = "government" # Default search term
451
+
452
  # Process the uploaded PDF
453
  raw_party = Parsing(Manifesto)
454
+
455
+ # Check if parsing was successful
456
+ if raw_party.startswith("Error"):
457
+ return raw_party, {}, None, None, None, None, None, "Error generating summary due to parsing failure"
458
+
459
  text_Party = clean_text(raw_party)
460
  text_Party_processed = Preprocess(text_Party)
461
 
 
583
  # io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']])
584
  # io.launch(debug=True,share=False)
585