Sa-m commited on
Commit
029ffc9
·
verified ·
1 Parent(s): 8d7d358

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +561 -13
app.py CHANGED
@@ -1,3 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  # MANIFESTO ANALYSIS
3
  """
@@ -32,6 +445,14 @@ import gradio as gr
32
  from zipfile import ZipFile
33
  import contractions
34
  import unidecode
 
 
 
 
 
 
 
 
35
 
36
  nltk.download('punkt_tab')
37
  nltk.download('stopwords')
@@ -39,6 +460,18 @@ nltk.download('punkt')
39
  nltk.download('wordnet')
40
  nltk.download('words')
41
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  """## PARSING FILES"""
44
 
@@ -149,16 +582,77 @@ def normalize(d, target=1.0):
149
  factor = target/raw
150
  return {key:value*factor for key,value in d.items()}
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def fDistance(text2Party):
153
  '''
154
- Most frequent words search
155
  '''
 
156
  word_tokens_party = word_tokenize(text2Party) #Tokenizing
157
  fdistance = FreqDist(word_tokens_party).most_common(10)
158
  mem={}
159
  for x in fdistance:
160
  mem[x[0]]=x[1]
161
- return normalize(mem)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  def fDistancePlot(text2Party,plotN=15):
164
  '''
@@ -352,7 +846,10 @@ urllib.request.urlretrieve(url, filename=path_input)
352
  def analysis(Manifesto,Search):
353
  raw_party = Parsing(Manifesto)
354
  text_Party=clean_text(raw_party)
355
- text_Party= Preprocess(text_Party)
 
 
 
356
 
357
  df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
358
  df['Subjectivity'] = df['Content'].apply(getSubjectivity)
@@ -380,30 +877,81 @@ def analysis(Manifesto,Search):
380
  img2 = Image.open(buf)
381
  plt.clf()
382
 
383
- img3 = word_cloud_generator(Manifesto.name,text_Party)
 
 
 
 
384
 
385
- fdist_Party=fDistance(text_Party)
386
- img4=fDistancePlot(text_Party)
387
- img5=DispersionPlot(text_Party)
388
- #concordance(text_Party,Search)
389
- searChRes=get_all_phases_containing_tar_wrd(Search,text_Party)
390
  searChRes=searChRes.replace(Search,"\u0332".join(Search))
391
  plt.close('all')
392
- return searChRes,fdist_Party,img1,img2,img3,img4,img5
393
 
394
 
395
  Search_txt= "text"
396
  filePdf = "file"
397
  text = gr.Textbox(label='Context Based Search')
398
- mfw=gr.Label(label="Most Relevant Topics")
399
  plot1=gr.Image(label='Sentiment Analysis')
400
  plot2=gr.Image(label='Subjectivity Analysis')
401
  plot3=gr.Image(label='Word Cloud')
402
  plot4=gr.Image(label='Frequency Distribution')
403
  plot5=gr.Image(label='Dispersion Plot')
 
404
 
405
- io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']],theme='peach')
406
- io.launch(debug=True,share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
 
409
  #allow_screenshot=False,allow_flagging="never",
 
1
+ # """
2
+ # # MANIFESTO ANALYSIS
3
+ # """
4
+
5
+ # ##IMPORTING LIBRARIES
6
+ # import random
7
+ # import matplotlib.pyplot as plt
8
+ # import nltk
9
+ # from nltk.tokenize import word_tokenize,sent_tokenize
10
+ # from nltk.corpus import stopwords
11
+ # from nltk.stem.porter import PorterStemmer
12
+ # from nltk.stem import WordNetLemmatizer
13
+ # from nltk.corpus import stopwords
14
+ # from nltk.tokenize import word_tokenize
15
+ # from nltk.probability import FreqDist
16
+ # from cleantext import clean
17
+ # import textract
18
+ # import urllib.request
19
+ # import nltk.corpus
20
+ # from nltk.text import Text
21
+ # import io
22
+ # from io import StringIO,BytesIO
23
+ # import sys
24
+ # import pandas as pd
25
+ # import cv2
26
+ # import re
27
+ # from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
28
+ # from textblob import TextBlob
29
+ # from PIL import Image
30
+ # import os
31
+ # import gradio as gr
32
+ # from zipfile import ZipFile
33
+ # import contractions
34
+ # import unidecode
35
+
36
+ # nltk.download('punkt_tab')
37
+ # nltk.download('stopwords')
38
+ # nltk.download('punkt')
39
+ # nltk.download('wordnet')
40
+ # nltk.download('words')
41
+
42
+
43
+ # """## PARSING FILES"""
44
+
45
+ # #def Parsing(parsed_text):
46
+ # #parsed_text=parsed_text.name
47
+ # #raw_party =parser.from_file(parsed_text)
48
+ # # raw_party = raw_party['content'],cache_examples=True
49
+ # # return clean(raw_party)
50
+
51
+
52
+ # def Parsing(parsed_text):
53
+ # parsed_text=parsed_text.name
54
+ # raw_party =textract.process(parsed_text, encoding='ascii',method='pdfminer')
55
+ # return clean(raw_party)
56
+
57
+
58
+ # #Added more stopwords to avoid irrelevant terms
59
+ # stop_words = set(stopwords.words('english'))
60
+ # stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
61
+
62
+ # """## PREPROCESSING"""
63
+
64
+ # def clean_text(text):
65
+ # '''
66
+ # The function which returns clean text
67
+ # '''
68
+ # text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
69
+ # text=unidecode.unidecode(text)# diacritics remove
70
+ # text=contractions.fix(text) # contraction fix
71
+ # text = re.sub(r"\n", " ", text)
72
+ # text = re.sub(r"\n\n", " ", text)
73
+ # text = re.sub(r"\t", " ", text)
74
+ # text = re.sub(r"/ ", " ", text)
75
+ # text = text.strip(" ")
76
+ # text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
77
+
78
+ # text = [word for word in text.split() if word not in stop_words]
79
+ # text = ' '.join(text)
80
+ # return text
81
+
82
+ # # text_Party=clean_text(raw_party)
83
+
84
+ # def Preprocess(textParty):
85
+ # '''
86
+ # Removing special characters extra spaces
87
+ # '''
88
+ # text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
89
+ # #Removing all stop words
90
+ # pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
91
+ # text2Party = pattern.sub('', text1Party)
92
+ # # fdist_cong = FreqDist(word_tokens_cong)
93
+ # return text2Party
94
+
95
+
96
+
97
+
98
+
99
+ # '''
100
+ # Using Concordance, you can see each time a word is used, along with its
101
+ # immediate context. It can give you a peek into how a word is being used
102
+ # at the sentence level and what words are used with it
103
+ # '''
104
+ # def conc(text_Party,strng):
105
+ # word_tokens_party = word_tokenize(text_Party)
106
+ # moby = Text(word_tokens_party)
107
+ # resultList = []
108
+ # for i in range(0,1):
109
+ # save_stdout = sys.stdout
110
+ # result = StringIO()
111
+ # sys.stdout = result
112
+ # moby.concordance(strng,lines=4,width=82)
113
+ # sys.stdout = save_stdout
114
+ # s=result.getvalue().splitlines()
115
+ # return result.getvalue()
116
+
117
+ # def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10,numLins=4):
118
+ # """
119
+ # Function to get all the phases that contain the target word in a text/passage tar_passage.
120
+ # Workaround to save the output given by nltk Concordance function
121
+
122
+ # str target_word, str tar_passage int left_margin int right_margin --> list of str
123
+ # left_margin and right_margin allocate the number of words/pununciation before and after target word
124
+ # Left margin will take note of the beginning of the text
125
+ # """
126
+ # ## Create list of tokens using nltk function
127
+ # tokens = nltk.word_tokenize(tar_passage)
128
+
129
+ # ## Create the text of tokens
130
+ # text = nltk.Text(tokens)
131
+
132
+ # ## Collect all the index or offset position of the target word
133
+ # c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
134
+
135
+ # ## Collect the range of the words that is within the target word by using text.tokens[start;end].
136
+ # ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
137
+ # concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)])
138
+
139
+ # ## join the sentences for each of the target phrase and return it
140
+ # result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1]
141
+ # result=result[:numLins+1]
142
+
143
+ # res='\n\n'.join(result)
144
+ # return res
145
+
146
+
147
+ # def normalize(d, target=1.0):
148
+ # raw = sum(d.values())
149
+ # factor = target/raw
150
+ # return {key:value*factor for key,value in d.items()}
151
+
152
+ # def fDistance(text2Party):
153
+ # '''
154
+ # Most frequent words search
155
+ # '''
156
+ # word_tokens_party = word_tokenize(text2Party) #Tokenizing
157
+ # fdistance = FreqDist(word_tokens_party).most_common(10)
158
+ # mem={}
159
+ # for x in fdistance:
160
+ # mem[x[0]]=x[1]
161
+ # return normalize(mem)
162
+
163
+ # def fDistancePlot(text2Party,plotN=15):
164
+ # '''
165
+ # Most Frequent Words Visualization
166
+ # '''
167
+ # word_tokens_party = word_tokenize(text2Party) #Tokenizing
168
+ # fdistance = FreqDist(word_tokens_party)
169
+ # plt.title('Frequency Distribution')
170
+ # plt.axis('off')
171
+ # plt.figure(figsize=(4,3))
172
+ # fdistance.plot(plotN)
173
+ # plt.tight_layout()
174
+ # buf = BytesIO()
175
+ # plt.savefig(buf)
176
+ # buf.seek(0)
177
+ # img1 = Image.open(buf)
178
+ # plt.clf()
179
+ # return img1
180
+
181
+
182
+ # def DispersionPlot(textParty):
183
+ # '''
184
+ # Dispersion PLot
185
+ # '''
186
+ # word_tokens_party = word_tokenize(textParty) #Tokenizing
187
+ # moby = Text(word_tokens_party)
188
+ # fdistance = FreqDist(word_tokens_party)
189
+ # word_Lst=[]
190
+ # for x in range(5):
191
+ # word_Lst.append(fdistance.most_common(6)[x][0])
192
+
193
+ # plt.axis('off')
194
+ # plt.figure(figsize=(4,3))
195
+ # plt.title('Dispersion Plot')
196
+ # moby.dispersion_plot(word_Lst)
197
+ # plt.plot(color="#EF6D6D")
198
+ # plt.tight_layout()
199
+ # buf = BytesIO()
200
+ # plt.savefig(buf)
201
+ # buf.seek(0)
202
+ # img = Image.open(buf)
203
+ # plt.clf()
204
+ # return img
205
+
206
+
207
+ # def getSubjectivity(text):
208
+
209
+ # '''
210
+ # Create a function to get the polarity
211
+ # '''
212
+ # return TextBlob(text).sentiment.subjectivity
213
+
214
+
215
+ # def getPolarity(text):
216
+ # '''
217
+ # Create a function to get the polarity
218
+ # '''
219
+ # return TextBlob(text).sentiment.polarity
220
+
221
+
222
+ # def getAnalysis(score):
223
+ # if score < 0:
224
+ # return 'Negative'
225
+ # elif score == 0:
226
+ # return 'Neutral'
227
+ # else:
228
+ # return 'Positive'
229
+ # def Original_Image(path):
230
+ # img= cv2.imread(path)
231
+ # img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
232
+ # return img
233
+
234
+ # def Image_Processed(path):
235
+ # '''
236
+ # Reading the image file
237
+ # '''
238
+ # img= cv2.imread(path)
239
+ # img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
240
+
241
+ # #Thresholding
242
+ # ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY)
243
+
244
+ # return bw_img
245
+
246
+ # def word_cloud(orgIm,mask_img,text_Party_pr,maxWord=2000,colorGener=True,
247
+ # contCol='white',bckColor='white'):
248
+ # '''
249
+ # #Generating word cloud
250
+ # '''
251
+ # mask =mask_img
252
+ # # Create and generate a word cloud image:
253
+ # wordcloud = WordCloud(max_words=maxWord, background_color=bckColor,
254
+ # mask=mask,
255
+ # colormap='nipy_spectral_r',
256
+ # contour_color=contCol,
257
+ # width=800, height=800,
258
+ # margin=2,
259
+ # contour_width=3).generate(text_Party_pr)
260
+
261
+ # # create coloring from image
262
+
263
+
264
+ # plt.axis("off")
265
+ # if colorGener==True:
266
+ # image_colors = ImageColorGenerator(orgIm)
267
+ # plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear")
268
+
269
+
270
+ # else:
271
+ # plt.imshow(wordcloud)
272
+
273
+
274
+
275
+
276
+ # def word_cloud_generator(parsed_text_name,text_Party):
277
+ # parsed=parsed_text_name.lower()
278
+
279
+ # if 'bjp' in parsed:
280
+ # orgImg=Original_Image('bjpImg2.jpeg')
281
+ # bwImg=Image_Processed('bjpImg2.jpeg')
282
+ # plt.figure(figsize=(6,5))
283
+ # word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True,
284
+ # contCol='white', bckColor='black')
285
+ # plt.tight_layout()
286
+ # buf = BytesIO()
287
+ # plt.savefig(buf)
288
+ # buf.seek(0)
289
+ # img1 = Image.open(buf)
290
+ # plt.clf()
291
+ # return img1
292
+
293
+
294
+ # elif 'congress' in parsed:
295
+ # orgImg=Original_Image('congress3.jpeg')
296
+ # bwImg=Image_Processed('congress3.jpeg')
297
+ # plt.figure(figsize=(5,4))
298
+ # word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True)
299
+
300
+ # plt.tight_layout()
301
+ # buf = BytesIO()
302
+ # plt.savefig(buf)
303
+ # buf.seek(0)
304
+ # img2 = Image.open(buf)
305
+ # plt.clf()
306
+ # return img2
307
+ # #congrsMain.jpg
308
+
309
+
310
+ # elif 'aap' in parsed:
311
+ # orgImg=Original_Image('aapMain2.jpg')
312
+ # bwImg=Image_Processed('aapMain2.jpg')
313
+ # plt.figure(figsize=(5,4))
314
+ # word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=False,contCol='black')
315
+
316
+ # plt.tight_layout()
317
+ # buf = BytesIO()
318
+ # plt.savefig(buf)
319
+ # buf.seek(0)
320
+ # img3 = Image.open(buf)
321
+ # plt.clf()
322
+ # return img3
323
+
324
+ # else :
325
+ # wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party)
326
+ # plt.figure(figsize=(5,5))
327
+ # plt.imshow(wordcloud, interpolation="bilinear")
328
+ # plt.axis("off")
329
+ # plt.tight_layout()
330
+ # buf = BytesIO()
331
+ # plt.savefig(buf)
332
+ # buf.seek(0)
333
+ # img4 = Image.open(buf)
334
+ # plt.clf()
335
+ # return img4
336
+
337
+
338
+
339
+ # '''
340
+ # url = "http://library.bjp.org/jspui/bitstream/123456789/2988/1/BJP-Election-english-2019.pdf"
341
+ # path_input = "./Bjp_Manifesto_2019.pdf"
342
+ # urllib.request.urlretrieve(url, filename=path_input)
343
+
344
+ # url="https://drive.google.com/uc?id=1BLCiy_BWilfVdrUH8kbO-44DJevwO5CG&export=download"
345
+ # path_input = "./Aap_Manifesto_2019.pdf"
346
+ # urllib.request.urlretrieve(url, filename=path_input)
347
+
348
+ # url="https://drive.google.com/uc?id=1HVZvTtYntl0YKLnE0cwu0CvAIRhXOv60&export=download"
349
+ # path_input = "./Congress_Manifesto_2019.pdf"
350
+ # urllib.request.urlretrieve(url, filename=path_input)
351
+ # '''
352
+ # def analysis(Manifesto,Search):
353
+ # raw_party = Parsing(Manifesto)
354
+ # text_Party=clean_text(raw_party)
355
+ # text_Party= Preprocess(text_Party)
356
+
357
+ # df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
358
+ # df['Subjectivity'] = df['Content'].apply(getSubjectivity)
359
+ # df['Polarity'] = df['Content'].apply(getPolarity)
360
+ # df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis)
361
+ # df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis)
362
+ # plt.title('Sentiment Analysis')
363
+ # plt.xlabel('Sentiment')
364
+ # plt.ylabel('Counts')
365
+ # plt.figure(figsize=(4,3))
366
+ # df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45")
367
+ # plt.tight_layout()
368
+ # buf = BytesIO()
369
+ # plt.savefig(buf)
370
+ # buf.seek(0)
371
+ # img1 = Image.open(buf)
372
+ # plt.clf()
373
+
374
+ # plt.figure(figsize=(4,3))
375
+ # df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1")
376
+ # plt.tight_layout()
377
+ # buf = BytesIO()
378
+ # plt.savefig(buf)
379
+ # buf.seek(0)
380
+ # img2 = Image.open(buf)
381
+ # plt.clf()
382
+
383
+ # img3 = word_cloud_generator(Manifesto.name,text_Party)
384
+
385
+ # fdist_Party=fDistance(text_Party)
386
+ # img4=fDistancePlot(text_Party)
387
+ # img5=DispersionPlot(text_Party)
388
+ # #concordance(text_Party,Search)
389
+ # searChRes=get_all_phases_containing_tar_wrd(Search,text_Party)
390
+ # searChRes=searChRes.replace(Search,"\u0332".join(Search))
391
+ # plt.close('all')
392
+ # return searChRes,fdist_Party,img1,img2,img3,img4,img5
393
+
394
+
395
+ # Search_txt= "text"
396
+ # filePdf = "file"
397
+ # text = gr.Textbox(label='Context Based Search')
398
+ # mfw=gr.Label(label="Most Relevant Topics")
399
+ # plot1=gr.Image(label='Sentiment Analysis')
400
+ # plot2=gr.Image(label='Subjectivity Analysis')
401
+ # plot3=gr.Image(label='Word Cloud')
402
+ # plot4=gr.Image(label='Frequency Distribution')
403
+ # plot5=gr.Image(label='Dispersion Plot')
404
+
405
+ # io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']],theme='peach')
406
+ # io.launch(debug=True,share=False)
407
+
408
+
409
+ # #allow_screenshot=False,allow_flagging="never",
410
+ # #examples=[['manifestos/Bjp_Manifesto_2019.pdf','modi'],['AAP_Manifesto_2019.pdf','delhi'],['manifestos/Congress_Manifesto_2019.pdf','safety']])
411
+
412
+
413
+
414
  """
415
  # MANIFESTO ANALYSIS
416
  """
 
445
  from zipfile import ZipFile
446
  import contractions
447
  import unidecode
448
+ import groq
449
+ import json
450
+ from dotenv import load_dotenv
451
+ from sklearn.feature_extraction.text import TfidfVectorizer
452
+ from collections import Counter
453
+
454
+ # Load environment variables from .env file
455
+ load_dotenv()
456
 
457
  nltk.download('punkt_tab')
458
  nltk.download('stopwords')
 
460
  nltk.download('wordnet')
461
  nltk.download('words')
462
 
463
+ # Initialize Groq client for LLM capabilities
464
+ try:
465
+ groq_api_key = os.getenv("GROQ_API_KEY")
466
+ if groq_api_key:
467
+ groq_client = groq.Groq(api_key=groq_api_key)
468
+ else:
469
+ print("Warning: GROQ_API_KEY not found in environment variables. Summarization will be disabled.")
470
+ groq_client = None
471
+ except Exception as e:
472
+ print(f"Error initializing Groq client: {e}")
473
+ groq_client = None
474
+
475
 
476
  """## PARSING FILES"""
477
 
 
582
  factor = target/raw
583
  return {key:value*factor for key,value in d.items()}
584
 
585
+
586
+ def generate_summary(text, max_length=1000):
587
+ """
588
+ Generate a summary of the manifesto text using Groq LLM
589
+ """
590
+ if not groq_client:
591
+ return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
592
+
593
+ # Truncate text if it's too long to fit in context window
594
+ if len(text) > 10000:
595
+ text = text[:10000]
596
+
597
+ try:
598
+ # Use Groq's LLaMA 3 model for summarization
599
+ completion = groq_client.chat.completions.create(
600
+ model="llama3-8b-8192", # Using LLaMA 3 8B model
601
+ messages=[
602
+ {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
603
+ {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
604
+ ],
605
+ temperature=0.3, # Lower temperature for more focused output
606
+ max_tokens=800, # Limit response length
607
+ )
608
+
609
+ return completion.choices[0].message.content
610
+ except Exception as e:
611
+ return f"Error generating summary: {str(e)}. Please check your API key and connection."
612
+
613
  def fDistance(text2Party):
614
  '''
615
+ Most frequent words search using TF-IDF to find more relevant words
616
  '''
617
+ # Traditional frequency distribution
618
  word_tokens_party = word_tokenize(text2Party) #Tokenizing
619
  fdistance = FreqDist(word_tokens_party).most_common(10)
620
  mem={}
621
  for x in fdistance:
622
  mem[x[0]]=x[1]
623
+
624
+ # Enhanced with TF-IDF for better relevance
625
+ sentences = sent_tokenize(text2Party)
626
+
627
+ # Use TF-IDF to find more relevant words
628
+ vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
629
+ tfidf_matrix = vectorizer.fit_transform(sentences)
630
+
631
+ # Get feature names (words)
632
+ feature_names = vectorizer.get_feature_names_out()
633
+
634
+ # Calculate average TF-IDF score for each word across all sentences
635
+ tfidf_scores = {}
636
+ for i, word in enumerate(feature_names):
637
+ scores = [tfidf_matrix[j, i] for j in range(len(sentences)) if i < tfidf_matrix[j].shape[1]]
638
+ if scores:
639
+ tfidf_scores[word] = sum(scores) / len(scores)
640
+
641
+ # Sort by score and get top words
642
+ sorted_tfidf = dict(sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10])
643
+
644
+ # Combine traditional frequency with TF-IDF for better results
645
+ combined_scores = {}
646
+ for word in set(list(mem.keys()) + list(sorted_tfidf.keys())):
647
+ # Normalize and combine both scores (with more weight to TF-IDF)
648
+ freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
649
+ tfidf_score = sorted_tfidf.get(word, 0) / max(sorted_tfidf.values()) if sorted_tfidf else 0
650
+ combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7) # Weight TF-IDF higher
651
+
652
+ # Get top 10 words by combined score
653
+ top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
654
+
655
+ return normalize(top_words)
656
 
657
  def fDistancePlot(text2Party,plotN=15):
658
  '''
 
846
  def analysis(Manifesto,Search):
847
  raw_party = Parsing(Manifesto)
848
  text_Party=clean_text(raw_party)
849
+ text_Party_processed = Preprocess(text_Party)
850
+
851
+ # Generate summary using LLM
852
+ summary = generate_summary(raw_party)
853
 
854
  df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
855
  df['Subjectivity'] = df['Content'].apply(getSubjectivity)
 
877
  img2 = Image.open(buf)
878
  plt.clf()
879
 
880
+ img3 = word_cloud_generator(Manifesto.name,text_Party_processed)
881
+
882
+ fdist_Party=fDistance(text_Party_processed)
883
+ img4=fDistancePlot(text_Party_processed)
884
+ img5=DispersionPlot(text_Party_processed)
885
 
886
+ searChRes=get_all_phases_containing_tar_wrd(Search,text_Party_processed)
 
 
 
 
887
  searChRes=searChRes.replace(Search,"\u0332".join(Search))
888
  plt.close('all')
889
+ return searChRes,fdist_Party,img1,img2,img3,img4,img5,summary
890
 
891
 
892
  Search_txt= "text"
893
  filePdf = "file"
894
  text = gr.Textbox(label='Context Based Search')
895
+ mfw=gr.Label(label="Most Relevant Topics (LLM Enhanced)")
896
  plot1=gr.Image(label='Sentiment Analysis')
897
  plot2=gr.Image(label='Subjectivity Analysis')
898
  plot3=gr.Image(label='Word Cloud')
899
  plot4=gr.Image(label='Frequency Distribution')
900
  plot5=gr.Image(label='Dispersion Plot')
901
+ summary_output = gr.Textbox(label='AI-Generated Summary', lines=10)
902
 
903
+ with gr.Blocks(title='Manifesto Analysis', theme='peach') as demo:
904
+ gr.Markdown("# Manifesto Analysis with LLM Enhancement")
905
+ gr.Markdown("### Analyze political manifestos with advanced NLP and LLM techniques")
906
+
907
+ with gr.Row():
908
+ with gr.Column(scale=1):
909
+ file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
910
+ search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
911
+ submit_btn = gr.Button("Analyze Manifesto")
912
+
913
+ with gr.Tabs():
914
+ with gr.TabItem("Summary"):
915
+ summary_output
916
+
917
+ with gr.TabItem("Search Results"):
918
+ text
919
+
920
+ with gr.TabItem("Key Topics"):
921
+ mfw
922
+
923
+ with gr.TabItem("Visualizations"):
924
+ with gr.Row():
925
+ with gr.Column(scale=1):
926
+ plot3
927
+ with gr.Column(scale=1):
928
+ plot4
929
+
930
+ with gr.Row():
931
+ with gr.Column(scale=1):
932
+ plot1
933
+ with gr.Column(scale=1):
934
+ plot2
935
+
936
+ with gr.Row():
937
+ plot5
938
+
939
+ submit_btn.click(
940
+ fn=analysis,
941
+ inputs=[file_input, search_input],
942
+ outputs=[text, mfw, plot1, plot2, plot3, plot4, plot5, summary_output]
943
+ )
944
+
945
+ gr.Examples(
946
+ examples=[
947
+ ['Example/AAP_Manifesto_2019.pdf', 'government'],
948
+ ['Example/Bjp_Manifesto_2019.pdf', 'environment'],
949
+ ['Example/Congress_Manifesto_2019.pdf', 'safety']
950
+ ],
951
+ inputs=[file_input, search_input]
952
+ )
953
+
954
+ demo.launch(debug=True, share=False)
955
 
956
 
957
  #allow_screenshot=False,allow_flagging="never",