Sa-m commited on
Commit
bcf08f8
·
verified ·
1 Parent(s): c58aa0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -541
app.py CHANGED
@@ -1,624 +1,258 @@
1
- """
2
- # MANIFESTO ANALYSIS
3
- """
4
-
5
- ##IMPORTING LIBRARIES
6
  import random
7
  import matplotlib.pyplot as plt
8
  import nltk
9
- from nltk.tokenize import word_tokenize,sent_tokenize
10
  from nltk.corpus import stopwords
11
- from nltk.stem.porter import PorterStemmer
12
  from nltk.stem import WordNetLemmatizer
13
- from nltk.corpus import stopwords
14
- from nltk.tokenize import word_tokenize
15
  from nltk.probability import FreqDist
16
  from cleantext import clean
17
  import textract
18
  import urllib.request
19
- import nltk.corpus
20
- from nltk.text import Text
21
- import io
22
- from io import StringIO,BytesIO
23
- import sys
24
  import pandas as pd
25
  import cv2
26
  import re
27
- from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
28
  from textblob import TextBlob
29
  from PIL import Image
30
  import os
31
  import gradio as gr
32
- from zipfile import ZipFile
33
- import contractions
34
- import unidecode
35
  import groq
36
  import json
37
- from dotenv import load_dotenv
38
- from sklearn.feature_extraction.text import TfidfVectorizer
39
- from collections import Counter
40
  import numpy as np
41
 
42
- # # Load environment variables from .env file
43
- # load_dotenv()
44
-
45
- nltk.download('punkt_tab')
46
- nltk.download('stopwords')
47
- nltk.download('punkt')
48
- nltk.download('wordnet')
49
- nltk.download('words')
50
-
51
- # Initialize Groq client for LLM capabilities
52
- try:
53
- groq_api_key = GROQ_API_KEY
54
- if groq_api_key:
55
- groq_client = groq.Groq(api_key=groq_api_key)
56
- else:
57
- print("Warning: GROQ_API_KEY not found in environment variables. Summarization will be disabled.")
58
- groq_client = None
59
- except Exception as e:
60
- print(f"Error initializing Groq client: {e}")
61
- groq_client = None
62
 
 
 
63
 
64
- """## PARSING FILES"""
 
 
65
 
66
- #def Parsing(parsed_text):
67
- #parsed_text=parsed_text.name
68
- #raw_party =parser.from_file(parsed_text)
69
- # raw_party = raw_party['content'],cache_examples=True
70
- # return clean(raw_party)
71
-
72
-
73
- def Parsing(parsed_text):
74
- '''
75
- Process a PDF file and extract its text content
76
- parsed_text: Can be a file object with a 'name' attribute or a file path string
77
- '''
78
- try:
79
- # Handle different input types
80
- if hasattr(parsed_text, 'name'):
81
- file_path = parsed_text.name
82
- else:
83
- file_path = parsed_text
84
-
85
- # Extract text from PDF
86
- raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
87
- return clean(raw_party)
88
- except Exception as e:
89
- print(f"Error parsing PDF: {str(e)}")
90
- return f"Error parsing PDF: {str(e)}"
91
-
92
-
93
- #Added more stopwords to avoid irrelevant terms
94
  stop_words = set(stopwords.words('english'))
95
  stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
96
 
97
- """## PREPROCESSING"""
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def clean_text(text):
100
- '''
101
- The function which returns clean text
102
- '''
103
- text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
104
- text=unidecode.unidecode(text)# diacritics remove
105
- text=contractions.fix(text) # contraction fix
106
- text = re.sub(r"\n", " ", text)
107
- text = re.sub(r"\n\n", " ", text)
108
- text = re.sub(r"\t", " ", text)
109
- text = re.sub(r"/ ", " ", text)
110
- text = text.strip(" ")
111
- text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
112
-
113
- text = [word for word in text.split() if word not in stop_words]
114
- text = ' '.join(text)
115
- return text
116
-
117
- # text_Party=clean_text(raw_party)
118
 
119
  def Preprocess(textParty):
120
- '''
121
- Removing special characters extra spaces
122
- '''
123
- text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
124
- #Removing all stop words
125
- pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
126
- text2Party = pattern.sub('', text1Party)
127
- # fdist_cong = FreqDist(word_tokens_cong)
128
- return text2Party
129
-
130
-
131
-
132
-
133
-
134
- '''
135
- Using Concordance, you can see each time a word is used, along with its
136
- immediate context. It can give you a peek into how a word is being used
137
- at the sentence level and what words are used with it
138
- '''
139
- def conc(text_Party,strng):
140
- word_tokens_party = word_tokenize(text_Party)
141
- moby = Text(word_tokens_party)
142
- resultList = []
143
- for i in range(0,1):
144
- save_stdout = sys.stdout
145
- result = StringIO()
146
- sys.stdout = result
147
- moby.concordance(strng,lines=4,width=82)
148
- sys.stdout = save_stdout
149
- s=result.getvalue().splitlines()
150
- return result.getvalue()
151
-
152
- def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10, numLins=4):
153
- """
154
- Function to get all the phases that contain the target word in a text/passage tar_passage.
155
- Workaround to save the output given by nltk Concordance function
156
-
157
- str target_word, str tar_passage int left_margin int right_margin --> list of str
158
- left_margin and right_margin allocate the number of words/pununciation before and after target word
159
- Left margin will take note of the beginning of the text
160
- """
161
- # Handle empty or None search terms
162
- if not target_word or target_word.strip() == "":
163
- return "Please enter a search term"
164
-
165
- # Create list of tokens using nltk function
166
- tokens = nltk.word_tokenize(tar_passage)
167
-
168
- # Create the text of tokens
169
- text = nltk.Text(tokens)
170
-
171
- ## Collect all the index or offset position of the target word
172
- c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
173
-
174
- ## Collect the range of the words that is within the target word by using text.tokens[start;end].
175
- ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
176
- concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)])
177
-
178
- ## join the sentences for each of the target phrase and return it
179
- result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1]
180
- result=result[:numLins+1]
181
-
182
- res='\n\n'.join(result)
183
- return res
184
-
185
 
186
- def normalize(d, target=1.0):
187
- raw = sum(d.values())
188
- factor = target/raw
189
- return {key:value*factor for key,value in d.items()}
190
-
191
-
192
- def generate_summary(text, max_length=1000):
193
- """
194
- Generate a summary of the manifesto text using Groq LLM
195
- """
196
  if not groq_client:
197
  return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
198
-
199
- # Truncate text if it's too long to fit in context window
200
  if len(text) > 10000:
201
  text = text[:10000]
202
-
203
  try:
204
- # Use Groq's LLaMA 3 model for summarization
205
  completion = groq_client.chat.completions.create(
206
- model="llama3-8b-8192", # Using LLaMA 3 8B model
207
  messages=[
208
  {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
209
  {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
210
  ],
211
- temperature=0.3, # Lower temperature for more focused output
212
- max_tokens=800, # Limit response length
213
  )
214
-
215
  return completion.choices[0].message.content
216
  except Exception as e:
217
- return f"Error generating summary: {str(e)}. Please check your API key and connection."
218
 
219
  def fDistance(text2Party):
220
- '''
221
- Most frequent words search using TF-IDF to find more relevant words
222
- '''
223
- # Traditional frequency distribution
224
- word_tokens_party = word_tokenize(text2Party) #Tokenizing
225
- fdistance = FreqDist(word_tokens_party).most_common(10)
226
- mem={}
227
- for x in fdistance:
228
- mem[x[0]]=x[1]
229
 
230
- # Enhanced with TF-IDF for better relevance
231
- sentences = sent_tokenize(text2Party)
232
-
233
- # Use TF-IDF to find more relevant words
234
- vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
235
- tfidf_matrix = vectorizer.fit_transform(sentences)
236
-
237
- # Get feature names (words)
238
- feature_names = vectorizer.get_feature_names_out()
239
-
240
- # Calculate average TF-IDF score for each word across all sentences
241
- tfidf_scores = {}
242
- for i, word in enumerate(feature_names):
243
- scores = [tfidf_matrix[j, i] for j in range(len(sentences)) if i < tfidf_matrix[j].shape[1]]
244
- if scores:
245
- tfidf_scores[word] = sum(scores) / len(scores)
246
-
247
- # Sort by score and get top words
248
- sorted_tfidf = dict(sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10])
249
-
250
- # Combine traditional frequency with TF-IDF for better results
251
- combined_scores = {}
252
- for word in set(list(mem.keys()) + list(sorted_tfidf.keys())):
253
- # Normalize and combine both scores (with more weight to TF-IDF)
254
- freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
255
- tfidf_score = sorted_tfidf.get(word, 0) / max(sorted_tfidf.values()) if sorted_tfidf else 0
256
- combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7) # Weight TF-IDF higher
257
-
258
- # Get top 10 words by combined score
259
- top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
260
-
261
- return normalize(top_words)
262
-
263
- def fDistancePlot(text2Party,plotN=15):
264
- '''
265
- Most Frequent Words Visualization
266
- '''
267
- word_tokens_party = word_tokenize(text2Party) #Tokenizing
268
- fdistance = FreqDist(word_tokens_party)
269
- plt.title('Frequency Distribution')
270
- plt.axis('off')
271
- plt.figure(figsize=(4,3))
272
- fdistance.plot(plotN)
273
- plt.tight_layout()
274
- buf = BytesIO()
275
- plt.savefig(buf)
276
- buf.seek(0)
277
- img1 = np.array(Image.open(buf))
278
- plt.clf()
279
- return img1
280
-
281
-
282
- def DispersionPlot(textParty):
283
- '''
284
- Dispersion PLot
285
- '''
286
- word_tokens_party = word_tokenize(textParty) #Tokenizing
287
- moby = Text(word_tokens_party)
288
- fdistance = FreqDist(word_tokens_party)
289
- word_Lst=[]
290
- for x in range(5):
291
- word_Lst.append(fdistance.most_common(6)[x][0])
292
-
293
- plt.axis('off')
294
- plt.figure(figsize=(4,3))
295
- plt.title('Dispersion Plot')
296
- moby.dispersion_plot(word_Lst)
297
- plt.plot(color="#EF6D6D")
298
- plt.tight_layout()
299
- buf = BytesIO()
300
- plt.savefig(buf)
301
- buf.seek(0)
302
- img = Image.open(buf)
303
- plt.clf()
304
- return img
305
-
306
-
307
- def getSubjectivity(text):
308
-
309
- '''
310
- Create a function to get the polarity
311
- '''
312
- return TextBlob(text).sentiment.subjectivity
313
-
314
-
315
- def getPolarity(text):
316
- '''
317
- Create a function to get the polarity
318
- '''
319
- return TextBlob(text).sentiment.polarity
320
-
321
-
322
- def getAnalysis(score):
323
- if score < 0:
324
- return 'Negative'
325
- elif score == 0:
326
- return 'Neutral'
327
- else:
328
- return 'Positive'
329
- def Original_Image(path):
330
- img= cv2.imread(path)
331
- img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
332
- return img
333
-
334
- def Image_Processed(path):
335
- '''
336
- Reading the image file
337
- '''
338
- img= cv2.imread(path)
339
- img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
340
-
341
- #Thresholding
342
- ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY)
343
-
344
- return bw_img
345
 
346
- def word_cloud(orgIm,mask_img,text_Party_pr,maxWord=2000,colorGener=True,
347
- contCol='white',bckColor='white'):
348
- '''
349
- #Generating word cloud
350
- '''
351
- mask =mask_img
352
- # Create and generate a word cloud image:
353
- wordcloud = WordCloud(max_words=maxWord, background_color=bckColor,
354
- mask=mask,
355
- colormap='nipy_spectral_r',
356
- contour_color=contCol,
357
- width=800, height=800,
358
- margin=2,
359
- contour_width=3).generate(text_Party_pr)
360
 
361
- # create coloring from image
 
 
 
 
 
 
 
 
 
 
 
362
 
363
-
364
- plt.axis("off")
365
- if colorGener==True:
366
- image_colors = ImageColorGenerator(orgIm)
367
- plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear")
368
-
369
-
370
- else:
371
- plt.imshow(wordcloud)
372
-
373
-
374
-
375
-
376
- def word_cloud_generator(parsed_text_name,text_Party):
377
- parsed=parsed_text_name.lower()
378
 
379
- if 'bjp' in parsed:
380
- orgImg=Original_Image('bjpImg2.jpeg')
381
- bwImg=Image_Processed('bjpImg2.jpeg')
382
- plt.figure(figsize=(6,5))
383
- word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True,
384
- contCol='white', bckColor='black')
385
- plt.tight_layout()
386
- buf = BytesIO()
387
- plt.savefig(buf)
388
- buf.seek(0)
389
- img1 = Image.open(buf)
390
- plt.clf()
391
- return img1
392
 
393
-
394
- elif 'congress' in parsed:
395
- orgImg=Original_Image('congress3.jpeg')
396
- bwImg=Image_Processed('congress3.jpeg')
397
- plt.figure(figsize=(5,4))
398
- word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True)
399
-
400
- plt.tight_layout()
401
- buf = BytesIO()
402
- plt.savefig(buf)
403
- buf.seek(0)
404
- img2 = Image.open(buf)
405
- plt.clf()
406
- return img2
407
- #congrsMain.jpg
408
-
409
-
410
- elif 'aap' in parsed:
411
- orgImg=Original_Image('aapMain2.jpg')
412
- bwImg=Image_Processed('aapMain2.jpg')
413
- plt.figure(figsize=(5,4))
414
- word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=False,contCol='black')
415
-
416
- plt.tight_layout()
417
- buf = BytesIO()
418
- plt.savefig(buf)
419
- buf.seek(0)
420
- img3 = Image.open(buf)
421
- plt.clf()
422
- return img3
423
-
424
- else :
425
- wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party)
426
- plt.figure(figsize=(5,5))
427
- plt.imshow(wordcloud, interpolation="bilinear")
428
- plt.axis("off")
429
- plt.tight_layout()
430
- buf = BytesIO()
431
- plt.savefig(buf)
432
- buf.seek(0)
433
- img4 = Image.open(buf)
434
- plt.clf()
435
- return img4
436
 
 
 
 
 
 
 
 
437
 
 
 
 
438
 
439
- '''
440
- url = "http://library.bjp.org/jspui/bitstream/123456789/2988/1/BJP-Election-english-2019.pdf"
441
- path_input = "./Bjp_Manifesto_2019.pdf"
442
- urllib.request.urlretrieve(url, filename=path_input)
443
 
444
- url="https://drive.google.com/uc?id=1BLCiy_BWilfVdrUH8kbO-44DJevwO5CG&export=download"
445
- path_input = "./Aap_Manifesto_2019.pdf"
446
- urllib.request.urlretrieve(url, filename=path_input)
 
 
447
 
448
- url="https://drive.google.com/uc?id=1HVZvTtYntl0YKLnE0cwu0CvAIRhXOv60&export=download"
449
- path_input = "./Congress_Manifesto_2019.pdf"
450
- urllib.request.urlretrieve(url, filename=path_input)
451
- '''
452
- def analysis(Manifesto, Search):
453
- '''
454
- Main analysis function that processes the manifesto and generates all outputs
455
- Manifesto: PDF file uploaded by the user
456
- Search: Search term entered by the user
457
- '''
458
- try:
459
- print(f"Analysis function called with: Manifesto={Manifesto}, Search={Search}")
460
-
461
- # Check if a file was uploaded
462
- if Manifesto is None:
463
- print("No file uploaded")
464
- return "Please upload a PDF file", {}, None, None, None, None, None, "No file uploaded"
465
-
466
- # Handle empty search term
467
- if Search is None or Search.strip() == "":
468
- Search = "government" # Default search term
469
- print(f"Using default search term: {Search}")
470
- else:
471
- print(f"Using provided search term: {Search}")
472
-
473
- # Process the uploaded PDF
474
- print(f"Processing file: {Manifesto.name if hasattr(Manifesto, 'name') else Manifesto}")
475
- raw_party = Parsing(Manifesto)
476
-
477
- # Check if parsing was successful
478
- if isinstance(raw_party, str) and raw_party.startswith("Error"):
479
- print(f"Parsing error: {raw_party}")
480
- return raw_party, {}, None, None, None, None, None, "Error generating summary due to parsing failure"
481
-
482
- print("Parsing successful, cleaning text...")
483
- text_Party = clean_text(raw_party)
484
- text_Party_processed = Preprocess(text_Party)
485
 
486
- # Generate summary using LLM
487
- print("Generating summary...")
488
- summary = generate_summary(raw_party)
489
 
490
- # Sentiment analysis
491
- print("Performing sentiment analysis...")
492
- df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
493
- df['Subjectivity'] = df['Content'].apply(getSubjectivity)
494
- df['Polarity'] = df['Content'].apply(getPolarity)
495
- df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis)
496
- df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis)
497
-
498
- # Generate sentiment analysis plot
499
- print("Generating sentiment analysis plot...")
500
- plt.title('Sentiment Analysis')
501
- plt.xlabel('Sentiment')
502
- plt.ylabel('Counts')
503
- plt.figure(figsize=(4,3))
504
- df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45")
505
- plt.tight_layout()
506
- buf = BytesIO()
507
- plt.savefig(buf)
508
- buf.seek(0)
509
- img1 = Image.open(buf)
510
- plt.clf()
511
-
512
- # Generate subjectivity analysis plot
513
- print("Generating subjectivity analysis plot...")
514
- plt.figure(figsize=(4,3))
515
- df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1")
516
- plt.tight_layout()
517
- buf = BytesIO()
518
- plt.savefig(buf)
519
- buf.seek(0)
520
- img2 = Image.open(buf)
521
- plt.clf()
522
-
523
- # Generate word cloud
524
- print("Generating word cloud...")
525
- img3 = word_cloud_generator(Manifesto.name, text_Party_processed)
526
-
527
- # Generate frequency distribution and dispersion plots
528
- print("Generating frequency distribution...")
529
- fdist_Party = fDistance(text_Party_processed)
530
- img4 = fDistancePlot(text_Party_processed)
531
-
532
- print("Generating dispersion plot...")
533
- img5 = DispersionPlot(text_Party_processed)
534
-
535
- # Search for the term in the text
536
- print(f"Searching for term: {Search}")
537
- searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
538
- searChRes = searChRes.replace(Search, "\u0332".join(Search))
539
-
540
- plt.close('all')
541
- print("Analysis completed successfully")
542
- return searChRes, fdist_Party, img1, img2, img3, img4, img5, summary
543
-
544
- except Exception as e:
545
- error_message = f"Error analyzing manifesto: {str(e)}"
546
- print(error_message)
547
- import traceback
548
- traceback.print_exc()
549
- # Return placeholder values in case of error
550
- return error_message, {}, None, None, None, None, None, "Error generating summary. Please check the console for details."
551
 
 
 
 
 
 
552
 
553
- Search_txt= "text"
 
554
  filePdf = "file"
555
- text = gr.Textbox(label='Context Based Search')
556
- mfw=gr.Label(label="Most Relevant Topics (LLM Enhanced)")
557
- plot1=gr.Image(label='Sentiment Analysis')
558
- plot2=gr.Image(label='Subjectivity Analysis')
559
- plot3=gr.Image(label='Word Cloud')
560
- plot4=gr.Image(label='Frequency Distribution')
561
- plot5=gr.Image(label='Dispersion Plot')
562
- summary_output = gr.Textbox(label='AI-Generated Summary', lines=10)
563
 
564
  with gr.Blocks(title='Manifesto Analysis') as demo:
565
  gr.Markdown("# Manifesto Analysis with LLM Enhancement")
566
- gr.Markdown("### Analyze political manifestos with advanced NLP and LLM techniques")
567
-
568
  with gr.Row():
569
- with gr.Column(scale=1):
570
  file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
571
  search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
572
  submit_btn = gr.Button("Analyze Manifesto")
573
-
574
  with gr.Tabs():
575
- with gr.TabItem("Summary"):
576
- summary_output
577
-
578
- with gr.TabItem("Search Results"):
579
- text
580
-
581
- with gr.TabItem("Key Topics"):
582
- mfw
583
-
584
  with gr.TabItem("Visualizations"):
585
  with gr.Row():
586
- with gr.Column(scale=1):
587
- plot3
588
- with gr.Column(scale=1):
589
- plot4
590
-
591
- with gr.Row():
592
- with gr.Column(scale=1):
593
- plot1
594
- with gr.Column(scale=1):
595
- plot2
596
-
597
  with gr.Row():
598
- plot5
599
-
 
600
  submit_btn.click(
601
  fn=analysis,
602
  inputs=[file_input, search_input],
603
- outputs=[text, mfw, plot1, plot2, plot3, plot4, plot5, summary_output]
 
 
 
 
 
 
 
 
 
604
  )
605
-
606
- # Add a debug print to verify the button is connected
607
- print("Button connected to analysis function")
608
-
609
  gr.Examples(
610
  examples=[
611
- ['Example/AAP_Manifesto_2019.pdf', 'government'],
612
- ['Example/Bjp_Manifesto_2019.pdf', 'environment'],
613
- ['Example/Congress_Manifesto_2019.pdf', 'safety']
614
  ],
615
  inputs=[file_input, search_input]
616
  )
617
 
618
- demo.launch(debug=True, share=False, show_error=True)
619
-
620
-
621
- # Old interface code replaced by the Blocks implementation above
622
- # io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']])
623
- # io.launch(debug=True,share=False)
624
-
 
 
 
 
 
 
1
  import random
2
  import matplotlib.pyplot as plt
3
  import nltk
4
+ from nltk.tokenize import word_tokenize, sent_tokenize
5
  from nltk.corpus import stopwords
 
6
  from nltk.stem import WordNetLemmatizer
 
 
7
  from nltk.probability import FreqDist
8
  from cleantext import clean
9
  import textract
10
  import urllib.request
11
+ from io import BytesIO
12
+ import sys
 
 
 
13
  import pandas as pd
14
  import cv2
15
  import re
16
+ from wordcloud import WordCloud, ImageColorGenerator
17
  from textblob import TextBlob
18
  from PIL import Image
19
  import os
20
  import gradio as gr
21
+ from dotenv import load_dotenv
 
 
22
  import groq
23
  import json
24
+ import traceback
 
 
25
  import numpy as np
26
 
27
+ # Load environment variables
28
+ load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # Download NLTK resources
31
+ nltk.download(['punkt', 'stopwords', 'wordnet', 'words'])
32
 
33
+ # Initialize Groq client
34
+ groq_api_key = GROQ_API_KEY
35
+ groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
36
 
37
+ # Stopwords customization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  stop_words = set(stopwords.words('english'))
39
  stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
40
 
41
+ # --- Parsing & Preprocessing Functions ---
42
+ def Parsing(parsed_text):
43
+ try:
44
+ if hasattr(parsed_text, 'name'):
45
+ file_path = parsed_text.name
46
+ else:
47
+ file_path = parsed_text
48
+ raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
49
+ return clean(raw_party)
50
+ except Exception as e:
51
+ print(f"Error parsing PDF: {e}")
52
+ return f"Error parsing PDF: {e}"
53
 
54
  def clean_text(text):
55
+ text = text.encode("ascii", errors="ignore").decode("ascii")
56
+ text = unidecode.unidecode(text)
57
+ text = contractions.fix(text)
58
+ text = re.sub(r"\n", " ", text)
59
+ text = re.sub(r"\t", " ", text)
60
+ text = re.sub(r"/ ", " ", text)
61
+ text = text.strip()
62
+ text = re.sub(" +", " ", text).strip()
63
+ text = [word for word in text.split() if word not in stop_words]
64
+ return ' '.join(text)
 
 
 
 
 
 
 
 
65
 
66
  def Preprocess(textParty):
67
+ text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
68
+ pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
69
+ text2Party = pattern.sub('', text1Party)
70
+ return text2Party
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # --- Core Analysis Functions ---
73
+ def generate_summary(text):
 
 
 
 
 
 
 
 
74
  if not groq_client:
75
  return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
 
 
76
  if len(text) > 10000:
77
  text = text[:10000]
 
78
  try:
 
79
  completion = groq_client.chat.completions.create(
80
+ model="llama3-8b-8192",
81
  messages=[
82
  {"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
83
  {"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
84
  ],
85
+ temperature=0.3,
86
+ max_tokens=800
87
  )
 
88
  return completion.choices[0].message.content
89
  except Exception as e:
90
+ return f"Error generating summary: {str(e)}"
91
 
92
  def fDistance(text2Party):
93
+ word_tokens_party = word_tokenize(text2Party)
94
+ fdistance = FreqDist(word_tokens_party).most_common(10)
95
+ mem = {x[0]: x[1] for x in fdistance}
 
 
 
 
 
 
96
 
97
+ vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
98
+ tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party))
99
+ feature_names = vectorizer.get_feature_names_out()
100
+
101
+ tfidf_scores = {}
102
+ for i, word in enumerate(feature_names):
103
+ scores = [tfidf_matrix[j, i] for j in range(len(sent_tokenize(text2Party))) if i < tfidf_matrix[j].shape[1]]
104
+ if scores:
105
+ tfidf_scores[word] = sum(scores) / len(scores)
106
+
107
+ combined_scores = {}
108
+ for word in set(list(mem.keys()) + list(tfidf_scores.keys())):
109
+ freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
110
+ tfidf_score = tfidf_scores.get(word, 0) / max(tfidf_scores.values()) if tfidf_scores else 0
111
+ combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7)
112
+
113
+ top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
114
+ return normalize(top_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ def normalize(d, target=1.0):
117
+ raw = sum(d.values())
118
+ factor = target / raw if raw != 0 else 0
119
+ return {key: value * factor for key, value in d.items()}
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # --- Visualization Functions with Error Handling ---
122
+ def safe_plot(func, *args, **kwargs):
123
+ try:
124
+ plt.clf()
125
+ func(*args, **kwargs)
126
+ buf = BytesIO()
127
+ plt.savefig(buf, format='png')
128
+ buf.seek(0)
129
+ return Image.open(buf)
130
+ except Exception as e:
131
+ print(f"Plotting error: {e}")
132
+ return None
133
 
134
+ def fDistancePlot(text2Party):
135
+ return safe_plot(lambda: FreqDist(word_tokenize(text2Party)).plot(15, title='Frequency Distribution'))
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ def DispersionPlot(textParty):
138
+ tokens = word_tokenize(textParty)
139
+ moby = Text(tokens)
140
+ common_words = [item[0] for item in FreqDist(tokens).most_common(5)]
141
+ return safe_plot(lambda: moby.dispersion_plot(common_words, title='Dispersion Plot'))
 
 
 
 
 
 
 
 
142
 
143
+ def word_cloud_generator(parsed_text_name, text_Party):
144
+ try:
145
+ parsed = parsed_text_name.lower()
146
+ if 'bjp' in parsed:
147
+ mask_path = 'bjpImg2.jpeg'
148
+ elif 'congress' in parsed:
149
+ mask_path = 'congress3.jpeg'
150
+ elif 'aap' in parsed:
151
+ mask_path = 'aapMain2.jpg'
152
+ else:
153
+ mask_path = None
154
+
155
+ if mask_path and os.path.exists(mask_path):
156
+ orgImg = Image.open(mask_path)
157
+ mask = np.array(orgImg)
158
+ wordcloud = WordCloud(max_words=3000, mask=mask).generate(text_Party)
159
+ plt.imshow(wordcloud)
160
+ else:
161
+ wordcloud = WordCloud(max_words=2000).generate(text_Party)
162
+ plt.imshow(wordcloud)
163
+ plt.axis("off")
164
+ buf = BytesIO()
165
+ plt.savefig(buf, format='png')
166
+ buf.seek(0)
167
+ return Image.open(buf)
168
+ except Exception as e:
169
+ print(f"Word cloud error: {e}")
170
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ # --- Main Analysis Function ---
173
+ def analysis(Manifesto, Search):
174
+ try:
175
+ if Manifesto is None:
176
+ return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
177
+ if Search.strip() == "":
178
+ Search = "government"
179
 
180
+ raw_party = Parsing(Manifesto)
181
+ if isinstance(raw_party, str) and raw_party.startswith("Error"):
182
+ return raw_party, {}, None, None, None, None, None, "Parsing failed"
183
 
184
+ text_Party = clean_text(raw_party)
185
+ text_Party_processed = Preprocess(text_Party)
186
+ summary = generate_summary(raw_party)
 
187
 
188
+ df = pd.DataFrame([{'Content': text_Party_processed}], columns=['Content'])
189
+ df['Subjectivity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
190
+ df['Polarity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.polarity)
191
+ df['Polarity_Label'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
192
+ df['Subjectivity_Label'] = df['Subjectivity'].apply(lambda x: 'High' if x > 0.5 else 'Low')
193
 
194
+ # Generate Plots with Safe Plotting
195
+ sentiment_plot = safe_plot(lambda: df['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
196
+ subjectivity_plot = safe_plot(lambda: df['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
197
+ freq_plot = fDistancePlot(text_Party_processed)
198
+ dispersion_plot = DispersionPlot(text_Party_processed)
199
+ wordcloud = word_cloud_generator(Manifesto.name, text_Party_processed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ fdist_Party = fDistance(text_Party_processed)
202
+ searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
 
203
 
204
+ return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
+ except Exception as e:
207
+ error_msg = f"Critical error: {str(e)}"
208
+ print(error_msg)
209
+ traceback.print_exc()
210
+ return error_msg, {}, None, None, None, None, None, "Analysis failed"
211
 
212
+ # --- Gradio Interface ---
213
+ Search_txt = "text"
214
  filePdf = "file"
 
 
 
 
 
 
 
 
215
 
216
  with gr.Blocks(title='Manifesto Analysis') as demo:
217
  gr.Markdown("# Manifesto Analysis with LLM Enhancement")
 
 
218
  with gr.Row():
219
+ with gr.Column():
220
  file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
221
  search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
222
  submit_btn = gr.Button("Analyze Manifesto")
 
223
  with gr.Tabs():
224
+ with gr.TabItem("Summary"): gr.Textbox(label='AI-Generated Summary', lines=10)
225
+ with gr.TabItem("Search Results"): gr.Textbox(label='Context Based Search')
226
+ with gr.TabItem("Key Topics"): gr.Label(label="Most Relevant Topics (LLM Enhanced)")
 
 
 
 
 
 
227
  with gr.TabItem("Visualizations"):
228
  with gr.Row():
229
+ gr.Image(label='Sentiment Analysis'), gr.Image(label='Subjectivity Analysis')
 
 
 
 
 
 
 
 
 
 
230
  with gr.Row():
231
+ gr.Image(label='Word Cloud'), gr.Image(label='Frequency Distribution')
232
+ gr.Image(label='Dispersion Plot')
233
+
234
  submit_btn.click(
235
  fn=analysis,
236
  inputs=[file_input, search_input],
237
+ outputs=[
238
+ gr.Textbox(label='Context Based Search'),
239
+ gr.Label(label="Most Relevant Topics (LLM Enhanced)"),
240
+ gr.Image(label='Sentiment Analysis'),
241
+ gr.Image(label='Subjectivity Analysis'),
242
+ gr.Image(label='Word Cloud'),
243
+ gr.Image(label='Frequency Distribution'),
244
+ gr.Image(label='Dispersion Plot'),
245
+ gr.Textbox(label='AI-Generated Summary', lines=10)
246
+ ]
247
  )
248
+
 
 
 
249
  gr.Examples(
250
  examples=[
251
+ ["Example/AAP_Manifesto_2019.pdf", "government"],
252
+ ["Example/Bjp_Manifesto_2019.pdf", "environment"],
253
+ ["Example/Congress_Manifesto_2019.pdf", "safety"]
254
  ],
255
  inputs=[file_input, search_input]
256
  )
257
 
258
+ demo.launch(debug=True, share=False, show_error=True)