Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,624 +1,258 @@
|
|
1 |
-
"""
|
2 |
-
# MANIFESTO ANALYSIS
|
3 |
-
"""
|
4 |
-
|
5 |
-
##IMPORTING LIBRARIES
|
6 |
import random
|
7 |
import matplotlib.pyplot as plt
|
8 |
import nltk
|
9 |
-
from nltk.tokenize import word_tokenize,sent_tokenize
|
10 |
from nltk.corpus import stopwords
|
11 |
-
from nltk.stem.porter import PorterStemmer
|
12 |
from nltk.stem import WordNetLemmatizer
|
13 |
-
from nltk.corpus import stopwords
|
14 |
-
from nltk.tokenize import word_tokenize
|
15 |
from nltk.probability import FreqDist
|
16 |
from cleantext import clean
|
17 |
import textract
|
18 |
import urllib.request
|
19 |
-
import
|
20 |
-
|
21 |
-
import io
|
22 |
-
from io import StringIO,BytesIO
|
23 |
-
import sys
|
24 |
import pandas as pd
|
25 |
import cv2
|
26 |
import re
|
27 |
-
from wordcloud import WordCloud,
|
28 |
from textblob import TextBlob
|
29 |
from PIL import Image
|
30 |
import os
|
31 |
import gradio as gr
|
32 |
-
from
|
33 |
-
import contractions
|
34 |
-
import unidecode
|
35 |
import groq
|
36 |
import json
|
37 |
-
|
38 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
39 |
-
from collections import Counter
|
40 |
import numpy as np
|
41 |
|
42 |
-
#
|
43 |
-
|
44 |
-
|
45 |
-
nltk.download('punkt_tab')
|
46 |
-
nltk.download('stopwords')
|
47 |
-
nltk.download('punkt')
|
48 |
-
nltk.download('wordnet')
|
49 |
-
nltk.download('words')
|
50 |
-
|
51 |
-
# Initialize Groq client for LLM capabilities
|
52 |
-
try:
|
53 |
-
groq_api_key = GROQ_API_KEY
|
54 |
-
if groq_api_key:
|
55 |
-
groq_client = groq.Groq(api_key=groq_api_key)
|
56 |
-
else:
|
57 |
-
print("Warning: GROQ_API_KEY not found in environment variables. Summarization will be disabled.")
|
58 |
-
groq_client = None
|
59 |
-
except Exception as e:
|
60 |
-
print(f"Error initializing Groq client: {e}")
|
61 |
-
groq_client = None
|
62 |
|
|
|
|
|
63 |
|
64 |
-
|
|
|
|
|
65 |
|
66 |
-
#
|
67 |
-
#parsed_text=parsed_text.name
|
68 |
-
#raw_party =parser.from_file(parsed_text)
|
69 |
-
# raw_party = raw_party['content'],cache_examples=True
|
70 |
-
# return clean(raw_party)
|
71 |
-
|
72 |
-
|
73 |
-
def Parsing(parsed_text):
|
74 |
-
'''
|
75 |
-
Process a PDF file and extract its text content
|
76 |
-
parsed_text: Can be a file object with a 'name' attribute or a file path string
|
77 |
-
'''
|
78 |
-
try:
|
79 |
-
# Handle different input types
|
80 |
-
if hasattr(parsed_text, 'name'):
|
81 |
-
file_path = parsed_text.name
|
82 |
-
else:
|
83 |
-
file_path = parsed_text
|
84 |
-
|
85 |
-
# Extract text from PDF
|
86 |
-
raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
|
87 |
-
return clean(raw_party)
|
88 |
-
except Exception as e:
|
89 |
-
print(f"Error parsing PDF: {str(e)}")
|
90 |
-
return f"Error parsing PDF: {str(e)}"
|
91 |
-
|
92 |
-
|
93 |
-
#Added more stopwords to avoid irrelevant terms
|
94 |
stop_words = set(stopwords.words('english'))
|
95 |
stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
|
96 |
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
def clean_text(text):
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
text = text.strip(" ")
|
111 |
-
text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
|
112 |
-
|
113 |
-
text = [word for word in text.split() if word not in stop_words]
|
114 |
-
text = ' '.join(text)
|
115 |
-
return text
|
116 |
-
|
117 |
-
# text_Party=clean_text(raw_party)
|
118 |
|
119 |
def Preprocess(textParty):
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
#Removing all stop words
|
125 |
-
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
|
126 |
-
text2Party = pattern.sub('', text1Party)
|
127 |
-
# fdist_cong = FreqDist(word_tokens_cong)
|
128 |
-
return text2Party
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
'''
|
135 |
-
Using Concordance, you can see each time a word is used, along with its
|
136 |
-
immediate context. It can give you a peek into how a word is being used
|
137 |
-
at the sentence level and what words are used with it
|
138 |
-
'''
|
139 |
-
def conc(text_Party,strng):
|
140 |
-
word_tokens_party = word_tokenize(text_Party)
|
141 |
-
moby = Text(word_tokens_party)
|
142 |
-
resultList = []
|
143 |
-
for i in range(0,1):
|
144 |
-
save_stdout = sys.stdout
|
145 |
-
result = StringIO()
|
146 |
-
sys.stdout = result
|
147 |
-
moby.concordance(strng,lines=4,width=82)
|
148 |
-
sys.stdout = save_stdout
|
149 |
-
s=result.getvalue().splitlines()
|
150 |
-
return result.getvalue()
|
151 |
-
|
152 |
-
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10, numLins=4):
|
153 |
-
"""
|
154 |
-
Function to get all the phases that contain the target word in a text/passage tar_passage.
|
155 |
-
Workaround to save the output given by nltk Concordance function
|
156 |
-
|
157 |
-
str target_word, str tar_passage int left_margin int right_margin --> list of str
|
158 |
-
left_margin and right_margin allocate the number of words/pununciation before and after target word
|
159 |
-
Left margin will take note of the beginning of the text
|
160 |
-
"""
|
161 |
-
# Handle empty or None search terms
|
162 |
-
if not target_word or target_word.strip() == "":
|
163 |
-
return "Please enter a search term"
|
164 |
-
|
165 |
-
# Create list of tokens using nltk function
|
166 |
-
tokens = nltk.word_tokenize(tar_passage)
|
167 |
-
|
168 |
-
# Create the text of tokens
|
169 |
-
text = nltk.Text(tokens)
|
170 |
-
|
171 |
-
## Collect all the index or offset position of the target word
|
172 |
-
c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
|
173 |
-
|
174 |
-
## Collect the range of the words that is within the target word by using text.tokens[start;end].
|
175 |
-
## The map function is use so that when the offset position - the target range < 0, it will be default to zero
|
176 |
-
concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)])
|
177 |
-
|
178 |
-
## join the sentences for each of the target phrase and return it
|
179 |
-
result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1]
|
180 |
-
result=result[:numLins+1]
|
181 |
-
|
182 |
-
res='\n\n'.join(result)
|
183 |
-
return res
|
184 |
-
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
factor = target/raw
|
189 |
-
return {key:value*factor for key,value in d.items()}
|
190 |
-
|
191 |
-
|
192 |
-
def generate_summary(text, max_length=1000):
|
193 |
-
"""
|
194 |
-
Generate a summary of the manifesto text using Groq LLM
|
195 |
-
"""
|
196 |
if not groq_client:
|
197 |
return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
|
198 |
-
|
199 |
-
# Truncate text if it's too long to fit in context window
|
200 |
if len(text) > 10000:
|
201 |
text = text[:10000]
|
202 |
-
|
203 |
try:
|
204 |
-
# Use Groq's LLaMA 3 model for summarization
|
205 |
completion = groq_client.chat.completions.create(
|
206 |
-
model="llama3-8b-8192",
|
207 |
messages=[
|
208 |
{"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
|
209 |
{"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
|
210 |
],
|
211 |
-
temperature=0.3,
|
212 |
-
max_tokens=800
|
213 |
)
|
214 |
-
|
215 |
return completion.choices[0].message.content
|
216 |
except Exception as e:
|
217 |
-
return f"Error generating summary: {str(e)}
|
218 |
|
219 |
def fDistance(text2Party):
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
# Traditional frequency distribution
|
224 |
-
word_tokens_party = word_tokenize(text2Party) #Tokenizing
|
225 |
-
fdistance = FreqDist(word_tokens_party).most_common(10)
|
226 |
-
mem={}
|
227 |
-
for x in fdistance:
|
228 |
-
mem[x[0]]=x[1]
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
sorted_tfidf = dict(sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10])
|
249 |
-
|
250 |
-
# Combine traditional frequency with TF-IDF for better results
|
251 |
-
combined_scores = {}
|
252 |
-
for word in set(list(mem.keys()) + list(sorted_tfidf.keys())):
|
253 |
-
# Normalize and combine both scores (with more weight to TF-IDF)
|
254 |
-
freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
|
255 |
-
tfidf_score = sorted_tfidf.get(word, 0) / max(sorted_tfidf.values()) if sorted_tfidf else 0
|
256 |
-
combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7) # Weight TF-IDF higher
|
257 |
-
|
258 |
-
# Get top 10 words by combined score
|
259 |
-
top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
|
260 |
-
|
261 |
-
return normalize(top_words)
|
262 |
-
|
263 |
-
def fDistancePlot(text2Party,plotN=15):
|
264 |
-
'''
|
265 |
-
Most Frequent Words Visualization
|
266 |
-
'''
|
267 |
-
word_tokens_party = word_tokenize(text2Party) #Tokenizing
|
268 |
-
fdistance = FreqDist(word_tokens_party)
|
269 |
-
plt.title('Frequency Distribution')
|
270 |
-
plt.axis('off')
|
271 |
-
plt.figure(figsize=(4,3))
|
272 |
-
fdistance.plot(plotN)
|
273 |
-
plt.tight_layout()
|
274 |
-
buf = BytesIO()
|
275 |
-
plt.savefig(buf)
|
276 |
-
buf.seek(0)
|
277 |
-
img1 = np.array(Image.open(buf))
|
278 |
-
plt.clf()
|
279 |
-
return img1
|
280 |
-
|
281 |
-
|
282 |
-
def DispersionPlot(textParty):
|
283 |
-
'''
|
284 |
-
Dispersion PLot
|
285 |
-
'''
|
286 |
-
word_tokens_party = word_tokenize(textParty) #Tokenizing
|
287 |
-
moby = Text(word_tokens_party)
|
288 |
-
fdistance = FreqDist(word_tokens_party)
|
289 |
-
word_Lst=[]
|
290 |
-
for x in range(5):
|
291 |
-
word_Lst.append(fdistance.most_common(6)[x][0])
|
292 |
-
|
293 |
-
plt.axis('off')
|
294 |
-
plt.figure(figsize=(4,3))
|
295 |
-
plt.title('Dispersion Plot')
|
296 |
-
moby.dispersion_plot(word_Lst)
|
297 |
-
plt.plot(color="#EF6D6D")
|
298 |
-
plt.tight_layout()
|
299 |
-
buf = BytesIO()
|
300 |
-
plt.savefig(buf)
|
301 |
-
buf.seek(0)
|
302 |
-
img = Image.open(buf)
|
303 |
-
plt.clf()
|
304 |
-
return img
|
305 |
-
|
306 |
-
|
307 |
-
def getSubjectivity(text):
|
308 |
-
|
309 |
-
'''
|
310 |
-
Create a function to get the polarity
|
311 |
-
'''
|
312 |
-
return TextBlob(text).sentiment.subjectivity
|
313 |
-
|
314 |
-
|
315 |
-
def getPolarity(text):
|
316 |
-
'''
|
317 |
-
Create a function to get the polarity
|
318 |
-
'''
|
319 |
-
return TextBlob(text).sentiment.polarity
|
320 |
-
|
321 |
-
|
322 |
-
def getAnalysis(score):
|
323 |
-
if score < 0:
|
324 |
-
return 'Negative'
|
325 |
-
elif score == 0:
|
326 |
-
return 'Neutral'
|
327 |
-
else:
|
328 |
-
return 'Positive'
|
329 |
-
def Original_Image(path):
|
330 |
-
img= cv2.imread(path)
|
331 |
-
img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
332 |
-
return img
|
333 |
-
|
334 |
-
def Image_Processed(path):
|
335 |
-
'''
|
336 |
-
Reading the image file
|
337 |
-
'''
|
338 |
-
img= cv2.imread(path)
|
339 |
-
img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
340 |
-
|
341 |
-
#Thresholding
|
342 |
-
ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY)
|
343 |
-
|
344 |
-
return bw_img
|
345 |
|
346 |
-
def
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
'''
|
351 |
-
mask =mask_img
|
352 |
-
# Create and generate a word cloud image:
|
353 |
-
wordcloud = WordCloud(max_words=maxWord, background_color=bckColor,
|
354 |
-
mask=mask,
|
355 |
-
colormap='nipy_spectral_r',
|
356 |
-
contour_color=contCol,
|
357 |
-
width=800, height=800,
|
358 |
-
margin=2,
|
359 |
-
contour_width=3).generate(text_Party_pr)
|
360 |
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
-
|
364 |
-
|
365 |
-
if colorGener==True:
|
366 |
-
image_colors = ImageColorGenerator(orgIm)
|
367 |
-
plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear")
|
368 |
-
|
369 |
-
|
370 |
-
else:
|
371 |
-
plt.imshow(wordcloud)
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
def word_cloud_generator(parsed_text_name,text_Party):
|
377 |
-
parsed=parsed_text_name.lower()
|
378 |
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
contCol='white', bckColor='black')
|
385 |
-
plt.tight_layout()
|
386 |
-
buf = BytesIO()
|
387 |
-
plt.savefig(buf)
|
388 |
-
buf.seek(0)
|
389 |
-
img1 = Image.open(buf)
|
390 |
-
plt.clf()
|
391 |
-
return img1
|
392 |
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
plt.clf()
|
422 |
-
return img3
|
423 |
-
|
424 |
-
else :
|
425 |
-
wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party)
|
426 |
-
plt.figure(figsize=(5,5))
|
427 |
-
plt.imshow(wordcloud, interpolation="bilinear")
|
428 |
-
plt.axis("off")
|
429 |
-
plt.tight_layout()
|
430 |
-
buf = BytesIO()
|
431 |
-
plt.savefig(buf)
|
432 |
-
buf.seek(0)
|
433 |
-
img4 = Image.open(buf)
|
434 |
-
plt.clf()
|
435 |
-
return img4
|
436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
|
|
|
|
|
|
438 |
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
urllib.request.urlretrieve(url, filename=path_input)
|
443 |
|
444 |
-
|
445 |
-
|
446 |
-
|
|
|
|
|
447 |
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
Main analysis function that processes the manifesto and generates all outputs
|
455 |
-
Manifesto: PDF file uploaded by the user
|
456 |
-
Search: Search term entered by the user
|
457 |
-
'''
|
458 |
-
try:
|
459 |
-
print(f"Analysis function called with: Manifesto={Manifesto}, Search={Search}")
|
460 |
-
|
461 |
-
# Check if a file was uploaded
|
462 |
-
if Manifesto is None:
|
463 |
-
print("No file uploaded")
|
464 |
-
return "Please upload a PDF file", {}, None, None, None, None, None, "No file uploaded"
|
465 |
-
|
466 |
-
# Handle empty search term
|
467 |
-
if Search is None or Search.strip() == "":
|
468 |
-
Search = "government" # Default search term
|
469 |
-
print(f"Using default search term: {Search}")
|
470 |
-
else:
|
471 |
-
print(f"Using provided search term: {Search}")
|
472 |
-
|
473 |
-
# Process the uploaded PDF
|
474 |
-
print(f"Processing file: {Manifesto.name if hasattr(Manifesto, 'name') else Manifesto}")
|
475 |
-
raw_party = Parsing(Manifesto)
|
476 |
-
|
477 |
-
# Check if parsing was successful
|
478 |
-
if isinstance(raw_party, str) and raw_party.startswith("Error"):
|
479 |
-
print(f"Parsing error: {raw_party}")
|
480 |
-
return raw_party, {}, None, None, None, None, None, "Error generating summary due to parsing failure"
|
481 |
-
|
482 |
-
print("Parsing successful, cleaning text...")
|
483 |
-
text_Party = clean_text(raw_party)
|
484 |
-
text_Party_processed = Preprocess(text_Party)
|
485 |
|
486 |
-
|
487 |
-
|
488 |
-
summary = generate_summary(raw_party)
|
489 |
|
490 |
-
|
491 |
-
print("Performing sentiment analysis...")
|
492 |
-
df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
|
493 |
-
df['Subjectivity'] = df['Content'].apply(getSubjectivity)
|
494 |
-
df['Polarity'] = df['Content'].apply(getPolarity)
|
495 |
-
df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis)
|
496 |
-
df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis)
|
497 |
-
|
498 |
-
# Generate sentiment analysis plot
|
499 |
-
print("Generating sentiment analysis plot...")
|
500 |
-
plt.title('Sentiment Analysis')
|
501 |
-
plt.xlabel('Sentiment')
|
502 |
-
plt.ylabel('Counts')
|
503 |
-
plt.figure(figsize=(4,3))
|
504 |
-
df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45")
|
505 |
-
plt.tight_layout()
|
506 |
-
buf = BytesIO()
|
507 |
-
plt.savefig(buf)
|
508 |
-
buf.seek(0)
|
509 |
-
img1 = Image.open(buf)
|
510 |
-
plt.clf()
|
511 |
-
|
512 |
-
# Generate subjectivity analysis plot
|
513 |
-
print("Generating subjectivity analysis plot...")
|
514 |
-
plt.figure(figsize=(4,3))
|
515 |
-
df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1")
|
516 |
-
plt.tight_layout()
|
517 |
-
buf = BytesIO()
|
518 |
-
plt.savefig(buf)
|
519 |
-
buf.seek(0)
|
520 |
-
img2 = Image.open(buf)
|
521 |
-
plt.clf()
|
522 |
-
|
523 |
-
# Generate word cloud
|
524 |
-
print("Generating word cloud...")
|
525 |
-
img3 = word_cloud_generator(Manifesto.name, text_Party_processed)
|
526 |
-
|
527 |
-
# Generate frequency distribution and dispersion plots
|
528 |
-
print("Generating frequency distribution...")
|
529 |
-
fdist_Party = fDistance(text_Party_processed)
|
530 |
-
img4 = fDistancePlot(text_Party_processed)
|
531 |
-
|
532 |
-
print("Generating dispersion plot...")
|
533 |
-
img5 = DispersionPlot(text_Party_processed)
|
534 |
-
|
535 |
-
# Search for the term in the text
|
536 |
-
print(f"Searching for term: {Search}")
|
537 |
-
searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
|
538 |
-
searChRes = searChRes.replace(Search, "\u0332".join(Search))
|
539 |
-
|
540 |
-
plt.close('all')
|
541 |
-
print("Analysis completed successfully")
|
542 |
-
return searChRes, fdist_Party, img1, img2, img3, img4, img5, summary
|
543 |
-
|
544 |
-
except Exception as e:
|
545 |
-
error_message = f"Error analyzing manifesto: {str(e)}"
|
546 |
-
print(error_message)
|
547 |
-
import traceback
|
548 |
-
traceback.print_exc()
|
549 |
-
# Return placeholder values in case of error
|
550 |
-
return error_message, {}, None, None, None, None, None, "Error generating summary. Please check the console for details."
|
551 |
|
|
|
|
|
|
|
|
|
|
|
552 |
|
553 |
-
|
|
|
554 |
filePdf = "file"
|
555 |
-
text = gr.Textbox(label='Context Based Search')
|
556 |
-
mfw=gr.Label(label="Most Relevant Topics (LLM Enhanced)")
|
557 |
-
plot1=gr.Image(label='Sentiment Analysis')
|
558 |
-
plot2=gr.Image(label='Subjectivity Analysis')
|
559 |
-
plot3=gr.Image(label='Word Cloud')
|
560 |
-
plot4=gr.Image(label='Frequency Distribution')
|
561 |
-
plot5=gr.Image(label='Dispersion Plot')
|
562 |
-
summary_output = gr.Textbox(label='AI-Generated Summary', lines=10)
|
563 |
|
564 |
with gr.Blocks(title='Manifesto Analysis') as demo:
|
565 |
gr.Markdown("# Manifesto Analysis with LLM Enhancement")
|
566 |
-
gr.Markdown("### Analyze political manifestos with advanced NLP and LLM techniques")
|
567 |
-
|
568 |
with gr.Row():
|
569 |
-
with gr.Column(
|
570 |
file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
|
571 |
search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
|
572 |
submit_btn = gr.Button("Analyze Manifesto")
|
573 |
-
|
574 |
with gr.Tabs():
|
575 |
-
with gr.TabItem("Summary"):
|
576 |
-
|
577 |
-
|
578 |
-
with gr.TabItem("Search Results"):
|
579 |
-
text
|
580 |
-
|
581 |
-
with gr.TabItem("Key Topics"):
|
582 |
-
mfw
|
583 |
-
|
584 |
with gr.TabItem("Visualizations"):
|
585 |
with gr.Row():
|
586 |
-
|
587 |
-
plot3
|
588 |
-
with gr.Column(scale=1):
|
589 |
-
plot4
|
590 |
-
|
591 |
-
with gr.Row():
|
592 |
-
with gr.Column(scale=1):
|
593 |
-
plot1
|
594 |
-
with gr.Column(scale=1):
|
595 |
-
plot2
|
596 |
-
|
597 |
with gr.Row():
|
598 |
-
|
599 |
-
|
|
|
600 |
submit_btn.click(
|
601 |
fn=analysis,
|
602 |
inputs=[file_input, search_input],
|
603 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
)
|
605 |
-
|
606 |
-
# Add a debug print to verify the button is connected
|
607 |
-
print("Button connected to analysis function")
|
608 |
-
|
609 |
gr.Examples(
|
610 |
examples=[
|
611 |
-
[
|
612 |
-
[
|
613 |
-
[
|
614 |
],
|
615 |
inputs=[file_input, search_input]
|
616 |
)
|
617 |
|
618 |
-
demo.launch(debug=True, share=False, show_error=True)
|
619 |
-
|
620 |
-
|
621 |
-
# Old interface code replaced by the Blocks implementation above
|
622 |
-
# io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']])
|
623 |
-
# io.launch(debug=True,share=False)
|
624 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import random
|
2 |
import matplotlib.pyplot as plt
|
3 |
import nltk
|
4 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
5 |
from nltk.corpus import stopwords
|
|
|
6 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
7 |
from nltk.probability import FreqDist
|
8 |
from cleantext import clean
|
9 |
import textract
|
10 |
import urllib.request
|
11 |
+
from io import BytesIO
|
12 |
+
import sys
|
|
|
|
|
|
|
13 |
import pandas as pd
|
14 |
import cv2
|
15 |
import re
|
16 |
+
from wordcloud import WordCloud, ImageColorGenerator
|
17 |
from textblob import TextBlob
|
18 |
from PIL import Image
|
19 |
import os
|
20 |
import gradio as gr
|
21 |
+
from dotenv import load_dotenv
|
|
|
|
|
22 |
import groq
|
23 |
import json
|
24 |
+
import traceback
|
|
|
|
|
25 |
import numpy as np
|
26 |
|
27 |
+
# Load environment variables
|
28 |
+
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
# Download NLTK resources
|
31 |
+
nltk.download(['punkt', 'stopwords', 'wordnet', 'words'])
|
32 |
|
33 |
+
# Initialize Groq client
|
34 |
+
groq_api_key = GROQ_API_KEY
|
35 |
+
groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None
|
36 |
|
37 |
+
# Stopwords customization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
stop_words = set(stopwords.words('english'))
|
39 |
stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
|
40 |
|
41 |
+
# --- Parsing & Preprocessing Functions ---
|
42 |
+
def Parsing(parsed_text):
|
43 |
+
try:
|
44 |
+
if hasattr(parsed_text, 'name'):
|
45 |
+
file_path = parsed_text.name
|
46 |
+
else:
|
47 |
+
file_path = parsed_text
|
48 |
+
raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
|
49 |
+
return clean(raw_party)
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error parsing PDF: {e}")
|
52 |
+
return f"Error parsing PDF: {e}"
|
53 |
|
54 |
def clean_text(text):
|
55 |
+
text = text.encode("ascii", errors="ignore").decode("ascii")
|
56 |
+
text = unidecode.unidecode(text)
|
57 |
+
text = contractions.fix(text)
|
58 |
+
text = re.sub(r"\n", " ", text)
|
59 |
+
text = re.sub(r"\t", " ", text)
|
60 |
+
text = re.sub(r"/ ", " ", text)
|
61 |
+
text = text.strip()
|
62 |
+
text = re.sub(" +", " ", text).strip()
|
63 |
+
text = [word for word in text.split() if word not in stop_words]
|
64 |
+
return ' '.join(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
def Preprocess(textParty):
|
67 |
+
text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
|
68 |
+
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
|
69 |
+
text2Party = pattern.sub('', text1Party)
|
70 |
+
return text2Party
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
# --- Core Analysis Functions ---
|
73 |
+
def generate_summary(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
if not groq_client:
|
75 |
return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
|
|
|
|
|
76 |
if len(text) > 10000:
|
77 |
text = text[:10000]
|
|
|
78 |
try:
|
|
|
79 |
completion = groq_client.chat.completions.create(
|
80 |
+
model="llama3-8b-8192",
|
81 |
messages=[
|
82 |
{"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
|
83 |
{"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
|
84 |
],
|
85 |
+
temperature=0.3,
|
86 |
+
max_tokens=800
|
87 |
)
|
|
|
88 |
return completion.choices[0].message.content
|
89 |
except Exception as e:
|
90 |
+
return f"Error generating summary: {str(e)}"
|
91 |
|
92 |
def fDistance(text2Party):
|
93 |
+
word_tokens_party = word_tokenize(text2Party)
|
94 |
+
fdistance = FreqDist(word_tokens_party).most_common(10)
|
95 |
+
mem = {x[0]: x[1] for x in fdistance}
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
+
vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
|
98 |
+
tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party))
|
99 |
+
feature_names = vectorizer.get_feature_names_out()
|
100 |
+
|
101 |
+
tfidf_scores = {}
|
102 |
+
for i, word in enumerate(feature_names):
|
103 |
+
scores = [tfidf_matrix[j, i] for j in range(len(sent_tokenize(text2Party))) if i < tfidf_matrix[j].shape[1]]
|
104 |
+
if scores:
|
105 |
+
tfidf_scores[word] = sum(scores) / len(scores)
|
106 |
+
|
107 |
+
combined_scores = {}
|
108 |
+
for word in set(list(mem.keys()) + list(tfidf_scores.keys())):
|
109 |
+
freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
|
110 |
+
tfidf_score = tfidf_scores.get(word, 0) / max(tfidf_scores.values()) if tfidf_scores else 0
|
111 |
+
combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7)
|
112 |
+
|
113 |
+
top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
|
114 |
+
return normalize(top_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
+
def normalize(d, target=1.0):
|
117 |
+
raw = sum(d.values())
|
118 |
+
factor = target / raw if raw != 0 else 0
|
119 |
+
return {key: value * factor for key, value in d.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
# --- Visualization Functions with Error Handling ---
|
122 |
+
def safe_plot(func, *args, **kwargs):
|
123 |
+
try:
|
124 |
+
plt.clf()
|
125 |
+
func(*args, **kwargs)
|
126 |
+
buf = BytesIO()
|
127 |
+
plt.savefig(buf, format='png')
|
128 |
+
buf.seek(0)
|
129 |
+
return Image.open(buf)
|
130 |
+
except Exception as e:
|
131 |
+
print(f"Plotting error: {e}")
|
132 |
+
return None
|
133 |
|
134 |
+
def fDistancePlot(text2Party):
|
135 |
+
return safe_plot(lambda: FreqDist(word_tokenize(text2Party)).plot(15, title='Frequency Distribution'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
+
def DispersionPlot(textParty):
|
138 |
+
tokens = word_tokenize(textParty)
|
139 |
+
moby = Text(tokens)
|
140 |
+
common_words = [item[0] for item in FreqDist(tokens).most_common(5)]
|
141 |
+
return safe_plot(lambda: moby.dispersion_plot(common_words, title='Dispersion Plot'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
def word_cloud_generator(parsed_text_name, text_Party):
|
144 |
+
try:
|
145 |
+
parsed = parsed_text_name.lower()
|
146 |
+
if 'bjp' in parsed:
|
147 |
+
mask_path = 'bjpImg2.jpeg'
|
148 |
+
elif 'congress' in parsed:
|
149 |
+
mask_path = 'congress3.jpeg'
|
150 |
+
elif 'aap' in parsed:
|
151 |
+
mask_path = 'aapMain2.jpg'
|
152 |
+
else:
|
153 |
+
mask_path = None
|
154 |
+
|
155 |
+
if mask_path and os.path.exists(mask_path):
|
156 |
+
orgImg = Image.open(mask_path)
|
157 |
+
mask = np.array(orgImg)
|
158 |
+
wordcloud = WordCloud(max_words=3000, mask=mask).generate(text_Party)
|
159 |
+
plt.imshow(wordcloud)
|
160 |
+
else:
|
161 |
+
wordcloud = WordCloud(max_words=2000).generate(text_Party)
|
162 |
+
plt.imshow(wordcloud)
|
163 |
+
plt.axis("off")
|
164 |
+
buf = BytesIO()
|
165 |
+
plt.savefig(buf, format='png')
|
166 |
+
buf.seek(0)
|
167 |
+
return Image.open(buf)
|
168 |
+
except Exception as e:
|
169 |
+
print(f"Word cloud error: {e}")
|
170 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
+
# --- Main Analysis Function ---
|
173 |
+
def analysis(Manifesto, Search):
|
174 |
+
try:
|
175 |
+
if Manifesto is None:
|
176 |
+
return "No file uploaded", {}, None, None, None, None, None, "No file uploaded"
|
177 |
+
if Search.strip() == "":
|
178 |
+
Search = "government"
|
179 |
|
180 |
+
raw_party = Parsing(Manifesto)
|
181 |
+
if isinstance(raw_party, str) and raw_party.startswith("Error"):
|
182 |
+
return raw_party, {}, None, None, None, None, None, "Parsing failed"
|
183 |
|
184 |
+
text_Party = clean_text(raw_party)
|
185 |
+
text_Party_processed = Preprocess(text_Party)
|
186 |
+
summary = generate_summary(raw_party)
|
|
|
187 |
|
188 |
+
df = pd.DataFrame([{'Content': text_Party_processed}], columns=['Content'])
|
189 |
+
df['Subjectivity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
|
190 |
+
df['Polarity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.polarity)
|
191 |
+
df['Polarity_Label'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
|
192 |
+
df['Subjectivity_Label'] = df['Subjectivity'].apply(lambda x: 'High' if x > 0.5 else 'Low')
|
193 |
|
194 |
+
# Generate Plots with Safe Plotting
|
195 |
+
sentiment_plot = safe_plot(lambda: df['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis'))
|
196 |
+
subjectivity_plot = safe_plot(lambda: df['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis'))
|
197 |
+
freq_plot = fDistancePlot(text_Party_processed)
|
198 |
+
dispersion_plot = DispersionPlot(text_Party_processed)
|
199 |
+
wordcloud = word_cloud_generator(Manifesto.name, text_Party_processed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
+
fdist_Party = fDistance(text_Party_processed)
|
202 |
+
searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed)
|
|
|
203 |
|
204 |
+
return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
+
except Exception as e:
|
207 |
+
error_msg = f"Critical error: {str(e)}"
|
208 |
+
print(error_msg)
|
209 |
+
traceback.print_exc()
|
210 |
+
return error_msg, {}, None, None, None, None, None, "Analysis failed"
|
211 |
|
212 |
+
# --- Gradio Interface ---
|
213 |
+
Search_txt = "text"
|
214 |
filePdf = "file"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
with gr.Blocks(title='Manifesto Analysis') as demo:
|
217 |
gr.Markdown("# Manifesto Analysis with LLM Enhancement")
|
|
|
|
|
218 |
with gr.Row():
|
219 |
+
with gr.Column():
|
220 |
file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
|
221 |
search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
|
222 |
submit_btn = gr.Button("Analyze Manifesto")
|
|
|
223 |
with gr.Tabs():
|
224 |
+
with gr.TabItem("Summary"): gr.Textbox(label='AI-Generated Summary', lines=10)
|
225 |
+
with gr.TabItem("Search Results"): gr.Textbox(label='Context Based Search')
|
226 |
+
with gr.TabItem("Key Topics"): gr.Label(label="Most Relevant Topics (LLM Enhanced)")
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
with gr.TabItem("Visualizations"):
|
228 |
with gr.Row():
|
229 |
+
gr.Image(label='Sentiment Analysis'), gr.Image(label='Subjectivity Analysis')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
with gr.Row():
|
231 |
+
gr.Image(label='Word Cloud'), gr.Image(label='Frequency Distribution')
|
232 |
+
gr.Image(label='Dispersion Plot')
|
233 |
+
|
234 |
submit_btn.click(
|
235 |
fn=analysis,
|
236 |
inputs=[file_input, search_input],
|
237 |
+
outputs=[
|
238 |
+
gr.Textbox(label='Context Based Search'),
|
239 |
+
gr.Label(label="Most Relevant Topics (LLM Enhanced)"),
|
240 |
+
gr.Image(label='Sentiment Analysis'),
|
241 |
+
gr.Image(label='Subjectivity Analysis'),
|
242 |
+
gr.Image(label='Word Cloud'),
|
243 |
+
gr.Image(label='Frequency Distribution'),
|
244 |
+
gr.Image(label='Dispersion Plot'),
|
245 |
+
gr.Textbox(label='AI-Generated Summary', lines=10)
|
246 |
+
]
|
247 |
)
|
248 |
+
|
|
|
|
|
|
|
249 |
gr.Examples(
|
250 |
examples=[
|
251 |
+
["Example/AAP_Manifesto_2019.pdf", "government"],
|
252 |
+
["Example/Bjp_Manifesto_2019.pdf", "environment"],
|
253 |
+
["Example/Congress_Manifesto_2019.pdf", "safety"]
|
254 |
],
|
255 |
inputs=[file_input, search_input]
|
256 |
)
|
257 |
|
258 |
+
demo.launch(debug=True, share=False, show_error=True)
|
|
|
|
|
|
|
|
|
|
|
|