Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
# MANIFESTO ANALYSIS
|
3 |
"""
|
@@ -32,6 +445,14 @@ import gradio as gr
|
|
32 |
from zipfile import ZipFile
|
33 |
import contractions
|
34 |
import unidecode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
nltk.download('punkt_tab')
|
37 |
nltk.download('stopwords')
|
@@ -39,6 +460,18 @@ nltk.download('punkt')
|
|
39 |
nltk.download('wordnet')
|
40 |
nltk.download('words')
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
"""## PARSING FILES"""
|
44 |
|
@@ -149,16 +582,77 @@ def normalize(d, target=1.0):
|
|
149 |
factor = target/raw
|
150 |
return {key:value*factor for key,value in d.items()}
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
def fDistance(text2Party):
|
153 |
'''
|
154 |
-
Most frequent words search
|
155 |
'''
|
|
|
156 |
word_tokens_party = word_tokenize(text2Party) #Tokenizing
|
157 |
fdistance = FreqDist(word_tokens_party).most_common(10)
|
158 |
mem={}
|
159 |
for x in fdistance:
|
160 |
mem[x[0]]=x[1]
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
def fDistancePlot(text2Party,plotN=15):
|
164 |
'''
|
@@ -352,7 +846,10 @@ urllib.request.urlretrieve(url, filename=path_input)
|
|
352 |
def analysis(Manifesto,Search):
|
353 |
raw_party = Parsing(Manifesto)
|
354 |
text_Party=clean_text(raw_party)
|
355 |
-
|
|
|
|
|
|
|
356 |
|
357 |
df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
|
358 |
df['Subjectivity'] = df['Content'].apply(getSubjectivity)
|
@@ -380,30 +877,81 @@ def analysis(Manifesto,Search):
|
|
380 |
img2 = Image.open(buf)
|
381 |
plt.clf()
|
382 |
|
383 |
-
img3 = word_cloud_generator(Manifesto.name,
|
|
|
|
|
|
|
|
|
384 |
|
385 |
-
|
386 |
-
img4=fDistancePlot(text_Party)
|
387 |
-
img5=DispersionPlot(text_Party)
|
388 |
-
#concordance(text_Party,Search)
|
389 |
-
searChRes=get_all_phases_containing_tar_wrd(Search,text_Party)
|
390 |
searChRes=searChRes.replace(Search,"\u0332".join(Search))
|
391 |
plt.close('all')
|
392 |
-
return searChRes,fdist_Party,img1,img2,img3,img4,img5
|
393 |
|
394 |
|
395 |
Search_txt= "text"
|
396 |
filePdf = "file"
|
397 |
text = gr.Textbox(label='Context Based Search')
|
398 |
-
mfw=gr.Label(label="Most Relevant Topics")
|
399 |
plot1=gr.Image(label='Sentiment Analysis')
|
400 |
plot2=gr.Image(label='Subjectivity Analysis')
|
401 |
plot3=gr.Image(label='Word Cloud')
|
402 |
plot4=gr.Image(label='Frequency Distribution')
|
403 |
plot5=gr.Image(label='Dispersion Plot')
|
|
|
404 |
|
405 |
-
|
406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
|
408 |
|
409 |
#allow_screenshot=False,allow_flagging="never",
|
|
|
1 |
+
# """
|
2 |
+
# # MANIFESTO ANALYSIS
|
3 |
+
# """
|
4 |
+
|
5 |
+
# ##IMPORTING LIBRARIES
|
6 |
+
# import random
|
7 |
+
# import matplotlib.pyplot as plt
|
8 |
+
# import nltk
|
9 |
+
# from nltk.tokenize import word_tokenize,sent_tokenize
|
10 |
+
# from nltk.corpus import stopwords
|
11 |
+
# from nltk.stem.porter import PorterStemmer
|
12 |
+
# from nltk.stem import WordNetLemmatizer
|
13 |
+
# from nltk.corpus import stopwords
|
14 |
+
# from nltk.tokenize import word_tokenize
|
15 |
+
# from nltk.probability import FreqDist
|
16 |
+
# from cleantext import clean
|
17 |
+
# import textract
|
18 |
+
# import urllib.request
|
19 |
+
# import nltk.corpus
|
20 |
+
# from nltk.text import Text
|
21 |
+
# import io
|
22 |
+
# from io import StringIO,BytesIO
|
23 |
+
# import sys
|
24 |
+
# import pandas as pd
|
25 |
+
# import cv2
|
26 |
+
# import re
|
27 |
+
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
28 |
+
# from textblob import TextBlob
|
29 |
+
# from PIL import Image
|
30 |
+
# import os
|
31 |
+
# import gradio as gr
|
32 |
+
# from zipfile import ZipFile
|
33 |
+
# import contractions
|
34 |
+
# import unidecode
|
35 |
+
|
36 |
+
# nltk.download('punkt_tab')
|
37 |
+
# nltk.download('stopwords')
|
38 |
+
# nltk.download('punkt')
|
39 |
+
# nltk.download('wordnet')
|
40 |
+
# nltk.download('words')
|
41 |
+
|
42 |
+
|
43 |
+
# """## PARSING FILES"""
|
44 |
+
|
45 |
+
# #def Parsing(parsed_text):
|
46 |
+
# #parsed_text=parsed_text.name
|
47 |
+
# #raw_party =parser.from_file(parsed_text)
|
48 |
+
# # raw_party = raw_party['content'],cache_examples=True
|
49 |
+
# # return clean(raw_party)
|
50 |
+
|
51 |
+
|
52 |
+
# def Parsing(parsed_text):
|
53 |
+
# parsed_text=parsed_text.name
|
54 |
+
# raw_party =textract.process(parsed_text, encoding='ascii',method='pdfminer')
|
55 |
+
# return clean(raw_party)
|
56 |
+
|
57 |
+
|
58 |
+
# #Added more stopwords to avoid irrelevant terms
|
59 |
+
# stop_words = set(stopwords.words('english'))
|
60 |
+
# stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
|
61 |
+
|
62 |
+
# """## PREPROCESSING"""
|
63 |
+
|
64 |
+
# def clean_text(text):
|
65 |
+
# '''
|
66 |
+
# The function which returns clean text
|
67 |
+
# '''
|
68 |
+
# text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
|
69 |
+
# text=unidecode.unidecode(text)# diacritics remove
|
70 |
+
# text=contractions.fix(text) # contraction fix
|
71 |
+
# text = re.sub(r"\n", " ", text)
|
72 |
+
# text = re.sub(r"\n\n", " ", text)
|
73 |
+
# text = re.sub(r"\t", " ", text)
|
74 |
+
# text = re.sub(r"/ ", " ", text)
|
75 |
+
# text = text.strip(" ")
|
76 |
+
# text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
|
77 |
+
|
78 |
+
# text = [word for word in text.split() if word not in stop_words]
|
79 |
+
# text = ' '.join(text)
|
80 |
+
# return text
|
81 |
+
|
82 |
+
# # text_Party=clean_text(raw_party)
|
83 |
+
|
84 |
+
# def Preprocess(textParty):
|
85 |
+
# '''
|
86 |
+
# Removing special characters extra spaces
|
87 |
+
# '''
|
88 |
+
# text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
|
89 |
+
# #Removing all stop words
|
90 |
+
# pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
|
91 |
+
# text2Party = pattern.sub('', text1Party)
|
92 |
+
# # fdist_cong = FreqDist(word_tokens_cong)
|
93 |
+
# return text2Party
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
# '''
|
100 |
+
# Using Concordance, you can see each time a word is used, along with its
|
101 |
+
# immediate context. It can give you a peek into how a word is being used
|
102 |
+
# at the sentence level and what words are used with it
|
103 |
+
# '''
|
104 |
+
# def conc(text_Party,strng):
|
105 |
+
# word_tokens_party = word_tokenize(text_Party)
|
106 |
+
# moby = Text(word_tokens_party)
|
107 |
+
# resultList = []
|
108 |
+
# for i in range(0,1):
|
109 |
+
# save_stdout = sys.stdout
|
110 |
+
# result = StringIO()
|
111 |
+
# sys.stdout = result
|
112 |
+
# moby.concordance(strng,lines=4,width=82)
|
113 |
+
# sys.stdout = save_stdout
|
114 |
+
# s=result.getvalue().splitlines()
|
115 |
+
# return result.getvalue()
|
116 |
+
|
117 |
+
# def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10,numLins=4):
|
118 |
+
# """
|
119 |
+
# Function to get all the phases that contain the target word in a text/passage tar_passage.
|
120 |
+
# Workaround to save the output given by nltk Concordance function
|
121 |
+
|
122 |
+
# str target_word, str tar_passage int left_margin int right_margin --> list of str
|
123 |
+
# left_margin and right_margin allocate the number of words/pununciation before and after target word
|
124 |
+
# Left margin will take note of the beginning of the text
|
125 |
+
# """
|
126 |
+
# ## Create list of tokens using nltk function
|
127 |
+
# tokens = nltk.word_tokenize(tar_passage)
|
128 |
+
|
129 |
+
# ## Create the text of tokens
|
130 |
+
# text = nltk.Text(tokens)
|
131 |
+
|
132 |
+
# ## Collect all the index or offset position of the target word
|
133 |
+
# c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
|
134 |
+
|
135 |
+
# ## Collect the range of the words that is within the target word by using text.tokens[start;end].
|
136 |
+
# ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
|
137 |
+
# concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)])
|
138 |
+
|
139 |
+
# ## join the sentences for each of the target phrase and return it
|
140 |
+
# result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1]
|
141 |
+
# result=result[:numLins+1]
|
142 |
+
|
143 |
+
# res='\n\n'.join(result)
|
144 |
+
# return res
|
145 |
+
|
146 |
+
|
147 |
+
# def normalize(d, target=1.0):
|
148 |
+
# raw = sum(d.values())
|
149 |
+
# factor = target/raw
|
150 |
+
# return {key:value*factor for key,value in d.items()}
|
151 |
+
|
152 |
+
# def fDistance(text2Party):
|
153 |
+
# '''
|
154 |
+
# Most frequent words search
|
155 |
+
# '''
|
156 |
+
# word_tokens_party = word_tokenize(text2Party) #Tokenizing
|
157 |
+
# fdistance = FreqDist(word_tokens_party).most_common(10)
|
158 |
+
# mem={}
|
159 |
+
# for x in fdistance:
|
160 |
+
# mem[x[0]]=x[1]
|
161 |
+
# return normalize(mem)
|
162 |
+
|
163 |
+
# def fDistancePlot(text2Party,plotN=15):
|
164 |
+
# '''
|
165 |
+
# Most Frequent Words Visualization
|
166 |
+
# '''
|
167 |
+
# word_tokens_party = word_tokenize(text2Party) #Tokenizing
|
168 |
+
# fdistance = FreqDist(word_tokens_party)
|
169 |
+
# plt.title('Frequency Distribution')
|
170 |
+
# plt.axis('off')
|
171 |
+
# plt.figure(figsize=(4,3))
|
172 |
+
# fdistance.plot(plotN)
|
173 |
+
# plt.tight_layout()
|
174 |
+
# buf = BytesIO()
|
175 |
+
# plt.savefig(buf)
|
176 |
+
# buf.seek(0)
|
177 |
+
# img1 = Image.open(buf)
|
178 |
+
# plt.clf()
|
179 |
+
# return img1
|
180 |
+
|
181 |
+
|
182 |
+
# def DispersionPlot(textParty):
|
183 |
+
# '''
|
184 |
+
# Dispersion PLot
|
185 |
+
# '''
|
186 |
+
# word_tokens_party = word_tokenize(textParty) #Tokenizing
|
187 |
+
# moby = Text(word_tokens_party)
|
188 |
+
# fdistance = FreqDist(word_tokens_party)
|
189 |
+
# word_Lst=[]
|
190 |
+
# for x in range(5):
|
191 |
+
# word_Lst.append(fdistance.most_common(6)[x][0])
|
192 |
+
|
193 |
+
# plt.axis('off')
|
194 |
+
# plt.figure(figsize=(4,3))
|
195 |
+
# plt.title('Dispersion Plot')
|
196 |
+
# moby.dispersion_plot(word_Lst)
|
197 |
+
# plt.plot(color="#EF6D6D")
|
198 |
+
# plt.tight_layout()
|
199 |
+
# buf = BytesIO()
|
200 |
+
# plt.savefig(buf)
|
201 |
+
# buf.seek(0)
|
202 |
+
# img = Image.open(buf)
|
203 |
+
# plt.clf()
|
204 |
+
# return img
|
205 |
+
|
206 |
+
|
207 |
+
# def getSubjectivity(text):
|
208 |
+
|
209 |
+
# '''
|
210 |
+
# Create a function to get the polarity
|
211 |
+
# '''
|
212 |
+
# return TextBlob(text).sentiment.subjectivity
|
213 |
+
|
214 |
+
|
215 |
+
# def getPolarity(text):
|
216 |
+
# '''
|
217 |
+
# Create a function to get the polarity
|
218 |
+
# '''
|
219 |
+
# return TextBlob(text).sentiment.polarity
|
220 |
+
|
221 |
+
|
222 |
+
# def getAnalysis(score):
|
223 |
+
# if score < 0:
|
224 |
+
# return 'Negative'
|
225 |
+
# elif score == 0:
|
226 |
+
# return 'Neutral'
|
227 |
+
# else:
|
228 |
+
# return 'Positive'
|
229 |
+
# def Original_Image(path):
|
230 |
+
# img= cv2.imread(path)
|
231 |
+
# img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
232 |
+
# return img
|
233 |
+
|
234 |
+
# def Image_Processed(path):
|
235 |
+
# '''
|
236 |
+
# Reading the image file
|
237 |
+
# '''
|
238 |
+
# img= cv2.imread(path)
|
239 |
+
# img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
240 |
+
|
241 |
+
# #Thresholding
|
242 |
+
# ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY)
|
243 |
+
|
244 |
+
# return bw_img
|
245 |
+
|
246 |
+
# def word_cloud(orgIm,mask_img,text_Party_pr,maxWord=2000,colorGener=True,
|
247 |
+
# contCol='white',bckColor='white'):
|
248 |
+
# '''
|
249 |
+
# #Generating word cloud
|
250 |
+
# '''
|
251 |
+
# mask =mask_img
|
252 |
+
# # Create and generate a word cloud image:
|
253 |
+
# wordcloud = WordCloud(max_words=maxWord, background_color=bckColor,
|
254 |
+
# mask=mask,
|
255 |
+
# colormap='nipy_spectral_r',
|
256 |
+
# contour_color=contCol,
|
257 |
+
# width=800, height=800,
|
258 |
+
# margin=2,
|
259 |
+
# contour_width=3).generate(text_Party_pr)
|
260 |
+
|
261 |
+
# # create coloring from image
|
262 |
+
|
263 |
+
|
264 |
+
# plt.axis("off")
|
265 |
+
# if colorGener==True:
|
266 |
+
# image_colors = ImageColorGenerator(orgIm)
|
267 |
+
# plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear")
|
268 |
+
|
269 |
+
|
270 |
+
# else:
|
271 |
+
# plt.imshow(wordcloud)
|
272 |
+
|
273 |
+
|
274 |
+
|
275 |
+
|
276 |
+
# def word_cloud_generator(parsed_text_name,text_Party):
|
277 |
+
# parsed=parsed_text_name.lower()
|
278 |
+
|
279 |
+
# if 'bjp' in parsed:
|
280 |
+
# orgImg=Original_Image('bjpImg2.jpeg')
|
281 |
+
# bwImg=Image_Processed('bjpImg2.jpeg')
|
282 |
+
# plt.figure(figsize=(6,5))
|
283 |
+
# word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True,
|
284 |
+
# contCol='white', bckColor='black')
|
285 |
+
# plt.tight_layout()
|
286 |
+
# buf = BytesIO()
|
287 |
+
# plt.savefig(buf)
|
288 |
+
# buf.seek(0)
|
289 |
+
# img1 = Image.open(buf)
|
290 |
+
# plt.clf()
|
291 |
+
# return img1
|
292 |
+
|
293 |
+
|
294 |
+
# elif 'congress' in parsed:
|
295 |
+
# orgImg=Original_Image('congress3.jpeg')
|
296 |
+
# bwImg=Image_Processed('congress3.jpeg')
|
297 |
+
# plt.figure(figsize=(5,4))
|
298 |
+
# word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True)
|
299 |
+
|
300 |
+
# plt.tight_layout()
|
301 |
+
# buf = BytesIO()
|
302 |
+
# plt.savefig(buf)
|
303 |
+
# buf.seek(0)
|
304 |
+
# img2 = Image.open(buf)
|
305 |
+
# plt.clf()
|
306 |
+
# return img2
|
307 |
+
# #congrsMain.jpg
|
308 |
+
|
309 |
+
|
310 |
+
# elif 'aap' in parsed:
|
311 |
+
# orgImg=Original_Image('aapMain2.jpg')
|
312 |
+
# bwImg=Image_Processed('aapMain2.jpg')
|
313 |
+
# plt.figure(figsize=(5,4))
|
314 |
+
# word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=False,contCol='black')
|
315 |
+
|
316 |
+
# plt.tight_layout()
|
317 |
+
# buf = BytesIO()
|
318 |
+
# plt.savefig(buf)
|
319 |
+
# buf.seek(0)
|
320 |
+
# img3 = Image.open(buf)
|
321 |
+
# plt.clf()
|
322 |
+
# return img3
|
323 |
+
|
324 |
+
# else :
|
325 |
+
# wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party)
|
326 |
+
# plt.figure(figsize=(5,5))
|
327 |
+
# plt.imshow(wordcloud, interpolation="bilinear")
|
328 |
+
# plt.axis("off")
|
329 |
+
# plt.tight_layout()
|
330 |
+
# buf = BytesIO()
|
331 |
+
# plt.savefig(buf)
|
332 |
+
# buf.seek(0)
|
333 |
+
# img4 = Image.open(buf)
|
334 |
+
# plt.clf()
|
335 |
+
# return img4
|
336 |
+
|
337 |
+
|
338 |
+
|
339 |
+
# '''
|
340 |
+
# url = "http://library.bjp.org/jspui/bitstream/123456789/2988/1/BJP-Election-english-2019.pdf"
|
341 |
+
# path_input = "./Bjp_Manifesto_2019.pdf"
|
342 |
+
# urllib.request.urlretrieve(url, filename=path_input)
|
343 |
+
|
344 |
+
# url="https://drive.google.com/uc?id=1BLCiy_BWilfVdrUH8kbO-44DJevwO5CG&export=download"
|
345 |
+
# path_input = "./Aap_Manifesto_2019.pdf"
|
346 |
+
# urllib.request.urlretrieve(url, filename=path_input)
|
347 |
+
|
348 |
+
# url="https://drive.google.com/uc?id=1HVZvTtYntl0YKLnE0cwu0CvAIRhXOv60&export=download"
|
349 |
+
# path_input = "./Congress_Manifesto_2019.pdf"
|
350 |
+
# urllib.request.urlretrieve(url, filename=path_input)
|
351 |
+
# '''
|
352 |
+
# def analysis(Manifesto,Search):
|
353 |
+
# raw_party = Parsing(Manifesto)
|
354 |
+
# text_Party=clean_text(raw_party)
|
355 |
+
# text_Party= Preprocess(text_Party)
|
356 |
+
|
357 |
+
# df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
|
358 |
+
# df['Subjectivity'] = df['Content'].apply(getSubjectivity)
|
359 |
+
# df['Polarity'] = df['Content'].apply(getPolarity)
|
360 |
+
# df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis)
|
361 |
+
# df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis)
|
362 |
+
# plt.title('Sentiment Analysis')
|
363 |
+
# plt.xlabel('Sentiment')
|
364 |
+
# plt.ylabel('Counts')
|
365 |
+
# plt.figure(figsize=(4,3))
|
366 |
+
# df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45")
|
367 |
+
# plt.tight_layout()
|
368 |
+
# buf = BytesIO()
|
369 |
+
# plt.savefig(buf)
|
370 |
+
# buf.seek(0)
|
371 |
+
# img1 = Image.open(buf)
|
372 |
+
# plt.clf()
|
373 |
+
|
374 |
+
# plt.figure(figsize=(4,3))
|
375 |
+
# df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1")
|
376 |
+
# plt.tight_layout()
|
377 |
+
# buf = BytesIO()
|
378 |
+
# plt.savefig(buf)
|
379 |
+
# buf.seek(0)
|
380 |
+
# img2 = Image.open(buf)
|
381 |
+
# plt.clf()
|
382 |
+
|
383 |
+
# img3 = word_cloud_generator(Manifesto.name,text_Party)
|
384 |
+
|
385 |
+
# fdist_Party=fDistance(text_Party)
|
386 |
+
# img4=fDistancePlot(text_Party)
|
387 |
+
# img5=DispersionPlot(text_Party)
|
388 |
+
# #concordance(text_Party,Search)
|
389 |
+
# searChRes=get_all_phases_containing_tar_wrd(Search,text_Party)
|
390 |
+
# searChRes=searChRes.replace(Search,"\u0332".join(Search))
|
391 |
+
# plt.close('all')
|
392 |
+
# return searChRes,fdist_Party,img1,img2,img3,img4,img5
|
393 |
+
|
394 |
+
|
395 |
+
# Search_txt= "text"
|
396 |
+
# filePdf = "file"
|
397 |
+
# text = gr.Textbox(label='Context Based Search')
|
398 |
+
# mfw=gr.Label(label="Most Relevant Topics")
|
399 |
+
# plot1=gr.Image(label='Sentiment Analysis')
|
400 |
+
# plot2=gr.Image(label='Subjectivity Analysis')
|
401 |
+
# plot3=gr.Image(label='Word Cloud')
|
402 |
+
# plot4=gr.Image(label='Frequency Distribution')
|
403 |
+
# plot5=gr.Image(label='Dispersion Plot')
|
404 |
+
|
405 |
+
# io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']],theme='peach')
|
406 |
+
# io.launch(debug=True,share=False)
|
407 |
+
|
408 |
+
|
409 |
+
# #allow_screenshot=False,allow_flagging="never",
|
410 |
+
# #examples=[['manifestos/Bjp_Manifesto_2019.pdf','modi'],['AAP_Manifesto_2019.pdf','delhi'],['manifestos/Congress_Manifesto_2019.pdf','safety']])
|
411 |
+
|
412 |
+
|
413 |
+
|
414 |
"""
|
415 |
# MANIFESTO ANALYSIS
|
416 |
"""
|
|
|
445 |
from zipfile import ZipFile
|
446 |
import contractions
|
447 |
import unidecode
|
448 |
+
import groq
|
449 |
+
import json
|
450 |
+
from dotenv import load_dotenv
|
451 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
452 |
+
from collections import Counter
|
453 |
+
|
454 |
+
# Load environment variables from .env file
|
455 |
+
load_dotenv()
|
456 |
|
457 |
nltk.download('punkt_tab')
|
458 |
nltk.download('stopwords')
|
|
|
460 |
nltk.download('wordnet')
|
461 |
nltk.download('words')
|
462 |
|
463 |
+
# Initialize Groq client for LLM capabilities
|
464 |
+
try:
|
465 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
466 |
+
if groq_api_key:
|
467 |
+
groq_client = groq.Groq(api_key=groq_api_key)
|
468 |
+
else:
|
469 |
+
print("Warning: GROQ_API_KEY not found in environment variables. Summarization will be disabled.")
|
470 |
+
groq_client = None
|
471 |
+
except Exception as e:
|
472 |
+
print(f"Error initializing Groq client: {e}")
|
473 |
+
groq_client = None
|
474 |
+
|
475 |
|
476 |
"""## PARSING FILES"""
|
477 |
|
|
|
582 |
factor = target/raw
|
583 |
return {key:value*factor for key,value in d.items()}
|
584 |
|
585 |
+
|
586 |
+
def generate_summary(text, max_length=1000):
|
587 |
+
"""
|
588 |
+
Generate a summary of the manifesto text using Groq LLM
|
589 |
+
"""
|
590 |
+
if not groq_client:
|
591 |
+
return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file."
|
592 |
+
|
593 |
+
# Truncate text if it's too long to fit in context window
|
594 |
+
if len(text) > 10000:
|
595 |
+
text = text[:10000]
|
596 |
+
|
597 |
+
try:
|
598 |
+
# Use Groq's LLaMA 3 model for summarization
|
599 |
+
completion = groq_client.chat.completions.create(
|
600 |
+
model="llama3-8b-8192", # Using LLaMA 3 8B model
|
601 |
+
messages=[
|
602 |
+
{"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."},
|
603 |
+
{"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"}
|
604 |
+
],
|
605 |
+
temperature=0.3, # Lower temperature for more focused output
|
606 |
+
max_tokens=800, # Limit response length
|
607 |
+
)
|
608 |
+
|
609 |
+
return completion.choices[0].message.content
|
610 |
+
except Exception as e:
|
611 |
+
return f"Error generating summary: {str(e)}. Please check your API key and connection."
|
612 |
+
|
613 |
def fDistance(text2Party):
|
614 |
'''
|
615 |
+
Most frequent words search using TF-IDF to find more relevant words
|
616 |
'''
|
617 |
+
# Traditional frequency distribution
|
618 |
word_tokens_party = word_tokenize(text2Party) #Tokenizing
|
619 |
fdistance = FreqDist(word_tokens_party).most_common(10)
|
620 |
mem={}
|
621 |
for x in fdistance:
|
622 |
mem[x[0]]=x[1]
|
623 |
+
|
624 |
+
# Enhanced with TF-IDF for better relevance
|
625 |
+
sentences = sent_tokenize(text2Party)
|
626 |
+
|
627 |
+
# Use TF-IDF to find more relevant words
|
628 |
+
vectorizer = TfidfVectorizer(max_features=15, stop_words='english')
|
629 |
+
tfidf_matrix = vectorizer.fit_transform(sentences)
|
630 |
+
|
631 |
+
# Get feature names (words)
|
632 |
+
feature_names = vectorizer.get_feature_names_out()
|
633 |
+
|
634 |
+
# Calculate average TF-IDF score for each word across all sentences
|
635 |
+
tfidf_scores = {}
|
636 |
+
for i, word in enumerate(feature_names):
|
637 |
+
scores = [tfidf_matrix[j, i] for j in range(len(sentences)) if i < tfidf_matrix[j].shape[1]]
|
638 |
+
if scores:
|
639 |
+
tfidf_scores[word] = sum(scores) / len(scores)
|
640 |
+
|
641 |
+
# Sort by score and get top words
|
642 |
+
sorted_tfidf = dict(sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10])
|
643 |
+
|
644 |
+
# Combine traditional frequency with TF-IDF for better results
|
645 |
+
combined_scores = {}
|
646 |
+
for word in set(list(mem.keys()) + list(sorted_tfidf.keys())):
|
647 |
+
# Normalize and combine both scores (with more weight to TF-IDF)
|
648 |
+
freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0
|
649 |
+
tfidf_score = sorted_tfidf.get(word, 0) / max(sorted_tfidf.values()) if sorted_tfidf else 0
|
650 |
+
combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7) # Weight TF-IDF higher
|
651 |
+
|
652 |
+
# Get top 10 words by combined score
|
653 |
+
top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10])
|
654 |
+
|
655 |
+
return normalize(top_words)
|
656 |
|
657 |
def fDistancePlot(text2Party,plotN=15):
|
658 |
'''
|
|
|
846 |
def analysis(Manifesto,Search):
|
847 |
raw_party = Parsing(Manifesto)
|
848 |
text_Party=clean_text(raw_party)
|
849 |
+
text_Party_processed = Preprocess(text_Party)
|
850 |
+
|
851 |
+
# Generate summary using LLM
|
852 |
+
summary = generate_summary(raw_party)
|
853 |
|
854 |
df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
|
855 |
df['Subjectivity'] = df['Content'].apply(getSubjectivity)
|
|
|
877 |
img2 = Image.open(buf)
|
878 |
plt.clf()
|
879 |
|
880 |
+
img3 = word_cloud_generator(Manifesto.name,text_Party_processed)
|
881 |
+
|
882 |
+
fdist_Party=fDistance(text_Party_processed)
|
883 |
+
img4=fDistancePlot(text_Party_processed)
|
884 |
+
img5=DispersionPlot(text_Party_processed)
|
885 |
|
886 |
+
searChRes=get_all_phases_containing_tar_wrd(Search,text_Party_processed)
|
|
|
|
|
|
|
|
|
887 |
searChRes=searChRes.replace(Search,"\u0332".join(Search))
|
888 |
plt.close('all')
|
889 |
+
return searChRes,fdist_Party,img1,img2,img3,img4,img5,summary
|
890 |
|
891 |
|
892 |
Search_txt= "text"
|
893 |
filePdf = "file"
|
894 |
text = gr.Textbox(label='Context Based Search')
|
895 |
+
mfw=gr.Label(label="Most Relevant Topics (LLM Enhanced)")
|
896 |
plot1=gr.Image(label='Sentiment Analysis')
|
897 |
plot2=gr.Image(label='Subjectivity Analysis')
|
898 |
plot3=gr.Image(label='Word Cloud')
|
899 |
plot4=gr.Image(label='Frequency Distribution')
|
900 |
plot5=gr.Image(label='Dispersion Plot')
|
901 |
+
summary_output = gr.Textbox(label='AI-Generated Summary', lines=10)
|
902 |
|
903 |
+
with gr.Blocks(title='Manifesto Analysis', theme='peach') as demo:
|
904 |
+
gr.Markdown("# Manifesto Analysis with LLM Enhancement")
|
905 |
+
gr.Markdown("### Analyze political manifestos with advanced NLP and LLM techniques")
|
906 |
+
|
907 |
+
with gr.Row():
|
908 |
+
with gr.Column(scale=1):
|
909 |
+
file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"])
|
910 |
+
search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto")
|
911 |
+
submit_btn = gr.Button("Analyze Manifesto")
|
912 |
+
|
913 |
+
with gr.Tabs():
|
914 |
+
with gr.TabItem("Summary"):
|
915 |
+
summary_output
|
916 |
+
|
917 |
+
with gr.TabItem("Search Results"):
|
918 |
+
text
|
919 |
+
|
920 |
+
with gr.TabItem("Key Topics"):
|
921 |
+
mfw
|
922 |
+
|
923 |
+
with gr.TabItem("Visualizations"):
|
924 |
+
with gr.Row():
|
925 |
+
with gr.Column(scale=1):
|
926 |
+
plot3
|
927 |
+
with gr.Column(scale=1):
|
928 |
+
plot4
|
929 |
+
|
930 |
+
with gr.Row():
|
931 |
+
with gr.Column(scale=1):
|
932 |
+
plot1
|
933 |
+
with gr.Column(scale=1):
|
934 |
+
plot2
|
935 |
+
|
936 |
+
with gr.Row():
|
937 |
+
plot5
|
938 |
+
|
939 |
+
submit_btn.click(
|
940 |
+
fn=analysis,
|
941 |
+
inputs=[file_input, search_input],
|
942 |
+
outputs=[text, mfw, plot1, plot2, plot3, plot4, plot5, summary_output]
|
943 |
+
)
|
944 |
+
|
945 |
+
gr.Examples(
|
946 |
+
examples=[
|
947 |
+
['Example/AAP_Manifesto_2019.pdf', 'government'],
|
948 |
+
['Example/Bjp_Manifesto_2019.pdf', 'environment'],
|
949 |
+
['Example/Congress_Manifesto_2019.pdf', 'safety']
|
950 |
+
],
|
951 |
+
inputs=[file_input, search_input]
|
952 |
+
)
|
953 |
+
|
954 |
+
demo.launch(debug=True, share=False)
|
955 |
|
956 |
|
957 |
#allow_screenshot=False,allow_flagging="never",
|