Spaces:

Sa-m
/

manifesto-explainer

Running

App Files Files Community

Sa-m commited on 10 days ago

Commit

fb1ce50

verified ·

1 Parent(s): 1df8bb0

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -416

app.py CHANGED Viewed

@@ -1,415 +1,3 @@
-# """
-# # MANIFESTO ANALYSIS
-# """
-# ##IMPORTING LIBRARIES
-# import random
-# import matplotlib.pyplot as plt
-# import nltk
-# from nltk.tokenize import word_tokenize,sent_tokenize
-# from nltk.corpus import stopwords
-# from nltk.stem.porter import PorterStemmer
-# from nltk.stem import WordNetLemmatizer
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize
-# from nltk.probability import FreqDist
-# from cleantext import clean
-# import textract
-# import urllib.request
-# import nltk.corpus
-# from nltk.text import Text
-# import io
-# from io import StringIO,BytesIO
-# import sys
-# import pandas as pd
-# import cv2
-# import re
-# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
-# from textblob import TextBlob
-# from PIL import Image
-# import os
-# import gradio as gr
-# from zipfile import ZipFile
-# import contractions
-# import unidecode
-# nltk.download('punkt_tab')
-# nltk.download('stopwords')
-# nltk.download('punkt')
-# nltk.download('wordnet')
-# nltk.download('words')
-# """## PARSING FILES"""
-# #def Parsing(parsed_text):
-#   #parsed_text=parsed_text.name
-#   #raw_party =parser.from_file(parsed_text)
-#  # raw_party = raw_party['content'],cache_examples=True
-# #  return clean(raw_party)
-# def Parsing(parsed_text):
-#   parsed_text=parsed_text.name
-#   raw_party =textract.process(parsed_text, encoding='ascii',method='pdfminer')
-#   return clean(raw_party)
-# #Added more stopwords to avoid irrelevant terms
-# stop_words = set(stopwords.words('english'))
-# stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
-# """## PREPROCESSING"""
-# def clean_text(text):
-#   '''
-#   The function which returns clean text
-#   '''
-#   text = text.encode("ascii", errors="ignore").decode("ascii")  # remove non-asciicharacters
-#   text=unidecode.unidecode(text)# diacritics remove
-#   text=contractions.fix(text) # contraction fix
-#   text = re.sub(r"\n", " ", text)
-#   text = re.sub(r"\n\n", " ", text)
-#   text = re.sub(r"\t", " ", text)
-#   text = re.sub(r"/ ", " ", text)
-#   text = text.strip(" ")
-#   text = re.sub(" +", " ", text).strip()  # get rid of multiple spaces and replace with a single
-#   text = [word for word in text.split() if word not in stop_words]
-#   text = ' '.join(text)
-#   return text
-# # text_Party=clean_text(raw_party)
-# def Preprocess(textParty):
-#   '''
-#   Removing special characters extra spaces
-#   '''
-#   text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
-#   #Removing all stop words
-#   pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
-#   text2Party = pattern.sub('', text1Party)
-#   # fdist_cong = FreqDist(word_tokens_cong)
-#   return text2Party
-# '''
-#   Using Concordance, you can see each time a word is used, along with its
-#   immediate context. It can give you a peek into how a word is being used
-#   at the sentence level and what words are used with it
-# '''
-# def conc(text_Party,strng):
-#   word_tokens_party = word_tokenize(text_Party)
-#   moby = Text(word_tokens_party)
-#   resultList = []
-#   for i in range(0,1):
-#       save_stdout = sys.stdout
-#       result = StringIO()
-#       sys.stdout = result
-#       moby.concordance(strng,lines=4,width=82)
-#       sys.stdout = save_stdout
-#   s=result.getvalue().splitlines()
-#   return result.getvalue()
-# def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10,numLins=4):
-#     """
-#         Function to get all the phases that contain the target word in a text/passage tar_passage.
-#         Workaround to save the output given by nltk Concordance function
-#         str target_word, str tar_passage int left_margin int right_margin --> list of str
-#         left_margin and right_margin allocate the number of words/pununciation before and after target word
-#         Left margin will take note of the beginning of the text
-#     """
-#     ## Create list of tokens using nltk function
-#     tokens = nltk.word_tokenize(tar_passage)
-#     ## Create the text of tokens
-#     text = nltk.Text(tokens)
-#     ## Collect all the index or offset position of the target word
-#     c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
-#     ## Collect the range of the words that is within the target word by using text.tokens[start;end].
-#     ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
-#     concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)])
-#     ## join the sentences for each of the target phrase and return it
-#     result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1]
-#     result=result[:numLins+1]
-#     res='\n\n'.join(result)
-#     return res
-# def normalize(d, target=1.0):
-#    raw = sum(d.values())
-#    factor = target/raw
-#    return {key:value*factor for key,value in d.items()}
-# def fDistance(text2Party):
-#   '''
-#   Most frequent words search
-#   '''
-#   word_tokens_party = word_tokenize(text2Party) #Tokenizing
-#   fdistance = FreqDist(word_tokens_party).most_common(10)
-#   mem={}
-#   for x in fdistance:
-#     mem[x[0]]=x[1]
-#   return normalize(mem)
-# def fDistancePlot(text2Party,plotN=15):
-#   '''
-#   Most Frequent Words Visualization
-#   '''
-#   word_tokens_party = word_tokenize(text2Party) #Tokenizing
-#   fdistance = FreqDist(word_tokens_party)
-#   plt.title('Frequency Distribution')
-#   plt.axis('off')
-#   plt.figure(figsize=(4,3))
-#   fdistance.plot(plotN)
-#   plt.tight_layout()
-#   buf = BytesIO()
-#   plt.savefig(buf)
-#   buf.seek(0)
-#   img1 = Image.open(buf)
-#   plt.clf()
-#   return img1
-# def DispersionPlot(textParty):
-#   '''
-#   Dispersion PLot
-#   '''
-#   word_tokens_party = word_tokenize(textParty) #Tokenizing
-#   moby = Text(word_tokens_party)
-#   fdistance = FreqDist(word_tokens_party)
-#   word_Lst=[]
-#   for x in range(5):
-#     word_Lst.append(fdistance.most_common(6)[x][0])
-#   plt.axis('off')
-#   plt.figure(figsize=(4,3))
-#   plt.title('Dispersion Plot')
-#   moby.dispersion_plot(word_Lst)
-#   plt.plot(color="#EF6D6D")
-#   plt.tight_layout()
-#   buf = BytesIO()
-#   plt.savefig(buf)
-#   buf.seek(0)
-#   img = Image.open(buf)
-#   plt.clf()
-#   return img
-# def getSubjectivity(text):
-#   '''
-#   Create a function to get the polarity
-#   '''
-#   return TextBlob(text).sentiment.subjectivity
-# def getPolarity(text):
-#   '''
-#   Create a function to get the polarity
-#   '''
-#   return  TextBlob(text).sentiment.polarity
-# def getAnalysis(score):
-#   if score < 0:
-#     return 'Negative'
-#   elif score == 0:
-#     return 'Neutral'
-#   else:
-#     return 'Positive'
-# def Original_Image(path):
-#   img= cv2.imread(path)
-#   img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-#   return img
-# def Image_Processed(path):
-#   '''
-#   Reading the image file
-#   '''
-#   img= cv2.imread(path)
-#   img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-#   #Thresholding
-#   ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY)
-#   return bw_img
-# def word_cloud(orgIm,mask_img,text_Party_pr,maxWord=2000,colorGener=True,
-#     contCol='white',bckColor='white'):
-#   '''
-#   #Generating word cloud
-#   '''
-#   mask =mask_img
-#   # Create and generate a word cloud image:
-#   wordcloud = WordCloud(max_words=maxWord, background_color=bckColor,
-#                         mask=mask,
-#                         colormap='nipy_spectral_r',
-#                         contour_color=contCol,
-#                         width=800, height=800,
-#                         margin=2,
-#                         contour_width=3).generate(text_Party_pr)
-#   # create coloring from image
-#   plt.axis("off")
-#   if colorGener==True:
-#     image_colors = ImageColorGenerator(orgIm)
-#     plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear")
-#   else:
-#     plt.imshow(wordcloud)
-# def word_cloud_generator(parsed_text_name,text_Party):
-#   parsed=parsed_text_name.lower()
-#   if 'bjp' in parsed:
-#     orgImg=Original_Image('bjpImg2.jpeg')
-#     bwImg=Image_Processed('bjpImg2.jpeg')
-#     plt.figure(figsize=(6,5))
-#     word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True,
-#     contCol='white', bckColor='black')
-#     plt.tight_layout()
-#     buf = BytesIO()
-#     plt.savefig(buf)
-#     buf.seek(0)
-#     img1 = Image.open(buf)
-#     plt.clf()
-#     return img1
-#   elif 'congress' in parsed:
-#     orgImg=Original_Image('congress3.jpeg')
-#     bwImg=Image_Processed('congress3.jpeg')
-#     plt.figure(figsize=(5,4))
-#     word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True)
-#     plt.tight_layout()
-#     buf = BytesIO()
-#     plt.savefig(buf)
-#     buf.seek(0)
-#     img2 = Image.open(buf)
-#     plt.clf()
-#     return img2
-#     #congrsMain.jpg
-#   elif 'aap' in parsed:
-#     orgImg=Original_Image('aapMain2.jpg')
-#     bwImg=Image_Processed('aapMain2.jpg')
-#     plt.figure(figsize=(5,4))
-#     word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=False,contCol='black')
-#     plt.tight_layout()
-#     buf = BytesIO()
-#     plt.savefig(buf)
-#     buf.seek(0)
-#     img3 = Image.open(buf)
-#     plt.clf()
-#     return img3
-#   else :
-#     wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party)
-#     plt.figure(figsize=(5,5))
-#     plt.imshow(wordcloud, interpolation="bilinear")
-#     plt.axis("off")
-#     plt.tight_layout()
-#     buf = BytesIO()
-#     plt.savefig(buf)
-#     buf.seek(0)
-#     img4 = Image.open(buf)
-#     plt.clf()
-#     return img4
-# '''
-# url = "http://library.bjp.org/jspui/bitstream/123456789/2988/1/BJP-Election-english-2019.pdf"
-# path_input = "./Bjp_Manifesto_2019.pdf"
-# urllib.request.urlretrieve(url, filename=path_input)
-# url="https://drive.google.com/uc?id=1BLCiy_BWilfVdrUH8kbO-44DJevwO5CG&export=download"
-# path_input = "./Aap_Manifesto_2019.pdf"
-# urllib.request.urlretrieve(url, filename=path_input)
-# url="https://drive.google.com/uc?id=1HVZvTtYntl0YKLnE0cwu0CvAIRhXOv60&export=download"
-# path_input = "./Congress_Manifesto_2019.pdf"
-# urllib.request.urlretrieve(url, filename=path_input)
-# '''
-# def analysis(Manifesto,Search):
-#   raw_party = Parsing(Manifesto)
-#   text_Party=clean_text(raw_party)
-#   text_Party= Preprocess(text_Party)
-#   df = pd.DataFrame(raw_party.split('\n'), columns=['Content'])
-#   df['Subjectivity'] = df['Content'].apply(getSubjectivity)
-#   df['Polarity'] = df['Content'].apply(getPolarity)
-#   df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis)
-#   df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis)
-#   plt.title('Sentiment Analysis')
-#   plt.xlabel('Sentiment')
-#   plt.ylabel('Counts')
-#   plt.figure(figsize=(4,3))
-#   df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45")
-#   plt.tight_layout()
-#   buf = BytesIO()
-#   plt.savefig(buf)
-#   buf.seek(0)
-#   img1 = Image.open(buf)
-#   plt.clf()
-#   plt.figure(figsize=(4,3))
-#   df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1")
-#   plt.tight_layout()
-#   buf = BytesIO()
-#   plt.savefig(buf)
-#   buf.seek(0)
-#   img2 = Image.open(buf)
-#   plt.clf()
-#   img3 = word_cloud_generator(Manifesto.name,text_Party)
-#   fdist_Party=fDistance(text_Party)
-#   img4=fDistancePlot(text_Party)
-#   img5=DispersionPlot(text_Party)
-#   #concordance(text_Party,Search)
-#   searChRes=get_all_phases_containing_tar_wrd(Search,text_Party)
-#   searChRes=searChRes.replace(Search,"\u0332".join(Search))
-#   plt.close('all')
-#   return searChRes,fdist_Party,img1,img2,img3,img4,img5
-# Search_txt= "text"
-# filePdf = "file"
-# text = gr.Textbox(label='Context Based Search')
-# mfw=gr.Label(label="Most Relevant Topics")
-# plot1=gr.Image(label='Sentiment Analysis')
-# plot2=gr.Image(label='Subjectivity Analysis')
-# plot3=gr.Image(label='Word Cloud')
-# plot4=gr.Image(label='Frequency Distribution')
-# plot5=gr.Image(label='Dispersion Plot')
-# io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']],theme='peach')
-# io.launch(debug=True,share=False)
-# #allow_screenshot=False,allow_flagging="never",
-# #examples=[['manifestos/Bjp_Manifesto_2019.pdf','modi'],['AAP_Manifesto_2019.pdf','delhi'],['manifestos/Congress_Manifesto_2019.pdf','safety']])
 """
 # MANIFESTO ANALYSIS
 """
@@ -546,7 +134,7 @@ def conc(text_Party,strng):
   s=result.getvalue().splitlines()
   return result.getvalue()
-def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10,numLins=4):
     """
         Function to get all the phases that contain the target word in a text/passage tar_passage.
         Workaround to save the output given by nltk Concordance function
@@ -555,10 +143,14 @@ def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10
         left_margin and right_margin allocate the number of words/pununciation before and after target word
         Left margin will take note of the beginning of the text
     """
-    ## Create list of tokens using nltk function
     tokens = nltk.word_tokenize(tar_passage)
-    ## Create the text of tokens
     text = nltk.Text(tokens)
     ## Collect all the index or offset position of the target word
@@ -849,8 +441,21 @@ def analysis(Manifesto, Search):
   Search: Search term entered by the user
   '''
   try:
     # Process the uploaded PDF
     raw_party = Parsing(Manifesto)
     text_Party = clean_text(raw_party)
     text_Party_processed = Preprocess(text_Party)
@@ -978,4 +583,3 @@ demo.launch(debug=True, share=False)
 # io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']])
 # io.launch(debug=True,share=False)

 """
 # MANIFESTO ANALYSIS
 """
   s=result.getvalue().splitlines()
   return result.getvalue()
+def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10, numLins=4):
     """
         Function to get all the phases that contain the target word in a text/passage tar_passage.
         Workaround to save the output given by nltk Concordance function
         left_margin and right_margin allocate the number of words/pununciation before and after target word
         Left margin will take note of the beginning of the text
     """
+    # Handle empty or None search terms
+    if not target_word or target_word.strip() == "":
+        return "Please enter a search term"
+    # Create list of tokens using nltk function
     tokens = nltk.word_tokenize(tar_passage)
+    # Create the text of tokens
     text = nltk.Text(tokens)
     ## Collect all the index or offset position of the target word
   Search: Search term entered by the user
   '''
   try:
+    # Check if a file was uploaded
+    if Manifesto is None:
+      return "Please upload a PDF file", {}, None, None, None, None, None, "No file uploaded"
+    # Handle empty search term
+    if Search is None or Search.strip() == "":
+      Search = "government"  # Default search term
     # Process the uploaded PDF
     raw_party = Parsing(Manifesto)
+    # Check if parsing was successful
+    if raw_party.startswith("Error"):
+      return raw_party, {}, None, None, None, None, None, "Error generating summary due to parsing failure"
     text_Party = clean_text(raw_party)
     text_Party_processed = Preprocess(text_Party)
 # io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']])
 # io.launch(debug=True,share=False)