Spaces:
Running
Running
""" | |
# MANIFESTO ANALYSIS | |
""" | |
##IMPORTING LIBRARIES | |
import random | |
import matplotlib.pyplot as plt | |
import nltk | |
from nltk.tokenize import word_tokenize,sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.probability import FreqDist | |
from cleantext import clean | |
import textract | |
import urllib.request | |
import nltk.corpus | |
from nltk.text import Text | |
import io | |
from io import StringIO,BytesIO | |
import sys | |
import pandas as pd | |
import cv2 | |
import re | |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
from textblob import TextBlob | |
from PIL import Image | |
import os | |
import gradio as gr | |
from zipfile import ZipFile | |
import contractions | |
import unidecode | |
import groq | |
import json | |
from dotenv import load_dotenv | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from collections import Counter | |
import numpy as np | |
# Load environment variables from .env file | |
load_dotenv() | |
nltk.download('punkt_tab') | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
nltk.download('wordnet') | |
nltk.download('words') | |
# Initialize Groq client for LLM capabilities | |
try: | |
groq_api_key = os.getenv("GROQ_API_KEY") | |
if groq_api_key: | |
groq_client = groq.Groq(api_key=groq_api_key) | |
else: | |
print("Warning: GROQ_API_KEY not found in environment variables. Summarization will be disabled.") | |
groq_client = None | |
except Exception as e: | |
print(f"Error initializing Groq client: {e}") | |
groq_client = None | |
"""## PARSING FILES""" | |
#def Parsing(parsed_text): | |
#parsed_text=parsed_text.name | |
#raw_party =parser.from_file(parsed_text) | |
# raw_party = raw_party['content'],cache_examples=True | |
# return clean(raw_party) | |
def Parsing(parsed_text): | |
''' | |
Process a PDF file and extract its text content | |
parsed_text: Can be a file object with a 'name' attribute or a file path string | |
''' | |
try: | |
# Handle different input types | |
if hasattr(parsed_text, 'name'): | |
file_path = parsed_text.name | |
else: | |
file_path = parsed_text | |
# Extract text from PDF | |
raw_party = textract.process(file_path, encoding='ascii', method='pdfminer') | |
return clean(raw_party) | |
except Exception as e: | |
print(f"Error parsing PDF: {str(e)}") | |
return f"Error parsing PDF: {str(e)}" | |
#Added more stopwords to avoid irrelevant terms | |
stop_words = set(stopwords.words('english')) | |
stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2') | |
"""## PREPROCESSING""" | |
def clean_text(text): | |
''' | |
The function which returns clean text | |
''' | |
text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters | |
text=unidecode.unidecode(text)# diacritics remove | |
text=contractions.fix(text) # contraction fix | |
text = re.sub(r"\n", " ", text) | |
text = re.sub(r"\n\n", " ", text) | |
text = re.sub(r"\t", " ", text) | |
text = re.sub(r"/ ", " ", text) | |
text = text.strip(" ") | |
text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single | |
text = [word for word in text.split() if word not in stop_words] | |
text = ' '.join(text) | |
return text | |
# text_Party=clean_text(raw_party) | |
def Preprocess(textParty): | |
''' | |
Removing special characters extra spaces | |
''' | |
text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty) | |
#Removing all stop words | |
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') | |
text2Party = pattern.sub('', text1Party) | |
# fdist_cong = FreqDist(word_tokens_cong) | |
return text2Party | |
''' | |
Using Concordance, you can see each time a word is used, along with its | |
immediate context. It can give you a peek into how a word is being used | |
at the sentence level and what words are used with it | |
''' | |
def conc(text_Party,strng): | |
word_tokens_party = word_tokenize(text_Party) | |
moby = Text(word_tokens_party) | |
resultList = [] | |
for i in range(0,1): | |
save_stdout = sys.stdout | |
result = StringIO() | |
sys.stdout = result | |
moby.concordance(strng,lines=4,width=82) | |
sys.stdout = save_stdout | |
s=result.getvalue().splitlines() | |
return result.getvalue() | |
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10, numLins=4): | |
""" | |
Function to get all the phases that contain the target word in a text/passage tar_passage. | |
Workaround to save the output given by nltk Concordance function | |
str target_word, str tar_passage int left_margin int right_margin --> list of str | |
left_margin and right_margin allocate the number of words/pununciation before and after target word | |
Left margin will take note of the beginning of the text | |
""" | |
# Handle empty or None search terms | |
if not target_word or target_word.strip() == "": | |
return "Please enter a search term" | |
# Create list of tokens using nltk function | |
tokens = nltk.word_tokenize(tar_passage) | |
# Create the text of tokens | |
text = nltk.Text(tokens) | |
## Collect all the index or offset position of the target word | |
c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower()) | |
## Collect the range of the words that is within the target word by using text.tokens[start;end]. | |
## The map function is use so that when the offset position - the target range < 0, it will be default to zero | |
concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)]) | |
## join the sentences for each of the target phrase and return it | |
result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1] | |
result=result[:numLins+1] | |
res='\n\n'.join(result) | |
return res | |
def normalize(d, target=1.0): | |
raw = sum(d.values()) | |
factor = target/raw | |
return {key:value*factor for key,value in d.items()} | |
def generate_summary(text, max_length=1000): | |
""" | |
Generate a summary of the manifesto text using Groq LLM | |
""" | |
if not groq_client: | |
return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file." | |
# Truncate text if it's too long to fit in context window | |
if len(text) > 10000: | |
text = text[:10000] | |
try: | |
# Use Groq's LLaMA 3 model for summarization | |
completion = groq_client.chat.completions.create( | |
model="llama3-8b-8192", # Using LLaMA 3 8B model | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."}, | |
{"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"} | |
], | |
temperature=0.3, # Lower temperature for more focused output | |
max_tokens=800, # Limit response length | |
) | |
return completion.choices[0].message.content | |
except Exception as e: | |
return f"Error generating summary: {str(e)}. Please check your API key and connection." | |
def fDistance(text2Party): | |
''' | |
Most frequent words search using TF-IDF to find more relevant words | |
''' | |
# Traditional frequency distribution | |
word_tokens_party = word_tokenize(text2Party) #Tokenizing | |
fdistance = FreqDist(word_tokens_party).most_common(10) | |
mem={} | |
for x in fdistance: | |
mem[x[0]]=x[1] | |
# Enhanced with TF-IDF for better relevance | |
sentences = sent_tokenize(text2Party) | |
# Use TF-IDF to find more relevant words | |
vectorizer = TfidfVectorizer(max_features=15, stop_words='english') | |
tfidf_matrix = vectorizer.fit_transform(sentences) | |
# Get feature names (words) | |
feature_names = vectorizer.get_feature_names_out() | |
# Calculate average TF-IDF score for each word across all sentences | |
tfidf_scores = {} | |
for i, word in enumerate(feature_names): | |
scores = [tfidf_matrix[j, i] for j in range(len(sentences)) if i < tfidf_matrix[j].shape[1]] | |
if scores: | |
tfidf_scores[word] = sum(scores) / len(scores) | |
# Sort by score and get top words | |
sorted_tfidf = dict(sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10]) | |
# Combine traditional frequency with TF-IDF for better results | |
combined_scores = {} | |
for word in set(list(mem.keys()) + list(sorted_tfidf.keys())): | |
# Normalize and combine both scores (with more weight to TF-IDF) | |
freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0 | |
tfidf_score = sorted_tfidf.get(word, 0) / max(sorted_tfidf.values()) if sorted_tfidf else 0 | |
combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7) # Weight TF-IDF higher | |
# Get top 10 words by combined score | |
top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10]) | |
return normalize(top_words) | |
def fDistancePlot(text2Party,plotN=15): | |
''' | |
Most Frequent Words Visualization | |
''' | |
word_tokens_party = word_tokenize(text2Party) #Tokenizing | |
fdistance = FreqDist(word_tokens_party) | |
plt.title('Frequency Distribution') | |
plt.axis('off') | |
plt.figure(figsize=(4,3)) | |
fdistance.plot(plotN) | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img1 = np.array(Image.open(buf)) | |
plt.clf() | |
return img1 | |
def DispersionPlot(textParty): | |
''' | |
Dispersion PLot | |
''' | |
word_tokens_party = word_tokenize(textParty) #Tokenizing | |
moby = Text(word_tokens_party) | |
fdistance = FreqDist(word_tokens_party) | |
word_Lst=[] | |
for x in range(5): | |
word_Lst.append(fdistance.most_common(6)[x][0]) | |
plt.axis('off') | |
plt.figure(figsize=(4,3)) | |
plt.title('Dispersion Plot') | |
moby.dispersion_plot(word_Lst) | |
plt.plot(color="#EF6D6D") | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img = Image.open(buf) | |
plt.clf() | |
return img | |
def getSubjectivity(text): | |
''' | |
Create a function to get the polarity | |
''' | |
return TextBlob(text).sentiment.subjectivity | |
def getPolarity(text): | |
''' | |
Create a function to get the polarity | |
''' | |
return TextBlob(text).sentiment.polarity | |
def getAnalysis(score): | |
if score < 0: | |
return 'Negative' | |
elif score == 0: | |
return 'Neutral' | |
else: | |
return 'Positive' | |
def Original_Image(path): | |
img= cv2.imread(path) | |
img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
return img | |
def Image_Processed(path): | |
''' | |
Reading the image file | |
''' | |
img= cv2.imread(path) | |
img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
#Thresholding | |
ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY) | |
return bw_img | |
def word_cloud(orgIm,mask_img,text_Party_pr,maxWord=2000,colorGener=True, | |
contCol='white',bckColor='white'): | |
''' | |
#Generating word cloud | |
''' | |
mask =mask_img | |
# Create and generate a word cloud image: | |
wordcloud = WordCloud(max_words=maxWord, background_color=bckColor, | |
mask=mask, | |
colormap='nipy_spectral_r', | |
contour_color=contCol, | |
width=800, height=800, | |
margin=2, | |
contour_width=3).generate(text_Party_pr) | |
# create coloring from image | |
plt.axis("off") | |
if colorGener==True: | |
image_colors = ImageColorGenerator(orgIm) | |
plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear") | |
else: | |
plt.imshow(wordcloud) | |
def word_cloud_generator(parsed_text_name,text_Party): | |
parsed=parsed_text_name.lower() | |
if 'bjp' in parsed: | |
orgImg=Original_Image('bjpImg2.jpeg') | |
bwImg=Image_Processed('bjpImg2.jpeg') | |
plt.figure(figsize=(6,5)) | |
word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True, | |
contCol='white', bckColor='black') | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img1 = Image.open(buf) | |
plt.clf() | |
return img1 | |
elif 'congress' in parsed: | |
orgImg=Original_Image('congress3.jpeg') | |
bwImg=Image_Processed('congress3.jpeg') | |
plt.figure(figsize=(5,4)) | |
word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True) | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img2 = Image.open(buf) | |
plt.clf() | |
return img2 | |
#congrsMain.jpg | |
elif 'aap' in parsed: | |
orgImg=Original_Image('aapMain2.jpg') | |
bwImg=Image_Processed('aapMain2.jpg') | |
plt.figure(figsize=(5,4)) | |
word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=False,contCol='black') | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img3 = Image.open(buf) | |
plt.clf() | |
return img3 | |
else : | |
wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party) | |
plt.figure(figsize=(5,5)) | |
plt.imshow(wordcloud, interpolation="bilinear") | |
plt.axis("off") | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img4 = Image.open(buf) | |
plt.clf() | |
return img4 | |
''' | |
url = "http://library.bjp.org/jspui/bitstream/123456789/2988/1/BJP-Election-english-2019.pdf" | |
path_input = "./Bjp_Manifesto_2019.pdf" | |
urllib.request.urlretrieve(url, filename=path_input) | |
url="https://drive.google.com/uc?id=1BLCiy_BWilfVdrUH8kbO-44DJevwO5CG&export=download" | |
path_input = "./Aap_Manifesto_2019.pdf" | |
urllib.request.urlretrieve(url, filename=path_input) | |
url="https://drive.google.com/uc?id=1HVZvTtYntl0YKLnE0cwu0CvAIRhXOv60&export=download" | |
path_input = "./Congress_Manifesto_2019.pdf" | |
urllib.request.urlretrieve(url, filename=path_input) | |
''' | |
def analysis(Manifesto, Search): | |
''' | |
Main analysis function that processes the manifesto and generates all outputs | |
Manifesto: PDF file uploaded by the user | |
Search: Search term entered by the user | |
''' | |
try: | |
print(f"Analysis function called with: Manifesto={Manifesto}, Search={Search}") | |
# Check if a file was uploaded | |
if Manifesto is None: | |
print("No file uploaded") | |
return "Please upload a PDF file", {}, None, None, None, None, None, "No file uploaded" | |
# Handle empty search term | |
if Search is None or Search.strip() == "": | |
Search = "government" # Default search term | |
print(f"Using default search term: {Search}") | |
else: | |
print(f"Using provided search term: {Search}") | |
# Process the uploaded PDF | |
print(f"Processing file: {Manifesto.name if hasattr(Manifesto, 'name') else Manifesto}") | |
raw_party = Parsing(Manifesto) | |
# Check if parsing was successful | |
if isinstance(raw_party, str) and raw_party.startswith("Error"): | |
print(f"Parsing error: {raw_party}") | |
return raw_party, {}, None, None, None, None, None, "Error generating summary due to parsing failure" | |
print("Parsing successful, cleaning text...") | |
text_Party = clean_text(raw_party) | |
text_Party_processed = Preprocess(text_Party) | |
# Generate summary using LLM | |
print("Generating summary...") | |
summary = generate_summary(raw_party) | |
# Sentiment analysis | |
print("Performing sentiment analysis...") | |
df = pd.DataFrame(raw_party.split('\n'), columns=['Content']) | |
df['Subjectivity'] = df['Content'].apply(getSubjectivity) | |
df['Polarity'] = df['Content'].apply(getPolarity) | |
df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis) | |
df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis) | |
# Generate sentiment analysis plot | |
print("Generating sentiment analysis plot...") | |
plt.title('Sentiment Analysis') | |
plt.xlabel('Sentiment') | |
plt.ylabel('Counts') | |
plt.figure(figsize=(4,3)) | |
df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45") | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img1 = Image.open(buf) | |
plt.clf() | |
# Generate subjectivity analysis plot | |
print("Generating subjectivity analysis plot...") | |
plt.figure(figsize=(4,3)) | |
df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1") | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf) | |
buf.seek(0) | |
img2 = Image.open(buf) | |
plt.clf() | |
# Generate word cloud | |
print("Generating word cloud...") | |
img3 = word_cloud_generator(Manifesto.name, text_Party_processed) | |
# Generate frequency distribution and dispersion plots | |
print("Generating frequency distribution...") | |
fdist_Party = fDistance(text_Party_processed) | |
img4 = fDistancePlot(text_Party_processed) | |
print("Generating dispersion plot...") | |
img5 = DispersionPlot(text_Party_processed) | |
# Search for the term in the text | |
print(f"Searching for term: {Search}") | |
searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed) | |
searChRes = searChRes.replace(Search, "\u0332".join(Search)) | |
plt.close('all') | |
print("Analysis completed successfully") | |
return searChRes, fdist_Party, img1, img2, img3, img4, img5, summary | |
except Exception as e: | |
error_message = f"Error analyzing manifesto: {str(e)}" | |
print(error_message) | |
import traceback | |
traceback.print_exc() | |
# Return placeholder values in case of error | |
return error_message, {}, None, None, None, None, None, "Error generating summary. Please check the console for details." | |
Search_txt= "text" | |
filePdf = "file" | |
text = gr.Textbox(label='Context Based Search') | |
mfw=gr.Label(label="Most Relevant Topics (LLM Enhanced)") | |
plot1=gr.Image(label='Sentiment Analysis') | |
plot2=gr.Image(label='Subjectivity Analysis') | |
plot3=gr.Image(label='Word Cloud') | |
plot4=gr.Image(label='Frequency Distribution') | |
plot5=gr.Image(label='Dispersion Plot') | |
summary_output = gr.Textbox(label='AI-Generated Summary', lines=10) | |
with gr.Blocks(title='Manifesto Analysis') as demo: | |
gr.Markdown("# Manifesto Analysis with LLM Enhancement") | |
gr.Markdown("### Analyze political manifestos with advanced NLP and LLM techniques") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"]) | |
search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto") | |
submit_btn = gr.Button("Analyze Manifesto") | |
with gr.Tabs(): | |
with gr.TabItem("Summary"): | |
summary_output | |
with gr.TabItem("Search Results"): | |
text | |
with gr.TabItem("Key Topics"): | |
mfw | |
with gr.TabItem("Visualizations"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
plot3 | |
with gr.Column(scale=1): | |
plot4 | |
with gr.Row(): | |
with gr.Column(scale=1): | |
plot1 | |
with gr.Column(scale=1): | |
plot2 | |
with gr.Row(): | |
plot5 | |
submit_btn.click( | |
fn=analysis, | |
inputs=[file_input, search_input], | |
outputs=[text, mfw, plot1, plot2, plot3, plot4, plot5, summary_output] | |
) | |
# Add a debug print to verify the button is connected | |
print("Button connected to analysis function") | |
gr.Examples( | |
examples=[ | |
['Example/AAP_Manifesto_2019.pdf', 'government'], | |
['Example/Bjp_Manifesto_2019.pdf', 'environment'], | |
['Example/Congress_Manifesto_2019.pdf', 'safety'] | |
], | |
inputs=[file_input, search_input] | |
) | |
demo.launch(debug=True, share=False, show_error=True) | |
# Old interface code replaced by the Blocks implementation above | |
# io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']]) | |
# io.launch(debug=True,share=False) | |