Spaces:
Running
Running
import random | |
import matplotlib.pyplot as plt | |
import nltk | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.text import Text | |
from nltk.probability import FreqDist | |
from cleantext import clean | |
import textract | |
import urllib.request | |
from io import BytesIO | |
import sys | |
import pandas as pd | |
import cv2 | |
import re | |
from wordcloud import WordCloud, ImageColorGenerator | |
from textblob import TextBlob | |
from PIL import Image | |
import os | |
import gradio as gr | |
from dotenv import load_dotenv | |
import groq | |
import json | |
import traceback | |
import numpy as np | |
import unidecode | |
import contractions | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
# Load environment variables | |
load_dotenv() | |
# Download NLTK resources | |
nltk.download(['stopwords', 'wordnet', 'words']) | |
nltk.download('punkt') | |
nltk.download('punkt_tab') | |
# Initialize Groq client | |
groq_api_key = os.getenv("GROQ_API_KEY") | |
groq_client = groq.Groq(api_key=groq_api_key) if groq_api_key else None | |
# Stopwords customization | |
stop_words = set(stopwords.words('english')) | |
stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2') | |
# --- Parsing & Preprocessing Functions --- | |
def Parsing(parsed_text): | |
try: | |
if hasattr(parsed_text, 'name'): | |
file_path = parsed_text.name | |
else: | |
file_path = parsed_text | |
raw_party = textract.process(file_path, encoding='ascii', method='pdfminer') | |
return clean(raw_party) | |
except Exception as e: | |
print(f"Error parsing PDF: {e}") | |
return f"Error parsing PDF: {e}" | |
def clean_text(text): | |
text = text.encode("ascii", errors="ignore").decode("ascii") | |
text = unidecode.unidecode(text) | |
text = contractions.fix(text) | |
text = re.sub(r"\n", " ", text) | |
text = re.sub(r"\t", " ", text) | |
text = re.sub(r"/ ", " ", text) | |
text = text.strip() | |
text = re.sub(" +", " ", text).strip() | |
text = [word for word in text.split() if word not in stop_words] | |
return ' '.join(text) | |
def Preprocess(textParty): | |
text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty) | |
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') | |
text2Party = pattern.sub('', text1Party) | |
return text2Party | |
# --- Core Analysis Functions --- | |
def generate_summary(text): | |
if not groq_client: | |
return "Summarization is not available. Please set up your GROQ_API_KEY in the .env file." | |
if len(text) > 10000: | |
text = text[:10000] | |
try: | |
completion = groq_client.chat.completions.create( | |
model="llama3-8b-8192", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant that summarizes political manifestos. Provide a concise, objective summary that captures the key policy proposals, themes, and promises in the manifesto."}, | |
{"role": "user", "content": f"Please summarize the following political manifesto text in about 300-500 words, focusing on the main policy areas, promises, and themes:\n\n{text}"} | |
], | |
temperature=0.3, | |
max_tokens=800 | |
) | |
return completion.choices[0].message.content | |
except Exception as e: | |
return f"Error generating summary: {str(e)}" | |
def fDistance(text2Party): | |
word_tokens_party = word_tokenize(text2Party) | |
fdistance = FreqDist(word_tokens_party).most_common(10) | |
mem = {x[0]: x[1] for x in fdistance} | |
vectorizer = TfidfVectorizer(max_features=15, stop_words='english') | |
tfidf_matrix = vectorizer.fit_transform(sent_tokenize(text2Party)) | |
feature_names = vectorizer.get_feature_names_out() | |
tfidf_scores = {} | |
for i, word in enumerate(feature_names): | |
scores = [tfidf_matrix[j, i] for j in range(len(sent_tokenize(text2Party))) if i < tfidf_matrix[j].shape[1]] | |
if scores: | |
tfidf_scores[word] = sum(scores) / len(scores) | |
combined_scores = {} | |
for word in set(list(mem.keys()) + list(tfidf_scores.keys())): | |
freq_score = mem.get(word, 0) / max(mem.values()) if mem else 0 | |
tfidf_score = tfidf_scores.get(word, 0) / max(tfidf_scores.values()) if tfidf_scores else 0 | |
combined_scores[word] = (freq_score * 0.3) + (tfidf_score * 0.7) | |
top_words = dict(sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:10]) | |
return normalize(top_words) | |
def normalize(d, target=1.0): | |
raw = sum(d.values()) | |
factor = target / raw if raw != 0 else 0 | |
return {key: value * factor for key, value in d.items()} | |
# --- Visualization Functions with Error Handling --- | |
def safe_plot(func, *args, **kwargs): | |
try: | |
plt.clf() | |
func(*args, **kwargs) | |
buf = BytesIO() | |
plt.savefig(buf, format='png') | |
buf.seek(0) | |
return Image.open(buf) | |
except Exception as e: | |
print(f"Plotting error: {e}") | |
return None | |
def fDistancePlot(text2Party): | |
return safe_plot(lambda: FreqDist(word_tokenize(text2Party)).plot(15, title='Frequency Distribution')) | |
def DispersionPlot(textParty): | |
try: | |
word_tokens_party = word_tokenize(textParty) | |
moby = Text(word_tokens_party) # Ensure Text is imported | |
fdistance = FreqDist(word_tokens_party) | |
word_Lst = [fdistance.most_common(6)[x][0] for x in range(5)] | |
plt.figure(figsize=(4, 3)) | |
plt.title('Dispersion Plot') | |
moby.dispersion_plot(word_Lst) | |
plt.tight_layout() | |
buf = BytesIO() | |
plt.savefig(buf, format='png') | |
buf.seek(0) | |
img = Image.open(buf) | |
plt.clf() | |
return img | |
except Exception as e: | |
print(f"Dispersion plot error: {e}") | |
return None | |
def word_cloud_generator(parsed_text_name, text_Party): | |
try: | |
parsed = parsed_text_name.lower() | |
if 'bjp' in parsed: | |
mask_path = 'bjpImg2.jpeg' | |
elif 'congress' in parsed: | |
mask_path = 'congress3.jpeg' | |
elif 'aap' in parsed: | |
mask_path = 'aapMain2.jpg' | |
else: | |
mask_path = None | |
if mask_path and os.path.exists(mask_path): | |
orgImg = Image.open(mask_path) | |
mask = np.array(orgImg) | |
wordcloud = WordCloud(max_words=3000, mask=mask).generate(text_Party) | |
plt.imshow(wordcloud) | |
else: | |
wordcloud = WordCloud(max_words=2000).generate(text_Party) | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
buf = BytesIO() | |
plt.savefig(buf, format='png') | |
buf.seek(0) | |
return Image.open(buf) | |
except Exception as e: | |
print(f"Word cloud error: {e}") | |
return None | |
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10, numLins=4): | |
""" | |
Function to get all the phrases that contain the target word in a text/passage. | |
""" | |
if not target_word or target_word.strip() == "": | |
return "Please enter a search term" | |
tokens = nltk.word_tokenize(tar_passage) | |
text = nltk.Text(tokens) | |
c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) | |
offsets = c.offsets(target_word) | |
concordance_txt = [ | |
text.tokens[max(0, offset - left_margin):offset + right_margin] | |
for offset in offsets[:numLins] | |
] | |
result = [' '.join(con_sub) for con_sub in concordance_txt] | |
return '\n'.join(result) | |
# --- Main Analysis Function --- | |
def analysis(Manifesto, Search): | |
try: | |
if Manifesto is None: | |
return "No file uploaded", {}, None, None, None, None, None, "No file uploaded" | |
if Search.strip() == "": | |
Search = "government" | |
raw_party = Parsing(Manifesto) | |
if isinstance(raw_party, str) and raw_party.startswith("Error"): | |
return raw_party, {}, None, None, None, None, None, "Parsing failed" | |
text_Party = clean_text(raw_party) | |
text_Party_processed = Preprocess(text_Party) | |
summary = generate_summary(raw_party) | |
df = pd.DataFrame([{'Content': text_Party_processed}], columns=['Content']) | |
df['Subjectivity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.subjectivity) | |
df['Polarity'] = df['Content'].apply(lambda x: TextBlob(x).sentiment.polarity) | |
df['Polarity_Label'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral') | |
df['Subjectivity_Label'] = df['Subjectivity'].apply(lambda x: 'High' if x > 0.5 else 'Low') | |
# Generate Plots with Safe Plotting | |
sentiment_plot = safe_plot(lambda: df['Polarity_Label'].value_counts().plot(kind='bar', color="#FF9F45", title='Sentiment Analysis')) | |
subjectivity_plot = safe_plot(lambda: df['Subjectivity_Label'].value_counts().plot(kind='bar', color="#B667F1", title='Subjectivity Analysis')) | |
freq_plot = fDistancePlot(text_Party_processed) | |
dispersion_plot = DispersionPlot(text_Party_processed) | |
wordcloud = word_cloud_generator(Manifesto.name, text_Party_processed) | |
fdist_Party = fDistance(text_Party_processed) | |
searChRes = get_all_phases_containing_tar_wrd(Search, text_Party_processed) | |
return searChRes, fdist_Party, sentiment_plot, subjectivity_plot, wordcloud, freq_plot, dispersion_plot, summary | |
except Exception as e: | |
error_msg = f"Critical error: {str(e)}" | |
print(error_msg) | |
traceback.print_exc() | |
return error_msg, {}, None, None, None, None, None, "Analysis failed" | |
# --- Gradio Interface --- | |
Search_txt = "text" | |
filePdf = "file" | |
with gr.Blocks(title='Manifesto Analysis') as demo: | |
gr.Markdown("# Manifesto Analysis with LLM Enhancement") | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File(label="Upload Manifesto PDF", file_types=[".pdf"]) | |
search_input = gr.Textbox(label="Search Term", placeholder="Enter a term to search in the manifesto") | |
submit_btn = gr.Button("Analyze Manifesto") | |
with gr.Tabs(): | |
with gr.TabItem("Summary"): gr.Textbox(label='AI-Generated Summary', lines=10) | |
with gr.TabItem("Search Results"): gr.Textbox(label='Context Based Search') | |
with gr.TabItem("Key Topics"): gr.Label(label="Most Relevant Topics (LLM Enhanced)") | |
with gr.TabItem("Visualizations"): | |
with gr.Row(): | |
gr.Image(label='Sentiment Analysis'), gr.Image(label='Subjectivity Analysis') | |
with gr.Row(): | |
gr.Image(label='Word Cloud'), gr.Image(label='Frequency Distribution') | |
gr.Image(label='Dispersion Plot') | |
submit_btn.click( | |
fn=analysis, | |
inputs=[file_input, search_input], | |
outputs=[ | |
gr.Textbox(label='Context Based Search'), | |
gr.Label(label="Most Relevant Topics (LLM Enhanced)"), | |
gr.Image(label='Sentiment Analysis'), | |
gr.Image(label='Subjectivity Analysis'), | |
gr.Image(label='Word Cloud'), | |
gr.Image(label='Frequency Distribution'), | |
gr.Image(label='Dispersion Plot'), | |
gr.Textbox(label='AI-Generated Summary', lines=10) | |
] | |
) | |
gr.Examples( | |
examples=[ | |
["Example/AAP_Manifesto_2019.pdf", "government"], | |
["Example/Bjp_Manifesto_2019.pdf", "environment"], | |
["Example/Congress_Manifesto_2019.pdf", "safety"] | |
], | |
inputs=[file_input, search_input] | |
) | |
demo.launch(debug=True, share=False, show_error=True) |