Spaces:
Running
Running
# app.py | |
import os | |
from google import genai | |
import gradio as gr | |
import PyPDF2 | |
import numpy as np | |
# Try importing DSPy for chain-of-thought reasoning | |
try: | |
import dspy | |
HAS_DSPY = True | |
except ImportError: | |
HAS_DSPY = False | |
############################################# | |
# Load Gemini API key from environment variable | |
############################################# | |
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") | |
if not GEMINI_API_KEY: | |
raise ValueError("Please set the GEMINI_API_KEY environment variable.") | |
# Initialize the Gemini API client with the secret key | |
client = genai.Client(api_key=GEMINI_API_KEY) | |
############################################# | |
# Custom DSPy Prompt Signature Function | |
############################################# | |
def custom_dspy_prompt(text, mode="summarization"): | |
""" | |
Returns a custom chain-of-thought prompt signature for DSPy. | |
Modes: | |
- "summarization": for summarizing a text chunk. | |
- "overall": for combining chunk summaries. | |
""" | |
if mode == "summarization": | |
return (f"EffectiveDSPyCOT: Please provide a detailed, robust, and token-expansive summary using chain-of-thought reasoning. " | |
f"Preserve context and key details. Text:\n\n{text}") | |
elif mode == "overall": | |
return (f"EffectiveDSPyCOT: Combine the following chunk summaries into an overall comprehensive summary. " | |
f"Expand on details and maintain context with chain-of-thought reasoning. Summaries:\n\n{text}") | |
else: | |
return text | |
############################################# | |
# Fallback Using Gemini's generate_content Method | |
############################################# | |
def fallback_predict(prompt, system_msg="You are a helpful assistant."): | |
""" | |
Uses the Gemini API (generate_content method) to generate content. | |
""" | |
try: | |
full_prompt = f"{system_msg}\n\n{prompt}" | |
response = client.models.generate_content( | |
model="gemini-2.0-flash", # Adjust model name as needed. | |
contents=full_prompt | |
) | |
return response.text | |
except Exception as e: | |
return f"[Gemini fallback error]: {str(e)}" | |
############################################# | |
# PDF Extraction and Improved Chunking | |
############################################# | |
def extract_text_from_pdf(pdf_path): | |
""" | |
Extract text from all pages of a PDF file. | |
""" | |
text = "" | |
with open(pdf_path, "rb") as f: | |
pdf_reader = PyPDF2.PdfReader(f) | |
for page in pdf_reader.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text + "\n" | |
return text | |
def chunk_text(text, chunk_size=2000, overlap=300): | |
""" | |
Split the text into overlapping chunks. | |
Larger chunk size and overlap help maintain context and expand token capacity. | |
""" | |
words = text.split() | |
chunks = [] | |
start = 0 | |
while start < len(words): | |
end = min(start + chunk_size, len(words)) | |
chunk = " ".join(words[start:end]) | |
chunks.append(chunk) | |
start += chunk_size - overlap # Advance with overlap | |
return chunks | |
############################################# | |
# Summarizing a Single Chunk with Custom DSPy / Gemini | |
############################################# | |
def summarize_chunk(chunk): | |
""" | |
Summarize a text chunk using a custom DSPy chain-of-thought prompt. | |
Falls back to Gemini if DSPy is not available or fails. | |
""" | |
prompt = custom_dspy_prompt(chunk, mode="summarization") | |
if HAS_DSPY: | |
try: | |
summary = dspy.predict(prompt) | |
except Exception as e: | |
summary = fallback_predict(prompt, system_msg="You are a helpful summarizer.") | |
else: | |
summary = fallback_predict(prompt, system_msg="You are a helpful summarizer.") | |
return summary | |
############################################# | |
# Summarizing the Entire PDF | |
############################################# | |
def summarize_document(pdf_path): | |
""" | |
Extract text from PDF, split it into overlapping chunks, summarize each chunk, | |
and then combine the chunk summaries into an overall document summary. | |
""" | |
text = extract_text_from_pdf(pdf_path) | |
chunks = chunk_text(text) | |
summaries = [] | |
for chunk in chunks: | |
summary = summarize_chunk(chunk) | |
summaries.append(summary) | |
overall_prompt = custom_dspy_prompt("\n\n".join(summaries), mode="overall") | |
if HAS_DSPY: | |
try: | |
overall_summary = dspy.predict(overall_prompt) | |
except Exception as e: | |
overall_summary = fallback_predict(overall_prompt, system_msg="You are a helpful assistant that summarizes documents.") | |
else: | |
overall_summary = fallback_predict(overall_prompt, system_msg="You are a helpful assistant that summarizes documents.") | |
return overall_summary, summaries | |
############################################# | |
# Enhanced Gradio Interface with Better UI Aesthetics (Summarization Only) | |
############################################# | |
custom_css = """ | |
<style> | |
body { background-color: #f4f7f9; } | |
.gradio-container { font-family: 'Arial', sans-serif; } | |
h1, h2, h3 { color: #333333; } | |
.tab-header { background-color: #ffffff; border-bottom: 2px solid #e0e0e0; } | |
.gr-button { background-color: #4CAF50; color: white; } | |
.gr-textbox { background-color: #ffffff; } | |
</style> | |
""" | |
with gr.Blocks(css=custom_css) as demo: | |
gr.Markdown("## PDF Summarization Interface with Gemini API\n" | |
"Upload a PDF document to get a robust, detailed summary using a custom DSPy chain-of-thought prompt.\n") | |
with gr.Row(): | |
pdf_input_sum = gr.File(label="Upload PDF for Summarization", file_types=['.pdf']) | |
summarize_button = gr.Button("Summarize Document") | |
overall_summary_output = gr.Textbox(label="Overall Document Summary", lines=8) | |
chunk_summaries_output = gr.Textbox(label="Chunk Summaries", lines=10) | |
def process_and_summarize(pdf_file): | |
if pdf_file is None: | |
return "No file uploaded.", "No file uploaded." | |
file_path = pdf_file.name | |
overall, chunks = summarize_document(file_path) | |
return overall, "\n\n".join(chunks) | |
summarize_button.click( | |
fn=process_and_summarize, | |
inputs=pdf_input_sum, | |
outputs=[overall_summary_output, chunk_summaries_output] | |
) | |
demo.launch() | |