Omarrran commited on
Commit
bfdc429
·
verified ·
1 Parent(s): aaa015f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -0
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ from google import genai
4
+ import gradio as gr
5
+ import PyPDF2
6
+ import numpy as np
7
+
8
+ # Try importing DSPy for chain-of-thought reasoning
9
+ try:
10
+ import dspy
11
+ HAS_DSPY = True
12
+ except ImportError:
13
+ HAS_DSPY = False
14
+
15
+ #############################################
16
+ # Load Gemini API key from environment variable
17
+ #############################################
18
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
19
+ if not GEMINI_API_KEY:
20
+ raise ValueError("Please set the GEMINI_API_KEY environment variable.")
21
+
22
+ # Initialize the Gemini API client with the secret key
23
+ client = genai.Client(api_key=GEMINI_API_KEY)
24
+
25
+ #############################################
26
+ # Custom DSPy Prompt Signature Function
27
+ #############################################
28
+ def custom_dspy_prompt(text, mode="summarization"):
29
+ """
30
+ Returns a custom chain-of-thought prompt signature for DSPy.
31
+ Modes:
32
+ - "summarization": for summarizing a text chunk.
33
+ - "overall": for combining chunk summaries.
34
+ """
35
+ if mode == "summarization":
36
+ return (f"EffectiveDSPyCOT: Please provide a detailed, robust, and token-expansive summary using chain-of-thought reasoning. "
37
+ f"Preserve context and key details. Text:\n\n{text}")
38
+ elif mode == "overall":
39
+ return (f"EffectiveDSPyCOT: Combine the following chunk summaries into an overall comprehensive summary. "
40
+ f"Expand on details and maintain context with chain-of-thought reasoning. Summaries:\n\n{text}")
41
+ else:
42
+ return text
43
+
44
+ #############################################
45
+ # Fallback Using Gemini's generate_content Method
46
+ #############################################
47
+ def fallback_predict(prompt, system_msg="You are a helpful assistant."):
48
+ """
49
+ Uses the Gemini API (generate_content method) to generate content.
50
+ """
51
+ try:
52
+ full_prompt = f"{system_msg}\n\n{prompt}"
53
+ response = client.models.generate_content(
54
+ model="gemini-2.0-flash", # Adjust model name as needed.
55
+ contents=full_prompt
56
+ )
57
+ return response.text
58
+ except Exception as e:
59
+ return f"[Gemini fallback error]: {str(e)}"
60
+
61
+ #############################################
62
+ # PDF Extraction and Improved Chunking
63
+ #############################################
64
+ def extract_text_from_pdf(pdf_path):
65
+ """
66
+ Extract text from all pages of a PDF file.
67
+ """
68
+ text = ""
69
+ with open(pdf_path, "rb") as f:
70
+ pdf_reader = PyPDF2.PdfReader(f)
71
+ for page in pdf_reader.pages:
72
+ page_text = page.extract_text()
73
+ if page_text:
74
+ text += page_text + "\n"
75
+ return text
76
+
77
+ def chunk_text(text, chunk_size=2000, overlap=300):
78
+ """
79
+ Split the text into overlapping chunks.
80
+ Larger chunk size and overlap help maintain context and expand token capacity.
81
+ """
82
+ words = text.split()
83
+ chunks = []
84
+ start = 0
85
+ while start < len(words):
86
+ end = min(start + chunk_size, len(words))
87
+ chunk = " ".join(words[start:end])
88
+ chunks.append(chunk)
89
+ start += chunk_size - overlap # Advance with overlap
90
+ return chunks
91
+
92
+ #############################################
93
+ # Summarizing a Single Chunk with Custom DSPy / Gemini
94
+ #############################################
95
+ def summarize_chunk(chunk):
96
+ """
97
+ Summarize a text chunk using a custom DSPy chain-of-thought prompt.
98
+ Falls back to Gemini if DSPy is not available or fails.
99
+ """
100
+ prompt = custom_dspy_prompt(chunk, mode="summarization")
101
+ if HAS_DSPY:
102
+ try:
103
+ summary = dspy.predict(prompt)
104
+ except Exception as e:
105
+ summary = fallback_predict(prompt, system_msg="You are a helpful summarizer.")
106
+ else:
107
+ summary = fallback_predict(prompt, system_msg="You are a helpful summarizer.")
108
+ return summary
109
+
110
+ #############################################
111
+ # Summarizing the Entire PDF
112
+ #############################################
113
+ def summarize_document(pdf_path):
114
+ """
115
+ Extract text from PDF, split it into overlapping chunks, summarize each chunk,
116
+ and then combine the chunk summaries into an overall document summary.
117
+ """
118
+ text = extract_text_from_pdf(pdf_path)
119
+ chunks = chunk_text(text)
120
+
121
+ summaries = []
122
+ for chunk in chunks:
123
+ summary = summarize_chunk(chunk)
124
+ summaries.append(summary)
125
+
126
+ overall_prompt = custom_dspy_prompt("\n\n".join(summaries), mode="overall")
127
+ if HAS_DSPY:
128
+ try:
129
+ overall_summary = dspy.predict(overall_prompt)
130
+ except Exception as e:
131
+ overall_summary = fallback_predict(overall_prompt, system_msg="You are a helpful assistant that summarizes documents.")
132
+ else:
133
+ overall_summary = fallback_predict(overall_prompt, system_msg="You are a helpful assistant that summarizes documents.")
134
+
135
+ return overall_summary, summaries
136
+
137
+ #############################################
138
+ # Enhanced Gradio Interface with Better UI Aesthetics (Summarization Only)
139
+ #############################################
140
+ custom_css = """
141
+ <style>
142
+ body { background-color: #f4f7f9; }
143
+ .gradio-container { font-family: 'Arial', sans-serif; }
144
+ h1, h2, h3 { color: #333333; }
145
+ .tab-header { background-color: #ffffff; border-bottom: 2px solid #e0e0e0; }
146
+ .gr-button { background-color: #4CAF50; color: white; }
147
+ .gr-textbox { background-color: #ffffff; }
148
+ </style>
149
+ """
150
+
151
+ with gr.Blocks(css=custom_css) as demo:
152
+ gr.Markdown("## PDF Summarization Interface with Gemini API\n"
153
+ "Upload a PDF document to get a robust, detailed summary using a custom DSPy chain-of-thought prompt.\n")
154
+
155
+ with gr.Row():
156
+ pdf_input_sum = gr.File(label="Upload PDF for Summarization", file_types=['.pdf'])
157
+ summarize_button = gr.Button("Summarize Document")
158
+ overall_summary_output = gr.Textbox(label="Overall Document Summary", lines=8)
159
+ chunk_summaries_output = gr.Textbox(label="Chunk Summaries", lines=10)
160
+
161
+ def process_and_summarize(pdf_file):
162
+ if pdf_file is None:
163
+ return "No file uploaded.", "No file uploaded."
164
+ file_path = pdf_file.name
165
+ overall, chunks = summarize_document(file_path)
166
+ return overall, "\n\n".join(chunks)
167
+
168
+ summarize_button.click(
169
+ fn=process_and_summarize,
170
+ inputs=pdf_input_sum,
171
+ outputs=[overall_summary_output, chunk_summaries_output]
172
+ )
173
+
174
+ demo.launch()