poemsforaphrodite commited on
Commit
491af27
·
verified ·
1 Parent(s): e53356f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -284
app.py CHANGED
@@ -1,10 +1,7 @@
1
  import os
2
- import gradio as gr
3
  from openai import OpenAI
4
  from PyPDF2 import PdfReader
5
- import requests
6
- from youtube_transcript_api import YouTubeTranscriptApi
7
- from urllib.parse import urlparse, parse_qs
8
  from pinecone import Pinecone
9
  import uuid
10
  from dotenv import load_dotenv
@@ -33,28 +30,6 @@ def process_pdf(file):
33
  text += page.extract_text() + "\n"
34
  return text
35
 
36
- def process_web_link(url):
37
- response = requests.get(url)
38
- return response.text
39
-
40
- def process_youtube_link(url):
41
- video_id = extract_video_id(url)
42
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
43
- return " ".join([entry['text'] for entry in transcript])
44
-
45
- def extract_video_id(url):
46
- parsed_url = urlparse(url)
47
- if parsed_url.hostname == 'youtu.be':
48
- return parsed_url.path[1:]
49
- if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
50
- if parsed_url.path == '/watch':
51
- return parse_qs(parsed_url.query)['v'][0]
52
- if parsed_url.path[:7] == '/embed/':
53
- return parsed_url.path.split('/')[2]
54
- if parsed_url.path[:3] == '/v/':
55
- return parsed_url.path.split('/')[2]
56
- return None
57
-
58
  def process_upload(upload_type, file_or_link, file_name=None):
59
  print(f"Starting process_upload for {upload_type}")
60
  doc_id = str(uuid.uuid4())
@@ -63,251 +38,6 @@ def process_upload(upload_type, file_or_link, file_name=None):
63
  if upload_type == "PDF":
64
  content = process_pdf(file_or_link)
65
  doc_name = file_name or "Uploaded PDF"
66
- elif upload_type == "Web Link":
67
- content = process_web_link(file_or_link)
68
- doc_name = file_or_link
69
- elif upload_type == "YouTube Link":
70
- content = process_youtube_link(file_or_link)
71
- doc_name = f"YouTube: {file_or_link}"
72
- else:
73
- print("Invalid upload type")
74
- return "Invalid upload type"
75
-
76
- content_length = len(content)
77
- print(f"Content extracted, length: {content_length}")
78
-
79
- # Dynamically adjust chunk size based on content length
80
- if content_length < 10000:
81
- chunk_size = 1000
82
- elif content_length < 100000:
83
- chunk_size = 2000
84
- else:
85
- chunk_size = 4000
86
- print(f"Using chunk size: {chunk_size}")
87
-
88
- chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
89
-
90
- vectors = []
91
- with ThreadPoolExecutor() as executor:
92
- futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
93
-
94
- for future in as_completed(futures):
95
- vectors.append(future.result())
96
- # Progress can be handled via logging or status messages
97
-
98
- print(f"Generated {len(vectors)} vectors")
99
-
100
- index.upsert(vectors=vectors)
101
- print("Vectors upserted to Pinecone")
102
-
103
- return f"Processing complete for {upload_type}. Document Name: {doc_name}"
104
-
105
- def process_chunk(chunk, doc_id, i, upload_type, doc_name):
106
- embedding = get_embedding(chunk)
107
- return (f"{doc_id}_{i}", embedding, {
108
- "text": chunk,
109
- "type": upload_type,
110
- "doc_id": doc_id,
111
- "doc_name": doc_name,
112
- "chunk_index": i
113
- })
114
-
115
- def get_relevant_context(query, top_k=5):
116
- print(f"Getting relevant context for query: {query}")
117
- query_embedding = get_embedding(query)
118
-
119
- search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
120
- print(f"Found {len(search_results['matches'])} relevant results")
121
-
122
- # Sort results by doc_id and chunk_index to maintain document structure
123
- sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
124
-
125
- context = "\n".join([result['metadata']['text'] for result in sorted_results])
126
- return context, sorted_results
127
-
128
- def chat_with_ai(message):
129
- print(f"Chatting with AI, message: {message}")
130
- context, results = get_relevant_context(message)
131
- print(f"Retrieved context, length: {len(context)}")
132
-
133
- messages = [
134
- {"role": "system", "content": "You are a helpful assistant. Use the following information to answer the user's question, but don't mention the context directly in your response. If the information isn't in the context, say you don't know."},
135
- {"role": "system", "content": f"Context: {context}"},
136
- {"role": "user", "content": message}
137
- ]
138
-
139
- response = client.chat.completions.create(
140
- model="gpt-4o-mini",
141
- messages=messages
142
- )
143
- print("Received response from OpenAI")
144
-
145
- ai_response = response.choices[0].message.content
146
-
147
- # Prepare source information
148
- sources = [
149
- {
150
- "doc_id": result['metadata']['doc_id'],
151
- "doc_name": result['metadata']['doc_name'],
152
- "chunk_index": result['metadata']['chunk_index'],
153
- "text": result['metadata']['text'],
154
- "type": result['metadata']['type']
155
- }
156
- for result in results
157
- ]
158
-
159
- return ai_response, sources
160
-
161
- def clear_database():
162
- print("Clearing database...")
163
- index.delete(delete_all=True)
164
- print("Database cleared")
165
- return "Database cleared successfully."
166
-
167
- # Gradio Interface Components
168
-
169
- def handle_uploads(pdf, web_link, youtube_link):
170
- results = []
171
- if pdf:
172
- pdf_result = process_upload("PDF", pdf, pdf.name)
173
- results.append(pdf_result)
174
- if web_link:
175
- web_result = process_upload("Web Link", web_link)
176
- results.append(web_result)
177
- if youtube_link:
178
- youtube_result = process_upload("YouTube Link", youtube_link)
179
- results.append(youtube_result)
180
-
181
- if results:
182
- return "\n".join([f"✅ {res}" for res in results])
183
- else:
184
- return "⚠️ No content uploaded. Please provide at least one input."
185
-
186
- def handle_chat(user_input, state):
187
- if not user_input:
188
- return "⚠️ Please enter a question.", state
189
-
190
- response, sources = chat_with_ai(user_input)
191
- state = sources # Update state with sources
192
- return f"**You:** {user_input}\n\n**AI:** {response}", state
193
-
194
- def handle_clear_database():
195
- result = clear_database()
196
- return result
197
-
198
- def display_sources(sources):
199
- if not sources:
200
- return "ℹ️ Ask a question to see source chunks here."
201
-
202
- source_texts = []
203
- for i, source in enumerate(sources, 1):
204
- source_text = f"**Source {i} - {source['type']} ({source['doc_name']})**\n\n" \
205
- f"**Chunk Index:** {source['chunk_index']}\n" \
206
- f"{source['text']}\n\n---\n"
207
- source_texts.append(source_text)
208
- return "\n".join(source_texts)
209
-
210
- with gr.Blocks() as demo:
211
- gr.Markdown("# 📄 Upload and Chat with PDFs, Web Links, and YouTube Videos")
212
-
213
- with gr.Row():
214
- with gr.Column(scale=1):
215
- gr.Markdown("## 📤 Upload")
216
-
217
- pdf_input = gr.File(label="Choose a PDF file", file_types=[".pdf"])
218
- web_link_input = gr.Textbox(label="Enter a Web Link", placeholder="https://example.com")
219
- youtube_link_input = gr.Textbox(label="Enter a YouTube Link", placeholder="https://youtube.com/watch?v=...")
220
-
221
- upload_button = gr.Button("Process All")
222
- clear_db_button = gr.Button("Clear Database")
223
- upload_output = gr.Markdown()
224
-
225
- with gr.Column(scale=1):
226
- gr.Markdown("## 💬 Chat")
227
- user_input = gr.Textbox(label="Ask a question about the uploaded content:", placeholder="Your question here...")
228
- chat_button = gr.Button("Send")
229
- chat_output = gr.Markdown()
230
-
231
- with gr.Column(scale=1):
232
- gr.Markdown("## 📚 Source Chunks")
233
- sources_display = gr.Markdown()
234
-
235
- # Hidden state to store sources
236
- state = gr.State([])
237
-
238
- # Define interactions
239
- upload_button.click(handle_uploads, inputs=[pdf_input, web_link_input, youtube_link_input], outputs=upload_output)
240
- clear_db_button.click(handle_clear_database, inputs=None, outputs=upload_output)
241
- chat_button.click(handle_chat, inputs=[user_input, state], outputs=[chat_output, state])
242
- state.change(display_sources, inputs=state, outputs=sources_display)
243
-
244
- # Alternatively, use an event to update sources_display when state changes
245
- def update_sources(sources):
246
- return display_sources(sources)
247
-
248
- chat_button.click(update_sources, inputs=state, outputs=sources_display)
249
-
250
- # Launch the Gradio app
251
- if __name__ == "__main__":
252
- demo.launch()
253
-
254
-
255
- # Set up OpenAI client
256
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
257
-
258
- # Set up Pinecone
259
- pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
260
-
261
- index_name = "main" # Your index name
262
- index = pc.Index(index_name)
263
-
264
- def get_embedding(text):
265
- response = client.embeddings.create(input=text, model="text-embedding-3-large")
266
- return response.data[0].embedding
267
-
268
- def process_pdf(file):
269
- reader = PdfReader(file)
270
- text = ""
271
- for page in reader.pages:
272
- text += page.extract_text() + "\n"
273
- return text
274
-
275
- def process_web_link(url):
276
- response = requests.get(url)
277
- return response.text
278
-
279
- def process_youtube_link(url):
280
- video_id = extract_video_id(url)
281
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
282
- return " ".join([entry['text'] for entry in transcript])
283
-
284
- def extract_video_id(url):
285
- parsed_url = urlparse(url)
286
- if parsed_url.hostname == 'youtu.be':
287
- return parsed_url.path[1:]
288
- if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
289
- if parsed_url.path == '/watch':
290
- return parse_qs(parsed_url.query)['v'][0]
291
- if parsed_url.path[:7] == '/embed/':
292
- return parsed_url.path.split('/')[2]
293
- if parsed_url.path[:3] == '/v/':
294
- return parsed_url.path.split('/')[2]
295
- return None
296
-
297
- def process_upload(upload_type, file_or_link, file_name=None):
298
- print(f"Starting process_upload for {upload_type}")
299
- doc_id = str(uuid.uuid4())
300
- print(f"Generated doc_id: {doc_id}")
301
-
302
- if upload_type == "PDF":
303
- content = process_pdf(file_or_link)
304
- doc_name = file_name or "Uploaded PDF"
305
- elif upload_type == "Web Link":
306
- content = process_web_link(file_or_link)
307
- doc_name = file_or_link
308
- elif upload_type == "YouTube Link":
309
- content = process_youtube_link(file_or_link)
310
- doc_name = f"YouTube: {file_or_link}"
311
  else:
312
  print("Invalid upload type")
313
  return "Invalid upload type"
@@ -407,7 +137,7 @@ def clear_database():
407
 
408
  # Streamlit UI
409
  st.set_page_config(layout="wide")
410
- st.title("Upload and Chat with PDFs, Web Links, and YouTube Videos")
411
 
412
  # Create three columns
413
  col1, col2, col3 = st.columns([1, 1, 1])
@@ -418,12 +148,6 @@ with col1:
418
  # PDF upload
419
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
420
 
421
- # Web Link input
422
- web_link = st.text_input("Enter a Web Link")
423
-
424
- # YouTube Link input
425
- youtube_link = st.text_input("Enter a YouTube Link")
426
-
427
  if st.button("Process All"):
428
  st.session_state.upload_progress = st.progress(0)
429
  with st.spinner("Processing uploads..."):
@@ -431,12 +155,6 @@ with col1:
431
  if uploaded_file:
432
  pdf_result = process_upload("PDF", uploaded_file, uploaded_file.name)
433
  results.append(pdf_result)
434
- if web_link:
435
- web_result = process_upload("Web Link", web_link)
436
- results.append(web_result)
437
- if youtube_link:
438
- youtube_result = process_upload("YouTube Link", youtube_link)
439
- results.append(youtube_result)
440
 
441
  if results:
442
  for result in results:
 
1
  import os
2
+ import streamlit as st
3
  from openai import OpenAI
4
  from PyPDF2 import PdfReader
 
 
 
5
  from pinecone import Pinecone
6
  import uuid
7
  from dotenv import load_dotenv
 
30
  text += page.extract_text() + "\n"
31
  return text
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def process_upload(upload_type, file_or_link, file_name=None):
34
  print(f"Starting process_upload for {upload_type}")
35
  doc_id = str(uuid.uuid4())
 
38
  if upload_type == "PDF":
39
  content = process_pdf(file_or_link)
40
  doc_name = file_name or "Uploaded PDF"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  else:
42
  print("Invalid upload type")
43
  return "Invalid upload type"
 
137
 
138
  # Streamlit UI
139
  st.set_page_config(layout="wide")
140
+ st.title("Upload and Chat with PDFs")
141
 
142
  # Create three columns
143
  col1, col2, col3 = st.columns([1, 1, 1])
 
148
  # PDF upload
149
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
150
 
 
 
 
 
 
 
151
  if st.button("Process All"):
152
  st.session_state.upload_progress = st.progress(0)
153
  with st.spinner("Processing uploads..."):
 
155
  if uploaded_file:
156
  pdf_result = process_upload("PDF", uploaded_file, uploaded_file.name)
157
  results.append(pdf_result)
 
 
 
 
 
 
158
 
159
  if results:
160
  for result in results: