SyedHasanCronosPMC commited on
Commit
b524645
Β·
verified Β·
1 Parent(s): 40cedea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -27
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import gradio as gr
3
- from langchain.document_loaders import PyPDFLoader, YoutubeLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_openai import OpenAIEmbeddings
6
  from langchain_community.vectorstores import FAISS
@@ -12,61 +12,72 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("openai")
12
  if not OPENAI_API_KEY:
13
  raise ValueError("❌ OPENAI API Key not found. Please add it in Hugging Face secrets as 'OPENAI_API_KEY' or 'openai'.")
14
 
15
- # --- PROCESSING PIPELINE FUNCTION ---
16
- def process_inputs(pdf_file, youtube_url, query):
17
  docs = []
18
 
19
  # Load PDF
20
  try:
21
- pdf_path = pdf_file.name # βœ… Use .name to get the actual file path from Gradio
22
  pdf_loader = PyPDFLoader(pdf_path)
23
  docs.extend(pdf_loader.load())
24
  except Exception as e:
25
  return f"❌ Failed to load PDF: {e}"
26
 
27
- # Load YouTube Transcript
28
- try:
29
- yt_loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=False)
30
- docs.extend(yt_loader.load())
31
- except Exception as e:
32
- return f"❌ Failed to load YouTube video: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  if not docs:
35
- return "❌ No documents could be loaded from the PDF or YouTube URL."
36
 
37
- # Split documents
38
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
39
  splits = splitter.split_documents(docs)
40
 
41
- # Embedding + Vector Store
42
- try:
43
- embedding = OpenAIEmbeddings(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
44
- db = FAISS.from_documents(splits, embedding)
45
- except Exception as e:
46
- return f"❌ Embedding failed: {e}"
 
47
 
48
- # QA Chain
49
  try:
50
- llm = init_chat_model("gpt-4o-mini", model_provider="openai", api_key=OPENAI_API_KEY)
51
- qa = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())
52
  result = qa.invoke({"query": query})
53
  return result["result"]
54
  except Exception as e:
55
  return f"❌ Retrieval failed: {e}"
56
 
57
- # --- GRADIO APP ---
58
  with gr.Blocks() as demo:
59
- gr.Markdown("## πŸ“š Ask Questions from PDF + YouTube Transcript")
60
 
61
  with gr.Row():
62
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
63
- yt_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
 
64
 
65
- query_input = gr.Textbox(label="Your Question", placeholder="e.g., What did the PDF say about X?")
66
  output = gr.Textbox(label="Answer")
67
 
68
  run_button = gr.Button("Get Answer")
69
- run_button.click(fn=process_inputs, inputs=[pdf_input, yt_input, query_input], outputs=output)
70
 
71
  if __name__ == "__main__":
72
- demo.launch()
 
1
  import os
2
  import gradio as gr
3
+ from langchain.document_loaders import PyPDFLoader, YoutubeLoader, TextLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_openai import OpenAIEmbeddings
6
  from langchain_community.vectorstores import FAISS
 
12
  if not OPENAI_API_KEY:
13
  raise ValueError("❌ OPENAI API Key not found. Please add it in Hugging Face secrets as 'OPENAI_API_KEY' or 'openai'.")
14
 
15
+ # --- PROCESSING FUNCTION ---
16
+ def process_inputs(pdf_file, youtube_url, txt_file, query):
17
  docs = []
18
 
19
  # Load PDF
20
  try:
21
+ pdf_path = pdf_file.name
22
  pdf_loader = PyPDFLoader(pdf_path)
23
  docs.extend(pdf_loader.load())
24
  except Exception as e:
25
  return f"❌ Failed to load PDF: {e}"
26
 
27
+ # Load YouTube Transcript (optional)
28
+ yt_loaded = False
29
+ if youtube_url:
30
+ try:
31
+ yt_loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=False)
32
+ docs.extend(yt_loader.load())
33
+ yt_loaded = True
34
+ except Exception as e:
35
+ print(f"⚠️ YouTube transcript not loaded: {e}")
36
+
37
+ # Load text transcript file (optional fallback)
38
+ if not yt_loaded and txt_file is not None:
39
+ try:
40
+ txt_path = txt_file.name
41
+ txt_loader = TextLoader(txt_path)
42
+ docs.extend(txt_loader.load())
43
+ except Exception as e:
44
+ return f"❌ Failed to load transcript file: {e}"
45
 
46
  if not docs:
47
+ return "❌ No documents could be loaded. Please check your inputs."
48
 
49
+ # Split text into chunks
50
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
51
  splits = splitter.split_documents(docs)
52
 
53
+ # Embed documents
54
+ embedding = OpenAIEmbeddings(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
55
+ db = FAISS.from_documents(splits, embedding)
56
+
57
+ # Query using RetrievalQA
58
+ llm = init_chat_model("gpt-4o-mini", model_provider="openai", api_key=OPENAI_API_KEY)
59
+ qa = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())
60
 
 
61
  try:
 
 
62
  result = qa.invoke({"query": query})
63
  return result["result"]
64
  except Exception as e:
65
  return f"❌ Retrieval failed: {e}"
66
 
67
+ # --- GRADIO UI ---
68
  with gr.Blocks() as demo:
69
+ gr.Markdown("## πŸ“š Ask Questions from PDF + YouTube Transcript or .txt Upload")
70
 
71
  with gr.Row():
72
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
73
+ yt_input = gr.Textbox(label="YouTube URL (Optional)", placeholder="https://www.youtube.com/watch?v=...")
74
+ txt_input = gr.File(label="Upload Transcript .txt (Optional fallback)", file_types=[".txt"])
75
 
76
+ query_input = gr.Textbox(label="Your Question", placeholder="e.g., What did the document say about X?")
77
  output = gr.Textbox(label="Answer")
78
 
79
  run_button = gr.Button("Get Answer")
80
+ run_button.click(fn=process_inputs, inputs=[pdf_input, yt_input, txt_input, query_input], outputs=output)
81
 
82
  if __name__ == "__main__":
83
+ demo.launch()