yougpt / rag_chain.py
harshpatel080503's picture
Update rag_chain.py
c31dfff verified
# rag_chain.py
import os
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
import requests
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
load_dotenv()
# Already handled by Hugging Face via secret env vars
hf_token = os.getenv("HUGGINGFACEHUB_ACCESS_TOKEN")
openai_token = os.getenv("OPENAI_API_KEY")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token
os.environ["OPENAI_API_KEY"] = openai_token
# Hugging Face Embeddings
os.environ['HF_HOME'] = 'Embedding Models'
embedding = HuggingFaceEmbeddings(
model_name="Embedding Models/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf",
model_kwargs={"local_files_only": True}
)
# OpenRouter LLM (Meta LLaMA 3.3)
llm = ChatOpenAI(
openai_api_base="https://openrouter.ai/api/v1",
model="meta-llama/llama-3.3-70b-instruct:free",
)
# Custom prompt for RAG
qa_prompt = PromptTemplate(
template="""
You are a helpful assistant answering questions based on YouTube video content.
Context:
{context}
Question:
{question}
Answer:""",
input_variables=["context", "question"],
)
def fetch_transcript(video_id: str) -> str:
try:
# βœ… Define your proxy here
proxies = {
"http": "http://219.65.73.81:80",
"https": "http://219.65.73.81:80"
}
# Patch requests session with proxy
session = requests.Session()
session.proxies.update(proxies)
YouTubeTranscriptApi._requests = session # monkey patch
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "hi"])
return " ".join([t["text"] for t in transcript])
except (TranscriptsDisabled, NoTranscriptFound, VideoUnavailable) as e:
raise Exception(f"Transcript not available for video_id {video_id}: {str(e)}")
except Exception as e:
raise Exception(f"Error fetching transcript: {str(e)}")
# Build RAG chain from transcript
def build_chain(video_id: str) -> RetrievalQA:
text = fetch_transcript(video_id)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.create_documents([text])
vectorstore = FAISS.from_documents(docs, embedding)
retriever = vectorstore.as_retriever()
memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True,
output_key="result"
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
memory=memory,
return_source_documents=True,
output_key="result",
chain_type_kwargs={"prompt": qa_prompt}
)
return qa_chain