harshpatel080503 commited on
Commit
f60fca3
·
verified ·
1 Parent(s): a75d25d

Update rag_chain.py

Browse files
Files changed (1) hide show
  1. rag_chain.py +22 -7
rag_chain.py CHANGED
@@ -2,7 +2,8 @@
2
 
3
  import os
4
  from dotenv import load_dotenv
5
- from youtube_transcript_api import YouTubeTranscriptApi
 
6
 
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
@@ -23,6 +24,7 @@ os.environ["OPENAI_API_KEY"] = openai_token
23
 
24
  # Hugging Face Embeddings
25
  os.environ['HF_HOME'] = 'Embedding Models'
 
26
  embedding = HuggingFaceEmbeddings(
27
  model_name="Embedding Models/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf",
28
  model_kwargs={"local_files_only": True}
@@ -49,14 +51,27 @@ Answer:""",
49
  input_variables=["context", "question"],
50
  )
51
 
52
- # Fetch transcript using YouTubeTranscriptApi
53
- def fetch_transcript(video_id: str) -> str:
54
- transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "hi"])
55
- return " ".join([t["text"] for t in transcript])
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # Build RAG chain from transcript
58
- def build_chain(video_id: str) -> RetrievalQA:
59
- text = fetch_transcript(video_id)
60
 
61
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
62
  docs = splitter.create_documents([text])
 
2
 
3
  import os
4
  from dotenv import load_dotenv
5
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
6
+ import requests
7
 
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from langchain.vectorstores import FAISS
 
24
 
25
  # Hugging Face Embeddings
26
  os.environ['HF_HOME'] = 'Embedding Models'
27
+
28
  embedding = HuggingFaceEmbeddings(
29
  model_name="Embedding Models/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf",
30
  model_kwargs={"local_files_only": True}
 
51
  input_variables=["context", "question"],
52
  )
53
 
54
+ # Updated to optionally accept proxies
55
+ def fetch_transcript(video_id: str, proxies: dict = None) -> str:
56
+ try:
57
+ # If proxies are provided, patch requests.Session to use them
58
+ if proxies:
59
+ session = requests.Session()
60
+ session.proxies.update(proxies)
61
+ # Monkey patch the YouTubeTranscriptApi's internal session to use proxy
62
+ YouTubeTranscriptApi._requests = session
63
+
64
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "hi"])
65
+ return " ".join([t["text"] for t in transcript])
66
+
67
+ except (TranscriptsDisabled, NoTranscriptFound, VideoUnavailable) as e:
68
+ raise Exception(f"Transcript not available for video_id {video_id}: {str(e)}")
69
+ except Exception as e:
70
+ raise Exception(f"Error fetching transcript: {str(e)}")
71
 
72
  # Build RAG chain from transcript
73
+ def build_chain(video_id: str, proxies: dict = None) -> RetrievalQA:
74
+ text = fetch_transcript(video_id, proxies=proxies)
75
 
76
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
77
  docs = splitter.create_documents([text])