openai-llm-rag

Build error

App Files Files Community

bstraehle commited on Oct 22, 2023

Commit

d871888

1 Parent(s): 865d70c

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -21

app.py CHANGED Viewed

@@ -33,10 +33,10 @@ YOUTUBE_DIR = "/data/youtube"
 YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
 YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
 YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
-YOUTUBE_URL_4 = "https://www.youtube.com/shorts/3x95mw35dJY"
-YOUTUBE_URL_5 = "https://www.youtube.com/shorts/zg-DS23wq0c"
-YOUTUBE_URL_6 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
 MODEL_NAME  = "gpt-4"
@@ -46,20 +46,21 @@ def invoke(openai_api_key, use_rag, prompt):
                      temperature = 0)
     if (use_rag):
         # Document loading, splitting, and storage
-        #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
-        #                                           YOUTUBE_URL_2,
-        #                                           YOUTUBE_URL_3,
-        #                                           YOUTUBE_URL_4,
-        #                                           YOUTUBE_URL_5,
-        #                                           YOUTUBE_URL_6], YOUTUBE_DIR),
-        #                       OpenAIWhisperParser())
-        #docs = loader.load()
-        #text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
-        #                                               chunk_size = 1500)
-        #splits = text_splitter.split_documents(docs)
-        #vector_db = Chroma.from_documents(documents = splits,
-        #                                  embedding = OpenAIEmbeddings(),
-        #                                  persist_directory = CHROMA_DIR)
         # Document retrieval
         vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
                            persist_directory = CHROMA_DIR)
@@ -78,9 +79,9 @@ description = """<strong>Overview:</strong> The app demonstrates how to use a La
                  (YouTube videos, PDFs, URLs, or other <a href='https://raw.githubusercontent.com/bstraehle/ai-ml-dl/c38b224c196fc984aab6b6cc6bdc666f8f4fbcff/langchain/document-loaders.png'>data sources</a>).\n\n
                  <strong>Instructions:</strong> Enter an OpenAI API key and perform LLM use cases (semantic search, sentiment analysis, summarization, translation, etc.) on YouTube videos about GPT-4.
                  <ul style="list-style-type:square;">
-                 <li>Set "Retrieval Augmented Generation" to "<strong>False</strong>" and submit prompt "explain gpt-4". The LLM <strong>without</strong> RAG does not know the answer.</li>
-                 <li>Set "Retrieval Augmented Generation" to "<strong>True</strong>" and submit prompt "explain gpt-4". The LLM <strong>with</strong> RAG knows the answer.</li>
-                 <li>Experiment with different prompts, for example "explain gpt-4 in german", "list pros and cons of gpt-4", or "compare gtp-4 and gpt-3.5 in JSON".</li>
                  </ul>\n\n
                  <strong>Technology:</strong> <a href='https://www.gradio.app/'>Gradio</a> UI using <a href='https://platform.openai.com/'>OpenAI</a> API via AI-first
                  <a href='https://www.langchain.com/'>LangChain</a> toolkit with <a href='https://openai.com/research/whisper'>Whisper</a> (speech-to-text) and

 YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
 YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
 YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
+YOUTUBE_URL_4 = "https://www.youtube.com/watch?v=kiHpqXNCPj8"
+YOUTUBE_URL_5 = "https://www.youtube.com/shorts/3x95mw35dJY"
+YOUTUBE_URL_6 = "https://www.youtube.com/shorts/zg-DS23wq0c"
+YOUTUBE_URL_7 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
 MODEL_NAME  = "gpt-4"
                      temperature = 0)
     if (use_rag):
         # Document loading, splitting, and storage
+        loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
+                                                   YOUTUBE_URL_2,
+                                                   YOUTUBE_URL_3,
+                                                   YOUTUBE_URL_4,
+                                                   YOUTUBE_URL_5,
+                                                   YOUTUBE_URL_6,
+                                                   YOUTUBE_URL_7], YOUTUBE_DIR),
+                               OpenAIWhisperParser())
+        docs = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
+                                                       chunk_size = 1500)
+        splits = text_splitter.split_documents(docs)
+        vector_db = Chroma.from_documents(documents = splits,
+                                          embedding = OpenAIEmbeddings(),
+                                          persist_directory = CHROMA_DIR)
         # Document retrieval
         vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
                            persist_directory = CHROMA_DIR)
                  (YouTube videos, PDFs, URLs, or other <a href='https://raw.githubusercontent.com/bstraehle/ai-ml-dl/c38b224c196fc984aab6b6cc6bdc666f8f4fbcff/langchain/document-loaders.png'>data sources</a>).\n\n
                  <strong>Instructions:</strong> Enter an OpenAI API key and perform LLM use cases (semantic search, sentiment analysis, summarization, translation, etc.) on YouTube videos about GPT-4.
                  <ul style="list-style-type:square;">
+                 <li>Set "Retrieval Augmented Generation" to "<strong>False</strong>" and submit prompt "Explain GPT-4". The LLM <strong>without</strong> RAG does not know the answer.</li>
+                 <li>Set "Retrieval Augmented Generation" to "<strong>True</strong>" and submit prompt "Explain GPT-4". The LLM <strong>with</strong> RAG knows the answer.</li>
+                 <li>Experiment with different prompts, for example "Explain GPT-4 in one sentence, output in German", "List pros and cons of GPT-4", or "Compare GPT-4 and Claude 2, output in JSON".</li>
                  </ul>\n\n
                  <strong>Technology:</strong> <a href='https://www.gradio.app/'>Gradio</a> UI using <a href='https://platform.openai.com/'>OpenAI</a> API via AI-first
                  <a href='https://www.langchain.com/'>LangChain</a> toolkit with <a href='https://openai.com/research/whisper'>Whisper</a> (speech-to-text) and