Spaces:

acpotts
/

w8d2

Runtime error

App Files Files Community

acpotts commited on Oct 7, 2024

Commit

6a9b7c2

1 Parent(s): 0be6f64

dockerfile

Browse files

Files changed (2) hide show

Dockerfile +0 -2
app.py +58 -26

Dockerfile CHANGED Viewed

@@ -1,5 +1,3 @@
-AIM version:
 FROM python:3.9
 RUN useradd -m -u 1000 user
 USER user

 FROM python:3.9
 RUN useradd -m -u 1000 user
 USER user

app.py CHANGED Viewed

@@ -11,6 +11,9 @@ from langchain_core.prompts import PromptTemplate
 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
 from langchain.schema.runnable.config import RunnableConfig
 # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
 # ---- ENV VARIABLES ---- #
@@ -44,42 +47,73 @@ documents = text_loader.load()
 ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
-split_documents = text_splitter.split_documents(documents)
 ### 3. LOAD HUGGINGFACE EMBEDDINGS
-hf_embeddings  = HuggingFaceEndpointEmbeddings(
-    model=HF_EMBED_ENDPOINT, #HERE
     task="feature-extraction",
     huggingfacehub_api_token=os.environ["HF_TOKEN"],
 )
-if os.path.exists("./data/vectorstore"):
-    vectorstore = FAISS.load_local(
-        "./data/vectorstore",
-        hf_embeddings,
-        allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
-    )
-    hf_retriever = vectorstore.as_retriever()
-    print("Loaded Vectorstore")
-else:
     print("Indexing Files")
-    os.makedirs("./data/vectorstore", exist_ok=True)
-    ### 4. INDEX FILES
-    ### NOTE: REMEMBER TO BATCH THE DOCUMENTS WITH MAXIMUM BATCH SIZE = 32
-    for i in range(0, len(split_documents), 32):
-        if i == 0:
-            vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
-            continue
-        vectorstore.add_documents(split_documents[i:i+32])
-hf_retriever = vectorstore.as_retriever()
 # -- AUGMENTED -- #
 """
 1. Define a String Template
 2. Create a Prompt Template from the String Template
 """
-### 1. DEFINE STRING TEMPLATE
 RAG_PROMPT_TEMPLATE = """\
 <|start_header_id|>system<|end_header_id|>
 You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context, say you don't know.<|eot_id|>
@@ -94,16 +128,14 @@ Context:
 <|start_header_id|>assistant<|end_header_id|>
 """
-### 2. CREATE PROMPT TEMPLATE
 rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
 # -- GENERATION -- #
 """
 1. Create a HuggingFaceEndpoint for the LLM
 """
 ### 1. CREATE HUGGINGFACE ENDPOINT FOR LLM
-hf_llm  = HuggingFaceEndpoint(
     endpoint_url=f"{HF_LLM_ENDPOINT}",
     max_new_tokens=512,
     top_k=10,
@@ -111,7 +143,7 @@ hf_llm  = HuggingFaceEndpoint(
     typical_p=0.95,
     temperature=0.01,
     repetition_penalty=1.03,
-    huggingfacehub_api_token=HF_TOKEN
 )
 @cl.author_rename

 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
 from langchain.schema.runnable.config import RunnableConfig
+from tqdm.asyncio import tqdm_asyncio
+import asyncio
+from tqdm.asyncio import tqdm
 # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
 # ---- ENV VARIABLES ---- #
 ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
+split_documents = split_documents = text_splitter.split_documents(documents)
+print(len(split_documents))
 ### 3. LOAD HUGGINGFACE EMBEDDINGS
+hf_embeddings = HuggingFaceEndpointEmbeddings(
+    model=HF_EMBED_ENDPOINT,
     task="feature-extraction",
     huggingfacehub_api_token=os.environ["HF_TOKEN"],
 )
+async def add_documents_async(vectorstore, documents):
+    await vectorstore.aadd_documents(documents)
+async def process_batch(vectorstore, batch, is_first_batch, pbar):
+    if is_first_batch:
+        result = await FAISS.afrom_documents(batch, hf_embeddings)
+    else:
+        await add_documents_async(vectorstore, batch)
+        result = vectorstore
+    pbar.update(len(batch))
+    return result
+async def main():
     print("Indexing Files")
+    vectorstore = None
+    batch_size = 32
+    batches = [split_documents[i:i+batch_size] for i in range(0, len(split_documents), batch_size)]
+    async def process_all_batches():
+        nonlocal vectorstore
+        tasks = []
+        pbars = []
+        for i, batch in enumerate(batches):
+            pbar = tqdm(total=len(batch), desc=f"Batch {i+1}/{len(batches)}", position=i)
+            pbars.append(pbar)
+            if i == 0:
+                vectorstore = await process_batch(None, batch, True, pbar)
+            else:
+                tasks.append(process_batch(vectorstore, batch, False, pbar))
+        if tasks:
+            await asyncio.gather(*tasks)
+        for pbar in pbars:
+            pbar.close()
+    await process_all_batches()
+    hf_retriever = vectorstore.as_retriever()
+    print("\nIndexing complete. Vectorstore is ready for use.")
+    return hf_retriever
+async def run():
+    retriever = await main()
+    return retriever
+hf_retriever = asyncio.run(run())
 # -- AUGMENTED -- #
 """
 1. Define a String Template
 2. Create a Prompt Template from the String Template
 """
 RAG_PROMPT_TEMPLATE = """\
 <|start_header_id|>system<|end_header_id|>
 You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context, say you don't know.<|eot_id|>
 <|start_header_id|>assistant<|end_header_id|>
 """
 rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
 # -- GENERATION -- #
 """
 1. Create a HuggingFaceEndpoint for the LLM
 """
 ### 1. CREATE HUGGINGFACE ENDPOINT FOR LLM
+hf_llm = HuggingFaceEndpoint(
     endpoint_url=f"{HF_LLM_ENDPOINT}",
     max_new_tokens=512,
     top_k=10,
     typical_p=0.95,
     temperature=0.01,
     repetition_penalty=1.03,
+    huggingfacehub_api_token=os.environ["HF_TOKEN"]
 )
 @cl.author_rename