Spaces:

Fecalisboa
/

lucIAna

Runtime error

App Files Files Community

Fecalisboa commited on May 30, 2024

Commit

8dabaa3

verified ·

1 Parent(s): ce1efe0

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -22

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ api_token = os.getenv("HF_TOKEN")
-list_llm = ["meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3","google/flan-t5-base"]
 list_llm_simple = [os.path.basename(llm) for llm in list_llm]
 # Load PDF document and create doc splits
@@ -34,15 +34,36 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
     return doc_splits
 # Create vector database
-def create_db(splits, collection_name):
     embedding = HuggingFaceEmbeddings()
-    new_client = chromadb.EphemeralClient()
-    vectordb = Chroma.from_documents(
-        documents=splits,
-        embedding=embedding,
-        client=new_client,
-        collection_name=collection_name,
-    )
     return vectordb
 # Load vector database
@@ -67,14 +88,7 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, pr
             max_new_tokens=max_tokens,
             top_k=top_k,
         )
-    elif llm_model == "mistralai/Mistral-7B-Instruct-v0.3":
-        llm = HuggingFaceEndpoint(
-            repo_id=llm_model,
-            huggingfacehub_api_token=api_token,
-            temperature=temperature,
-            max_new_tokens=max_tokens,
-            top_k=top_k,
-        )
     else:
         llm = HuggingFaceEndpoint(
@@ -122,14 +136,14 @@ def create_collection_name(filepath):
     return collection_name
 # Initialize database
-def initialize_database(list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()):
     list_file_path = [x.name for x in list_file_obj if x is not None]
     progress(0.1, desc="Creating collection name...")
     collection_name = create_collection_name(list_file_path[0])
     progress(0.25, desc="Loading document...")
     doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
     progress(0.5, desc="Generating vector database...")
-    vector_db = create_db(doc_splits, collection_name)
     progress(0.9, desc="Done!")
     return vector_db, collection_name, "Complete!"
@@ -190,7 +204,7 @@ def demo():
         with gr.Tab("Step 2 - Process document"):
             with gr.Row():
-                db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value="ChromaDB", type="index", info="Choose your vector database")
             with gr.Accordion("Advanced options - Document text splitter", open=False):
                 with gr.Row():
                     slider_chunk_size = gr.Slider(minimum=100, maximum=1000, value=600, step=20, label="Chunk size", info="Chunk size", interactive=True)
@@ -237,7 +251,7 @@ def demo():
         # Preprocessing events
         db_btn.click(initialize_database,
-            inputs=[document, slider_chunk_size, slider_chunk_overlap],
             outputs=[vector_db, collection_name, db_progress])
         qachain_btn.click(initialize_LLM,
             inputs=[llm_btn, slider_temperature, slider_maxtokens, slider_topk, vector_db],
@@ -261,6 +275,5 @@ def demo():
             queue=False)
     demo.queue().launch(debug=True)
 if __name__ == "__main__":
     demo()

+list_llm = ["meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3"]
 list_llm_simple = [os.path.basename(llm) for llm in list_llm]
 # Load PDF document and create doc splits
     return doc_splits
 # Create vector database
+def create_db(splits, collection_name, db_type):
     embedding = HuggingFaceEmbeddings()
+    if db_type == "ChromaDB":
+        new_client = chromadb.EphemeralClient()
+        vectordb = Chroma.from_documents(
+            documents=splits,
+            embedding=embedding,
+            client=new_client,
+            collection_name=collection_name,
+        )
+    elif db_type == "FAISS":
+        vectordb = FAISS.from_documents(
+            documents=splits,
+            embedding=embedding
+        )
+    elif db_type == "ScaNN":
+        vectordb = ScaNN.from_documents(
+            documents=splits,
+            embedding=embedding
+        )
+    elif db_type == "Milvus":
+        vectordb = Milvus.from_documents(
+            documents=splits,
+            embedding=embedding,
+            collection_name=collection_name,
+        )
+    else:
+        raise ValueError(f"Unsupported vector database type: {db_type}")
     return vectordb
 # Load vector database
             max_new_tokens=max_tokens,
             top_k=top_k,
         )
     else:
         llm = HuggingFaceEndpoint(
     return collection_name
 # Initialize database
+def initialize_database(list_file_obj, chunk_size, chunk_overlap, db_type, progress=gr.Progress()):
     list_file_path = [x.name for x in list_file_obj if x is not None]
     progress(0.1, desc="Creating collection name...")
     collection_name = create_collection_name(list_file_path[0])
     progress(0.25, desc="Loading document...")
     doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
     progress(0.5, desc="Generating vector database...")
+    vector_db = create_db(doc_splits, collection_name, db_type)
     progress(0.9, desc="Done!")
     return vector_db, collection_name, "Complete!"
         with gr.Tab("Step 2 - Process document"):
             with gr.Row():
+                db_btn = gr.Radio(["ChromaDB", "FAISS", "ScaNN", "Milvus"], label="Vector database type", value="ChromaDB", type="index", info="Choose your vector database")
             with gr.Accordion("Advanced options - Document text splitter", open=False):
                 with gr.Row():
                     slider_chunk_size = gr.Slider(minimum=100, maximum=1000, value=600, step=20, label="Chunk size", info="Chunk size", interactive=True)
         # Preprocessing events
         db_btn.click(initialize_database,
+            inputs=[document, slider_chunk_size, slider_chunk_overlap, db_btn],
             outputs=[vector_db, collection_name, db_progress])
         qachain_btn.click(initialize_LLM,
             inputs=[llm_btn, slider_temperature, slider_maxtokens, slider_topk, vector_db],
             queue=False)
     demo.queue().launch(debug=True)
 if __name__ == "__main__":
     demo()