Spaces:

PabloVD
/

CAMELSDocBot

Sleeping

App Files Files Community

PabloVD commited on Oct 30, 2024

Commit

bc84f5e

2 Parent(s): a08531a 127f3c4

Merge branch 'newbranch' into main

Browse files

Files changed (3) hide show

app.py +2 -1
requirements.txt +6 -14
worker.py +20 -13

app.py CHANGED Viewed

@@ -10,7 +10,8 @@ url = 'https://camels.readthedocs.io/_/downloads/en/latest/pdf/'
 r = requests.get(url, stream=True)
 document_path = Path('metadata.pdf')
 document_path.write_bytes(r.content)
-worker.process_document(str(document_path))
 def handle_prompt(message, history):
     bot_response = worker.process_prompt(message, history)

 r = requests.get(url, stream=True)
 document_path = Path('metadata.pdf')
 document_path.write_bytes(r.content)
+# document_path="2022GS.pdf"
+worker.process_document(document_path)
 def handle_prompt(message, history):
     bot_response = worker.process_prompt(message, history)

requirements.txt CHANGED Viewed

@@ -1,17 +1,9 @@
-Flask
-Flask_Cors
 pdf2image
 pypdf
 tiktoken
-pandas==1.5
-langchain==0.0.254
-atlassian-python-api==3.36.0
-chromadb==0.3.25
-huggingface-hub==0.16.4
-torch==2.0.1
-sentence-transformers==2.2.2
-InstructorEmbedding==1.0.0
-p4python==2023.1.2454917
-lxml==4.9.2
-bs4==0.0.1
-ibm-watson-machine-learning

 pdf2image
 pypdf
 tiktoken
+langchain
+langchain-community
+langchain-huggingface
+chromadb
+InstructorEmbedding
+huggingface_hub==0.25.2

worker.py CHANGED Viewed

@@ -1,14 +1,24 @@
 import torch
 from langchain.chains import RetrievalQA
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import Chroma
-from langchain.llms import HuggingFaceHub
-import os
 # Check for GPU availability and set the appropriate device for computation.
 DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 # Global variables
 conversation_retrieval_chain = None
@@ -20,10 +30,10 @@ embeddings = None
 def init_llm():
     global llm_hub, embeddings
     # Set up the environment variable for HuggingFace and initialize the desired model.
-    tokenfile = open("api_token.txt")
-    api_token = tokenfile.readline().replace("\n","")
-    tokenfile.close()
-    os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
     # repo name for the model
     # model_id = "tiiuae/falcon-7b-instruct"
@@ -32,16 +42,13 @@ def init_llm():
     # model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
     # load the model into the HuggingFaceHub
-    #llm_hub = HuggingFaceHub(repo_id=model_id, temperature=0.1, max_new_tokens=600, model_kwargs={"max_length":600})
-    llm_hub = HuggingFaceHub(repo_id=model_id, model_kwargs={"temperature": 0.1, "max_new_tokens": 600, "max_length": 600})
     llm_hub.client.api_url = 'https://api-inference.huggingface.co/models/'+model_id
     # llm_hub.invoke('foo bar')
     #Initialize embeddings using a pre-trained model to represent the text data.
     embedddings_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
     # embedddings_model = "sentence-transformers/all-MiniLM-L6-v2"
-    # emb_model = SentenceTransformer(embedddings_model)
     embeddings = HuggingFaceInstructEmbeddings(
         model_name=embedddings_model,

 import torch
 from langchain.chains import RetrievalQA
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEndpoint
+# import pip
+# def install(package):
+#     if hasattr(pip, 'main'):
+#         pip.main(['install', package])
+#     else:
+#         pip._internal.main(['install', package])
+# # Temporal fix for incompatibility between langchain_huggingface and sentence-transformers<2.6
+# install("sentence-transformers==2.2.2")
 # Check for GPU availability and set the appropriate device for computation.
 DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
+# DEVICE = "cpu"
 # Global variables
 conversation_retrieval_chain = None
 def init_llm():
     global llm_hub, embeddings
     # Set up the environment variable for HuggingFace and initialize the desired model.
+    # tokenfile = open("api_token.txt")
+    # api_token = tokenfile.readline().replace("\n","")
+    # tokenfile.close()
+    # os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
     # repo name for the model
     # model_id = "tiiuae/falcon-7b-instruct"
     # model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
     # load the model into the HuggingFaceHub
+    llm_hub = HuggingFaceEndpoint(repo_id=model_id, temperature=0.1, max_new_tokens=600, model_kwargs={"max_length":600})
     llm_hub.client.api_url = 'https://api-inference.huggingface.co/models/'+model_id
     # llm_hub.invoke('foo bar')
     #Initialize embeddings using a pre-trained model to represent the text data.
     embedddings_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
     # embedddings_model = "sentence-transformers/all-MiniLM-L6-v2"
     embeddings = HuggingFaceInstructEmbeddings(
         model_name=embedddings_model,