Spaces:

supertskone
/

prompt-search-engine

Sleeping

supertskone commited on Jul 26, 2024

Commit

be3ffc8

unverified ·

2 Parent(s): a1bf242 8271f70

Merge branch 'main' of https://huggingface.co/spaces/supertskone/prompt-search-engine

Files changed (4) hide show

Dockerfile CHANGED Viewed

@@ -1,6 +1,26 @@
 # Use an official Python runtime as a parent image
 FROM python:3.12-slim
 # Set the working directory
 WORKDIR /app
@@ -8,7 +28,7 @@ WORKDIR /app
 COPY . /app
 # Install system dependencies
-RUN apt-get update && apt-get install -y \
     git \
     && rm -rf /var/lib/apt/lists/*
@@ -16,8 +36,16 @@ RUN apt-get update && apt-get install -y \
 RUN pip install --no-cache-dir --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
-# Expose port 8501 for Streamlit
 EXPOSE 8501
-# Run the Streamlit app
-CMD ["streamlit", "run", "run.py", "--server.port=8501", "--server.address=0.0.0.0"]

 # Use an official Python runtime as a parent image
 FROM python:3.12-slim
+# Create a non-root user and switch to that user
+RUN useradd -m -u 1000 user
+# Set the home directory for the user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory for the new user
+WORKDIR $HOME/app
+# Change ownership of the app directory
+COPY --chown=user . $HOME/app
+# Switch to the new user
+USER user
 # Set the working directory
 WORKDIR /app
 COPY . /app
 # Install system dependencies
+RUN apt-get update && apt-get install -u 0 -y \
     git \
     && rm -rf /var/lib/apt/lists/*
 RUN pip install --no-cache-dir --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
+# Set environment variable for Hugging Face cache
+ENV TRANSFORMERS_CACHE=/app/cache
+# Create the cache directory
+RUN mkdir -p /app/cache/hub
+RUN chmod -R 777 /app/cache
+# Expose port 8501 for Streamlit and port 5000 for Flask
 EXPOSE 8501
+EXPOSE 5000
+# Run data loading, backend, and frontend
+CMD ["sh", "-c", "python load_data.py && python run.py & streamlit run ui/app.py --server.port=8501 --server.address=0.0.0.0"]

app/vectorizer.py CHANGED Viewed

@@ -9,6 +9,10 @@ from pinecone import Pinecone, ServerlessSpec
 # Disable parallelism for tokenizers
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -20,7 +24,7 @@ class Vectorizer:
         self.model = SentenceTransformer(model_name)
         self.prompts = []
         self.batch_size = batch_size
-        self.pinecone_index_name = "prompts-index"
         self._init_pinecone = init_pinecone
         self._setup_pinecone()
         self._load_prompts()

 # Disable parallelism for tokenizers
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+# Ensure the cache directory exists
+os.makedirs('/app/cache/hub', exist_ok=True)
+os.environ['HF_HOME'] = '/app/cache/hub'
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         self.model = SentenceTransformer(model_name)
         self.prompts = []
         self.batch_size = batch_size
+        self.pinecone_index_name = "search-prompts-index"
         self._init_pinecone = init_pinecone
         self._setup_pinecone()
         self._load_prompts()

load_data.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from app.vectorizer import Vectorizer
 if __name__ == "__main__":
     vectorizer = Vectorizer()
     vectorizer.store_from_dataset(store_data=True)  # Run this once to load the dataset into Pinecone

+import os
 from app.vectorizer import Vectorizer
+os.environ['HF_HOME'] = '/app/cache/hub'
 if __name__ == "__main__":
     vectorizer = Vectorizer()
     vectorizer.store_from_dataset(store_data=True)  # Run this once to load the dataset into Pinecone

ui/app.py CHANGED Viewed

@@ -14,7 +14,8 @@ n = st.number_input("Number of results:", min_value=1, max_value=20, value=5)
 if st.button("Search"):
     search_method = use_pinecone == 'Pinecone Vector Search'
-    response = requests.post("http://localhost:5000/search", json={"query": query, "n": n, "use_pinecone": search_method})
     # Log the response for debugging
     st.write("Response Status Code:", response.status_code)

 if st.button("Search"):
     search_method = use_pinecone == 'Pinecone Vector Search'
+    backend_url = "http://supertskone-prompt-search-engine.hf.space/search"
+    response = requests.post(backend_url, json={"query": query, "n": n, "use_pinecone": search_method})
     # Log the response for debugging
     st.write("Response Status Code:", response.status_code)