supertskone commited on
Commit
be3ffc8
·
unverified ·
2 Parent(s): a1bf242 8271f70

Merge branch 'main' of https://huggingface.co/spaces/supertskone/prompt-search-engine

Browse files
Files changed (4) hide show
  1. Dockerfile +32 -4
  2. app/vectorizer.py +5 -1
  3. load_data.py +3 -0
  4. ui/app.py +2 -1
Dockerfile CHANGED
@@ -1,6 +1,26 @@
1
  # Use an official Python runtime as a parent image
2
  FROM python:3.12-slim
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  # Set the working directory
5
  WORKDIR /app
6
 
@@ -8,7 +28,7 @@ WORKDIR /app
8
  COPY . /app
9
 
10
  # Install system dependencies
11
- RUN apt-get update && apt-get install -y \
12
  git \
13
  && rm -rf /var/lib/apt/lists/*
14
 
@@ -16,8 +36,16 @@ RUN apt-get update && apt-get install -y \
16
  RUN pip install --no-cache-dir --upgrade pip
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
- # Expose port 8501 for Streamlit
 
 
 
 
 
 
 
20
  EXPOSE 8501
 
21
 
22
- # Run the Streamlit app
23
- CMD ["streamlit", "run", "run.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
  # Use an official Python runtime as a parent image
2
  FROM python:3.12-slim
3
 
4
+ # Create a non-root user and switch to that user
5
+ RUN useradd -m -u 1000 user
6
+
7
+
8
+ # Set the home directory for the user
9
+ ENV HOME=/home/user \
10
+
11
+ PATH=/home/user/.local/bin:$PATH
12
+
13
+ # Set the working directory for the new user
14
+ WORKDIR $HOME/app
15
+
16
+
17
+ # Change ownership of the app directory
18
+ COPY --chown=user . $HOME/app
19
+
20
+
21
+ # Switch to the new user
22
+ USER user
23
+
24
  # Set the working directory
25
  WORKDIR /app
26
 
 
28
  COPY . /app
29
 
30
  # Install system dependencies
31
+ RUN apt-get update && apt-get install -u 0 -y \
32
  git \
33
  && rm -rf /var/lib/apt/lists/*
34
 
 
36
  RUN pip install --no-cache-dir --upgrade pip
37
  RUN pip install --no-cache-dir -r requirements.txt
38
 
39
+ # Set environment variable for Hugging Face cache
40
+ ENV TRANSFORMERS_CACHE=/app/cache
41
+
42
+ # Create the cache directory
43
+ RUN mkdir -p /app/cache/hub
44
+ RUN chmod -R 777 /app/cache
45
+
46
+ # Expose port 8501 for Streamlit and port 5000 for Flask
47
  EXPOSE 8501
48
+ EXPOSE 5000
49
 
50
+ # Run data loading, backend, and frontend
51
+ CMD ["sh", "-c", "python load_data.py && python run.py & streamlit run ui/app.py --server.port=8501 --server.address=0.0.0.0"]
app/vectorizer.py CHANGED
@@ -9,6 +9,10 @@ from pinecone import Pinecone, ServerlessSpec
9
  # Disable parallelism for tokenizers
10
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
11
 
 
 
 
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
@@ -20,7 +24,7 @@ class Vectorizer:
20
  self.model = SentenceTransformer(model_name)
21
  self.prompts = []
22
  self.batch_size = batch_size
23
- self.pinecone_index_name = "prompts-index"
24
  self._init_pinecone = init_pinecone
25
  self._setup_pinecone()
26
  self._load_prompts()
 
9
  # Disable parallelism for tokenizers
10
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
11
 
12
+ # Ensure the cache directory exists
13
+ os.makedirs('/app/cache/hub', exist_ok=True)
14
+ os.environ['HF_HOME'] = '/app/cache/hub'
15
+
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
 
24
  self.model = SentenceTransformer(model_name)
25
  self.prompts = []
26
  self.batch_size = batch_size
27
+ self.pinecone_index_name = "search-prompts-index"
28
  self._init_pinecone = init_pinecone
29
  self._setup_pinecone()
30
  self._load_prompts()
load_data.py CHANGED
@@ -1,5 +1,8 @@
 
1
  from app.vectorizer import Vectorizer
2
 
 
 
3
  if __name__ == "__main__":
4
  vectorizer = Vectorizer()
5
  vectorizer.store_from_dataset(store_data=True) # Run this once to load the dataset into Pinecone
 
1
+ import os
2
  from app.vectorizer import Vectorizer
3
 
4
+ os.environ['HF_HOME'] = '/app/cache/hub'
5
+
6
  if __name__ == "__main__":
7
  vectorizer = Vectorizer()
8
  vectorizer.store_from_dataset(store_data=True) # Run this once to load the dataset into Pinecone
ui/app.py CHANGED
@@ -14,7 +14,8 @@ n = st.number_input("Number of results:", min_value=1, max_value=20, value=5)
14
 
15
  if st.button("Search"):
16
  search_method = use_pinecone == 'Pinecone Vector Search'
17
- response = requests.post("http://localhost:5000/search", json={"query": query, "n": n, "use_pinecone": search_method})
 
18
 
19
  # Log the response for debugging
20
  st.write("Response Status Code:", response.status_code)
 
14
 
15
  if st.button("Search"):
16
  search_method = use_pinecone == 'Pinecone Vector Search'
17
+ backend_url = "http://supertskone-prompt-search-engine.hf.space/search"
18
+ response = requests.post(backend_url, json={"query": query, "n": n, "use_pinecone": search_method})
19
 
20
  # Log the response for debugging
21
  st.write("Response Status Code:", response.status_code)