Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Aug 20, 2024

Commit

91bd588

1 Parent(s): b1c3d49

Updated Dockerfile and requirements files to create a smaller container

Browse files

Files changed (3) hide show

Dockerfile +22 -18
requirements_aws.txt +8 -0
search_funcs/convert_files_to_parquet.py +0 -33

Dockerfile CHANGED Viewed

@@ -1,43 +1,48 @@
 # First stage: build dependencies
-FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
 # COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
-# Install wget
-RUN apt-get update && \
-	apt-get install -y wget && \
-	apt-get install -y curl && \
-	apt-get clean && rm -rf /var/lib/apt/lists/*
 # Create a directory for the model
-RUN mkdir /model
 WORKDIR /src
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Gradio needs to be installed after due to conflict with spacy in requirements
-RUN pip install --no-cache-dir gradio==4.37.2
-# Download the BGE embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
-RUN mkdir -p /model/minilm
 COPY download_model.py /src/download_model.py
 RUN python /src/download_model.py
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
 # Change ownership of /home/user directory
 RUN chown -R user:user /home/user
-EXPOSE 7860
 # Make output folder
-RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
-RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
 # Switch to the "user" user
 USER user
@@ -54,7 +59,6 @@ ENV HOME=/home/user \
 	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
-	#GRADIO_ROOT_PATH=/data-text-search \
 	SYSTEM=spaces
 # Set the working directory to the user's home directory

 # First stage: build dependencies
+FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
 # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
 # COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
+# Update apt
+RUN apt-get update && rm -rf /var/lib/apt/lists/*
 # Create a directory for the model
+RUN mkdir -p /model /model/minilm /install
 WORKDIR /src
+COPY requirements_aws.txt .
+RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
+&& pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
+&& pip install --no-cache-dir --target=/install -r requirements_aws.txt \
+&& pip install --no-cache-dir --target=/install gradio==4.41.0
+# Add /install to the PYTHONPATH
+ENV PYTHONPATH="/install:${PYTHONPATH}"
+# Download the embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
 COPY download_model.py /src/download_model.py
 RUN python /src/download_model.py
+# Stage 2: Final runtime image
+FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
+# Copy installed packages from builder stage
+COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 # Change ownership of /home/user directory
 RUN chown -R user:user /home/user
 # Make output folder
+RUN mkdir -p /home/user/app/output && mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user
+# Copy models from the builder stage
+COPY --from=builder /model/minilm /home/user/app/model/minilm
 # Switch to the "user" user
 USER user
 	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
 	SYSTEM=spaces
 # Set the working directory to the user's home directory

requirements_aws.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pandas==2.2.2
+polars==0.20.3
+pyarrow==14.0.2
+openpyxl==3.1.3
+spacy==3.7.5
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+lxml==5.2.2
+boto3==1.34.158

search_funcs/convert_files_to_parquet.py DELETED Viewed

@@ -1,33 +0,0 @@
-# %%
-import pandas as pd
-import csv
-# %%
-# Define your file paths
-file_dir = "../"
-extracted_file_path = file_dir + "2022_08_case_notes.txt"
-parquet_file_path = file_dir + "2022_08_case_notes.parquet"
-# %%
-# Read the TXT file using the csv module and convert to DataFrame
-csv.field_size_limit(1000000)  # set to a higher value
-data_list = []
-with open(extracted_file_path, mode='r', encoding='iso-8859-1') as file:
-    csv_reader = csv.reader(file, delimiter=',')  # Change the delimiter if needed
-    for row in csv_reader:
-        data_list.append(row)
-# Filter rows that have the same number of columns as the header
-header = data_list[0]
-filtered_data = [row for row in data_list if len(row) == len(header)]
-# Convert list of rows to DataFrame
-casenotes = pd.DataFrame(filtered_data[1:], columns=header)  # Assuming first row is header
-print(casenotes.head())  # Display the first few rows of the DataFrame
-# %%
-casenotes.to_parquet(parquet_file_path)