seanpedrickcase
commited on
Commit
•
91bd588
1
Parent(s):
b1c3d49
Updated Dockerfile and requirements files to create a smaller container
Browse files- Dockerfile +22 -18
- requirements_aws.txt +8 -0
- search_funcs/convert_files_to_parquet.py +0 -33
Dockerfile
CHANGED
@@ -1,43 +1,48 @@
|
|
1 |
# First stage: build dependencies
|
2 |
-
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
3 |
|
4 |
# Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
5 |
# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
|
6 |
|
7 |
-
#
|
8 |
-
RUN apt-get update &&
|
9 |
-
apt-get install -y wget && \
|
10 |
-
apt-get install -y curl && \
|
11 |
-
apt-get clean && rm -rf /var/lib/apt/lists/*
|
12 |
|
13 |
# Create a directory for the model
|
14 |
-
RUN mkdir /model
|
15 |
|
16 |
WORKDIR /src
|
17 |
|
18 |
-
COPY
|
19 |
|
20 |
-
RUN pip install --
|
|
|
|
|
|
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
|
25 |
-
# Download the
|
26 |
-
RUN mkdir -p /model/minilm
|
27 |
COPY download_model.py /src/download_model.py
|
28 |
RUN python /src/download_model.py
|
29 |
|
|
|
|
|
|
|
30 |
# Set up a new user named "user" with user ID 1000
|
31 |
RUN useradd -m -u 1000 user
|
32 |
|
|
|
|
|
|
|
33 |
# Change ownership of /home/user directory
|
34 |
RUN chown -R user:user /home/user
|
35 |
|
36 |
-
EXPOSE 7860
|
37 |
-
|
38 |
# Make output folder
|
39 |
-
RUN mkdir -p /home/user/app/output && chown -R user:user /home/user
|
40 |
-
|
|
|
|
|
41 |
|
42 |
# Switch to the "user" user
|
43 |
USER user
|
@@ -54,7 +59,6 @@ ENV HOME=/home/user \
|
|
54 |
GRADIO_SERVER_PORT=7860 \
|
55 |
GRADIO_THEME=huggingface \
|
56 |
AWS_STS_REGIONAL_ENDPOINT=regional \
|
57 |
-
#GRADIO_ROOT_PATH=/data-text-search \
|
58 |
SYSTEM=spaces
|
59 |
|
60 |
# Set the working directory to the user's home directory
|
|
|
1 |
# First stage: build dependencies
|
2 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
|
3 |
|
4 |
# Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
5 |
# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
|
6 |
|
7 |
+
# Update apt
|
8 |
+
RUN apt-get update && rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
9 |
|
10 |
# Create a directory for the model
|
11 |
+
RUN mkdir -p /model /model/minilm /install
|
12 |
|
13 |
WORKDIR /src
|
14 |
|
15 |
+
COPY requirements_aws.txt .
|
16 |
|
17 |
+
RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
|
18 |
+
&& pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
|
19 |
+
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt \
|
20 |
+
&& pip install --no-cache-dir --target=/install gradio==4.41.0
|
21 |
|
22 |
+
# Add /install to the PYTHONPATH
|
23 |
+
ENV PYTHONPATH="/install:${PYTHONPATH}"
|
24 |
|
25 |
+
# Download the embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
|
|
|
26 |
COPY download_model.py /src/download_model.py
|
27 |
RUN python /src/download_model.py
|
28 |
|
29 |
+
# Stage 2: Final runtime image
|
30 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
31 |
+
|
32 |
# Set up a new user named "user" with user ID 1000
|
33 |
RUN useradd -m -u 1000 user
|
34 |
|
35 |
+
# Copy installed packages from builder stage
|
36 |
+
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
37 |
+
|
38 |
# Change ownership of /home/user directory
|
39 |
RUN chown -R user:user /home/user
|
40 |
|
|
|
|
|
41 |
# Make output folder
|
42 |
+
RUN mkdir -p /home/user/app/output && mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user
|
43 |
+
|
44 |
+
# Copy models from the builder stage
|
45 |
+
COPY --from=builder /model/minilm /home/user/app/model/minilm
|
46 |
|
47 |
# Switch to the "user" user
|
48 |
USER user
|
|
|
59 |
GRADIO_SERVER_PORT=7860 \
|
60 |
GRADIO_THEME=huggingface \
|
61 |
AWS_STS_REGIONAL_ENDPOINT=regional \
|
|
|
62 |
SYSTEM=spaces
|
63 |
|
64 |
# Set the working directory to the user's home directory
|
requirements_aws.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.2
|
2 |
+
polars==0.20.3
|
3 |
+
pyarrow==14.0.2
|
4 |
+
openpyxl==3.1.3
|
5 |
+
spacy==3.7.5
|
6 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
7 |
+
lxml==5.2.2
|
8 |
+
boto3==1.34.158
|
search_funcs/convert_files_to_parquet.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
# %%
|
2 |
-
import pandas as pd
|
3 |
-
import csv
|
4 |
-
|
5 |
-
# %%
|
6 |
-
# Define your file paths
|
7 |
-
file_dir = "../"
|
8 |
-
extracted_file_path = file_dir + "2022_08_case_notes.txt"
|
9 |
-
parquet_file_path = file_dir + "2022_08_case_notes.parquet"
|
10 |
-
|
11 |
-
# %%
|
12 |
-
# Read the TXT file using the csv module and convert to DataFrame
|
13 |
-
csv.field_size_limit(1000000) # set to a higher value
|
14 |
-
|
15 |
-
data_list = []
|
16 |
-
with open(extracted_file_path, mode='r', encoding='iso-8859-1') as file:
|
17 |
-
csv_reader = csv.reader(file, delimiter=',') # Change the delimiter if needed
|
18 |
-
for row in csv_reader:
|
19 |
-
data_list.append(row)
|
20 |
-
|
21 |
-
# Filter rows that have the same number of columns as the header
|
22 |
-
header = data_list[0]
|
23 |
-
filtered_data = [row for row in data_list if len(row) == len(header)]
|
24 |
-
|
25 |
-
# Convert list of rows to DataFrame
|
26 |
-
casenotes = pd.DataFrame(filtered_data[1:], columns=header) # Assuming first row is header
|
27 |
-
|
28 |
-
print(casenotes.head()) # Display the first few rows of the DataFrame
|
29 |
-
|
30 |
-
# %%
|
31 |
-
casenotes.to_parquet(parquet_file_path)
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|