david-meltzer
commited on
Commit
·
d40f2bc
1
Parent(s):
4a1bdeb
initial commit
Browse files- Dockerfile +9 -0
- Dockerrun.aws.json +14 -0
- app.py +93 -0
- faiss_index.pickle +3 -0
- faiss_index_small.pickle +3 -0
- requirements.txt +7 -0
- setup.py +26 -0
- vector_engine/.utils.py.swp +0 -0
- vector_engine/__init__.py +0 -0
- vector_engine/__pycache__/__init__.cpython-310.pyc +0 -0
- vector_engine/__pycache__/__init__.cpython-39.pyc +0 -0
- vector_engine/__pycache__/utils.cpython-310.pyc +0 -0
- vector_engine/__pycache__/utils.cpython-39.pyc +0 -0
- vector_engine/utils.py +41 -0
Dockerfile
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim-buster
|
2 |
+
COPY . /app
|
3 |
+
WORKDIR /app
|
4 |
+
RUN pip install --upgrade pip
|
5 |
+
RUN pip install numpy==1.24.3
|
6 |
+
RUN pip install -r requirements.txt
|
7 |
+
EXPOSE 8501
|
8 |
+
ENTRYPOINT ["streamlit","run"]
|
9 |
+
CMD ["app.py"]
|
Dockerrun.aws.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"AWSEBDockerrunVersion": "1",
|
3 |
+
"Image": {
|
4 |
+
"Name": "dmeltzer/semanticv3",
|
5 |
+
"Update": "true"
|
6 |
+
},
|
7 |
+
"Ports": [
|
8 |
+
{
|
9 |
+
"ContainerPort": 8501,
|
10 |
+
"HostPort": 8501
|
11 |
+
}
|
12 |
+
],
|
13 |
+
"Logging": "/var/log/nginx"
|
14 |
+
}
|
app.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faiss
|
2 |
+
import pickle
|
3 |
+
import datasets
|
4 |
+
import numpy as np
|
5 |
+
import requests
|
6 |
+
import streamlit as st
|
7 |
+
from vector_engine.utils import vector_search
|
8 |
+
from transformers import AutoModel, AutoTokenizer
|
9 |
+
|
10 |
+
from datasets import load_dataset
|
11 |
+
|
12 |
+
@st.cache
|
13 |
+
def read_data(dataset_repo='dhmeltzer/asks_validation_embedded'):
|
14 |
+
"""Read the data from huggingface."""
|
15 |
+
return load_dataset(dataset_repo)
|
16 |
+
|
17 |
+
#@st.cache(allow_output_mutation=True)
|
18 |
+
#def load_bert_model(name="nli-distilbert-base"):
|
19 |
+
# """Instantiate a sentence-level DistilBERT model."""
|
20 |
+
# return AutoModel.from_pretrained(f'sentence-transformers/{name}')
|
21 |
+
#
|
22 |
+
#@st.cache(allow_output_mutation=True)
|
23 |
+
#def load_tokenizer(name="nli-distilbert-base"):
|
24 |
+
# return AutoTokenizer.from_pretrained(f'sentence-transformers/{name}')
|
25 |
+
|
26 |
+
@st.cache(allow_output_mutation=True)
|
27 |
+
def load_faiss_index(path_to_faiss="./faiss_index_small.pickle"):
|
28 |
+
"""Load and deserialize the Faiss index."""
|
29 |
+
with open(path_to_faiss, "rb") as h:
|
30 |
+
data = pickle.load(h)
|
31 |
+
return faiss.deserialize_index(data)
|
32 |
+
|
33 |
+
def main():
|
34 |
+
# Load data and models
|
35 |
+
data = read_data()
|
36 |
+
#model = load_bert_model()
|
37 |
+
#tok = load_tokenizer()
|
38 |
+
faiss_index = load_faiss_index()
|
39 |
+
|
40 |
+
import requests
|
41 |
+
|
42 |
+
model_id="sentence-transformers/nli-distilbert-base"
|
43 |
+
|
44 |
+
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
|
45 |
+
headers = {"Authorization": "Bearer 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"}
|
46 |
+
|
47 |
+
def query(texts):
|
48 |
+
response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
|
49 |
+
return response.json()
|
50 |
+
|
51 |
+
|
52 |
+
st.title("Vector-based searches with Sentence Transformers and Faiss")
|
53 |
+
|
54 |
+
# User search
|
55 |
+
user_input = st.text_area("Search box", "ELI5 Dataset")
|
56 |
+
|
57 |
+
# Filters
|
58 |
+
st.sidebar.markdown("**Filters**")
|
59 |
+
|
60 |
+
filter_scores = st.sidebar.slider("Citations", 0, 250, 0)
|
61 |
+
num_results = st.sidebar.slider("Number of search results", 1, 50, 1)
|
62 |
+
|
63 |
+
vector = query([user_input])
|
64 |
+
# Fetch results
|
65 |
+
if user_input:
|
66 |
+
# Get paper IDs
|
67 |
+
_, I = faiss_index.search(np.array(vector).astype("float32"), k=num_results)
|
68 |
+
#D, I = vector_search([user_input],tok, model, faiss_index, num_results)
|
69 |
+
|
70 |
+
# Slice data on year
|
71 |
+
#frame = data[
|
72 |
+
# (data.scores >= filter_scores)
|
73 |
+
#]
|
74 |
+
|
75 |
+
frame = data
|
76 |
+
st.write(user_input)
|
77 |
+
# Get individual results
|
78 |
+
for id_ in I.flatten().tolist():
|
79 |
+
f = frame[id_]
|
80 |
+
#if id_ in set(frame.id):
|
81 |
+
# f = frame[(frame.id == id_)]
|
82 |
+
#else:
|
83 |
+
# continue
|
84 |
+
|
85 |
+
st.write(
|
86 |
+
f"""**{f['title']}**
|
87 |
+
**text**: {f['selftext']}
|
88 |
+
"""
|
89 |
+
)
|
90 |
+
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
main()
|
faiss_index.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d056c102fcaa817168f48ce69a5ce515e86d80131fb7e7a2866a983f6847185b
|
3 |
+
size 862585188
|
faiss_index_small.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b29154f9f51079670be7cf88767f2932a052f8d026bfcf0bba1da92ffc230bf1
|
3 |
+
size 7025732
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
torch==1.13
|
3 |
+
transformers==4.29.0
|
4 |
+
faiss-cpu
|
5 |
+
folium
|
6 |
+
streamlit==1.14.0
|
7 |
+
-e .
|
setup.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup
|
2 |
+
from setuptools import find_namespace_packages
|
3 |
+
|
4 |
+
common_kwargs = dict(
|
5 |
+
version="0.1.0",
|
6 |
+
license="MIT",
|
7 |
+
author="David Meltzer",
|
8 |
+
author_email="[email protected]",
|
9 |
+
classifiers=[
|
10 |
+
"Intended Audience :: Developers",
|
11 |
+
"Intended Audience :: Science/Research",
|
12 |
+
"License :: OSI Approved :: MIT License",
|
13 |
+
"Natural Language :: English",
|
14 |
+
"Operating System :: OS Independent",
|
15 |
+
"Programming Language :: Python :: 3.9",
|
16 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
17 |
+
],
|
18 |
+
python_requires=">=3.9",
|
19 |
+
include_package_data=False,
|
20 |
+
)
|
21 |
+
|
22 |
+
setup(
|
23 |
+
name="vector_engine",
|
24 |
+
packages=find_namespace_packages(where="vector_engine.*"),
|
25 |
+
**common_kwargs
|
26 |
+
)
|
vector_engine/.utils.py.swp
ADDED
Binary file (12.3 kB). View file
|
|
vector_engine/__init__.py
ADDED
File without changes
|
vector_engine/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (154 Bytes). View file
|
|
vector_engine/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (161 Bytes). View file
|
|
vector_engine/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (1.93 kB). View file
|
|
vector_engine/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (1.93 kB). View file
|
|
vector_engine/utils.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
+
|
6 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
7 |
+
def mean_pooling(model_output, attention_mask):
|
8 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
9 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
10 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
11 |
+
|
12 |
+
def vector_search(query, tokenizer, model, index, num_results=10):
|
13 |
+
"""Tranforms query to vector using a pretrained, sentence-level
|
14 |
+
DistilBERT model and finds similar vectors using FAISS.
|
15 |
+
Args:
|
16 |
+
query (str): User query that should be more than a sentence long.
|
17 |
+
model (sentence_transformers.SentenceTransformer.SentenceTransformer)
|
18 |
+
index (`numpy.ndarray`): FAISS index that needs to be deserialized.
|
19 |
+
num_results (int): Number of results to return.
|
20 |
+
Returns:
|
21 |
+
D (:obj:`numpy.array` of `float`): Distance between results and query.
|
22 |
+
I (:obj:`numpy.array` of `int`): Paper ID of the results.
|
23 |
+
|
24 |
+
"""
|
25 |
+
query=list(query)
|
26 |
+
encoded_input = tokenizer(query,padding=True, truncation=True, return_tensors='pt')
|
27 |
+
|
28 |
+
with torch.no_grad():
|
29 |
+
model_output = model(**encoded_input)
|
30 |
+
|
31 |
+
vector = mean_pooling(model_output, encoded_input['attention_mask'])
|
32 |
+
vector = F.normalize(vector, p=2, dim=1)
|
33 |
+
|
34 |
+
|
35 |
+
#vector = model.encode(list(query))
|
36 |
+
D, I = index.search(np.array(vector).astype("float32"), k=num_results)
|
37 |
+
return D, I
|
38 |
+
|
39 |
+
def id2details(df, I, column):
|
40 |
+
"""Returns the paper titles based on the paper index."""
|
41 |
+
return df.select(I[0])[column]
|