david-meltzer commited on
Commit
d40f2bc
·
1 Parent(s): 4a1bdeb

initial commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim-buster
2
+ COPY . /app
3
+ WORKDIR /app
4
+ RUN pip install --upgrade pip
5
+ RUN pip install numpy==1.24.3
6
+ RUN pip install -r requirements.txt
7
+ EXPOSE 8501
8
+ ENTRYPOINT ["streamlit","run"]
9
+ CMD ["app.py"]
Dockerrun.aws.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "AWSEBDockerrunVersion": "1",
3
+ "Image": {
4
+ "Name": "dmeltzer/semanticv3",
5
+ "Update": "true"
6
+ },
7
+ "Ports": [
8
+ {
9
+ "ContainerPort": 8501,
10
+ "HostPort": 8501
11
+ }
12
+ ],
13
+ "Logging": "/var/log/nginx"
14
+ }
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import pickle
3
+ import datasets
4
+ import numpy as np
5
+ import requests
6
+ import streamlit as st
7
+ from vector_engine.utils import vector_search
8
+ from transformers import AutoModel, AutoTokenizer
9
+
10
+ from datasets import load_dataset
11
+
12
+ @st.cache
13
+ def read_data(dataset_repo='dhmeltzer/asks_validation_embedded'):
14
+ """Read the data from huggingface."""
15
+ return load_dataset(dataset_repo)
16
+
17
+ #@st.cache(allow_output_mutation=True)
18
+ #def load_bert_model(name="nli-distilbert-base"):
19
+ # """Instantiate a sentence-level DistilBERT model."""
20
+ # return AutoModel.from_pretrained(f'sentence-transformers/{name}')
21
+ #
22
+ #@st.cache(allow_output_mutation=True)
23
+ #def load_tokenizer(name="nli-distilbert-base"):
24
+ # return AutoTokenizer.from_pretrained(f'sentence-transformers/{name}')
25
+
26
+ @st.cache(allow_output_mutation=True)
27
+ def load_faiss_index(path_to_faiss="./faiss_index_small.pickle"):
28
+ """Load and deserialize the Faiss index."""
29
+ with open(path_to_faiss, "rb") as h:
30
+ data = pickle.load(h)
31
+ return faiss.deserialize_index(data)
32
+
33
+ def main():
34
+ # Load data and models
35
+ data = read_data()
36
+ #model = load_bert_model()
37
+ #tok = load_tokenizer()
38
+ faiss_index = load_faiss_index()
39
+
40
+ import requests
41
+
42
+ model_id="sentence-transformers/nli-distilbert-base"
43
+
44
+ api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
45
+ headers = {"Authorization": "Bearer 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"}
46
+
47
+ def query(texts):
48
+ response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
49
+ return response.json()
50
+
51
+
52
+ st.title("Vector-based searches with Sentence Transformers and Faiss")
53
+
54
+ # User search
55
+ user_input = st.text_area("Search box", "ELI5 Dataset")
56
+
57
+ # Filters
58
+ st.sidebar.markdown("**Filters**")
59
+
60
+ filter_scores = st.sidebar.slider("Citations", 0, 250, 0)
61
+ num_results = st.sidebar.slider("Number of search results", 1, 50, 1)
62
+
63
+ vector = query([user_input])
64
+ # Fetch results
65
+ if user_input:
66
+ # Get paper IDs
67
+ _, I = faiss_index.search(np.array(vector).astype("float32"), k=num_results)
68
+ #D, I = vector_search([user_input],tok, model, faiss_index, num_results)
69
+
70
+ # Slice data on year
71
+ #frame = data[
72
+ # (data.scores >= filter_scores)
73
+ #]
74
+
75
+ frame = data
76
+ st.write(user_input)
77
+ # Get individual results
78
+ for id_ in I.flatten().tolist():
79
+ f = frame[id_]
80
+ #if id_ in set(frame.id):
81
+ # f = frame[(frame.id == id_)]
82
+ #else:
83
+ # continue
84
+
85
+ st.write(
86
+ f"""**{f['title']}**
87
+ **text**: {f['selftext']}
88
+ """
89
+ )
90
+
91
+
92
+ if __name__ == "__main__":
93
+ main()
faiss_index.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d056c102fcaa817168f48ce69a5ce515e86d80131fb7e7a2866a983f6847185b
3
+ size 862585188
faiss_index_small.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b29154f9f51079670be7cf88767f2932a052f8d026bfcf0bba1da92ffc230bf1
3
+ size 7025732
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ datasets
2
+ torch==1.13
3
+ transformers==4.29.0
4
+ faiss-cpu
5
+ folium
6
+ streamlit==1.14.0
7
+ -e .
setup.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+ from setuptools import find_namespace_packages
3
+
4
+ common_kwargs = dict(
5
+ version="0.1.0",
6
+ license="MIT",
7
+ author="David Meltzer",
8
+ author_email="[email protected]",
9
+ classifiers=[
10
+ "Intended Audience :: Developers",
11
+ "Intended Audience :: Science/Research",
12
+ "License :: OSI Approved :: MIT License",
13
+ "Natural Language :: English",
14
+ "Operating System :: OS Independent",
15
+ "Programming Language :: Python :: 3.9",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ ],
18
+ python_requires=">=3.9",
19
+ include_package_data=False,
20
+ )
21
+
22
+ setup(
23
+ name="vector_engine",
24
+ packages=find_namespace_packages(where="vector_engine.*"),
25
+ **common_kwargs
26
+ )
vector_engine/.utils.py.swp ADDED
Binary file (12.3 kB). View file
 
vector_engine/__init__.py ADDED
File without changes
vector_engine/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (154 Bytes). View file
 
vector_engine/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (161 Bytes). View file
 
vector_engine/__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.93 kB). View file
 
vector_engine/__pycache__/utils.cpython-39.pyc ADDED
Binary file (1.93 kB). View file
 
vector_engine/utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from transformers import AutoTokenizer, AutoModel
5
+
6
+ #Mean Pooling - Take attention mask into account for correct averaging
7
+ def mean_pooling(model_output, attention_mask):
8
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
9
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
10
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
11
+
12
+ def vector_search(query, tokenizer, model, index, num_results=10):
13
+ """Tranforms query to vector using a pretrained, sentence-level
14
+ DistilBERT model and finds similar vectors using FAISS.
15
+ Args:
16
+ query (str): User query that should be more than a sentence long.
17
+ model (sentence_transformers.SentenceTransformer.SentenceTransformer)
18
+ index (`numpy.ndarray`): FAISS index that needs to be deserialized.
19
+ num_results (int): Number of results to return.
20
+ Returns:
21
+ D (:obj:`numpy.array` of `float`): Distance between results and query.
22
+ I (:obj:`numpy.array` of `int`): Paper ID of the results.
23
+
24
+ """
25
+ query=list(query)
26
+ encoded_input = tokenizer(query,padding=True, truncation=True, return_tensors='pt')
27
+
28
+ with torch.no_grad():
29
+ model_output = model(**encoded_input)
30
+
31
+ vector = mean_pooling(model_output, encoded_input['attention_mask'])
32
+ vector = F.normalize(vector, p=2, dim=1)
33
+
34
+
35
+ #vector = model.encode(list(query))
36
+ D, I = index.search(np.array(vector).astype("float32"), k=num_results)
37
+ return D, I
38
+
39
+ def id2details(df, I, column):
40
+ """Returns the paper titles based on the paper index."""
41
+ return df.select(I[0])[column]