Spaces:

dhmeltzer
/

semantic

Sleeping

App Files Files Community

david-meltzer commited on May 19, 2023

Commit

d40f2bc

1 Parent(s): 4a1bdeb

initial commit

Browse files

Files changed (14) hide show

Dockerfile +9 -0
Dockerrun.aws.json +14 -0
app.py +93 -0
faiss_index.pickle +3 -0
faiss_index_small.pickle +3 -0
requirements.txt +7 -0
setup.py +26 -0
vector_engine/.utils.py.swp +0 -0
vector_engine/__init__.py +0 -0
vector_engine/__pycache__/__init__.cpython-310.pyc +0 -0
vector_engine/__pycache__/__init__.cpython-39.pyc +0 -0
vector_engine/__pycache__/utils.cpython-310.pyc +0 -0
vector_engine/__pycache__/utils.cpython-39.pyc +0 -0
vector_engine/utils.py +41 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,9 @@

+FROM python:3.9-slim-buster
+COPY . /app
+WORKDIR /app
+RUN pip install --upgrade pip
+RUN pip install numpy==1.24.3
+RUN pip install -r requirements.txt
+EXPOSE 8501
+ENTRYPOINT ["streamlit","run"]
+CMD ["app.py"]

Dockerrun.aws.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "AWSEBDockerrunVersion": "1",
+  "Image": {
+    "Name": "dmeltzer/semanticv3",
+    "Update": "true"
+  },
+  "Ports": [
+    {
+      "ContainerPort": 8501,
+      "HostPort": 8501
+    }
+  ],
+  "Logging": "/var/log/nginx"
+}

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import faiss
+import pickle
+import datasets
+import numpy as np
+import requests
+import streamlit as st
+from vector_engine.utils import vector_search
+from transformers import AutoModel, AutoTokenizer
+from datasets import load_dataset
+@st.cache
+def read_data(dataset_repo='dhmeltzer/asks_validation_embedded'):
+    """Read the data from huggingface."""
+    return load_dataset(dataset_repo)
+#@st.cache(allow_output_mutation=True)
+#def load_bert_model(name="nli-distilbert-base"):
+#    """Instantiate a sentence-level DistilBERT model."""
+#    return AutoModel.from_pretrained(f'sentence-transformers/{name}')
+#
+#@st.cache(allow_output_mutation=True)
+#def load_tokenizer(name="nli-distilbert-base"):
+#    return AutoTokenizer.from_pretrained(f'sentence-transformers/{name}')
+@st.cache(allow_output_mutation=True)
+def load_faiss_index(path_to_faiss="./faiss_index_small.pickle"):
+    """Load and deserialize the Faiss index."""
+    with open(path_to_faiss, "rb") as h:
+        data = pickle.load(h)
+    return faiss.deserialize_index(data)
+def main():
+    # Load data and models
+    data = read_data()
+    #model = load_bert_model()
+    #tok = load_tokenizer()
+    faiss_index = load_faiss_index()
+    import requests
+    model_id="sentence-transformers/nli-distilbert-base"
+    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
+    headers = {"Authorization": "Bearer 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"}
+    def query(texts):
+        response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
+        return response.json()
+    st.title("Vector-based searches with Sentence Transformers and Faiss")
+    # User search
+    user_input = st.text_area("Search box", "ELI5 Dataset")
+    # Filters
+    st.sidebar.markdown("**Filters**")
+    filter_scores = st.sidebar.slider("Citations", 0, 250, 0)
+    num_results = st.sidebar.slider("Number of search results", 1, 50, 1)
+    vector = query([user_input])
+    # Fetch results
+    if user_input:
+        # Get paper IDs
+        _, I = faiss_index.search(np.array(vector).astype("float32"), k=num_results)
+        #D, I = vector_search([user_input],tok, model, faiss_index, num_results)
+        # Slice data on year
+        #frame = data[
+        #    (data.scores >= filter_scores)
+        #]
+        frame = data
+        st.write(user_input)
+        # Get individual results
+        for id_ in I.flatten().tolist():
+            f = frame[id_]
+            #if id_ in set(frame.id):
+            #    f = frame[(frame.id == id_)]
+            #else:
+            #    continue
+            st.write(
+                f"""**{f['title']}**
+            **text**: {f['selftext']}
+            """
+            )
+if __name__ == "__main__":
+    main()

faiss_index.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d056c102fcaa817168f48ce69a5ce515e86d80131fb7e7a2866a983f6847185b
+size 862585188

faiss_index_small.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b29154f9f51079670be7cf88767f2932a052f8d026bfcf0bba1da92ffc230bf1
+size 7025732

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+datasets
+torch==1.13
+transformers==4.29.0
+faiss-cpu
+folium
+streamlit==1.14.0
+-e .

setup.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from setuptools import setup
+from setuptools import find_namespace_packages
+common_kwargs = dict(
+    version="0.1.0",
+    license="MIT",
+    author="David Meltzer",
+    author_email="[email protected]",
+    classifiers=[
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Natural Language :: English",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    python_requires=">=3.9",
+    include_package_data=False,
+)
+setup(
+    name="vector_engine",
+    packages=find_namespace_packages(where="vector_engine.*"),
+    **common_kwargs
+)

vector_engine/.utils.py.swp ADDED Viewed

Binary file (12.3 kB). View file

vector_engine/__init__.py ADDED Viewed

File without changes

vector_engine/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (154 Bytes). View file

vector_engine/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (161 Bytes). View file

vector_engine/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.93 kB). View file

vector_engine/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (1.93 kB). View file

vector_engine/utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+def vector_search(query, tokenizer, model, index, num_results=10):
+    """Tranforms query to vector using a pretrained, sentence-level
+    DistilBERT model and finds similar vectors using FAISS.
+    Args:
+        query (str): User query that should be more than a sentence long.
+        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
+        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
+        num_results (int): Number of results to return.
+    Returns:
+        D (:obj:`numpy.array` of `float`): Distance between results and query.
+        I (:obj:`numpy.array` of `int`): Paper ID of the results.
+    """
+    query=list(query)
+    encoded_input = tokenizer(query,padding=True, truncation=True, return_tensors='pt')
+    with torch.no_grad():
+        model_output = model(**encoded_input)
+    vector = mean_pooling(model_output, encoded_input['attention_mask'])
+    vector = F.normalize(vector, p=2, dim=1)
+    #vector = model.encode(list(query))
+    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
+    return D, I
+def id2details(df, I, column):
+    """Returns the paper titles based on the paper index."""
+    return df.select(I[0])[column]