Spaces:
Runtime error
Runtime error
File size: 4,007 Bytes
673bdce ff974ba 6370188 673bdce ff974ba 6370188 673bdce 93f465d b48f7ea fc36661 054a10b b48f7ea a01266f ee93acc 0deac40 ee93acc ff974ba ee93acc 054a10b 31a82df 6415a43 054a10b fc36661 a01266f 31a82df a01266f 054a10b fc36661 054a10b 0deac40 c4b2b97 fdc154a c4b2b97 0deac40 fdc154a e4e7a06 6370188 a01266f 8cd59e7 673bdce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import time
def find_abstracts(soup):
#df = pd.DataFrame(columns = ["identifier", "abstract"])
id_list = []
abs_list = []
title_list = []
for record in soup.find_all("csw:record"):
id = record.find("dc:identifier")
abs = record.find("dct:abstract")
title = record.find("dc:title")
# append id and abs to df
#df = df.append([id.text, abs.text])
id_list.append(id.text)
title_list.append(title.text)
if abs != None:
abs_list.append(abs.text)
else:
abs_list.append("NA")
return id_list, title_list, abs_list
def get_metadata():
# Get the abstracts from Geoportal
URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
page = requests.get(URL)
soup = BeautifulSoup(page.text, "lxml")
id_list, title_list, abs_list = find_abstracts(soup)
df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
df.to_csv("./ncei-metadata.csv")
return df
def show_model(query):
path = "./ncei-metadata.csv"
if os.path.exists(path):
last_modified = os.path.getmtime(path)
now = time.time()
DAY = 86400
if (now - last_modified > DAY):
df = get_metadata()
else:
df = pd.read_csv(path)
else:
df = get_metadata()
# Make the abstracts the docs
docs_df = df[df["abstract"] != "NA"]
docs = list(docs_df["abstract"])
titles = list(docs_df["title"])
# Query
query = input("Enter your query: ")
# predict on a search query for data
#Load the model
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
#Encode query and documents
query_emb = model.encode(query)
doc_emb = model.encode(docs)
#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
#Combine docs & scores
doc_score_pairs = list(zip(docs, scores, titles))
#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
return doc_score_pairs
def main():
st.title("Semantic Search for Datasets Using Sentence Transformers")
st.write("A case study for the National Centers for Environmental Information (NCEI)")
st.image("noaa_logo.png", width=150)
st.write("## Goal: search for datasets in NCEI's Archive using natural language queries")
st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)")
st.image("pres-whatisnoaa.png")
st.write("## The Problem Context")
st.write("Uses service called OneStop for data search")
st.write("**Problems:**")
st.write("- Uses keyword search -- not robust to natural language queries")
st.write("- Filtering options too specific for non-expert users")
#st.image("pres-onestop.png")
#st.image("pres-problems.png")
st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)")
st.image("pres-sentencetransformers.png")
st.write("## Project Data")
st.image("pres-metadata.png")
st.write("## The Process")
st.image("pres-creatingse.png")
st.write("## Results and Demo")
query = st.text_input("Enter your query:")
if query != "":
with st.spinner("Searching..."):
results = show_model(query)
#Output passages & scores
for doc, score, title in results[:10]:
st.write("Score: ", score)
st.write("Title:", title)
st.write("Abstract:", abstract)
st.write("---")
st.image("pres-futureplans.png")
st.write("## Critical Analysis")
st.write("- seems to take a while to run on HuggingFace Space")
st.write("- only embeds the first 5000 datasets")
st.write("- calculates embeddings for datasets with each run")
main() |