Spaces:
Runtime error
Runtime error
File size: 2,421 Bytes
673bdce ff974ba 673bdce ff974ba 673bdce 93f465d b48f7ea fc36661 054a10b b48f7ea a01266f ee93acc 0deac40 ee93acc ff974ba ee93acc 054a10b 31a82df 6415a43 054a10b fc36661 a01266f 31a82df a01266f 054a10b fc36661 054a10b 0deac40 a01266f 673bdce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
import pandas as pd
import requests
def find_abstracts(soup):
#df = pd.DataFrame(columns = ["identifier", "abstract"])
id_list = []
abs_list = []
title_list = []
for record in soup.find_all("csw:record"):
id = record.find("dc:identifier")
abs = record.find("dct:abstract")
title = record.find("dc:title")
# append id and abs to df
#df = df.append([id.text, abs.text])
id_list.append(id.text)
title_list.append(title.text)
if abs != None:
abs_list.append(abs.text)
else:
abs_list.append("NA")
return id_list, title_list, abs_list
def get_metadata():
# Get the abstracts from Geoportal
URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
page = requests.get(URL)
soup = BeautifulSoup(page.text, "lxml")
id_list, title_list, abs_list = find_abstracts(soup)
df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
df.to_csv("./ncei-metadata.csv")
return df
def show_model():
return
def main():
st.title("Semantic Search for Datasets Using Sentence Transformers")
st.write("A case study for the National Centers for Environmental Information (NCEI)")
st.image("noaa_logo.png", width=150)
st.write("## Goal: search for datasets in NCEI's Archive using natural language queries")
st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)")
st.image("pres-whatisnoaa.png")
st.write("## The Problem Context")
st.write("Uses service called OneStop for data search")
st.write("**Problems:**")
st.write("- Uses keyword search -- not robust to natural language queries")
st.write("- Filtering options too specific for non-expert users")
#st.image("pres-onestop.png")
#st.image("pres-problems.png")
st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)")
st.image("pres-sentencetransformers.png")
st.write("## Project Data")
st.image("pres-metadata.png")
st.write("## The Process")
st.image("pres-creatingse.png")
st.write("## Results and Demo")
query = st.text_input("Enter your query:")
results = show_model(query)
st.image("pres-futureplans.png")
main() |