myshirk's picture
Update app.py
ff974ba
raw
history blame
2.42 kB
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
import pandas as pd
import requests
def find_abstracts(soup):
#df = pd.DataFrame(columns = ["identifier", "abstract"])
id_list = []
abs_list = []
title_list = []
for record in soup.find_all("csw:record"):
id = record.find("dc:identifier")
abs = record.find("dct:abstract")
title = record.find("dc:title")
# append id and abs to df
#df = df.append([id.text, abs.text])
id_list.append(id.text)
title_list.append(title.text)
if abs != None:
abs_list.append(abs.text)
else:
abs_list.append("NA")
return id_list, title_list, abs_list
def get_metadata():
# Get the abstracts from Geoportal
URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
page = requests.get(URL)
soup = BeautifulSoup(page.text, "lxml")
id_list, title_list, abs_list = find_abstracts(soup)
df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
df.to_csv("./ncei-metadata.csv")
return df
def show_model():
return
def main():
st.title("Semantic Search for Datasets Using Sentence Transformers")
st.write("A case study for the National Centers for Environmental Information (NCEI)")
st.image("noaa_logo.png", width=150)
st.write("## Goal: search for datasets in NCEI's Archive using natural language queries")
st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)")
st.image("pres-whatisnoaa.png")
st.write("## The Problem Context")
st.write("Uses service called OneStop for data search")
st.write("**Problems:**")
st.write("- Uses keyword search -- not robust to natural language queries")
st.write("- Filtering options too specific for non-expert users")
#st.image("pres-onestop.png")
#st.image("pres-problems.png")
st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)")
st.image("pres-sentencetransformers.png")
st.write("## Project Data")
st.image("pres-metadata.png")
st.write("## The Process")
st.image("pres-creatingse.png")
st.write("## Results and Demo")
query = st.text_input("Enter your query:")
results = show_model(query)
st.image("pres-futureplans.png")
main()