File size: 2,421 Bytes
673bdce
ff974ba
 
 
 
673bdce
ff974ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673bdce
 
93f465d
 
b48f7ea
 
fc36661
054a10b
b48f7ea
a01266f
ee93acc
0deac40
 
ee93acc
 
ff974ba
ee93acc
 
054a10b
31a82df
6415a43
054a10b
fc36661
a01266f
31a82df
 
a01266f
054a10b
fc36661
054a10b
0deac40
 
 
 
a01266f
673bdce
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
import pandas as pd
import requests

def find_abstracts(soup):
  #df = pd.DataFrame(columns = ["identifier", "abstract"])
  id_list = []
  abs_list = []
  title_list = []

  for record in soup.find_all("csw:record"):
    id = record.find("dc:identifier")
    abs = record.find("dct:abstract")
    title = record.find("dc:title")

    # append id and abs to df
    #df = df.append([id.text, abs.text])
    id_list.append(id.text)
    title_list.append(title.text)

    if abs != None:
      abs_list.append(abs.text)
    else:
      abs_list.append("NA")

  return id_list, title_list, abs_list

def get_metadata():
  # Get the abstracts from Geoportal
  URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
  
  page = requests.get(URL)
  soup = BeautifulSoup(page.text, "lxml")
  
  id_list, title_list, abs_list = find_abstracts(soup)
  df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
  df.to_csv("./ncei-metadata.csv")

  return df
  
def show_model():

  return 
  
def main():
  st.title("Semantic Search for Datasets Using Sentence Transformers")
  st.write("A case study for the National Centers for Environmental Information (NCEI)")
  st.image("noaa_logo.png", width=150)
  
  st.write("## Goal: search for datasets in NCEI's Archive using natural language queries")
  st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)")
  
  st.image("pres-whatisnoaa.png")
  
  st.write("## The Problem Context")
  st.write("Uses service called OneStop for data search")
  st.write("**Problems:**")
  st.write("- Uses keyword search -- not robust to natural language queries")
  st.write("- Filtering options too specific for non-expert users")
  #st.image("pres-onestop.png")
  #st.image("pres-problems.png")
  
  st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)")
  st.image("pres-sentencetransformers.png")
  
  st.write("## Project Data")
  st.image("pres-metadata.png")
  
  st.write("## The Process")
  st.image("pres-creatingse.png")
  
  st.write("## Results and Demo")
  
  query = st.text_input("Enter your query:")
  
  results = show_model(query)
  
  st.image("pres-futureplans.png")

main()