File size: 4,007 Bytes
673bdce
ff974ba
 
 
 
6370188
 
673bdce
ff974ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6370188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673bdce
 
93f465d
 
b48f7ea
 
fc36661
054a10b
b48f7ea
a01266f
ee93acc
0deac40
 
ee93acc
 
ff974ba
ee93acc
 
054a10b
31a82df
6415a43
054a10b
fc36661
a01266f
31a82df
 
a01266f
054a10b
fc36661
054a10b
0deac40
c4b2b97
 
 
 
fdc154a
c4b2b97
 
 
 
 
 
 
0deac40
fdc154a
e4e7a06
6370188
a01266f
8cd59e7
 
 
 
 
673bdce
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import time

def find_abstracts(soup):
  #df = pd.DataFrame(columns = ["identifier", "abstract"])
  id_list = []
  abs_list = []
  title_list = []

  for record in soup.find_all("csw:record"):
    id = record.find("dc:identifier")
    abs = record.find("dct:abstract")
    title = record.find("dc:title")

    # append id and abs to df
    #df = df.append([id.text, abs.text])
    id_list.append(id.text)
    title_list.append(title.text)

    if abs != None:
      abs_list.append(abs.text)
    else:
      abs_list.append("NA")

  return id_list, title_list, abs_list

def get_metadata():
  # Get the abstracts from Geoportal
  URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
  
  page = requests.get(URL)
  soup = BeautifulSoup(page.text, "lxml")
  
  id_list, title_list, abs_list = find_abstracts(soup)
  df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
  df.to_csv("./ncei-metadata.csv")

  return df
  
def show_model(query):
  path = "./ncei-metadata.csv"
  
  if os.path.exists(path):
  
    last_modified = os.path.getmtime(path)
    now = time.time()
    DAY = 86400
    
    if (now - last_modified > DAY):
      df = get_metadata()
      
    else:
      df = pd.read_csv(path)
  else:
    df = get_metadata()
    
  
  # Make the abstracts the docs
  docs_df = df[df["abstract"] != "NA"]
  docs = list(docs_df["abstract"])
  titles = list(docs_df["title"])
  
  # Query
  query = input("Enter your query: ")
  
  # predict on a search query for data
  
  #Load the model
  model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
  
  #Encode query and documents
  query_emb = model.encode(query)
  doc_emb = model.encode(docs)
  
  #Compute dot score between query and all document embeddings
  scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
  
  #Combine docs & scores
  doc_score_pairs = list(zip(docs, scores, titles))
  
  #Sort by decreasing score
  doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
  return doc_score_pairs
  
def main():
  st.title("Semantic Search for Datasets Using Sentence Transformers")
  st.write("A case study for the National Centers for Environmental Information (NCEI)")
  st.image("noaa_logo.png", width=150)
  
  st.write("## Goal: search for datasets in NCEI's Archive using natural language queries")
  st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)")
  
  st.image("pres-whatisnoaa.png")
  
  st.write("## The Problem Context")
  st.write("Uses service called OneStop for data search")
  st.write("**Problems:**")
  st.write("- Uses keyword search -- not robust to natural language queries")
  st.write("- Filtering options too specific for non-expert users")
  #st.image("pres-onestop.png")
  #st.image("pres-problems.png")
  
  st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)")
  st.image("pres-sentencetransformers.png")
  
  st.write("## Project Data")
  st.image("pres-metadata.png")
  
  st.write("## The Process")
  st.image("pres-creatingse.png")
  
  st.write("## Results and Demo")
  
  query = st.text_input("Enter your query:")

  if query != "":
    with st.spinner("Searching..."):
      results = show_model(query)
      
        #Output passages & scores
    for doc, score, title in results[:10]:
      st.write("Score: ", score)
      st.write("Title:", title)
      st.write("Abstract:", abstract)
      st.write("---")

  
  

  
  st.image("pres-futureplans.png")
  
  st.write("## Critical Analysis")
  st.write("- seems to take a while to run on HuggingFace Space")
  st.write("- only embeds the first 5000 datasets")
  st.write("- calculates embeddings for datasets with each run")

main()