Spaces:

HarryLee
/

QueryExpansion

Runtime error

App Files Files Community

HarryLee commited on Feb 28, 2023

Commit

927223b

1 Parent(s): ae9bda3

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -33

app.py CHANGED Viewed

@@ -43,7 +43,6 @@ st.sidebar.success("Load Successfully!")
 if not torch.cuda.is_available():
     print("Warning: No GPU found. Please add GPU to your notebook")
 #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
 bi_encoder = SentenceTransformer(option1)
 bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
@@ -52,38 +51,13 @@ top_k = 32                          #Number of passages we want to retrieve with
 #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
 cross_encoder = CrossEncoder(option2)
-# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
-# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder
-etsy_filepath = '000000000001.json'
-#if not os.path.exists(wikipedia_filepath):
-#    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)
-passages = []
-'''
-with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
-    for line in fIn:
-        data = json.loads(line.strip())
-        #Add all paragraphs
-        #passages.extend(data['paragraphs'])
-        #Only add the first paragraph
-        passages.append(data['paragraphs'][0])
-'''
-with open(etsy_filepath, 'r') as EtsyJson:
-  for line in EtsyJson:
-    data = json.loads(line.strip())
-    #passages.append(data['query'])
-    passages.append(data['title'])
-print("Passages:", len(passages))
-# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
-corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
 # This function will search all wikipedia articles for passages that
 # answer the query

 if not torch.cuda.is_available():
     print("Warning: No GPU found. Please add GPU to your notebook")
 #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
 bi_encoder = SentenceTransformer(option1)
 bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
 #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
 cross_encoder = CrossEncoder(option2)
+# load pre-train embeedings files
+embedding_cache_path = 'etsy-embeddings.pkl'
+print("Load pre-computed embeddings from disc")
+with open(embedding_cache_path, "rb") as fIn:
+  cache_data = pickle.load(fIn)
+  corpus_sentences = cache_data['sentences']
+  corpus_embeddings = cache_data['embeddings']
 # This function will search all wikipedia articles for passages that
 # answer the query