Spaces:

kaisugi
/

academic-paraphraser

Runtime error

kaisugi commited on Feb 14, 2023

Commit

60689d7

1 Parent(s): 980f078

update

Files changed (3) hide show

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 import streamlit as st
 import torch
 import os
 os.environ['KMP_DUPLICATE_LIB_OK']='True'
@@ -21,27 +22,27 @@ def load_model_and_tokenizer():
 @st.cache(allow_output_mutation=True)
 def load_sentence_data():
-    sentence_df = pd.read_csv("sentence_data_2m.csv.gz")
     return sentence_df
 @st.cache(allow_output_mutation=True)
 def load_sentence_embeddings_and_index():
-    npz_comp = np.load("sentence_embeddings_2m.npz")
     sentence_embeddings = npz_comp["arr_0"]
     faiss.normalize_L2(sentence_embeddings)
     D = 768
-    N = 789188
-    Xt = sentence_embeddings[:39000]
     X = sentence_embeddings
     # Param of PQ
     M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
     nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
     # Param of IVF
-    nlist = 888  # The number of cells (space partition). Typical value is sqrt(N)
     # Param of HNSW
     hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32
@@ -102,7 +103,7 @@ if __name__ == "__main__":
     st.markdown("## AI-based Paraphrasing for Academic Writing")
     input_text = st.text_area("text input", "We saw difference in the results between A and B.", placeholder="Write something here...")
-    top_k = st.number_input('top_k (upperbound)', min_value=1, value=100, step=1)
     input_words = st.text_input("exclude words (comma separated)", "see, saw")
     if st.button('search'):

 import streamlit as st
 import torch
+import math
 import os
 os.environ['KMP_DUPLICATE_LIB_OK']='True'
 @st.cache(allow_output_mutation=True)
 def load_sentence_data():
+    sentence_df = pd.read_csv("sentence_data_858k.csv.gz")
     return sentence_df
 @st.cache(allow_output_mutation=True)
 def load_sentence_embeddings_and_index():
+    npz_comp = np.load("sentence_embeddings_858k.npz")
     sentence_embeddings = npz_comp["arr_0"]
     faiss.normalize_L2(sentence_embeddings)
     D = 768
+    N = 857610
+    Xt = sentence_embeddings[:100000]
     X = sentence_embeddings
     # Param of PQ
     M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
     nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
     # Param of IVF
+    nlist = int(math.sqrt(N))  # The number of cells (space partition). Typical value is sqrt(N)
     # Param of HNSW
     hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32
     st.markdown("## AI-based Paraphrasing for Academic Writing")
     input_text = st.text_area("text input", "We saw difference in the results between A and B.", placeholder="Write something here...")
+    top_k = st.number_input('top_k (upperbound)', min_value=1, value=200, step=1)
     input_words = st.text_input("exclude words (comma separated)", "see, saw")
     if st.button('search'):

sentence_data_2m.csv.gz → sentence_data_858k.csv.gz RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:63c969cf0690e217d708cf1dca54a567b3e8b32c4d1b4f8581445a5bd3a5be0a
-size 105386557

 version https://git-lfs.github.com/spec/v1
+oid sha256:bdbb865ac3bf06d634a3a1b05ec3a85aff825496a2d1bb8598800f659448aeae
+size 41237822

sentence_embeddings_2m.npz → sentence_embeddings_858k.npz RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe2c1296e1d676df6c0fc24b748523e254222ccf19bf5116d2cca0d93237709e
-size 6219635550

 version https://git-lfs.github.com/spec/v1
+oid sha256:4a9c20fa9e559139b368b388fd4ca092fef8cc27966685cfc6961a2a6e727622
+size 2438570561