Spaces:
Runtime error
Runtime error
update
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
import streamlit as st
|
6 |
import torch
|
7 |
|
|
|
8 |
import os
|
9 |
|
10 |
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
@@ -21,27 +22,27 @@ def load_model_and_tokenizer():
|
|
21 |
|
22 |
@st.cache(allow_output_mutation=True)
|
23 |
def load_sentence_data():
|
24 |
-
sentence_df = pd.read_csv("
|
25 |
|
26 |
return sentence_df
|
27 |
|
28 |
|
29 |
@st.cache(allow_output_mutation=True)
|
30 |
def load_sentence_embeddings_and_index():
|
31 |
-
npz_comp = np.load("
|
32 |
sentence_embeddings = npz_comp["arr_0"]
|
33 |
|
34 |
faiss.normalize_L2(sentence_embeddings)
|
35 |
D = 768
|
36 |
-
N =
|
37 |
-
Xt = sentence_embeddings[:
|
38 |
X = sentence_embeddings
|
39 |
|
40 |
# Param of PQ
|
41 |
M = 16 # The number of sub-vector. Typically this is 8, 16, 32, etc.
|
42 |
nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
|
43 |
# Param of IVF
|
44 |
-
nlist =
|
45 |
# Param of HNSW
|
46 |
hnsw_m = 32 # The number of neighbors for HNSW. This is typically 32
|
47 |
|
@@ -102,7 +103,7 @@ if __name__ == "__main__":
|
|
102 |
st.markdown("## AI-based Paraphrasing for Academic Writing")
|
103 |
|
104 |
input_text = st.text_area("text input", "We saw difference in the results between A and B.", placeholder="Write something here...")
|
105 |
-
top_k = st.number_input('top_k (upperbound)', min_value=1, value=
|
106 |
input_words = st.text_input("exclude words (comma separated)", "see, saw")
|
107 |
|
108 |
if st.button('search'):
|
|
|
5 |
import streamlit as st
|
6 |
import torch
|
7 |
|
8 |
+
import math
|
9 |
import os
|
10 |
|
11 |
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
|
|
22 |
|
23 |
@st.cache(allow_output_mutation=True)
|
24 |
def load_sentence_data():
|
25 |
+
sentence_df = pd.read_csv("sentence_data_858k.csv.gz")
|
26 |
|
27 |
return sentence_df
|
28 |
|
29 |
|
30 |
@st.cache(allow_output_mutation=True)
|
31 |
def load_sentence_embeddings_and_index():
|
32 |
+
npz_comp = np.load("sentence_embeddings_858k.npz")
|
33 |
sentence_embeddings = npz_comp["arr_0"]
|
34 |
|
35 |
faiss.normalize_L2(sentence_embeddings)
|
36 |
D = 768
|
37 |
+
N = 857610
|
38 |
+
Xt = sentence_embeddings[:100000]
|
39 |
X = sentence_embeddings
|
40 |
|
41 |
# Param of PQ
|
42 |
M = 16 # The number of sub-vector. Typically this is 8, 16, 32, etc.
|
43 |
nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
|
44 |
# Param of IVF
|
45 |
+
nlist = int(math.sqrt(N)) # The number of cells (space partition). Typical value is sqrt(N)
|
46 |
# Param of HNSW
|
47 |
hnsw_m = 32 # The number of neighbors for HNSW. This is typically 32
|
48 |
|
|
|
103 |
st.markdown("## AI-based Paraphrasing for Academic Writing")
|
104 |
|
105 |
input_text = st.text_area("text input", "We saw difference in the results between A and B.", placeholder="Write something here...")
|
106 |
+
top_k = st.number_input('top_k (upperbound)', min_value=1, value=200, step=1)
|
107 |
input_words = st.text_input("exclude words (comma separated)", "see, saw")
|
108 |
|
109 |
if st.button('search'):
|
sentence_data_2m.csv.gz β sentence_data_858k.csv.gz
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bdbb865ac3bf06d634a3a1b05ec3a85aff825496a2d1bb8598800f659448aeae
|
3 |
+
size 41237822
|
sentence_embeddings_2m.npz β sentence_embeddings_858k.npz
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a9c20fa9e559139b368b388fd4ca092fef8cc27966685cfc6961a2a6e727622
|
3 |
+
size 2438570561
|