kaisugi commited on
Commit
60689d7
Β·
1 Parent(s): 980f078
app.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import streamlit as st
6
  import torch
7
 
 
8
  import os
9
 
10
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
@@ -21,27 +22,27 @@ def load_model_and_tokenizer():
21
 
22
  @st.cache(allow_output_mutation=True)
23
  def load_sentence_data():
24
- sentence_df = pd.read_csv("sentence_data_2m.csv.gz")
25
 
26
  return sentence_df
27
 
28
 
29
  @st.cache(allow_output_mutation=True)
30
  def load_sentence_embeddings_and_index():
31
- npz_comp = np.load("sentence_embeddings_2m.npz")
32
  sentence_embeddings = npz_comp["arr_0"]
33
 
34
  faiss.normalize_L2(sentence_embeddings)
35
  D = 768
36
- N = 789188
37
- Xt = sentence_embeddings[:39000]
38
  X = sentence_embeddings
39
 
40
  # Param of PQ
41
  M = 16 # The number of sub-vector. Typically this is 8, 16, 32, etc.
42
  nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
43
  # Param of IVF
44
- nlist = 888 # The number of cells (space partition). Typical value is sqrt(N)
45
  # Param of HNSW
46
  hnsw_m = 32 # The number of neighbors for HNSW. This is typically 32
47
 
@@ -102,7 +103,7 @@ if __name__ == "__main__":
102
  st.markdown("## AI-based Paraphrasing for Academic Writing")
103
 
104
  input_text = st.text_area("text input", "We saw difference in the results between A and B.", placeholder="Write something here...")
105
- top_k = st.number_input('top_k (upperbound)', min_value=1, value=100, step=1)
106
  input_words = st.text_input("exclude words (comma separated)", "see, saw")
107
 
108
  if st.button('search'):
 
5
  import streamlit as st
6
  import torch
7
 
8
+ import math
9
  import os
10
 
11
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
 
22
 
23
  @st.cache(allow_output_mutation=True)
24
  def load_sentence_data():
25
+ sentence_df = pd.read_csv("sentence_data_858k.csv.gz")
26
 
27
  return sentence_df
28
 
29
 
30
  @st.cache(allow_output_mutation=True)
31
  def load_sentence_embeddings_and_index():
32
+ npz_comp = np.load("sentence_embeddings_858k.npz")
33
  sentence_embeddings = npz_comp["arr_0"]
34
 
35
  faiss.normalize_L2(sentence_embeddings)
36
  D = 768
37
+ N = 857610
38
+ Xt = sentence_embeddings[:100000]
39
  X = sentence_embeddings
40
 
41
  # Param of PQ
42
  M = 16 # The number of sub-vector. Typically this is 8, 16, 32, etc.
43
  nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
44
  # Param of IVF
45
+ nlist = int(math.sqrt(N)) # The number of cells (space partition). Typical value is sqrt(N)
46
  # Param of HNSW
47
  hnsw_m = 32 # The number of neighbors for HNSW. This is typically 32
48
 
 
103
  st.markdown("## AI-based Paraphrasing for Academic Writing")
104
 
105
  input_text = st.text_area("text input", "We saw difference in the results between A and B.", placeholder="Write something here...")
106
+ top_k = st.number_input('top_k (upperbound)', min_value=1, value=200, step=1)
107
  input_words = st.text_input("exclude words (comma separated)", "see, saw")
108
 
109
  if st.button('search'):
sentence_data_2m.csv.gz β†’ sentence_data_858k.csv.gz RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63c969cf0690e217d708cf1dca54a567b3e8b32c4d1b4f8581445a5bd3a5be0a
3
- size 105386557
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdbb865ac3bf06d634a3a1b05ec3a85aff825496a2d1bb8598800f659448aeae
3
+ size 41237822
sentence_embeddings_2m.npz β†’ sentence_embeddings_858k.npz RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe2c1296e1d676df6c0fc24b748523e254222ccf19bf5116d2cca0d93237709e
3
- size 6219635550
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a9c20fa9e559139b368b388fd4ca092fef8cc27966685cfc6961a2a6e727622
3
+ size 2438570561