Update app.py
Browse files
app.py
CHANGED
@@ -87,10 +87,10 @@ def preprocess_plain_text(text, window_size=3):
|
|
87 |
# #break multi-headlines into a line each
|
88 |
chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
|
89 |
|
90 |
-
#
|
91 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
92 |
|
93 |
-
|
94 |
paragraphs = []
|
95 |
for paragraph in text.replace('\n', ' ').split("\n\n"):
|
96 |
if len(paragraph.strip()) > 0:
|
@@ -106,15 +106,13 @@ def preprocess_plain_text(text, window_size=3):
|
|
106 |
return passages
|
107 |
|
108 |
|
109 |
-
def bi_encode(
|
110 |
global bi_encoder
|
111 |
# We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
|
112 |
-
bi_encoder = SentenceTransformer(
|
113 |
|
114 |
-
# quantize the model
|
115 |
-
# bi_encoder = quantize_dynamic(model, {Linear, Embedding})
|
116 |
|
117 |
-
# Compute the embeddings
|
118 |
with st.spinner('Encoding passages into a vector space...'):
|
119 |
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
|
120 |
|
@@ -142,8 +140,7 @@ def display_as_table(model, score='score'):
|
|
142 |
st.title("Search Your Query Here")
|
143 |
window_size = 3
|
144 |
|
145 |
-
|
146 |
-
# This function will search all wikipedia articles for passages that answer the query
|
147 |
def search_func(query):
|
148 |
global bi_encoder, cross_encoder
|
149 |
|
@@ -157,7 +154,7 @@ def search_func(query):
|
|
157 |
|
158 |
st.write(f"Document Header: {pdf_title}")
|
159 |
|
160 |
-
# Encode the query using the bi-encoder and find
|
161 |
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
162 |
question_embedding = question_embedding.cpu()
|
163 |
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=2, score_function=util.dot_score)
|
@@ -224,7 +221,7 @@ if search:
|
|
224 |
with st.spinner(
|
225 |
text=f"Loading..........................."
|
226 |
):
|
227 |
-
bi_encoder, corpus_embeddings = bi_encode(
|
228 |
cross_encoder = cross_encode()
|
229 |
|
230 |
with st.spinner(
|
|
|
87 |
# #break multi-headlines into a line each
|
88 |
chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
|
89 |
|
90 |
+
# drop blank lines
|
91 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
92 |
|
93 |
+
# We split this article into paragraphs and then every paragraph into sentences
|
94 |
paragraphs = []
|
95 |
for paragraph in text.replace('\n', ' ').split("\n\n"):
|
96 |
if len(paragraph.strip()) > 0:
|
|
|
106 |
return passages
|
107 |
|
108 |
|
109 |
+
def bi_encode(passages):
|
110 |
global bi_encoder
|
111 |
# We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
|
112 |
+
bi_encoder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
|
113 |
|
|
|
|
|
114 |
|
115 |
+
# Compute the embeddings
|
116 |
with st.spinner('Encoding passages into a vector space...'):
|
117 |
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
|
118 |
|
|
|
140 |
st.title("Search Your Query Here")
|
141 |
window_size = 3
|
142 |
|
143 |
+
# This will search articles for passages to answer the query
|
|
|
144 |
def search_func(query):
|
145 |
global bi_encoder, cross_encoder
|
146 |
|
|
|
154 |
|
155 |
st.write(f"Document Header: {pdf_title}")
|
156 |
|
157 |
+
# Encode the query using the bi-encoder and find relevant answers
|
158 |
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
159 |
question_embedding = question_embedding.cpu()
|
160 |
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=2, score_function=util.dot_score)
|
|
|
221 |
with st.spinner(
|
222 |
text=f"Loading..........................."
|
223 |
):
|
224 |
+
bi_encoder, corpus_embeddings = bi_encode(passages)
|
225 |
cross_encoder = cross_encode()
|
226 |
|
227 |
with st.spinner(
|