Spaces:

domenicrosati
/

scite-qa-demo

Runtime error

App Files Files Community

domenicrosati commited on Sep 21, 2022

Commit

1f1e9bd

1 Parent(s): 5cc7b84

don't use concat

Browse files

Files changed (1) hide show

app.py +21 -8

app.py CHANGED Viewed

@@ -145,7 +145,7 @@ def init_models():
         "question-answering", model='sultan/BioM-ELECTRA-Large-SQuAD2-BioASQ8B',
         device=device
     )
-    reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-2-v2', device=device)
     # queryexp_tokenizer = AutoTokenizer.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
     # queryexp_model = AutoModelWithLMHead.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
     return question_answerer, reranker, stop, device
@@ -211,6 +211,9 @@ st.markdown("""
 """, unsafe_allow_html=True)
 with st.expander("Settings (strictness, context limit, top hits)"):
     support_all = st.radio(
         "Use abstracts and titles as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
         ('yes', 'no'))
@@ -224,8 +227,8 @@ with st.expander("Settings (strictness, context limit, top hits)"):
     use_reranking = st.radio(
         "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
         ('yes', 'no'))
-    top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 50)
-    context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 10)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
@@ -313,14 +316,24 @@ def run_query(query):
         scores = reranker.predict(sentence_pairs, batch_size=len(sentence_pairs), show_progress_bar=False)
         hits = {contexts[idx]: scores[idx] for idx in range(len(scores))}
         sorted_contexts = [k for k,v in sorted(hits.items(), key=lambda x: x[0], reverse=True)]
-        context = '\n---'.join(sorted_contexts[:context_limit])
     else:
-        context = '\n---'.join(contexts[:context_limit])
     results = []
-    model_results = qa_model(question=query, context=query+'---'+context, top_k=10)
-    for result in model_results:
-        matched = matched_context(result['start'], result['end'], context)
         support = find_source(result['answer'], orig_docs, matched)
         if not support:
             continue

         "question-answering", model='sultan/BioM-ELECTRA-Large-SQuAD2-BioASQ8B',
         device=device
     )
+    reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
     # queryexp_tokenizer = AutoTokenizer.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
     # queryexp_model = AutoModelWithLMHead.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
     return question_answerer, reranker, stop, device
 """, unsafe_allow_html=True)
 with st.expander("Settings (strictness, context limit, top hits)"):
+    concat_passages = st.radio(
+        "Concatenate passages as one long context?",
+        ('no', 'yes'))
     support_all = st.radio(
         "Use abstracts and titles as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
         ('yes', 'no'))
     use_reranking = st.radio(
         "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
         ('yes', 'no'))
+    top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 10)
+    context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 5)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
         scores = reranker.predict(sentence_pairs, batch_size=len(sentence_pairs), show_progress_bar=False)
         hits = {contexts[idx]: scores[idx] for idx in range(len(scores))}
         sorted_contexts = [k for k,v in sorted(hits.items(), key=lambda x: x[0], reverse=True)]
+        contexts = sorted_contexts[:context_limit]
     else:
+        contexts = contexts[:context_limit]
+    if concat_passages == 'yes':
+        context = '\n---'.join(contexts)
+        model_results = qa_model(question=query, context=context, top_k=10)
+    else:
+        context = ['\n---\n'+ctx for ctx in contexts]
+        model_results = qa_model(question=[query]*len(contexts), context=context)
     results = []
+    for i, result in enumerate(model_results):
+        if concat_passages == 'yes':
+            matched = matched_context(result['start'], result['end'], context)
+        else:
+            matched = matched_context(result['start'], result['end'], context[i])
         support = find_source(result['answer'], orig_docs, matched)
         if not support:
             continue