Spaces:
Runtime error
Runtime error
# | |
# Pyserini: Reproducible IR research with sparse and dense representations | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
""" | |
This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``LuceneSearcher`` | |
class, which wraps the Java class with the same name in Anserini. | |
""" | |
import logging | |
import os | |
from pyserini.util import get_cache_home | |
from pyserini.pyclass import autoclass | |
logger = logging.getLogger(__name__) | |
# Wrappers around Lucene classes | |
JQuery = autoclass('org.apache.lucene.search.Query') | |
# Wrappers around Anserini classes | |
JQrels = autoclass('io.anserini.eval.Qrels') | |
JRelevanceJudgments = autoclass('io.anserini.eval.RelevanceJudgments') | |
JTopicReader = autoclass('io.anserini.search.topicreader.TopicReader') | |
JTopics = autoclass('io.anserini.search.topicreader.Topics') | |
JQueryGenerator = autoclass('io.anserini.search.query.QueryGenerator') | |
JBagOfWordsQueryGenerator = autoclass('io.anserini.search.query.BagOfWordsQueryGenerator') | |
JDisjunctionMaxQueryGenerator = autoclass('io.anserini.search.query.DisjunctionMaxQueryGenerator') | |
JCovid19QueryGenerator = autoclass('io.anserini.search.query.Covid19QueryGenerator') | |
topics_mapping = { | |
'trec1-adhoc': JTopics.TREC1_ADHOC, | |
'trec2-adhoc': JTopics.TREC2_ADHOC, | |
'trec3-adhoc': JTopics.TREC3_ADHOC, | |
'robust04': JTopics.ROBUST04, | |
'robust05': JTopics.ROBUST05, | |
'core17': JTopics.CORE17, | |
'core18': JTopics.CORE18, | |
'wt10g': JTopics.WT10G, | |
'trec2004-terabyte': JTopics.TREC2004_TERABYTE, | |
'trec2005-terabyte': JTopics.TREC2005_TERABYTE, | |
'trec2006-terabyte': JTopics.TREC2006_TERABYTE, | |
'trec2007-million-query': JTopics.TREC2007_MILLION_QUERY, | |
'trec2008-million-query': JTopics.TREC2008_MILLION_QUERY, | |
'trec2009-million-query': JTopics.TREC2009_MILLION_QUERY, | |
'trec2010-web': JTopics.TREC2010_WEB, | |
'trec2011-web': JTopics.TREC2011_WEB, | |
'trec2012-web': JTopics.TREC2012_WEB, | |
'trec2013-web': JTopics.TREC2013_WEB, | |
'trec2014-web': JTopics.TREC2014_WEB, | |
'mb11': JTopics.MB11, | |
'mb12': JTopics.MB12, | |
'mb13': JTopics.MB13, | |
'mb14': JTopics.MB14, | |
'car17v1.5-benchmarkY1test': JTopics.CAR17V15_BENCHMARK_Y1_TEST, | |
'car17v2.0-benchmarkY1test': JTopics.CAR17V20_BENCHMARK_Y1_TEST, | |
'dl19-doc': JTopics.TREC2019_DL_DOC, | |
'dl19-doc-unicoil': JTopics.TREC2019_DL_DOC_UNICOIL, | |
'dl19-doc-unicoil-noexp': JTopics.TREC2019_DL_DOC_UNICOIL_NOEXP, | |
'dl19-passage': JTopics.TREC2019_DL_PASSAGE, | |
'dl19-passage-unicoil': JTopics.TREC2019_DL_PASSAGE_UNICOIL, | |
'dl19-passage-unicoil-noexp': JTopics.TREC2019_DL_PASSAGE_UNICOIL_NOEXP, | |
'dl20': JTopics.TREC2020_DL, | |
'dl20-unicoil': JTopics.TREC2020_DL_UNICOIL, | |
'dl20-unicoil-noexp': JTopics.TREC2020_DL_UNICOIL_NOEXP, | |
'dl21': JTopics.TREC2021_DL, | |
'dl21-unicoil': JTopics.TREC2021_DL_UNICOIL, | |
'dl21-unicoil-noexp': JTopics.TREC2021_DL_UNICOIL_NOEXP, | |
'msmarco-doc-dev': JTopics.MSMARCO_DOC_DEV, | |
'msmarco-doc-dev-unicoil': JTopics.MSMARCO_DOC_DEV_UNICOIL, | |
'msmarco-doc-dev-unicoil-noexp': JTopics.MSMARCO_DOC_DEV_UNICOIL_NOEXP, | |
'msmarco-doc-test': JTopics.MSMARCO_DOC_TEST, | |
'msmarco-passage-dev-subset': JTopics.MSMARCO_PASSAGE_DEV_SUBSET, | |
'msmarco-passage-dev-subset-deepimpact': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT, | |
'msmarco-passage-dev-subset-unicoil': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL, | |
'msmarco-passage-dev-subset-unicoil-noexp': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_NOEXP, | |
'msmarco-passage-dev-subset-unicoil-tilde': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE, | |
'msmarco-passage-dev-subset-distill-splade-max': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX, | |
'msmarco-passage-test-subset': JTopics.MSMARCO_PASSAGE_TEST_SUBSET, | |
'msmarco-v2-doc-dev': JTopics.MSMARCO_V2_DOC_DEV, | |
'msmarco-v2-doc-dev-unicoil': JTopics.MSMARCO_V2_DOC_DEV_UNICOIL, | |
'msmarco-v2-doc-dev-unicoil-noexp': JTopics.MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP, | |
'msmarco-v2-doc-dev2': JTopics.MSMARCO_V2_DOC_DEV2, | |
'msmarco-v2-doc-dev2-unicoil': JTopics.MSMARCO_V2_DOC_DEV2_UNICOIL, | |
'msmarco-v2-doc-dev2-unicoil-noexp': JTopics.MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP, | |
'msmarco-v2-passage-dev': JTopics.MSMARCO_V2_PASSAGE_DEV, | |
'msmarco-v2-passage-dev-unicoil': JTopics.MSMARCO_V2_PASSAGE_DEV_UNICOIL, | |
'msmarco-v2-passage-dev-unicoil-noexp': JTopics.MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP, | |
'msmarco-v2-passage-dev2': JTopics.MSMARCO_V2_PASSAGE_DEV2, | |
'msmarco-v2-passage-dev2-unicoil': JTopics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL, | |
'msmarco-v2-passage-dev2-unicoil-noexp': JTopics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL_NOEXP, | |
'ntcir8-zh': JTopics.NTCIR8_ZH, | |
'clef2006-fr': JTopics.CLEF2006_FR, | |
'trec2002-ar': JTopics.TREC2002_AR, | |
'fire2012-bn': JTopics.FIRE2012_BN, | |
'fire2012-hi': JTopics.FIRE2012_HI, | |
'fire2012-en': JTopics.FIRE2012_EN, | |
'covid-round1': JTopics.COVID_ROUND1, | |
'covid-round1-udel': JTopics.COVID_ROUND1_UDEL, | |
'covid-round2': JTopics.COVID_ROUND2, | |
'covid-round2-udel': JTopics.COVID_ROUND2_UDEL, | |
'covid-round3': JTopics.COVID_ROUND3, | |
'covid-round3-udel': JTopics.COVID_ROUND3_UDEL, | |
'covid-round4': JTopics.COVID_ROUND4, | |
'covid-round4-udel': JTopics.COVID_ROUND4_UDEL, | |
'covid-round5': JTopics.COVID_ROUND5, | |
'covid-round5-udel': JTopics.COVID_ROUND5_UDEL, | |
'trec2018-bl': JTopics.TREC2018_BL, | |
'trec2019-bl': JTopics.TREC2019_BL, | |
'trec2020-bl': JTopics.TREC2020_BL, | |
'epidemic-qa-expert-prelim': JTopics.EPIDEMIC_QA_EXPERT_PRELIM, | |
'epidemic-qa-consumer-prelim': JTopics.EPIDEMIC_QA_CONSUMER_PRELIM, | |
'dpr-nq-dev': JTopics.DPR_NQ_DEV, | |
'dpr-nq-test': JTopics.DPR_NQ_TEST, | |
'dpr-trivia-dev': JTopics.DPR_TRIVIA_DEV, | |
'dpr-trivia-test': JTopics.DPR_TRIVIA_TEST, | |
'dpr-wq-test': JTopics.DPR_WQ_TEST, | |
'dpr-squad-test': JTopics.DPR_SQUAD_TEST, | |
'dpr-curated-test': JTopics.DPR_CURATED_TEST, | |
'dpr-trivia-test-gar-t5-answers': JTopics.DPR_TRIVIA_TEST_GART5_ANSWERS, | |
'dpr-trivia-test-gar-t5-titles': JTopics.DPR_TRIVIA_TEST_GART5_TITLES, | |
'dpr-trivia-test-gar-t5-sentences': JTopics.DPR_TRIVIA_TEST_GART5_SENTENCES, | |
'dpr-trivia-test-gar-t5-all': JTopics.DPR_TRIVIA_TEST_GART5_ALL, | |
'nq-test-gar-t5-answers': JTopics.NQ_TEST_GART5_ANSWERS, | |
'nq-test-gar-t5-titles': JTopics.NQ_TEST_GART5_TITLES, | |
'nq-test-gar-t5-sentences': JTopics.NQ_TEST_GART5_SENTENCES, | |
'nq-test-gar-t5-all': JTopics.NQ_TEST_GART5_ALL, | |
'nq-dev': JTopics.NQ_DEV, | |
'nq-test': JTopics.NQ_TEST, | |
'mrtydi-v1.1-arabic-train': JTopics.MRTYDI_V11_AR_TRAIN, | |
'mrtydi-v1.1-arabic-dev': JTopics.MRTYDI_V11_AR_DEV, | |
'mrtydi-v1.1-arabic-test': JTopics.MRTYDI_V11_AR_TEST, | |
'mrtydi-v1.1-bengali-train': JTopics.MRTYDI_V11_BN_TRAIN, | |
'mrtydi-v1.1-bengali-dev': JTopics.MRTYDI_V11_BN_DEV, | |
'mrtydi-v1.1-bengali-test': JTopics.MRTYDI_V11_BN_TEST, | |
'mrtydi-v1.1-english-train': JTopics.MRTYDI_V11_EN_TRAIN, | |
'mrtydi-v1.1-english-dev': JTopics.MRTYDI_V11_EN_DEV, | |
'mrtydi-v1.1-english-test': JTopics.MRTYDI_V11_EN_TEST, | |
'mrtydi-v1.1-finnish-train': JTopics.MRTYDI_V11_FI_TRAIN, | |
'mrtydi-v1.1-finnish-dev': JTopics.MRTYDI_V11_FI_DEV, | |
'mrtydi-v1.1-finnish-test': JTopics.MRTYDI_V11_FI_TEST, | |
'mrtydi-v1.1-indonesian-train': JTopics.MRTYDI_V11_ID_TRAIN, | |
'mrtydi-v1.1-indonesian-dev': JTopics.MRTYDI_V11_ID_DEV, | |
'mrtydi-v1.1-indonesian-test': JTopics.MRTYDI_V11_ID_TEST, | |
'mrtydi-v1.1-japanese-train': JTopics.MRTYDI_V11_JA_TRAIN, | |
'mrtydi-v1.1-japanese-dev': JTopics.MRTYDI_V11_JA_DEV, | |
'mrtydi-v1.1-japanese-test': JTopics.MRTYDI_V11_JA_TEST, | |
'mrtydi-v1.1-korean-train': JTopics.MRTYDI_V11_KO_TRAIN, | |
'mrtydi-v1.1-korean-dev': JTopics.MRTYDI_V11_KO_DEV, | |
'mrtydi-v1.1-korean-test': JTopics.MRTYDI_V11_KO_TEST, | |
'mrtydi-v1.1-russian-train': JTopics.MRTYDI_V11_RU_TRAIN, | |
'mrtydi-v1.1-russian-dev': JTopics.MRTYDI_V11_RU_DEV, | |
'mrtydi-v1.1-russian-test': JTopics.MRTYDI_V11_RU_TEST, | |
'mrtydi-v1.1-swahili-train': JTopics.MRTYDI_V11_SW_TRAIN, | |
'mrtydi-v1.1-swahili-dev': JTopics.MRTYDI_V11_SW_DEV, | |
'mrtydi-v1.1-swahili-test': JTopics.MRTYDI_V11_SW_TEST, | |
'mrtydi-v1.1-telugu-train': JTopics.MRTYDI_V11_TE_TRAIN, | |
'mrtydi-v1.1-telugu-dev': JTopics.MRTYDI_V11_TE_DEV, | |
'mrtydi-v1.1-telugu-test': JTopics.MRTYDI_V11_TE_TEST, | |
'mrtydi-v1.1-thai-train': JTopics.MRTYDI_V11_TH_TRAIN, | |
'mrtydi-v1.1-thai-dev': JTopics.MRTYDI_V11_TH_DEV, | |
'mrtydi-v1.1-thai-test': JTopics.MRTYDI_V11_TH_TEST, | |
'beir-v1.0.0-trec-covid-test': JTopics.BEIR_V1_0_0_TREC_COVID_TEST, | |
'beir-v1.0.0-bioasq-test': JTopics.BEIR_V1_0_0_BIOASQ_TEST, | |
'beir-v1.0.0-nfcorpus-test': JTopics.BEIR_V1_0_0_NFCORPUS_TEST, | |
'beir-v1.0.0-nq-test': JTopics.BEIR_V1_0_0_NQ_TEST, | |
'beir-v1.0.0-hotpotqa-test': JTopics.BEIR_V1_0_0_HOTPOTQA_TEST, | |
'beir-v1.0.0-fiqa-test': JTopics.BEIR_V1_0_0_FIQA_TEST, | |
'beir-v1.0.0-signal1m-test': JTopics.BEIR_V1_0_0_SIGNAL1M_TEST, | |
'beir-v1.0.0-trec-news-test': JTopics.BEIR_V1_0_0_TREC_NEWS_TEST, | |
'beir-v1.0.0-robust04-test': JTopics.BEIR_V1_0_0_ROBUST04_TEST, | |
'beir-v1.0.0-arguana-test': JTopics.BEIR_V1_0_0_ARGUANA_TEST, | |
'beir-v1.0.0-webis-touche2020-test': JTopics.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST, | |
'beir-v1.0.0-cqadupstack-android-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST, | |
'beir-v1.0.0-cqadupstack-english-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST, | |
'beir-v1.0.0-cqadupstack-gaming-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST, | |
'beir-v1.0.0-cqadupstack-gis-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST, | |
'beir-v1.0.0-cqadupstack-mathematica-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST, | |
'beir-v1.0.0-cqadupstack-physics-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST, | |
'beir-v1.0.0-cqadupstack-programmers-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST, | |
'beir-v1.0.0-cqadupstack-stats-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST, | |
'beir-v1.0.0-cqadupstack-tex-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST, | |
'beir-v1.0.0-cqadupstack-unix-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST, | |
'beir-v1.0.0-cqadupstack-webmasters-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST, | |
'beir-v1.0.0-cqadupstack-wordpress-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST, | |
'beir-v1.0.0-quora-test': JTopics.BEIR_V1_0_0_QUORA_TEST, | |
'beir-v1.0.0-dbpedia-entity-test': JTopics.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST, | |
'beir-v1.0.0-scidocs-test': JTopics.BEIR_V1_0_0_SCIDOCS_TEST, | |
'beir-v1.0.0-fever-test': JTopics.BEIR_V1_0_0_FEVER_TEST, | |
'beir-v1.0.0-climate-fever-test': JTopics.BEIR_V1_0_0_CLIMATE_FEVER_TEST, | |
'beir-v1.0.0-scifact-test': JTopics.BEIR_V1_0_0_SCIFACT_TEST, | |
'beir-v1.0.0-trec-covid-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_TREC_COVID_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-bioasq-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_BIOASQ_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-nfcorpus-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_NFCORPUS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-nq-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_NQ_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-hotpotqa-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_HOTPOTQA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-fiqa-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_FIQA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-signal1m-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SIGNAL1M_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-trec-news-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_TREC_NEWS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-robust04-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_ROBUST04_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-arguana-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_ARGUANA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-webis-touche2020-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-android-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-english-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-gaming-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-gis-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-mathematica-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-physics-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-programmers-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-stats-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-tex-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-unix-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-webmasters-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-cqadupstack-wordpress-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-quora-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_QUORA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-dbpedia-entity-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-scidocs-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SCIDOCS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-fever-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_FEVER_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-climate-fever-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CLIMATE_FEVER_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'beir-v1.0.0-scifact-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SCIFACT_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, | |
'hc4-v1.0-fa-dev-title': JTopics.HC4_V1_0_FA_DEV_TITLE, | |
'hc4-v1.0-fa-dev-desc': JTopics.HC4_V1_0_FA_DEV_DESC, | |
'hc4-v1.0-fa-dev-desc-title': JTopics.HC4_V1_0_FA_DEV_DESC_TITLE, | |
'hc4-v1.0-fa-test-title': JTopics.HC4_V1_0_FA_TEST_TITLE, | |
'hc4-v1.0-fa-test-desc': JTopics.HC4_V1_0_FA_TEST_DESC, | |
'hc4-v1.0-fa-test-desc-title': JTopics.HC4_V1_0_FA_TEST_DESC_TITLE, | |
'hc4-v1.0-fa-en-test-title': JTopics.HC4_V1_0_FA_EN_TEST_TITLE, | |
'hc4-v1.0-fa-en-test-desc': JTopics.HC4_V1_0_FA_EN_TEST_DESC, | |
'hc4-v1.0-fa-en-test-desc-title': JTopics.HC4_V1_0_FA_EN_TEST_DESC_TITLE, | |
'hc4-v1.0-ru-dev-title': JTopics.HC4_V1_0_RU_DEV_TITLE, | |
'hc4-v1.0-ru-dev-desc': JTopics.HC4_V1_0_RU_DEV_DESC, | |
'hc4-v1.0-ru-dev-desc-title': JTopics.HC4_V1_0_RU_DEV_DESC_TITLE, | |
'hc4-v1.0-ru-test-title': JTopics.HC4_V1_0_RU_TEST_TITLE, | |
'hc4-v1.0-ru-test-desc': JTopics.HC4_V1_0_RU_TEST_DESC, | |
'hc4-v1.0-ru-test-desc-title': JTopics.HC4_V1_0_RU_TEST_DESC_TITLE, | |
'hc4-v1.0-ru-en-test-title': JTopics.HC4_V1_0_RU_EN_TEST_TITLE, | |
'hc4-v1.0-ru-en-test-desc': JTopics.HC4_V1_0_RU_EN_TEST_DESC, | |
'hc4-v1.0-ru-en-test-desc-title': JTopics.HC4_V1_0_RU_EN_TEST_DESC_TITLE, | |
'hc4-v1.0-zh-dev-title': JTopics.HC4_V1_0_ZH_DEV_TITLE, | |
'hc4-v1.0-zh-dev-desc': JTopics.HC4_V1_0_ZH_DEV_DESC, | |
'hc4-v1.0-zh-dev-desc-title': JTopics.HC4_V1_0_ZH_DEV_DESC_TITLE, | |
'hc4-v1.0-zh-test-title': JTopics.HC4_V1_0_ZH_TEST_TITLE, | |
'hc4-v1.0-zh-test-desc': JTopics.HC4_V1_0_ZH_TEST_DESC, | |
'hc4-v1.0-zh-test-desc-title': JTopics.HC4_V1_0_ZH_TEST_DESC_TITLE, | |
'hc4-v1.0-zh-en-test-title': JTopics.HC4_V1_0_ZH_EN_TEST_TITLE, | |
'hc4-v1.0-zh-en-test-desc': JTopics.HC4_V1_0_ZH_EN_TEST_DESC, | |
'hc4-v1.0-zh-en-test-desc-title': JTopics.HC4_V1_0_ZH_EN_TEST_DESC_TITLE, | |
# NeuCLIR 2022 topics | |
'neuclir22-en-title': JTopics.NEUCLIR22_EN_TITLE, | |
'neuclir22-en-desc': JTopics.NEUCLIR22_EN_DESC, | |
'neuclir22-en-desc-title': JTopics.NEUCLIR22_EN_DESC_TITLE, | |
'neuclir22-fa-ht-title': JTopics.NEUCLIR22_FA_HT_TITLE, | |
'neuclir22-fa-ht-desc': JTopics.NEUCLIR22_FA_HT_DESC, | |
'neuclir22-fa-ht-desc-title': JTopics.NEUCLIR22_FA_HT_DESC_TITLE, | |
'neuclir22-fa-mt-title': JTopics.NEUCLIR22_FA_MT_TITLE, | |
'neuclir22-fa-mt-desc': JTopics.NEUCLIR22_FA_MT_DESC, | |
'neuclir22-fa-mt-desc-title': JTopics.NEUCLIR22_FA_MT_DESC_TITLE, | |
'neuclir22-ru-ht-title': JTopics.NEUCLIR22_RU_HT_TITLE, | |
'neuclir22-ru-ht-desc': JTopics.NEUCLIR22_RU_HT_DESC, | |
'neuclir22-ru-ht-desc-title': JTopics.NEUCLIR22_RU_HT_DESC_TITLE, | |
'neuclir22-ru-mt-title': JTopics.NEUCLIR22_RU_MT_TITLE, | |
'neuclir22-ru-mt-desc': JTopics.NEUCLIR22_RU_MT_DESC, | |
'neuclir22-ru-mt-desc-title': JTopics.NEUCLIR22_RU_MT_DESC_TITLE, | |
'neuclir22-zh-ht-title': JTopics.NEUCLIR22_ZH_HT_TITLE, | |
'neuclir22-zh-ht-desc': JTopics.NEUCLIR22_ZH_HT_DESC, | |
'neuclir22-zh-ht-desc-title': JTopics.NEUCLIR22_ZH_HT_DESC_TITLE, | |
'neuclir22-zh-mt-title': JTopics.NEUCLIR22_ZH_MT_TITLE, | |
'neuclir22-zh-mt-desc': JTopics.NEUCLIR22_ZH_MT_DESC, | |
'neuclir22-zh-mt-desc-title': JTopics.NEUCLIR22_ZH_MT_DESC_TITLE, | |
# MIRACL topics | |
'miracl-v1.0-ar-dev': JTopics.MIRACL_V10_AR_DEV, | |
'miracl-v1.0-bn-dev': JTopics.MIRACL_V10_BN_DEV, | |
'miracl-v1.0-en-dev': JTopics.MIRACL_V10_EN_DEV, | |
'miracl-v1.0-es-dev': JTopics.MIRACL_V10_ES_DEV, | |
'miracl-v1.0-fa-dev': JTopics.MIRACL_V10_FA_DEV, | |
'miracl-v1.0-fi-dev': JTopics.MIRACL_V10_FI_DEV, | |
'miracl-v1.0-fr-dev': JTopics.MIRACL_V10_FR_DEV, | |
'miracl-v1.0-hi-dev': JTopics.MIRACL_V10_HI_DEV, | |
'miracl-v1.0-id-dev': JTopics.MIRACL_V10_ID_DEV, | |
'miracl-v1.0-ja-dev': JTopics.MIRACL_V10_JA_DEV, | |
'miracl-v1.0-ko-dev': JTopics.MIRACL_V10_KO_DEV, | |
'miracl-v1.0-ru-dev': JTopics.MIRACL_V10_RU_DEV, | |
'miracl-v1.0-sw-dev': JTopics.MIRACL_V10_SW_DEV, | |
'miracl-v1.0-te-dev': JTopics.MIRACL_V10_TE_DEV, | |
'miracl-v1.0-th-dev': JTopics.MIRACL_V10_TH_DEV, | |
'miracl-v1.0-zh-dev': JTopics.MIRACL_V10_ZH_DEV, | |
'miracl-v1.0-de-dev': JTopics.MIRACL_V10_DE_DEV, | |
'miracl-v1.0-yo-dev': JTopics.MIRACL_V10_YO_DEV, | |
} | |
qrels_mapping = { | |
'trec1-adhoc': JQrels.TREC1_ADHOC, | |
'trec2-adhoc': JQrels.TREC2_ADHOC, | |
'trec3-adhoc': JQrels.TREC3_ADHOC, | |
'robust04': JQrels.ROBUST04, | |
'robust05': JQrels.ROBUST05, | |
'core17': JQrels.CORE17, | |
'core18': JQrels.CORE18, | |
'wt10g': JQrels.WT10G, | |
'trec2004-terabyte': JQrels.TREC2004_TERABYTE, | |
'trec2005-terabyte': JQrels.TREC2005_TERABYTE, | |
'trec2006-terabyte': JQrels.TREC2006_TERABYTE, | |
'trec2011-web': JQrels.TREC2011_WEB, | |
'trec2012-web': JQrels.TREC2012_WEB, | |
'trec2013-web': JQrels.TREC2013_WEB, | |
'trec2014-web': JQrels.TREC2014_WEB, | |
'mb11': JQrels.MB11, | |
'mb12': JQrels.MB12, | |
'mb13': JQrels.MB13, | |
'mb14': JQrels.MB14, | |
'car17v1.5-benchmarkY1test': JQrels.CAR17V15_BENCHMARK_Y1_TEST, | |
'car17v2.0-benchmarkY1test': JQrels.CAR17V20_BENCHMARK_Y1_TEST, | |
'dl19-doc': JQrels.TREC2019_DL_DOC, | |
'dl19-passage': JQrels.TREC2019_DL_PASSAGE, | |
'dl20-doc': JQrels.TREC2020_DL_DOC, | |
'dl20-passage': JQrels.TREC2020_DL_PASSAGE, | |
'dl21-doc': JQrels.TREC2021_DL_DOC, | |
'dl21-passage': JQrels.TREC2021_DL_PASSAGE, | |
'msmarco-doc-dev': JQrels.MSMARCO_DOC_DEV, | |
'msmarco-passage-dev-subset': JQrels.MSMARCO_PASSAGE_DEV_SUBSET, | |
'msmarco-v2-doc-dev': JQrels.MSMARCO_V2_DOC_DEV, | |
'msmarco-v2-doc-dev2': JQrels.MSMARCO_V2_DOC_DEV2, | |
'msmarco-v2-passage-dev': JQrels.MSMARCO_V2_PASSAGE_DEV, | |
'msmarco-v2-passage-dev2': JQrels.MSMARCO_V2_PASSAGE_DEV2, | |
'ntcir8-zh': JQrels.NTCIR8_ZH, | |
'clef2006-fr': JQrels.CLEF2006_FR, | |
'trec2002-ar': JQrels.TREC2002_AR, | |
'fire2012-bn': JQrels.FIRE2012_BN, | |
'fire2012-hi': JQrels.FIRE2012_HI, | |
'fire2012-en': JQrels.FIRE2012_EN, | |
'covid-complete': JQrels.COVID_COMPLETE, | |
'covid-round1': JQrels.COVID_ROUND1, | |
'covid-round2': JQrels.COVID_ROUND2, | |
'covid-round3': JQrels.COVID_ROUND3, | |
'covid-round3-cumulative': JQrels.COVID_ROUND3_CUMULATIVE, | |
'covid-round4': JQrels.COVID_ROUND4, | |
'covid-round4-cumulative': JQrels.COVID_ROUND4_CUMULATIVE, | |
'covid-round5': JQrels.COVID_ROUND5, | |
'trec2018-bl': JQrels.TREC2018_BL, | |
'trec2019-bl': JQrels.TREC2019_BL, | |
'trec2020-bl': JQrels.TREC2020_BL, | |
'mrtydi-v1.1-arabic-train': JQrels.MRTYDI_V11_AR_TRAIN, | |
'mrtydi-v1.1-arabic-dev': JQrels.MRTYDI_V11_AR_DEV, | |
'mrtydi-v1.1-arabic-test': JQrels.MRTYDI_V11_AR_TEST, | |
'mrtydi-v1.1-bengali-train': JQrels.MRTYDI_V11_BN_TRAIN, | |
'mrtydi-v1.1-bengali-dev': JQrels.MRTYDI_V11_BN_DEV, | |
'mrtydi-v1.1-bengali-test': JQrels.MRTYDI_V11_BN_TEST, | |
'mrtydi-v1.1-english-train': JQrels.MRTYDI_V11_EN_TRAIN, | |
'mrtydi-v1.1-english-dev': JQrels.MRTYDI_V11_EN_DEV, | |
'mrtydi-v1.1-english-test': JQrels.MRTYDI_V11_EN_TEST, | |
'mrtydi-v1.1-finnish-train': JQrels.MRTYDI_V11_FI_TRAIN, | |
'mrtydi-v1.1-finnish-dev': JQrels.MRTYDI_V11_FI_DEV, | |
'mrtydi-v1.1-finnish-test': JQrels.MRTYDI_V11_FI_TEST, | |
'mrtydi-v1.1-indonesian-train': JQrels.MRTYDI_V11_ID_TRAIN, | |
'mrtydi-v1.1-indonesian-dev': JQrels.MRTYDI_V11_ID_DEV, | |
'mrtydi-v1.1-indonesian-test': JQrels.MRTYDI_V11_ID_TEST, | |
'mrtydi-v1.1-japanese-train': JQrels.MRTYDI_V11_JA_TRAIN, | |
'mrtydi-v1.1-japanese-dev': JQrels.MRTYDI_V11_JA_DEV, | |
'mrtydi-v1.1-japanese-test': JQrels.MRTYDI_V11_JA_TEST, | |
'mrtydi-v1.1-korean-train': JQrels.MRTYDI_V11_KO_TRAIN, | |
'mrtydi-v1.1-korean-dev': JQrels.MRTYDI_V11_KO_DEV, | |
'mrtydi-v1.1-korean-test': JQrels.MRTYDI_V11_KO_TEST, | |
'mrtydi-v1.1-russian-train': JQrels.MRTYDI_V11_RU_TRAIN, | |
'mrtydi-v1.1-russian-dev': JQrels.MRTYDI_V11_RU_DEV, | |
'mrtydi-v1.1-russian-test': JQrels.MRTYDI_V11_RU_TEST, | |
'mrtydi-v1.1-swahili-train': JQrels.MRTYDI_V11_SW_TRAIN, | |
'mrtydi-v1.1-swahili-dev': JQrels.MRTYDI_V11_SW_DEV, | |
'mrtydi-v1.1-swahili-test': JQrels.MRTYDI_V11_SW_TEST, | |
'mrtydi-v1.1-telugu-train': JQrels.MRTYDI_V11_TE_TRAIN, | |
'mrtydi-v1.1-telugu-dev': JQrels.MRTYDI_V11_TE_DEV, | |
'mrtydi-v1.1-telugu-test': JQrels.MRTYDI_V11_TE_TEST, | |
'mrtydi-v1.1-thai-train': JQrels.MRTYDI_V11_TH_TRAIN, | |
'mrtydi-v1.1-thai-dev': JQrels.MRTYDI_V11_TH_DEV, | |
'mrtydi-v1.1-thai-test': JQrels.MRTYDI_V11_TH_TEST, | |
'beir-v1.0.0-trec-covid-test': JQrels.BEIR_V1_0_0_TREC_COVID_TEST, | |
'beir-v1.0.0-bioasq-test': JQrels.BEIR_V1_0_0_BIOASQ_TEST, | |
'beir-v1.0.0-nfcorpus-test': JQrels.BEIR_V1_0_0_NFCORPUS_TEST, | |
'beir-v1.0.0-nq-test': JQrels.BEIR_V1_0_0_NQ_TEST, | |
'beir-v1.0.0-hotpotqa-test': JQrels.BEIR_V1_0_0_HOTPOTQA_TEST, | |
'beir-v1.0.0-fiqa-test': JQrels.BEIR_V1_0_0_FIQA_TEST, | |
'beir-v1.0.0-signal1m-test': JQrels.BEIR_V1_0_0_SIGNAL1M_TEST, | |
'beir-v1.0.0-trec-news-test': JQrels.BEIR_V1_0_0_TREC_NEWS_TEST, | |
'beir-v1.0.0-robust04-test': JQrels.BEIR_V1_0_0_ROBUST04_TEST, | |
'beir-v1.0.0-arguana-test': JQrels.BEIR_V1_0_0_ARGUANA_TEST, | |
'beir-v1.0.0-webis-touche2020-test': JQrels.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST, | |
'beir-v1.0.0-cqadupstack-android-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST, | |
'beir-v1.0.0-cqadupstack-english-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST, | |
'beir-v1.0.0-cqadupstack-gaming-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST, | |
'beir-v1.0.0-cqadupstack-gis-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST, | |
'beir-v1.0.0-cqadupstack-mathematica-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST, | |
'beir-v1.0.0-cqadupstack-physics-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST, | |
'beir-v1.0.0-cqadupstack-programmers-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST, | |
'beir-v1.0.0-cqadupstack-stats-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST, | |
'beir-v1.0.0-cqadupstack-tex-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST, | |
'beir-v1.0.0-cqadupstack-unix-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST, | |
'beir-v1.0.0-cqadupstack-webmasters-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST, | |
'beir-v1.0.0-cqadupstack-wordpress-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST, | |
'beir-v1.0.0-quora-test': JQrels.BEIR_V1_0_0_QUORA_TEST, | |
'beir-v1.0.0-dbpedia-entity-test': JQrels.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST, | |
'beir-v1.0.0-scidocs-test': JQrels.BEIR_V1_0_0_SCIDOCS_TEST, | |
'beir-v1.0.0-fever-test': JQrels.BEIR_V1_0_0_FEVER_TEST, | |
'beir-v1.0.0-climate-fever-test': JQrels.BEIR_V1_0_0_CLIMATE_FEVER_TEST, | |
'beir-v1.0.0-scifact-test': JQrels.BEIR_V1_0_0_SCIFACT_TEST, | |
'hc4-v1.0-fa-dev': JQrels.HC4_V1_0_FA_DEV, | |
'hc4-v1.0-fa-test': JQrels.HC4_V1_0_FA_TEST, | |
'hc4-v1.0-ru-dev': JQrels.HC4_V1_0_RU_DEV, | |
'hc4-v1.0-ru-test': JQrels.HC4_V1_0_RU_TEST, | |
'hc4-v1.0-zh-dev': JQrels.HC4_V1_0_ZH_DEV, | |
'hc4-v1.0-zh-test': JQrels.HC4_V1_0_ZH_TEST, | |
'hc4-neuclir22-fa-test': JQrels.HC4_NEUCLIR22_FA_TEST, | |
'hc4-neuclir22-ru-test': JQrels.HC4_NEUCLIR22_RU_TEST, | |
'hc4-neuclir22-zh-test': JQrels.HC4_NEUCLIR22_ZH_TEST, | |
'miracl-v1.0-ar-dev': JQrels.MIRACL_V10_AR_DEV, | |
'miracl-v1.0-bn-dev': JQrels.MIRACL_V10_BN_DEV, | |
'miracl-v1.0-en-dev': JQrels.MIRACL_V10_EN_DEV, | |
'miracl-v1.0-es-dev': JQrels.MIRACL_V10_ES_DEV, | |
'miracl-v1.0-fa-dev': JQrels.MIRACL_V10_FA_DEV, | |
'miracl-v1.0-fi-dev': JQrels.MIRACL_V10_FI_DEV, | |
'miracl-v1.0-fr-dev': JQrels.MIRACL_V10_FR_DEV, | |
'miracl-v1.0-hi-dev': JQrels.MIRACL_V10_HI_DEV, | |
'miracl-v1.0-id-dev': JQrels.MIRACL_V10_ID_DEV, | |
'miracl-v1.0-ja-dev': JQrels.MIRACL_V10_JA_DEV, | |
'miracl-v1.0-ko-dev': JQrels.MIRACL_V10_KO_DEV, | |
'miracl-v1.0-ru-dev': JQrels.MIRACL_V10_RU_DEV, | |
'miracl-v1.0-sw-dev': JQrels.MIRACL_V10_SW_DEV, | |
'miracl-v1.0-te-dev': JQrels.MIRACL_V10_TE_DEV, | |
'miracl-v1.0-th-dev': JQrels.MIRACL_V10_TH_DEV, | |
'miracl-v1.0-zh-dev': JQrels.MIRACL_V10_ZH_DEV, | |
'miracl-v1.0-de-dev': JQrels.MIRACL_V10_DE_DEV, | |
'miracl-v1.0-yo-dev': JQrels.MIRACL_V10_YO_DEV, | |
} | |
def get_topics(collection_name): | |
""" | |
Parameters | |
---------- | |
collection_name : str | |
collection_name | |
Returns | |
------- | |
result : dictionary | |
Topics as a dictionary | |
""" | |
if collection_name not in topics_mapping: | |
raise ValueError(f'Topic {collection_name} Not Found') | |
topics = JTopicReader.getTopicsWithStringIds(topics_mapping[collection_name]) | |
t = {} | |
for topic in topics.keySet().toArray(): | |
if topic.isdigit(): | |
# parse the keys into integers | |
topic_key = int(topic) | |
else: | |
topic_key = topic | |
t[topic_key] = {} | |
for key in topics.get(topic).keySet().toArray(): | |
t[topic_key][key] = topics.get(topic).get(key) | |
return t | |
def get_topics_with_reader(reader_class, file): | |
# Yes, this is an insanely ridiculous method name. | |
topics = JTopicReader.getTopicsWithStringIdsFromFileWithTopicReaderClass(reader_class, file) | |
if topics is None: | |
raise ValueError(f'Unable to initialize TopicReader {reader_class} with file {file}!') | |
t = {} | |
for topic in topics.keySet().toArray(): | |
if topic.isdigit(): | |
# parse the keys into integers | |
topic_key = int(topic) | |
else: | |
topic_key = topic | |
t[topic_key] = {} | |
for key in topics.get(topic).keySet().toArray(): | |
t[topic_key][key] = topics.get(topic).get(key) | |
return t | |
def get_qrels_file(collection_name): | |
""" | |
Parameters | |
---------- | |
collection_name : str | |
collection_name | |
Returns | |
------- | |
path : str | |
path of the qrels file | |
""" | |
if collection_name in qrels_mapping: | |
qrels = qrels_mapping[collection_name] | |
target_path = os.path.join(get_cache_home(), qrels.path) | |
if os.path.exists(target_path): | |
return target_path | |
target_dir = os.path.split(target_path)[0] | |
if not os.path.exists(target_dir): | |
os.makedirs(target_dir) | |
with open(target_path, 'w') as file: | |
qrels_content = JRelevanceJudgments.getQrelsResource(qrels) | |
file.write(qrels_content) | |
return target_path | |
raise FileNotFoundError(f'no qrels file for {collection_name}') | |
def get_qrels(collection_name): | |
""" | |
Parameters | |
---------- | |
collection_name : str | |
collection_name | |
Returns | |
------- | |
result : dictionary | |
qrels as a dictionary | |
""" | |
file_path = get_qrels_file(collection_name) | |
qrels = {} | |
with open(file_path, 'r') as f: | |
for line in f: | |
qid, _, docid, judgement = line.rstrip().split() | |
if qid.isdigit(): | |
qrels_key = int(qid) | |
else: | |
qrels_key = qid | |
if docid.isdigit(): | |
doc_key = int(docid) | |
else: | |
doc_key = docid | |
if qrels_key in qrels: | |
qrels[qrels_key][doc_key] = judgement | |
else: | |
qrels[qrels_key] = {doc_key: judgement} | |
return qrels | |