Spaces:
Sleeping
Sleeping
from dataclasses import dataclass | |
from enum import Enum | |
from typing import Dict, List | |
from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel | |
class Split(str, Enum): | |
train = "train" | |
dev = "dev" | |
test = "test" | |
class IRDataset: | |
corpus: List[Document] | |
queries: List[Query] | |
split2qrels: Dict[Split, List[QRel]] | |
def get_stats(self) -> Dict[str, int]: | |
stats = {"|corpus|": len(self.corpus), "|queries|": len(self.queries)} | |
for split, qrels in self.split2qrels.items(): | |
stats[f"|qrels-{split}|"] = len(qrels) | |
return stats | |
def get_qrels_dict(self, split: Split) -> Dict[str, Dict[str, int]]: | |
qrels_dict = {} | |
for qrel in self.split2qrels[split]: | |
qrels_dict.setdefault(qrel.query_id, {}) | |
qrels_dict[qrel.query_id][qrel.collection_id] = qrel.relevance | |
return qrels_dict | |
def get_split_queries(self, split: Split) -> List[Query]: | |
qrels = self.split2qrels[split] | |
qids = {qrel.query_id for qrel in qrels} | |
return list(filter(lambda query: query.query_id in qids, self.queries)) | |