Spaces:

Tiggy115
/

NLP4Web_HW1

Sleeping

App Files Files Community

Kurt commited on Nov 10, 2024

Commit

b7fb5a8

1 Parent(s): 2b4cfa7

cool2

Browse files

Files changed (1) hide show

app.py +333 -7

app.py CHANGED Viewed

@@ -20,6 +20,10 @@ from nlp4web_codebase.nlp4web_codebase.ir.data_loaders.sciq import load_sciq
 from nlp4web_codebase.nlp4web_codebase.ir.models import BaseRetriever
 from typing import Type
 from abc import abstractmethod
 LANGUAGE = "english"
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
@@ -308,6 +312,326 @@ class BM25Retriever(BaseInvertedIndexRetriever):
 bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
 bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")
 class Hit(TypedDict):
   cid: str
   score: float
@@ -317,21 +641,23 @@ demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
 return_type = List[Hit]
 ## YOUR_CODE_STARTS_HERE
-bm25_index = BM25Index.build_from_documents(
     documents=iter(sciq.corpus),
     ndocs=12160,
-    show_progress_bar=True
 )
-bm25_index.save("output/bm25_index")
 def search(query: str) -> List[Hit]:
-    bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
-    result = bm25_retriever.retrieve(query)
     l : return_type = []
     for cid, score in result.items():
-      docid = bm25_retriever.index.cid2docid[cid]
-      text = bm25_retriever.index.doc_texts[docid]
       l.append(Hit(cid=cid, score=score, text=text))

 from nlp4web_codebase.nlp4web_codebase.ir.models import BaseRetriever
 from typing import Type
 from abc import abstractmethod
+from nlp4web_codebase.nlp4web_codebase.ir.data_loaders import Split
+import pytrec_eval
+import numpy as np
+from scipy.sparse._csc import csc_matrix
 LANGUAGE = "english"
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
 bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
 bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")
+def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
+  metric = "map_cut_10"
+  qrels = sciq.get_qrels_dict(split)
+  evaluator = pytrec_eval.RelevanceEvaluator(sciq.get_qrels_dict(split), (metric,))
+  qps = evaluator.evaluate(rankings)
+  return float(np.mean([qp[metric] for qp in qps.values()]))
+"""Example of using the pre-requisite code:"""
+# Loading dataset:
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+sciq = load_sciq()
+counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
+# Building BM25 index and save:
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True
+)
+bm25_index.save("output/bm25_index")
+# Loading index and use BM25 retriever to retrieve:
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+print(bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?"))  # the ranking
+plots_b: Dict[str, List[float]] = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": []
+}
+plots_k1: Dict[str, List[float]] = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": []
+}
+## YOUR_CODE_STARTS_HERE
+# Two steps should be involved:
+# Step 1. Fix k1 value to the default one 0.9,
+# go through all the candidate b values (0, 0.1, ..., 1.0),
+# and record in plots_b["Y"] the corresponding performances obtained via evaluate_map;
+# Step 2. Fix b to the best one in step 1. and do the same for k1.
+# Hint (on using the pre-requisite code):
+# - One can use the loaded sciq dataset directly (loaded in the pre-requisite code);
+# - One can build bm25_index with `BM25Index.build_from_documents`;
+# - One can use BM25Retriever to load the index and perform retrieval on the dev queries
+# (dev queries can be obtained via sciq.get_split_queries(Split.dev))
+k1 = 0.9
+b_list = []
+for b in plots_b["X"]:
+  bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=k1,
+    b=b
+  )
+  bm25_index.save("output/bm25_index")
+  bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+  rankings = {}
+  for query in sciq.get_split_queries(Split.dev):
+    ranking = bm25_retriever.retrieve(query=query.text)
+    rankings[query.query_id] = ranking
+  optimized_map = evaluate_map(rankings, split=Split.dev)
+  b_list.append(optimized_map)
+plots_b["Y"] = b_list
+b = plots_b["X"][np.argmax(plots_b["Y"])]
+k1_list = []
+for k1 in plots_k1["X"]:
+  bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=k1,
+    b=b
+  )
+  bm25_index.save("output/bm25_index")
+  bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+  rankings = {}
+  for query in sciq.get_split_queries(Split.dev):
+    ranking = bm25_retriever.retrieve(query=query.text)
+    rankings[query.query_id] = ranking
+  optimized_map = evaluate_map(rankings, split=Split.dev)
+  k1_list.append(optimized_map)
+plots_k1["Y"] = k1_list
+"""Let's check the effectiveness gain on test after this tuning on dev"""
+default_map = 0.7849
+best_b = plots_b["X"][np.argmax(plots_b["Y"])]
+best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=best_k1,
+    b=best_b
+)
+bm25_index.save("output/bm25_index")
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+rankings = {}
+for query in sciq.get_split_queries(Split.test):  # note this is now on test
+  ranking = bm25_retriever.retrieve(query=query.text)
+  rankings[query.query_id] = ranking
+optimized_map = evaluate_map(rankings, split=Split.test)  # note this is now on test
+print(default_map, optimized_map)
+"""## TASK2.2: implement `CSCBM25Index` (4 points)
+Implement `CSCBM25Index` by completing the missing code. Note that `CSCInvertedIndex` is similar to `InvertedIndex` which we talked about during the class. The main difference is posting lists are represented by a CSC sparse matrix.
+"""
+@dataclass
+class CSCInvertedIndex:
+    posting_lists_matrix: csc_matrix  # docid -> posting_list
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]  # collection_id -> docid
+    collection_ids: List[str]  # docid -> collection_id
+    doc_texts: Optional[List[str]] = None  # docid -> document text
+    def save(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
+            pickle.dump(self, f)
+    @classmethod
+    def from_saved(cls: Type[T], saved_dir: str) -> T:
+        index = cls(
+            posting_lists_matrix=None, vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
+        )
+        with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            index = pickle.load(f)
+        return index
+@dataclass
+class CSCBM25Index(CSCInvertedIndex):
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        return simple_tokenize(text)
+    @staticmethod
+    def cache_term_weights(
+        posting_lists: List[PostingList],
+        total_docs: int,
+        avgdl: float,
+        dfs: List[int],
+        dls: List[int],
+        k1: float,
+        b: float,
+    ) -> csc_matrix:
+        """Compute term weights and caching"""
+        ## YOUR_CODE_STARTS_HERE
+        data = []
+        indices = []
+        indptr = []
+        N = total_docs
+        for tid, posting_list in enumerate(
+            tqdm.tqdm(posting_lists, desc="Regularizing TFs")
+        ):
+            if indptr == []:
+              indptr.append(0)
+            #if dfs[tid] != len(posting_list.docid_postings):
+            #  print(dfs[tid], ", ", len(posting_list.docid_postings))
+            #if dfs[tid] == 0:
+            #  print(posting_list.docid_postings[0])
+            indptr.append(indptr[-1] + len(posting_list.docid_postings))
+            idf = CSCBM25Index.calc_idf(df=dfs[tid], N=N)
+            for i in range(len(posting_list.docid_postings)):
+                docid = posting_list.docid_postings[i]
+                indices.append(docid)
+                tf = posting_list.tweight_postings[i]
+                dl = dls[docid]
+                regularized_tf = CSCBM25Index.calc_regularized_tf(
+                    tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
+                )
+                tf_idf = regularized_tf * idf
+                data.append(tf_idf)
+        posting_lists_matrix = csc_matrix((data, indices, indptr)).astype(np.float32)
+        print(posting_lists_matrix.shape)
+        return posting_lists_matrix
+        ## YOUR_CODE_ENDS_HERE
+    @staticmethod
+    def calc_regularized_tf(
+        tf: int, dl: float, avgdl: float, k1: float, b: float
+    ) -> float:
+        return tf / (tf + k1 * (1 - b + b * dl / avgdl))
+    @staticmethod
+    def calc_idf(df: int, N: int):
+        return math.log(1 + (N - df + 0.5) / (df + 0.5))
+    @classmethod
+    def build_from_documents(
+        cls: Type[CSCBM25Index],
+        documents: Iterable[Document],
+        store_raw: bool = True,
+        output_dir: Optional[str] = None,
+        ndocs: Optional[int] = None,
+        show_progress_bar: bool = True,
+        k1: float = 0.9,
+        b: float = 0.4,
+    ) -> CSCBM25Index:
+        # Counting TFs, DFs, doc_lengths, etc.:
+        counting = run_counting(
+            documents=documents,
+            tokenize_fn=CSCBM25Index.tokenize,
+            store_raw=store_raw,
+            ndocs=ndocs,
+            show_progress_bar=show_progress_bar,
+        )
+        # Compute term weights and caching:
+        posting_lists = counting.posting_lists
+        total_docs = len(counting.cid2docid)
+        posting_lists_matrix = CSCBM25Index.cache_term_weights(
+            posting_lists=posting_lists,
+            total_docs=total_docs,
+            avgdl=counting.avgdl,
+            dfs=counting.dfs,
+            dls=counting.dls,
+            k1=k1,
+            b=b,
+        )
+        # Assembly and save:
+        index = CSCBM25Index(
+            posting_lists_matrix=posting_lists_matrix,
+            vocab=counting.vocab,
+            cid2docid=counting.cid2docid,
+            collection_ids=counting.collection_ids,
+            doc_texts=counting.doc_texts,
+        )
+        return index
+csc_bm25_index = CSCBM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=best_k1,
+    b=best_b
+)
+csc_bm25_index.save("output/csc_bm25_index")
+class BaseCSCInvertedIndexRetriever(BaseRetriever):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[CSCInvertedIndex]:
+        pass
+    def __init__(self, index_dir: str) -> None:
+        self.index = self.index_class.from_saved(index_dir)
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        ## YOUR_CODE_STARTS_HERE
+        toks = CSCBM25Index.tokenize(query)
+        target_docid = self.index.cid2docid[cid]
+        term_weights = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            weight = self.index.posting_lists_matrix[target_docid, tid]
+            if weight == 0:
+              continue
+            term_weights[tok] = weight
+        return term_weights
+        ## YOUR_CODE_ENDS_HERE
+    def score(self, query: str, cid: str) -> float:
+        return sum(self.get_term_weights(query=query, cid=cid).values())
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        ## YOUR_CODE_STARTS_HERE
+        toks = CSCBM25Index.tokenize(query)
+        docid2score: Dict[int, float] = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists_matrix.getcol(tid)
+            indices = posting_list.indices
+            weights = posting_list.data
+            for docid, tweight in zip(indices, weights):
+              docid2score.setdefault(docid, 0)
+              docid2score[docid] += tweight
+        docid2score = dict(
+            sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
+        )
+        ranking = {
+            self.index.collection_ids[docid]: score
+            for docid, score in docid2score.items()
+        }
+        return ranking
+        ## YOUR_CODE_ENDS_HERE
+class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):
+    @property
+    def index_class(self) -> Type[CSCBM25Index]:
+        return CSCBM25Index
 class Hit(TypedDict):
   cid: str
   score: float
 return_type = List[Hit]
 ## YOUR_CODE_STARTS_HERE
+csc_bm25_index = CSCBM25Index.build_from_documents(
     documents=iter(sciq.corpus),
     ndocs=12160,
+    show_progress_bar=True,
+    k1=best_k1,
+    b=best_b
 )
+csc_bm25_index.save("output/csc_bm25_index")
 def search(query: str) -> List[Hit]:
+    csc_bm25_index = CSCBM25Retriever(index_dir="output/csc_bm25_index")
+    result = csc_bm25_index.retrieve(query)
     l : return_type = []
     for cid, score in result.items():
+      docid = csc_bm25_index.index.cid2docid[cid]
+      text = csc_bm25_index.index.doc_texts[docid]
       l.append(Hit(cid=cid, score=score, text=text))