from __future__ import annotations from dataclasses import dataclass import pickle import os from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar from nlp4web_codebase.ir.data_loaders.dm import Document from collections import Counter import tqdm import re import nltk nltk.download("stopwords", quiet=True) from nltk.corpus import stopwords as nltk_stopwords LANGUAGE = "english" word_splitter = re.compile(r"(?u)\b\w\w+\b").findall stopwords = set(nltk_stopwords.words(LANGUAGE)) def word_splitting(text: str) -> List[str]: return word_splitter(text.lower()) def lemmatization(words: List[str]) -> List[str]: return words # We ignore lemmatization here for simplicity def simple_tokenize(text: str) -> List[str]: words = word_splitting(text) tokenized = list(filter(lambda w: w not in stopwords, words)) tokenized = lemmatization(tokenized) return tokenized T = TypeVar("T", bound="InvertedIndex") @dataclass class PostingList: term: str # The term docid_postings: List[int] # docid_postings[i] means the docid (int) of the i-th associated posting tweight_postings: List[float] # tweight_postings[i] means the term weight (float) of the i-th associated posting @dataclass class InvertedIndex: posting_lists: List[PostingList] # docid -> posting_list vocab: Dict[str, int] cid2docid: Dict[str, int] # collection_id -> docid collection_ids: List[str] # docid -> collection_id doc_texts: Optional[List[str]] = None # docid -> document text def save(self, output_dir: str) -> None: os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, "index.pkl"), "wb") as f: pickle.dump(self, f) @classmethod def from_saved(cls: Type[T], saved_dir: str) -> T: index = cls( posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None ) with open(os.path.join(saved_dir, "index.pkl"), "rb") as f: index = pickle.load(f) return index # The output of the counting function: @dataclass class Counting: posting_lists: List[PostingList] vocab: Dict[str, int] cid2docid: Dict[str, int] collection_ids: List[str] dfs: List[int] # tid -> df dls: List[int] # docid -> doc length avgdl: float nterms: int doc_texts: Optional[List[str]] = None def run_counting( documents: Iterable[Document], tokenize_fn: Callable[[str], List[str]] = simple_tokenize, store_raw: bool = True, # store the document text in doc_texts ndocs: Optional[int] = None, show_progress_bar: bool = True, ) -> Counting: """Counting TFs, DFs, doc_lengths, etc.""" posting_lists: List[PostingList] = [] vocab: Dict[str, int] = {} cid2docid: Dict[str, int] = {} collection_ids: List[str] = [] dfs: List[int] = [] # tid -> df dls: List[int] = [] # docid -> doc length nterms: int = 0 doc_texts: Optional[List[str]] = [] for doc in tqdm.tqdm( documents, desc="Counting", total=ndocs, disable=not show_progress_bar, ): if doc.collection_id in cid2docid: continue collection_ids.append(doc.collection_id) docid = cid2docid.setdefault(doc.collection_id, len(cid2docid)) toks = tokenize_fn(doc.text) tok2tf = Counter(toks) dls.append(sum(tok2tf.values())) for tok, tf in tok2tf.items(): nterms += tf tid = vocab.get(tok, None) if tid is None: posting_lists.append( PostingList(term=tok, docid_postings=[], tweight_postings=[]) ) tid = vocab.setdefault(tok, len(vocab)) posting_lists[tid].docid_postings.append(docid) posting_lists[tid].tweight_postings.append(tf) if tid < len(dfs): dfs[tid] += 1 else: dfs.append(0) if store_raw: doc_texts.append(doc.text) else: doc_texts = None return Counting( posting_lists=posting_lists, vocab=vocab, cid2docid=cid2docid, collection_ids=collection_ids, dfs=dfs, dls=dls, avgdl=sum(dls) / len(dls), nterms=nterms, doc_texts=doc_texts, ) from nlp4web_codebase.ir.data_loaders.sciq import load_sciq sciq = load_sciq() counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus)) """### BM25 Index""" from dataclasses import asdict, dataclass import math import os from typing import Iterable, List, Optional, Type import tqdm from nlp4web_codebase.ir.data_loaders.dm import Document @dataclass class BM25Index(InvertedIndex): @staticmethod def tokenize(text: str) -> List[str]: return simple_tokenize(text) @staticmethod def cache_term_weights( posting_lists: List[PostingList], total_docs: int, avgdl: float, dfs: List[int], dls: List[int], k1: float, b: float, ) -> None: """Compute term weights and caching""" N = total_docs for tid, posting_list in enumerate( tqdm.tqdm(posting_lists, desc="Regularizing TFs") ): idf = BM25Index.calc_idf(df=dfs[tid], N=N) for i in range(len(posting_list.docid_postings)): docid = posting_list.docid_postings[i] tf = posting_list.tweight_postings[i] dl = dls[docid] regularized_tf = BM25Index.calc_regularized_tf( tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b ) posting_list.tweight_postings[i] = regularized_tf * idf @staticmethod def calc_regularized_tf( tf: int, dl: float, avgdl: float, k1: float, b: float ) -> float: return tf / (tf + k1 * (1 - b + b * dl / avgdl)) @staticmethod def calc_idf(df: int, N: int): return math.log(1 + (N - df + 0.5) / (df + 0.5)) @classmethod def build_from_documents( cls: Type[BM25Index], documents: Iterable[Document], store_raw: bool = True, output_dir: Optional[str] = None, ndocs: Optional[int] = None, show_progress_bar: bool = True, k1: float = 0.9, b: float = 0.4, ) -> BM25Index: # Counting TFs, DFs, doc_lengths, etc.: counting = run_counting( documents=documents, tokenize_fn=BM25Index.tokenize, store_raw=store_raw, ndocs=ndocs, show_progress_bar=show_progress_bar, ) # Compute term weights and caching: posting_lists = counting.posting_lists total_docs = len(counting.cid2docid) BM25Index.cache_term_weights( posting_lists=posting_lists, total_docs=total_docs, avgdl=counting.avgdl, dfs=counting.dfs, dls=counting.dls, k1=k1, b=b, ) # Assembly and save: index = BM25Index( posting_lists=posting_lists, vocab=counting.vocab, cid2docid=counting.cid2docid, collection_ids=counting.collection_ids, doc_texts=counting.doc_texts, ) return index bm25_index = BM25Index.build_from_documents( documents=iter(sciq.corpus), ndocs=12160, show_progress_bar=True, ) bm25_index.save("output/bm25_index") """### BM25 Retriever""" from nlp4web_codebase.ir.models import BaseRetriever from typing import Type from abc import abstractmethod class BaseInvertedIndexRetriever(BaseRetriever): @property @abstractmethod def index_class(self) -> Type[InvertedIndex]: pass def __init__(self, index_dir: str) -> None: self.index = self.index_class.from_saved(index_dir) def get_term_weights(self, query: str, cid: str) -> Dict[str, float]: toks = self.index.tokenize(query) target_docid = self.index.cid2docid[cid] term_weights = {} for tok in toks: if tok not in self.index.vocab: continue tid = self.index.vocab[tok] posting_list = self.index.posting_lists[tid] for docid, tweight in zip( posting_list.docid_postings, posting_list.tweight_postings ): if docid == target_docid: term_weights[tok] = tweight break return term_weights def score(self, query: str, cid: str) -> float: return sum(self.get_term_weights(query=query, cid=cid).values()) def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]: toks = self.index.tokenize(query) docid2score: Dict[int, float] = {} for tok in toks: if tok not in self.index.vocab: continue tid = self.index.vocab[tok] posting_list = self.index.posting_lists[tid] for docid, tweight in zip( posting_list.docid_postings, posting_list.tweight_postings ): docid2score.setdefault(docid, 0) docid2score[docid] += tweight docid2score = dict( sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk] ) return { self.index.collection_ids[docid]: score for docid, score in docid2score.items() } class BM25Retriever(BaseInvertedIndexRetriever): @property def index_class(self) -> Type[BM25Index]: return BM25Index bm25_retriever = BM25Retriever(index_dir="output/bm25_index") bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?") """# TASK1: tune b and k1 (4 points) Tune b and k1 on the **dev** split of SciQ using the metric MAP@10. The evaluation function (`evalaute_map`) is provided. Record the values in `plots_k1` and `plots_b`. Do it in a greedy manner: as the influence from b is larger, please first tune b (with k1 fixed to the default value 0.9) and use the best value of b to further tune k1. $${\displaystyle {\text{score}}(D,Q)=\sum _{i=1}^{n}{\text{IDF}}(q_{i})\cdot {\frac {f(q_{i},D)\cdot (k_{1}+1)}{f(q_{i},D)+k_{1}\cdot \left(1-b+b\cdot {\frac {|D|}{\text{avgdl}}}\right)}}}$$ """ from nlp4web_codebase.ir.data_loaders import Split import pytrec_eval import numpy as np def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float: metric = "map_cut_10" qrels = sciq.get_qrels_dict(split) evaluator = pytrec_eval.RelevanceEvaluator(sciq.get_qrels_dict(split), (metric,)) qps = evaluator.evaluate(rankings) return float(np.mean([qp[metric] for qp in qps.values()])) """Example of using the pre-requisite code:""" # Loading dataset: from nlp4web_codebase.ir.data_loaders.sciq import load_sciq sciq = load_sciq() counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus)) # Building BM25 index and save: bm25_index = BM25Index.build_from_documents( documents=iter(sciq.corpus), ndocs=12160, show_progress_bar=True ) bm25_index.save("output/bm25_index") # Loading index and use BM25 retriever to retrieve: bm25_retriever = BM25Retriever(index_dir="output/bm25_index") print(bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")) # the ranking print(bm25_retriever.retrieve("cells")) plots_b: Dict[str, List[float]] = { "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "Y": [] } plots_k1: Dict[str, List[float]] = { "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "Y": [] } ## YOUR_CODE_STARTS_HERE # Two steps should be involved: # Step 1. Fix k1 value to the default one 0.9, # go through all the candidate b values (0, 0.1, ..., 1.0), # and record in plots_b["Y"] the corresponding performances obtained via evaluate_map; # Step 2. Fix b to the best one in step 1. and do the same for k1. # Hint (on using the pre-requisite code): # - One can use the loaded sciq dataset directly (loaded in the pre-requisite code); # - One can build bm25_index with `BM25Index.build_from_documents`; # - One can use BM25Retriever to load the index and perform retrieval on the dev queries # (dev queries can be obtained via sciq.get_split_queries(Split.dev)) k1 = 0.9 for b in plots_b["X"]: # Building BM25 index and save: bm25_index = BM25Index.build_from_documents( documents=iter(sciq.corpus), ndocs=12160, show_progress_bar=True, k1=k1, b=b ) bm25_index.save("output/bm25_index") # Loading index and use BM25 retriever to retrieve: bm25_retriever = BM25Retriever(index_dir="output/bm25_index") rankings = {} for query in sciq.get_split_queries(Split.dev): ranking = bm25_retriever.retrieve(query=query.text) rankings[query.query_id] = ranking map = evaluate_map(rankings, split=Split.dev) plots_b["Y"].append(map) best_b = plots_b["X"][np.argmax(plots_b["Y"])] print("Best b:", best_b) for k1 in plots_k1["X"]: # Building BM25 index and save: bm25_index = BM25Index.build_from_documents( documents=iter(sciq.corpus), ndocs=12160, show_progress_bar=True, k1=k1, b=best_b ) bm25_index.save("output/bm25_index") # Loading index and use BM25 retriever to retrieve: bm25_retriever = BM25Retriever(index_dir="output/bm25_index") rankings = {} for query in sciq.get_split_queries(Split.dev): ranking = bm25_retriever.retrieve(query=query.text) rankings[query.query_id] = ranking map = evaluate_map(rankings, split=Split.dev) plots_k1["Y"].append(map) best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])] print("Best k1:", best_k1) ## YOU_CODE_ENDS_HERE ## TEST_CASES (should be close to 0.8135637188208616 and 0.7512916099773244) print(plots_k1["Y"][9]) print(plots_b["Y"][1]) ## RESULT_CHECKING_POINT print(plots_k1) print(plots_b) from matplotlib import pyplot as plt plt.plot(plots_b["X"], plots_b["Y"], label="b") plt.plot(plots_k1["X"], plots_k1["Y"], label="k1") plt.ylabel("MAP") plt.legend() plt.grid() plt.show() """Let's check the effectiveness gain on test after this tuning on dev""" default_map = 0.7849 best_b = plots_b["X"][np.argmax(plots_b["Y"])] best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])] bm25_index = BM25Index.build_from_documents( documents=iter(sciq.corpus), ndocs=12160, show_progress_bar=True, k1=best_k1, b=best_b ) bm25_index.save("output/bm25_index") bm25_retriever = BM25Retriever(index_dir="output/bm25_index") rankings = {} for query in sciq.get_split_queries(Split.test): # note this is now on test ranking = bm25_retriever.retrieve(query=query.text) rankings[query.query_id] = ranking optimized_map = evaluate_map(rankings, split=Split.test) # note this is now on test print(default_map, optimized_map) """# TASK2: CSC matrix and `CSCBM25Index` (12 points) Recall that we use Python lists to implement posting lists, mapping term IDs to the documents in which they appear. This is inefficient due to its naive design. Actually [Compressed Sparse Column matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html) is very suitable for storing the posting lists and can boost the efficiency. ## TASK2.1: learn about `scipy.sparse.csc_matrix` (2 point) Convert the matrix \begin{bmatrix} 0 & 1 & 0 & 3 \\ 10 & 2 & 1 & 0 \\ 0 & 0 & 0 & 9 \end{bmatrix} to a `csc_matrix` by specifying `data`, `indices`, `indptr` and `shape`. """ from scipy.sparse._csc import csc_matrix input_matrix = [[0, 1, 0, 3], [10, 2, 1, 0], [0, 0, 0, 9]] data = [10, 1, 2, 1, 3, 9] indices = [1, 0, 1, 1, 0, 2] indptr = [0, 1, 3, 4, 6] shape = (3, 4) ## YOUR_CODE_STARTS_HERE # Please assign the values to data, indices, indptr and shape # One can just do it in a hard-coded manner ## YOUR_CODE_ENDS_HERE output_matrix = csc_matrix((data, indices, indptr), shape=shape) output_matrix.A ## TEST_CASES (should be 3 and 11) print((output_matrix.indices + output_matrix.data).tolist()[2]) print((output_matrix.indices + output_matrix.data).tolist()[-1]) ## RESULT_CHECKING_POINT print((output_matrix.indices + output_matrix.data).tolist()) """## TASK2.2: implement `CSCBM25Index` (4 points) Implement `CSCBM25Index` by completing the missing code. Note that `CSCInvertedIndex` is similar to `InvertedIndex` which we talked about during the class. The main difference is posting lists are represented by a CSC sparse matrix. """ @dataclass class CSCInvertedIndex: posting_lists_matrix: csc_matrix # docid -> posting_list vocab: Dict[str, int] cid2docid: Dict[str, int] # collection_id -> docid collection_ids: List[str] # docid -> collection_id doc_texts: Optional[List[str]] = None # docid -> document text def save(self, output_dir: str) -> None: os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, "index.pkl"), "wb") as f: pickle.dump(self, f) @classmethod def from_saved(cls: Type[T], saved_dir: str) -> T: index = cls( posting_lists_matrix=None, vocab={}, cid2docid={}, collection_ids=[], doc_texts=None ) with open(os.path.join(saved_dir, "index.pkl"), "rb") as f: index = pickle.load(f) return index @dataclass class CSCBM25Index(CSCInvertedIndex): @staticmethod def tokenize(text: str) -> List[str]: return simple_tokenize(text) @staticmethod def cache_term_weights( posting_lists: List[PostingList], total_docs: int, avgdl: float, dfs: List[int], dls: List[int], k1: float, b: float, ) -> csc_matrix: """Compute term weights and caching""" ## YOUR_CODE_STARTS_HERE data = [] indices = [] indptr = [0] N = total_docs for tid, posting_list in enumerate(tqdm.tqdm(posting_lists, desc="Regularizing TFs")): idf = CSCBM25Index.calc_idf(df=dfs[tid], N=N) for i in range(len(posting_list.docid_postings)): docid = posting_list.docid_postings[i] tf = posting_list.tweight_postings[i] dl = dls[docid] regularized_tf = CSCBM25Index.calc_regularized_tf( tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b ) data.append(regularized_tf * idf) indices.append(docid) indptr.append(len(data)) shape = (total_docs, len(posting_lists)) output_matrix = csc_matrix((data, indices, indptr), shape=shape, dtype=np.float32) return output_matrix ## YOUR_CODE_ENDS_HERE @staticmethod def calc_regularized_tf( tf: int, dl: float, avgdl: float, k1: float, b: float ) -> float: return tf / (tf + k1 * (1 - b + b * dl / avgdl)) @staticmethod def calc_idf(df: int, N: int): return math.log(1 + (N - df + 0.5) / (df + 0.5)) @classmethod def build_from_documents( cls: Type[CSCBM25Index], documents: Iterable[Document], store_raw: bool = True, output_dir: Optional[str] = None, ndocs: Optional[int] = None, show_progress_bar: bool = True, k1: float = 0.9, b: float = 0.4, ) -> CSCBM25Index: # Counting TFs, DFs, doc_lengths, etc.: counting = run_counting( documents=documents, tokenize_fn=CSCBM25Index.tokenize, store_raw=store_raw, ndocs=ndocs, show_progress_bar=show_progress_bar, ) # Compute term weights and caching: posting_lists = counting.posting_lists total_docs = len(counting.cid2docid) posting_lists_matrix = CSCBM25Index.cache_term_weights( posting_lists=posting_lists, total_docs=total_docs, avgdl=counting.avgdl, dfs=counting.dfs, dls=counting.dls, k1=k1, b=b, ) # Assembly and save: index = CSCBM25Index( posting_lists_matrix=posting_lists_matrix, vocab=counting.vocab, cid2docid=counting.cid2docid, collection_ids=counting.collection_ids, doc_texts=counting.doc_texts, ) return index csc_bm25_index = CSCBM25Index.build_from_documents( documents=iter(sciq.corpus), ndocs=12160, show_progress_bar=True, k1=best_k1, b=best_b ) csc_bm25_index.save("output/csc_bm25_index") ## TEST_CASES (should be 7 and 95) print(len(str(os.path.getsize("output/csc_bm25_index/index.pkl")))) print(os.path.getsize("output/csc_bm25_index/index.pkl") // int(1e5)) ## RESULT_CHECKING_POINT print(os.path.getsize("output/csc_bm25_index/index.pkl")) """We can compare the size of the CSC-based index with the Python-list-based index:""" print(os.path.getsize("output/bm25_index/index.pkl")) """## TASK2.3: implement `CSCInvertedIndexRetriever` (6 points) Implement `CSCInvertedIndexRetriever` by completing the missing code. """ class BaseCSCInvertedIndexRetriever(BaseRetriever): @property @abstractmethod def index_class(self) -> Type[CSCInvertedIndex]: pass def __init__(self, index_dir: str) -> None: self.index = self.index_class.from_saved(index_dir) def get_term_weights(self, query: str, cid: str) -> Dict[str, float]: ## YOUR_CODE_STARTS_HERE toks = self.index.tokenize(query) target_docid = self.index.cid2docid[cid] term_weights = {} for tok in toks: if tok not in self.index.vocab: continue tid = self.index.vocab[tok] posting_list = self.index.posting_lists_matrix.getcol(tid) tweight = posting_list.getrow(target_docid).A[0][0] if tweight != 0: term_weights[tok] = tweight return term_weights ## YOUR_CODE_ENDS_HERE def score(self, query: str, cid: str) -> float: return sum(self.get_term_weights(query=query, cid=cid).values()) def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]: ## YOUR_CODE_STARTS_HERE toks = self.index.tokenize(query) docid2score: Dict[int, float] = {} for tok in toks: if tok not in self.index.vocab: continue tid = self.index.vocab[tok] posting_list = self.index.posting_lists_matrix.getcol(tid) for i, post in enumerate(posting_list): tweight = post.A[0][0] docid2score.setdefault(i, 0) docid2score[i] += tweight docid2score = dict( sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk] ) return { self.index.collection_ids[docid]: score for docid, score in docid2score.items() } ## YOUR_CODE_ENDS_HERE class CSCBM25Retriever(BaseCSCInvertedIndexRetriever): @property def index_class(self) -> Type[CSCBM25Index]: return CSCBM25Index ## TEST_CASES (should be close to # {'theory': 3.1838157176971436, 'evolution': 3.488086223602295, 'natural': 2.629807710647583, 'selection': 3.552377462387085} # {'train-11632': 16.241527557373047, 'train-10931': 13.352127075195312, 'train-2006': 12.854086875915527, 'train-7040': 12.690572738647461, 'train-1719': 11.01913833618164, 'train-9875': 10.886155128479004, 'train-1971': 10.796306610107422, 'train-9882': 10.535819053649902, 'train-2018': 10.481085777282715, 'test-586': 10.478515625} #) csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index") query = "Who proposed the theory of evolution by natural selection?" # print(csc_bm25_retriever.get_term_weights(query=query, cid="train-2006")) print(csc_bm25_retriever.retrieve(query)) ## RESULT_CHECKING_POINT csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index") query = "What are the differences between immunodeficiency and autoimmune diseases?" print(csc_bm25_retriever.get_term_weights(query=query, cid="train-1691")) print(csc_bm25_retriever.retrieve("What are the differences between immunodeficiency and autoimmune diseases?")) """# TASK3: a search-engine demo based on Huggingface space (4 points) ## TASK3.1: create the gradio app (2 point) Create a gradio app to demo the BM25 search engine index on SciQ. The app should have a single input variable for the query (of type `str`) and a single output variable for the returned ranking (of type `List[Hit]` in the code below). Please use the BM25 system with default k1 and b values. Hint: it should use a "search" function of signature: ```python def search(query: str) -> List[Hit]: ... ``` """ import gradio as gr from typing import TypedDict class Hit(TypedDict): cid: str score: float text: str demo: Optional[gr.Interface] = None # Assign your gradio demo to this variable return_type = List[Hit] ## YOUR_CODE_STARTS_HERE def search(query: str) -> List[Hit]: bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index") result = bm25_retriever.retrieve(query) output = [] for cid, score in result.items(): text = sciq.corpus[counting.cid2docid[cid]].text output.append({"cid": cid, "score": score, "text": text}) return output demo = gr.Interface( fn=search, inputs=gr.Textbox(lines=2, placeholder="query"), outputs=gr.Textbox(lines=10, label="result"), title="Hanifi's Search Engine" ) ## YOUR_CODE_ENDS_HERE demo.launch() ## TEST_CASES (result should be [{'cid': 'train-10966', 'score': 12.417802868109781, 'text': 'Bacteria can be used to make cheese from milk. The bacteria turn the milk sugars into lactic acid. The acid is what causes the milk to curdle to form cheese. Bacteria are also involved in producing other foods. Yogurt is made by using bacteria to ferment milk ( Figure below ). Fermenting cabbage with bacteria produces sauerkraut.'}, {'cid': 'train-0', 'score': 10.702840907292215, 'text': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}, {'cid': 'dev-569', 'score': 9.78520518303728, 'text': 'A wide range of friendly bacteria live in the gut. Bacteria begin to populate the human digestive system right after birth. Gut bacteria include Lactobacillus , the bacteria commonly used in probiotic foods such as yogurt, and E. coli bacteria. About a third of all bacteria in the gut are members of the Bacteroides species. Bacteroides are key in helping us digest plant food.'}, {'cid': 'train-1133', 'score': 8.292180216871554, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement.'}, {'cid': 'train-5314', 'score': 8.211635318028303, 'text': 'Bacteria are often used to make cheese from milk. But making foods is not the only beneficial role of bacteria. For example, they also play an essential role in your gut!.'}, {'cid': 'train-6684', 'score': 8.168255107424818, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement. Other sources of calcium are pictured in the Figure below .'}, {'cid': 'train-7890', 'score': 7.930578384187305, 'text': 'Animals and some bacteria and fungi carry out lactic acid fermentation. Lactic acid is a waste product of this process. Our muscles perform lactic acid fermentation during strenuous exercise, since oxygen cannot be delivered to the muscles quickly enough. The buildup of lactic acid is believed to make your muscles sore after exercise. Bacteria that produce lactic acid are used to make cheese and yogurt. The lactic acid causes the proteins in milk to thicken. Lactic acid also causes tooth decay, because bacteria use the sugars in your mouth for energy.'}, {'cid': 'train-6916', 'score': 7.833677059320589, 'text': 'Yogurt is a good source of calcium. Yogurt also contains active cultures of "good" bacteria. Foods that contain these beneficial bacteria are sometimes called "probiotic. ".'}, {'cid': 'train-10029', 'score': 7.725028405457634, 'text': 'Humans have collected and grown mushrooms for food for thousands of years. Figure below shows some of the many types of mushrooms that people eat. Yeasts are used in bread baking and brewing alcoholic beverages. Other fungi are used in fermenting a wide variety of foods, including soy sauce, tempeh, and cheeses. Blue cheese has its distinctive appearance and flavor because of the fungus growing though it (see Figure below ).'}, {'cid': 'train-10983', 'score': 7.334055808872751, 'text': "No doubt you've had a sore throat before, and you've probably eaten cheese or yogurt. If so, then you've already encountered the amazing world of prokaryotes. Prokaryotes are single-celled organisms that lack a nucleus. They also lack other membrane-bound organelles. Prokaryotes are tiny. They can only be viewed with a microscope (see Figure below ). But they are the most numerous organisms on Earth. Without them, the world would be a very different place."}]) import requests import json headers = {"Content-Type": "application/json"} data = {"data": ["What type of organism is commonly used in preparation of foods such as cheese and yogurt?"]} response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data)) event_id = response.json()["event_id"] response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True) lines = list(response.iter_lines()) print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0])) ## RESULT_CHECKING_POINT import requests import json headers = {"Content-Type": "application/json"} data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]} response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data)) event_id = response.json()["event_id"] response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True) lines = list(response.iter_lines()) print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0])) """## TASK3.2: upload it to Huggingface Space (2 point) Upload your gradio app to Huggingface Space. Put your URL to the Space app in the variable `hf_space_url`. IMPORTANT!!! You can get this URL from: *Your Space page* -> *"three dots" on the top right* -> "embedd this space" -> "Direct URL" An example URL (not for our task) is: https://stabilityai-stable-diffusion-3-5-large.hf.space (from https://huggingface.co/spaces/stabilityai/stable-diffusion-3.5-large) """ hf_space_url: Optional[str] = None # Store your created Huggingface Space URL in this variable ## YOUR_CODE_STARTS_HERE hf_space_url = "https://huggingface.co/spaces/hanifi/NLP4WEB-HW1" ## YOUR_CODE_ENDS_HERE ## RESULT_CHECKING_POINT import requests import json print(hf_space_url) headers = {"Content-Type": "application/json"} data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]} response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data)) event_id = response.json()["event_id"] response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True) lines = list(response.iter_lines()) print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0])) ## TEST_CASES (result should be [{'cid': 'train-5587', 'score': 26.74537329473182, 'text': 'The entropy change is positive as the solid state changes into the liquid state. If the transition went from the liquid to the solid state, the numerical value for would be the same, but the sign would be reversed since we are going from a less ordered to a more ordered situation.'}, {'cid': 'train-2', 'score': 25.93532475963942, 'text': 'Summary Changes of state are examples of phase changes, or phase transitions. All phase changes are accompanied by changes in the energy of a system. Changes from a more-ordered state to a less-ordered state (such as a liquid to a gas) areendothermic. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (ΔHfus). The energy change required to vaporize 1 mol of a substance is the enthalpy of vaporization (ΔHvap). The direct conversion of a solid to a gas is sublimation. The amount of energy needed to sublime 1 mol of a substance is its enthalpy of sublimation (ΔHsub) and is the sum of the enthalpies of fusion and vaporization. Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. Heating curves relate temperature changes to phase transitions. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. Supercooled liquids usually crystallize on standing, or adding a seed crystal of the same or another substance can induce crystallization.'}, {'cid': 'train-1658', 'score': 19.0263955721366, 'text': 'There are many examples in the chemical world of changes in entropy. Phase transitions are one obvious example. When a substance makes a transition from the liquid state to the gaseous state, the particles have many more possible arrangements, because they are no longer confined to a specified volume in which they are close to each other; gas particles can move freely throughout their container. Vaporization represents an increase in entropy. In the opposite direction, a liquid loses entropy when it freezes to a solid. Because solids have very ordered structures, there are fewer possible arrangements of particles that would result in the properties associated with a solid.'}, {'cid': 'train-5603', 'score': 16.14918704233498, 'text': 'Chemical energy, the energy stored in molecules and atoms, is one type of potential energy. Certain reactions can cause this energy to be released as heat. Other reactions require an input of energy, in which case the products will store more potential energy than the reactants. When we studied phase changes, we saw a relationship between energy and the state of matter. To melt a solid or boil a liquid, energy needs to be added in order to break up the intermolecular forces holding particles together in more ordered states. The reverse processes, condensation and freezing, release energy, because more favorable intermolecular interactions are formed.'}, {'cid': 'train-8144', 'score': 13.369317026860408, 'text': 'Solid carbon dioxide is also called dry ice. That’s because when it gets warmer and changes state, it doesn’t change to a liquid by melting. Instead, it changes directly to a gas without going through the liquid state. The process in which a solid changes directly to a gas is called sublimation . It occurs when energy is added to a solid such as dry ice. You can watch dry ice changing directly to a gas in the video at this URL: http://www. youtube. com/watch?v=J8mDGwf-5x0 .'}, {'cid': 'train-844', 'score': 12.931270408607555, 'text': 'The water droplets of fog form from water vapor in the air. Fog disappears when the water droplets change back to water vapor. These changes are examples of changes of state. A change of state occurs whenever matter changes from one state to another. Common states of matter on Earth are solid, liquid, and gas. Matter may change back and forth between any two of these states.'}, {'cid': 'train-9811', 'score': 12.904636038613848, 'text': 'Start right above point on the temperature axis and follow the red line vertically. At very low pressure, the particles of the substance are far apart from one another and the substance is in the gas state. As the pressure is increased, the particles of the substance are forced closer and closer together. Eventually the particles are pushed so close together that attractive forces cause the substance to condense into the liquid state. Continually increasing the pressure on the liquid will eventually cause the substance to solidify. For the majority of substances, the solid state is denser than the liquid state and so putting a liquid under great pressure will cause it to turn into a solid. The line segment represents the process of sublimation, where the substance changes directly from a solid to a gas. At a sufficiently low pressure, the liquid phase does not exist. The point labeled is called the triple point . The triple point is the one condition of temperature and pressure where the solid, liquid, and vapor states of a substance can all coexist at equilibrium.'}, {'cid': 'train-8260', 'score': 12.876342252900347, 'text': 'Unlike a crystalline solid, an amorphous solid is a solid that lacks an ordered internal structure. Some examples of amorphous solids include rubber, plastic, and gels. Glass is a very important amorphous solid that is made by cooling a mixture of materials in such a way that it does not crystallize. Glass is sometimes referred to as a supercooled liquid rather than a solid. If you have ever watched a glassblower in action, you have noticed that he takes advantage of the fact that amorphous solids do not have a distinct melting point like crystalline solids do. Instead, as glass is heated, it slowly softens and can be shaped into all sorts of interesting forms. When a glass object shatters, it does so in a very irregular way, unlike crystalline solids, which always break into fragments that have the same shape as dictated by its crystal system.'}, {'cid': 'train-317', 'score': 12.82403749702155, 'text': 'An amorphous solid is a solid that lacks an ordered internal structure.'}, {'cid': 'train-6203', 'score': 12.76684203292532, 'text': 'Matter can exist in one of several different states, including a gas, liquid, or solid state. States of matter differ in the amount of energy their molecules have. When matter recycles, it changes state by gaining or losing energy.'}] import requests import json headers = {"Content-Type": "application/json"} data = {"data": ["Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always what?"]} response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data)) event_id = response.json()["event_id"] response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True) lines = list(response.iter_lines()) print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))