Spaces:

ariansyahdedy
/

chat

Build error

File size: 5,446 Bytes

import asyncio
from rank_bm25 import BM25Okapi
# import nltk
import string
from typing import List, Set, Optional
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
import os

# Commented out this function that downloads NLTK resources.
# def download_nltk_resources():
#     """
#     Downloads required NLTK resources synchronously.
#     """
#     resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
#     nltk_data_path = "/tmp/nltk_data"
#     os.makedirs(nltk_data_path, exist_ok=True)
#     nltk.data.path.append(nltk_data_path)
#     for resource in resources:
#         try:
#             nltk.download(resource, download_dir=nltk_data_path, quiet=True)
#         except Exception as e:
#             print(f"Error downloading {resource}: {str(e)}")

class BM25_search:
    nltk_resources_downloaded = False

    def __init__(self, remove_stopwords: bool = True, perform_lemmatization: bool = False):
        """
        Initializes the BM25search.
        """
        # Commented out NLTK resource initialization
        # if not BM25_search.nltk_resources_downloaded:
        #     download_nltk_resources()
        #     BM25_search.nltk_resources_downloaded = True

        self.documents: List[str] = []
        self.doc_ids: List[str] = []
        self.tokenized_docs: List[List[str]] = []
        self.bm25: Optional[BM25Okapi] = None
        self.remove_stopwords = remove_stopwords
        self.perform_lemmatization = perform_lemmatization
        # Commented out NLTK-specific tools
        # self.stop_words: Set[str] = set(stopwords.words('english')) if remove_stopwords else set()
        # self.lemmatizer = WordNetLemmatizer() if perform_lemmatization else None

    def preprocess(self, text: str) -> List[str]:
        """
        Preprocesses the input text by lowercasing and removing punctuation.
        NLTK-related tokenization, stopword removal, and lemmatization are commented out.
        """
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        # tokens = nltk.word_tokenize(text)  # Commented out NLTK tokenization
        tokens = text.split()  # Basic tokenization as a fallback
        # if self.remove_stopwords:
        #     tokens = [token for token in tokens if token not in self.stop_words]
        # if self.perform_lemmatization and self.lemmatizer:
        #     tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        return tokens

    def add_document(self, doc_id: str, new_doc: str) -> None:
        """
        Adds a new document to the corpus and updates the BM25 index.
        """
        processed_tokens = self.preprocess(new_doc)
        self.documents.append(new_doc)
        self.doc_ids.append(doc_id)
        self.tokenized_docs.append(processed_tokens)
        self.update_bm25()
        print(f"Added document ID: {doc_id}")

    async def remove_document(self, index: int) -> None:
        """
        Removes a document from the corpus based on its index and updates the BM25 index.
        """
        if 0 <= index < len(self.documents):
            removed_doc_id = self.doc_ids[index]
            del self.documents[index]
            del self.doc_ids[index]
            del self.tokenized_docs[index]
            self.update_bm25()
            print(f"Removed document ID: {removed_doc_id}")
        else:
            print(f"Index {index} is out of bounds.")

    def update_bm25(self) -> None:
        """
        Updates the BM25 index based on the current tokenized documents.
        """
        if self.tokenized_docs:
            self.bm25 = BM25Okapi(self.tokenized_docs)
            print("BM25 index has been initialized.")
        else:
            print("No documents to initialize BM25.")

    def get_scores(self, query: str) -> List[float]:
        """
        Computes BM25 scores for all documents based on the given query.
        """
        processed_query = self.preprocess(query)
        print(f"Tokenized Query: {processed_query}")
        if self.bm25:
            return self.bm25.get_scores(processed_query)
        else:
            print("BM25 is not initialized.")
            return []

    def get_top_n_docs(self, query: str, n: int = 5) -> List[str]:
        """
        Returns the top N documents for a given query.
        """
        processed_query = self.preprocess(query)
        if self.bm25:
            return self.bm25.get_top_n(processed_query, self.documents, n)
        else:
            print("BM25 is not initialized.")
            return []

    def clear_documents(self) -> None:
        """
        Clears all documents from the BM25 index.
        """
        self.documents = []
        self.doc_ids = []
        self.tokenized_docs = []
        self.bm25 = None
        print("BM25 documents cleared and index reset.")

    def get_document(self, doc_id: str) -> str:
        """
        Retrieves a document by its document ID.
        """
        try:
            index = self.doc_ids.index(doc_id)
            return self.documents[index]
        except ValueError:
            print(f"Document ID {doc_id} not found.")
            return ""

async def initialize_bm25_search(remove_stopwords: bool = True, perform_lemmatization: bool = False) -> BM25_search:
    """
    Initializes the BM25search.
    """
    # Removed NLTK resource download from async context
    return BM25_search(remove_stopwords, perform_lemmatization)