Spaces:
Build error
Build error
File size: 2,354 Bytes
d660b02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
from concurrent.futures import ThreadPoolExecutor, as_completed
from loguru import logger
from qdrant_client.http import exceptions
from typing_extensions import Annotated
from clearml import PipelineDecorator
from llm_engineering.domain.base.nosql import NoSQLBaseDocument
from llm_engineering.domain.cleaned_documents import (
CleanedArticleDocument,
CleanedDocument,
CleanedPostDocument,
CleanedRepositoryDocument,
)
@PipelineDecorator.component(name="query_feature_store")
def query_feature_store() -> Annotated[list, "queried_cleaned_documents"]:
logger.info("Querying feature store.")
results = fetch_all_data()
cleaned_documents = [doc for query_result in results.values() for doc in query_result]
return cleaned_documents
def fetch_all_data() -> dict[str, list[NoSQLBaseDocument]]:
with ThreadPoolExecutor() as executor:
future_to_query = {
executor.submit(
__fetch_articles,
): "articles",
executor.submit(
__fetch_posts,
): "posts",
executor.submit(
__fetch_repositories,
): "repositories",
}
results = {}
for future in as_completed(future_to_query):
query_name = future_to_query[future]
try:
results[query_name] = future.result()
except Exception:
logger.exception(f"'{query_name}' request failed.")
results[query_name] = []
return results
def __fetch_articles() -> list[CleanedDocument]:
return __fetch(CleanedArticleDocument)
def __fetch_posts() -> list[CleanedDocument]:
return __fetch(CleanedPostDocument)
def __fetch_repositories() -> list[CleanedDocument]:
return __fetch(CleanedRepositoryDocument)
def __fetch(cleaned_document_type: type[CleanedDocument], limit: int = 1) -> list[CleanedDocument]:
try:
cleaned_documents, next_offset = cleaned_document_type.bulk_find(limit=limit)
except exceptions.UnexpectedResponse:
return []
while next_offset:
documents, next_offset = cleaned_document_type.bulk_find(limit=limit, offset=next_offset)
cleaned_documents.extend(documents)
return cleaned_documents
|