Spaces:

spark-ds549
/

LibRAG

Sleeping

App Files Files Community

Dephoh commited on Dec 11, 2024

Commit

0716283

1 Parent(s): 706b16d

1.5 workers for parallel. Avoiding rate limits

Browse files

Files changed (1) hide show

RAG.py +21 -4

RAG.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import Dict, Any, Optional, List, Tuple
 import logging
 import concurrent.futures
 import json
 def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
     start = time.time()
@@ -85,8 +86,20 @@ def process_single_document(doc: Document) -> Optional[Document]:
             )
     return None
-def rerank(documents: List[Document], query: str, max_workers: int = 3) -> List[Document]:
-    """Ingest more metadata and rerank documents using BM25 with parallel processing."""
     start = time.time()
     if not documents:
         return []
@@ -94,8 +107,12 @@ def rerank(documents: List[Document], query: str, max_workers: int = 3) -> List[
     meta_start = time.time()
     full_docs = []
     # Process documents in parallel using ThreadPoolExecutor
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         # Submit all document processing tasks
         future_to_doc = {
             executor.submit(process_single_document, doc): doc
@@ -205,7 +222,7 @@ def RAG(llm: Any, query: str,vectorstore:PineconeVectorStore, top: int = 10, k:
             First, reason about the answer between <REASONING></REASONING> headers,
             based on the context determine if there is sufficient material for answering the exact question,
             return either <VALID>YES</VALID> or <VALID>NO</VALID>
-            then return a response between <RESPONSE></RESPONSE> headers, your response should be well formatted and an individual summary of each piece of relevant context:
             Here is an example
             <EXAMPLE>
             <QUERY>Are pineapples a good fuel for cars?</QUERY>

 import logging
 import concurrent.futures
 import json
+from threading import Lock
 def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
     start = time.time()
             )
     return None
+# Global state to track alternating behavior
+_use_two_workers = False
+_worker_lock = Lock()
+def get_current_worker_count() -> int:
+    """Thread-safe way to get and toggle the worker count between 1 and 2."""
+    global _use_two_workers
+    with _worker_lock:
+        current_workers = 2 if _use_two_workers else 1
+        _use_two_workers = not _use_two_workers  # Toggle for next time
+        return current_workers
+def rerank(documents: List[Document], query: str) -> List[Document]:
+    """Ingest more metadata and rerank documents using BM25 with alternating worker counts."""
     start = time.time()
     if not documents:
         return []
     meta_start = time.time()
     full_docs = []
+    # Get the worker count for this specific call
+    worker_count = get_current_worker_count()
+    logging.info(f"Processing with {worker_count} worker{'s' if worker_count > 1 else ''}")
     # Process documents in parallel using ThreadPoolExecutor
+    with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
         # Submit all document processing tasks
         future_to_doc = {
             executor.submit(process_single_document, doc): doc
             First, reason about the answer between <REASONING></REASONING> headers,
             based on the context determine if there is sufficient material for answering the exact question,
             return either <VALID>YES</VALID> or <VALID>NO</VALID>
+            then return a response between <RESPONSE></RESPONSE> headers:
             Here is an example
             <EXAMPLE>
             <QUERY>Are pineapples a good fuel for cars?</QUERY>