Daniel Foley commited on
Commit
0716283
·
1 Parent(s): 706b16d

1.5 workers for parallel. Avoiding rate limits

Browse files
Files changed (1) hide show
  1. RAG.py +21 -4
RAG.py CHANGED
@@ -15,6 +15,7 @@ from typing import Dict, Any, Optional, List, Tuple
15
  import logging
16
  import concurrent.futures
17
  import json
 
18
 
19
  def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
20
  start = time.time()
@@ -85,8 +86,20 @@ def process_single_document(doc: Document) -> Optional[Document]:
85
  )
86
  return None
87
 
88
- def rerank(documents: List[Document], query: str, max_workers: int = 3) -> List[Document]:
89
- """Ingest more metadata and rerank documents using BM25 with parallel processing."""
 
 
 
 
 
 
 
 
 
 
 
 
90
  start = time.time()
91
  if not documents:
92
  return []
@@ -94,8 +107,12 @@ def rerank(documents: List[Document], query: str, max_workers: int = 3) -> List[
94
  meta_start = time.time()
95
  full_docs = []
96
 
 
 
 
 
97
  # Process documents in parallel using ThreadPoolExecutor
98
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
99
  # Submit all document processing tasks
100
  future_to_doc = {
101
  executor.submit(process_single_document, doc): doc
@@ -205,7 +222,7 @@ def RAG(llm: Any, query: str,vectorstore:PineconeVectorStore, top: int = 10, k:
205
  First, reason about the answer between <REASONING></REASONING> headers,
206
  based on the context determine if there is sufficient material for answering the exact question,
207
  return either <VALID>YES</VALID> or <VALID>NO</VALID>
208
- then return a response between <RESPONSE></RESPONSE> headers, your response should be well formatted and an individual summary of each piece of relevant context:
209
  Here is an example
210
  <EXAMPLE>
211
  <QUERY>Are pineapples a good fuel for cars?</QUERY>
 
15
  import logging
16
  import concurrent.futures
17
  import json
18
+ from threading import Lock
19
 
20
  def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
21
  start = time.time()
 
86
  )
87
  return None
88
 
89
+ # Global state to track alternating behavior
90
+ _use_two_workers = False
91
+ _worker_lock = Lock()
92
+
93
+ def get_current_worker_count() -> int:
94
+ """Thread-safe way to get and toggle the worker count between 1 and 2."""
95
+ global _use_two_workers
96
+ with _worker_lock:
97
+ current_workers = 2 if _use_two_workers else 1
98
+ _use_two_workers = not _use_two_workers # Toggle for next time
99
+ return current_workers
100
+
101
+ def rerank(documents: List[Document], query: str) -> List[Document]:
102
+ """Ingest more metadata and rerank documents using BM25 with alternating worker counts."""
103
  start = time.time()
104
  if not documents:
105
  return []
 
107
  meta_start = time.time()
108
  full_docs = []
109
 
110
+ # Get the worker count for this specific call
111
+ worker_count = get_current_worker_count()
112
+ logging.info(f"Processing with {worker_count} worker{'s' if worker_count > 1 else ''}")
113
+
114
  # Process documents in parallel using ThreadPoolExecutor
115
+ with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
116
  # Submit all document processing tasks
117
  future_to_doc = {
118
  executor.submit(process_single_document, doc): doc
 
222
  First, reason about the answer between <REASONING></REASONING> headers,
223
  based on the context determine if there is sufficient material for answering the exact question,
224
  return either <VALID>YES</VALID> or <VALID>NO</VALID>
225
+ then return a response between <RESPONSE></RESPONSE> headers:
226
  Here is an example
227
  <EXAMPLE>
228
  <QUERY>Are pineapples a good fuel for cars?</QUERY>