Spaces:

ganesh3
/

rag-youtube-assistant

Running

App Files Files Community

ganesh3 commited on Oct 29, 2024

Commit

42ef936

verified ·

1 Parent(s): 4565c57

Update app/data_processor.py

Browse files

Files changed (1) hide show

app/data_processor.py +15 -18

app/data_processor.py CHANGED Viewed

@@ -1,18 +1,14 @@
 from minsearch import Index
 from sentence_transformers import SentenceTransformer
 import numpy as np
-from elasticsearch import Elasticsearch
 import os
 import json
 import logging
 import re
-# Configure logging for stdout only
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    stream=sys.stdout
-)
 logger = logging.getLogger(__name__)
 def clean_text(text):
@@ -21,30 +17,31 @@ def clean_text(text):
         return ""
     cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
     cleaned = re.sub(r'\s+', ' ', cleaned).strip()
-    logger.debug(f"Original text length: {len(text)}, Cleaned text length: {len(cleaned)}")
-    logger.debug(f"Cleaned text sample: '{cleaned[:100]}...'")
     return cleaned
 class DataProcessor:
     def __init__(self, text_fields=["content", "title", "description"],
-                 keyword_fields=["video_id", "author", "upload_date"],
-                 embedding_model="multi-qa-MiniLM-L6-cos-v1"):
         self.text_fields = text_fields
         self.keyword_fields = keyword_fields
         self.all_fields = text_fields + keyword_fields
         self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
-        self.embedding_model = SentenceTransformer(embedding_model)
         self.documents = []
         self.embeddings = []
         self.index_built = False
         self.current_index_name = None
-        elasticsearch_host = os.getenv('ELASTICSEARCH_HOST', 'localhost')
-        elasticsearch_port = int(os.getenv('ELASTICSEARCH_PORT', 9200))
-        self.es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])
-        logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
     def process_transcript(self, video_id, transcript_data):
         logger.info(f"Processing transcript for video {video_id}")

 from minsearch import Index
 from sentence_transformers import SentenceTransformer
 import numpy as np
 import os
 import json
 import logging
 import re
+from config import Config
+from vector_store import get_vector_store
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stdout)
 logger = logging.getLogger(__name__)
 def clean_text(text):
         return ""
     cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
     cleaned = re.sub(r'\s+', ' ', cleaned).strip()
     return cleaned
 class DataProcessor:
     def __init__(self, text_fields=["content", "title", "description"],
+                 keyword_fields=["video_id", "author", "upload_date"],
+                 embedding_model=None):
         self.text_fields = text_fields
         self.keyword_fields = keyword_fields
         self.all_fields = text_fields + keyword_fields
         self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
+        # Use appropriate model path based on environment
+        model_path = Config.get_model_path() if embedding_model is None else embedding_model
+        self.embedding_model = SentenceTransformer(model_path)
         self.documents = []
         self.embeddings = []
         self.index_built = False
         self.current_index_name = None
+        # Initialize vector store
+        VectorStore = get_vector_store(Config)
+        self.vector_store = VectorStore(self.embedding_model.get_sentence_embedding_dimension())
+        logger.info("Initialized FAISS vector store")
     def process_transcript(self, video_id, transcript_data):
         logger.info(f"Processing transcript for video {video_id}")