ganesh3 commited on
Commit
42ef936
·
verified ·
1 Parent(s): 4565c57

Update app/data_processor.py

Browse files
Files changed (1) hide show
  1. app/data_processor.py +15 -18
app/data_processor.py CHANGED
@@ -1,18 +1,14 @@
1
  from minsearch import Index
2
  from sentence_transformers import SentenceTransformer
3
  import numpy as np
4
- from elasticsearch import Elasticsearch
5
  import os
6
  import json
7
  import logging
8
  import re
 
 
9
 
10
- # Configure logging for stdout only
11
- logging.basicConfig(
12
- level=logging.INFO,
13
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
14
- stream=sys.stdout
15
- )
16
  logger = logging.getLogger(__name__)
17
 
18
  def clean_text(text):
@@ -21,30 +17,31 @@ def clean_text(text):
21
  return ""
22
  cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
23
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
24
- logger.debug(f"Original text length: {len(text)}, Cleaned text length: {len(cleaned)}")
25
- logger.debug(f"Cleaned text sample: '{cleaned[:100]}...'")
26
  return cleaned
27
 
28
  class DataProcessor:
29
  def __init__(self, text_fields=["content", "title", "description"],
30
- keyword_fields=["video_id", "author", "upload_date"],
31
- embedding_model="multi-qa-MiniLM-L6-cos-v1"):
32
  self.text_fields = text_fields
33
  self.keyword_fields = keyword_fields
34
  self.all_fields = text_fields + keyword_fields
35
  self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
36
- self.embedding_model = SentenceTransformer(embedding_model)
 
 
 
 
37
  self.documents = []
38
  self.embeddings = []
39
  self.index_built = False
40
  self.current_index_name = None
41
-
42
- elasticsearch_host = os.getenv('ELASTICSEARCH_HOST', 'localhost')
43
- elasticsearch_port = int(os.getenv('ELASTICSEARCH_PORT', 9200))
44
-
45
- self.es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])
46
- logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
47
 
 
 
 
 
 
48
  def process_transcript(self, video_id, transcript_data):
49
  logger.info(f"Processing transcript for video {video_id}")
50
 
 
1
  from minsearch import Index
2
  from sentence_transformers import SentenceTransformer
3
  import numpy as np
 
4
  import os
5
  import json
6
  import logging
7
  import re
8
+ from config import Config
9
+ from vector_store import get_vector_store
10
 
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stdout)
 
 
 
 
 
12
  logger = logging.getLogger(__name__)
13
 
14
  def clean_text(text):
 
17
  return ""
18
  cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
19
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
 
 
20
  return cleaned
21
 
22
  class DataProcessor:
23
  def __init__(self, text_fields=["content", "title", "description"],
24
+ keyword_fields=["video_id", "author", "upload_date"],
25
+ embedding_model=None):
26
  self.text_fields = text_fields
27
  self.keyword_fields = keyword_fields
28
  self.all_fields = text_fields + keyword_fields
29
  self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
30
+
31
+ # Use appropriate model path based on environment
32
+ model_path = Config.get_model_path() if embedding_model is None else embedding_model
33
+ self.embedding_model = SentenceTransformer(model_path)
34
+
35
  self.documents = []
36
  self.embeddings = []
37
  self.index_built = False
38
  self.current_index_name = None
 
 
 
 
 
 
39
 
40
+ # Initialize vector store
41
+ VectorStore = get_vector_store(Config)
42
+ self.vector_store = VectorStore(self.embedding_model.get_sentence_embedding_dimension())
43
+ logger.info("Initialized FAISS vector store")
44
+
45
  def process_transcript(self, video_id, transcript_data):
46
  logger.info(f"Processing transcript for video {video_id}")
47