Spaces:
Running
Running
Update app/data_processor.py
Browse files- app/data_processor.py +15 -18
app/data_processor.py
CHANGED
@@ -1,18 +1,14 @@
|
|
1 |
from minsearch import Index
|
2 |
from sentence_transformers import SentenceTransformer
|
3 |
import numpy as np
|
4 |
-
from elasticsearch import Elasticsearch
|
5 |
import os
|
6 |
import json
|
7 |
import logging
|
8 |
import re
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
logging.basicConfig(
|
12 |
-
level=logging.INFO,
|
13 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
14 |
-
stream=sys.stdout
|
15 |
-
)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
def clean_text(text):
|
@@ -21,30 +17,31 @@ def clean_text(text):
|
|
21 |
return ""
|
22 |
cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
|
23 |
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
|
24 |
-
logger.debug(f"Original text length: {len(text)}, Cleaned text length: {len(cleaned)}")
|
25 |
-
logger.debug(f"Cleaned text sample: '{cleaned[:100]}...'")
|
26 |
return cleaned
|
27 |
|
28 |
class DataProcessor:
|
29 |
def __init__(self, text_fields=["content", "title", "description"],
|
30 |
-
keyword_fields=["video_id", "author", "upload_date"],
|
31 |
-
embedding_model=
|
32 |
self.text_fields = text_fields
|
33 |
self.keyword_fields = keyword_fields
|
34 |
self.all_fields = text_fields + keyword_fields
|
35 |
self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
self.documents = []
|
38 |
self.embeddings = []
|
39 |
self.index_built = False
|
40 |
self.current_index_name = None
|
41 |
-
|
42 |
-
elasticsearch_host = os.getenv('ELASTICSEARCH_HOST', 'localhost')
|
43 |
-
elasticsearch_port = int(os.getenv('ELASTICSEARCH_PORT', 9200))
|
44 |
-
|
45 |
-
self.es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])
|
46 |
-
logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
|
47 |
|
|
|
|
|
|
|
|
|
|
|
48 |
def process_transcript(self, video_id, transcript_data):
|
49 |
logger.info(f"Processing transcript for video {video_id}")
|
50 |
|
|
|
1 |
from minsearch import Index
|
2 |
from sentence_transformers import SentenceTransformer
|
3 |
import numpy as np
|
|
|
4 |
import os
|
5 |
import json
|
6 |
import logging
|
7 |
import re
|
8 |
+
from config import Config
|
9 |
+
from vector_store import get_vector_store
|
10 |
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stdout)
|
|
|
|
|
|
|
|
|
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
def clean_text(text):
|
|
|
17 |
return ""
|
18 |
cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
|
19 |
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
|
|
|
|
|
20 |
return cleaned
|
21 |
|
22 |
class DataProcessor:
|
23 |
def __init__(self, text_fields=["content", "title", "description"],
|
24 |
+
keyword_fields=["video_id", "author", "upload_date"],
|
25 |
+
embedding_model=None):
|
26 |
self.text_fields = text_fields
|
27 |
self.keyword_fields = keyword_fields
|
28 |
self.all_fields = text_fields + keyword_fields
|
29 |
self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
|
30 |
+
|
31 |
+
# Use appropriate model path based on environment
|
32 |
+
model_path = Config.get_model_path() if embedding_model is None else embedding_model
|
33 |
+
self.embedding_model = SentenceTransformer(model_path)
|
34 |
+
|
35 |
self.documents = []
|
36 |
self.embeddings = []
|
37 |
self.index_built = False
|
38 |
self.current_index_name = None
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
# Initialize vector store
|
41 |
+
VectorStore = get_vector_store(Config)
|
42 |
+
self.vector_store = VectorStore(self.embedding_model.get_sentence_embedding_dimension())
|
43 |
+
logger.info("Initialized FAISS vector store")
|
44 |
+
|
45 |
def process_transcript(self, video_id, transcript_data):
|
46 |
logger.info(f"Processing transcript for video {video_id}")
|
47 |
|