rag-youtube-assistant / app /data_processor.py
ganesh3's picture
first modification
dbd33b2
raw
history blame
6.42 kB
from minsearch import Index
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
from elasticsearch import Elasticsearch
import os
def clean_text(text):
# Remove special characters and extra whitespace
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
class DataProcessor:
def __init__(self, text_fields=["content", "title", "description"],
keyword_fields=["video_id", "start_time", "author", "upload_date"],
embedding_model="all-MiniLM-L6-v2"):
self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
self.embedding_model = SentenceTransformer(embedding_model)
self.documents = []
self.embeddings = []
# Use environment variables for Elasticsearch configuration
elasticsearch_host = os.getenv('ELASTICSEARCH_HOST', 'localhost')
elasticsearch_port = int(os.getenv('ELASTICSEARCH_PORT', 9200))
# Initialize Elasticsearch client with explicit scheme
self.es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])
def process_transcript(self, video_id, transcript_data):
metadata = transcript_data['metadata']
transcript = transcript_data['transcript']
for i, segment in enumerate(transcript):
cleaned_text = clean_text(segment['text'])
doc = {
"video_id": video_id,
"content": cleaned_text,
"start_time": segment['start'],
"duration": segment['duration'],
"segment_id": f"{video_id}_{i}",
"title": metadata['title'],
"author": metadata['author'],
"upload_date": metadata['upload_date'],
"view_count": metadata['view_count'],
"like_count": metadata['like_count'],
"comment_count": metadata['comment_count'],
"video_duration": metadata['duration']
}
self.documents.append(doc)
self.embeddings.append(self.embedding_model.encode(cleaned_text + " " + metadata['title']))
def build_index(self, index_name):
self.text_index.fit(self.documents)
self.embeddings = np.array(self.embeddings)
# Create Elasticsearch index
if not self.es.indices.exists(index=index_name):
self.es.indices.create(index=index_name, body={
"mappings": {
"properties": {
"embedding": {"type": "dense_vector", "dims": self.embeddings.shape[1]},
"content": {"type": "text"},
"video_id": {"type": "keyword"},
"segment_id": {"type": "keyword"},
"start_time": {"type": "float"},
"duration": {"type": "float"},
"title": {"type": "text"},
"author": {"type": "keyword"},
"upload_date": {"type": "date"},
"view_count": {"type": "integer"},
"like_count": {"type": "integer"},
"comment_count": {"type": "integer"},
"video_duration": {"type": "text"}
}
}
})
# Index documents in Elasticsearch
for doc, embedding in zip(self.documents, self.embeddings):
doc['embedding'] = embedding.tolist()
self.es.index(index=index_name, body=doc, id=doc['segment_id'])
def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
if method == 'text':
return self.text_search(query, filter_dict, boost_dict, num_results)
elif method == 'embedding':
return self.embedding_search(query, num_results, index_name)
else: # hybrid search
text_results = self.text_search(query, filter_dict, boost_dict, num_results)
embedding_results = self.embedding_search(query, num_results, index_name)
return self.combine_results(text_results, embedding_results, num_results)
def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10):
return self.text_index.search(query, filter_dict, boost_dict, num_results)
def embedding_search(self, query, num_results=10, index_name=None):
if index_name:
# Use Elasticsearch for embedding search
query_vector = self.embedding_model.encode(query).tolist()
script_query = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
"params": {"query_vector": query_vector}
}
}
}
response = self.es.search(
index=index_name,
body={
"size": num_results,
"query": script_query,
"_source": {"excludes": ["embedding"]}
}
)
return [hit['_source'] for hit in response['hits']['hits']]
else:
# Use in-memory embedding search
query_embedding = self.embedding_model.encode(query)
similarities = cosine_similarity([query_embedding], self.embeddings)[0]
top_indices = np.argsort(similarities)[::-1][:num_results]
return [self.documents[i] for i in top_indices]
def combine_results(self, text_results, embedding_results, num_results):
combined = []
for i in range(max(len(text_results), len(embedding_results))):
if i < len(text_results):
combined.append(text_results[i])
if i < len(embedding_results):
combined.append(embedding_results[i])
seen = set()
deduped = []
for doc in combined:
if doc['segment_id'] not in seen:
seen.add(doc['segment_id'])
deduped.append(doc)
return deduped[:num_results]
def process_query(self, query):
return clean_text(query)