Spaces:
Running
Running
File size: 6,419 Bytes
dbd33b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from minsearch import Index
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
from elasticsearch import Elasticsearch
import os
def clean_text(text):
# Remove special characters and extra whitespace
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
class DataProcessor:
def __init__(self, text_fields=["content", "title", "description"],
keyword_fields=["video_id", "start_time", "author", "upload_date"],
embedding_model="all-MiniLM-L6-v2"):
self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
self.embedding_model = SentenceTransformer(embedding_model)
self.documents = []
self.embeddings = []
# Use environment variables for Elasticsearch configuration
elasticsearch_host = os.getenv('ELASTICSEARCH_HOST', 'localhost')
elasticsearch_port = int(os.getenv('ELASTICSEARCH_PORT', 9200))
# Initialize Elasticsearch client with explicit scheme
self.es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])
def process_transcript(self, video_id, transcript_data):
metadata = transcript_data['metadata']
transcript = transcript_data['transcript']
for i, segment in enumerate(transcript):
cleaned_text = clean_text(segment['text'])
doc = {
"video_id": video_id,
"content": cleaned_text,
"start_time": segment['start'],
"duration": segment['duration'],
"segment_id": f"{video_id}_{i}",
"title": metadata['title'],
"author": metadata['author'],
"upload_date": metadata['upload_date'],
"view_count": metadata['view_count'],
"like_count": metadata['like_count'],
"comment_count": metadata['comment_count'],
"video_duration": metadata['duration']
}
self.documents.append(doc)
self.embeddings.append(self.embedding_model.encode(cleaned_text + " " + metadata['title']))
def build_index(self, index_name):
self.text_index.fit(self.documents)
self.embeddings = np.array(self.embeddings)
# Create Elasticsearch index
if not self.es.indices.exists(index=index_name):
self.es.indices.create(index=index_name, body={
"mappings": {
"properties": {
"embedding": {"type": "dense_vector", "dims": self.embeddings.shape[1]},
"content": {"type": "text"},
"video_id": {"type": "keyword"},
"segment_id": {"type": "keyword"},
"start_time": {"type": "float"},
"duration": {"type": "float"},
"title": {"type": "text"},
"author": {"type": "keyword"},
"upload_date": {"type": "date"},
"view_count": {"type": "integer"},
"like_count": {"type": "integer"},
"comment_count": {"type": "integer"},
"video_duration": {"type": "text"}
}
}
})
# Index documents in Elasticsearch
for doc, embedding in zip(self.documents, self.embeddings):
doc['embedding'] = embedding.tolist()
self.es.index(index=index_name, body=doc, id=doc['segment_id'])
def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
if method == 'text':
return self.text_search(query, filter_dict, boost_dict, num_results)
elif method == 'embedding':
return self.embedding_search(query, num_results, index_name)
else: # hybrid search
text_results = self.text_search(query, filter_dict, boost_dict, num_results)
embedding_results = self.embedding_search(query, num_results, index_name)
return self.combine_results(text_results, embedding_results, num_results)
def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10):
return self.text_index.search(query, filter_dict, boost_dict, num_results)
def embedding_search(self, query, num_results=10, index_name=None):
if index_name:
# Use Elasticsearch for embedding search
query_vector = self.embedding_model.encode(query).tolist()
script_query = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
"params": {"query_vector": query_vector}
}
}
}
response = self.es.search(
index=index_name,
body={
"size": num_results,
"query": script_query,
"_source": {"excludes": ["embedding"]}
}
)
return [hit['_source'] for hit in response['hits']['hits']]
else:
# Use in-memory embedding search
query_embedding = self.embedding_model.encode(query)
similarities = cosine_similarity([query_embedding], self.embeddings)[0]
top_indices = np.argsort(similarities)[::-1][:num_results]
return [self.documents[i] for i in top_indices]
def combine_results(self, text_results, embedding_results, num_results):
combined = []
for i in range(max(len(text_results), len(embedding_results))):
if i < len(text_results):
combined.append(text_results[i])
if i < len(embedding_results):
combined.append(embedding_results[i])
seen = set()
deduped = []
for doc in combined:
if doc['segment_id'] not in seen:
seen.add(doc['segment_id'])
deduped.append(doc)
return deduped[:num_results]
def process_query(self, query):
return clean_text(query) |