Smart-Course-Search / create_embeddings_together
Abhaykumar04's picture
Update create_embeddings_together
2c8dc40 verified
raw
history blame
4.19 kB
import json
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
import yaml
from together import Together
load_dotenv()
# Define file paths as constants
API_FILE_PATH = r".\API.yml"
COURSES_FILE_PATH = r".\courses.json"
def load_api_keys(api_file_path):
"""Loads API keys from a YAML file."""
with open(api_file_path, 'r') as f:
api_keys = yaml.safe_load(f)
return api_keys
def load_course_data(json_file_path):
"""Loads course data from a JSON file."""
with open(json_file_path, 'r') as f:
course_data = json.load(f)
return course_data
def prepare_for_embedding(course_data):
"""Combines relevant course fields for embedding."""
prepared_data = []
for i, course in enumerate(course_data):
combined_text = f"Title: {course.get('title', '')}, Description: {course.get('description', '')}"
prepared_data.append(
{
"course_id": i,
"text": combined_text,
"course_link": course.get("course_link"),
"image_url": course.get("image_url"),
"title": course.get("title"),
}
)
return prepared_data
# --- Generate Embeddings using Together AI Model ---
def generate_embeddings(texts, together_api_key):
"""Generates embeddings using Together AI model directly."""
client = Together(api_key=together_api_key)
embeddings = []
for text in texts:
response = client.embeddings.create(
model="WhereIsAI/UAE-Large-V1", input=text
)
embeddings.append(response.data[0].embedding)
return embeddings
# --- Initialize Pinecone ---
def initialize_pinecone(pinecone_api_key, pinecone_env):
"""Initializes Pinecone with API key and environment."""
pc = Pinecone(api_key=pinecone_api_key)
return pc
# --- Upsert Embeddings into Pinecone ---
def upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings):
"""Upserts vectors into a Pinecone index."""
index = pinecone_instance.Index(index_name)
vectors_to_upsert = []
for i, item in enumerate(prepared_data):
vector = embeddings[i]
metadata = {
"course_id": item["course_id"],
"text": item["text"],
"course_link": item["course_link"],
"image_url": item["image_url"],
"title": item["title"],
}
vectors_to_upsert.append((str(item["course_id"]), vector, metadata))
index.upsert(vectors=vectors_to_upsert)
# --- Main Function ---
def main():
try:
api_keys = load_api_keys(API_FILE_PATH)
together_api_key = api_keys["together_ai_api_key"]
pinecone_api_key = api_keys["pinecone_api_key"]
pinecone_env = api_keys["pinecone_env"]
course_data = load_course_data(COURSES_FILE_PATH)
prepared_data = prepare_for_embedding(course_data)
texts_for_embedding = [item["text"] for item in prepared_data]
print("Generating embeddings...")
embeddings = generate_embeddings(texts_for_embedding, together_api_key)
print("Initializing Pinecone...")
pinecone_instance = initialize_pinecone(pinecone_api_key, pinecone_env)
index_name = os.getenv("PINECONE_INDEX_NAME") or api_keys.get("pinecone_index_name")
if not index_name:
raise ValueError("Pinecone index name not found in environment variables or API.yml")
if index_name not in pinecone_instance.list_indexes().names():
pinecone_instance.create_index(
name=index_name,
dimension=1024, # Dimension for UAE-Large-V1
metric='cosine'
)
# Upsert embeddings into Pinecone
print("Upserting embeddings to Pinecone...")
upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings)
print("Embeddings generated and upserted to Pinecone successfully!")
except Exception as e:
print(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()