import json from pinecone import Pinecone, ServerlessSpec import os from dotenv import load_dotenv import yaml from together import Together load_dotenv() # Define file paths as constants API_FILE_PATH = r"API.yml" COURSES_FILE_PATH = r"courses.json" def load_api_keys(api_file_path): """Loads API keys from a YAML file.""" with open(api_file_path, 'r') as f: api_keys = yaml.safe_load(f) return api_keys def load_course_data(json_file_path): """Loads course data from a JSON file.""" with open(json_file_path, 'r') as f: course_data = json.load(f) return course_data def prepare_for_embedding(course_data): """Combines relevant course fields for embedding.""" prepared_data = [] for i, course in enumerate(course_data): combined_text = f"Title: {course.get('title', '')}, Description: {course.get('description', '')}" prepared_data.append( { "course_id": i, "text": combined_text, "course_link": course.get("course_link"), "image_url": course.get("image_url"), "title": course.get("title"), } ) return prepared_data # --- Generate Embeddings using Together AI Model --- def generate_embeddings(texts, together_api_key): """Generates embeddings using Together AI model directly.""" client = Together(api_key=together_api_key) embeddings = [] for text in texts: response = client.embeddings.create( model="WhereIsAI/UAE-Large-V1", input=text ) embeddings.append(response.data[0].embedding) return embeddings # --- Initialize Pinecone --- def initialize_pinecone(pinecone_api_key, pinecone_env): """Initializes Pinecone with API key and environment.""" pc = Pinecone(api_key=pinecone_api_key) return pc # --- Upsert Embeddings into Pinecone --- def upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings): """Upserts vectors into a Pinecone index.""" index = pinecone_instance.Index(index_name) vectors_to_upsert = [] for i, item in enumerate(prepared_data): vector = embeddings[i] metadata = { "course_id": item["course_id"], "text": item["text"], "course_link": item["course_link"], "image_url": item["image_url"], "title": item["title"], } vectors_to_upsert.append((str(item["course_id"]), vector, metadata)) index.upsert(vectors=vectors_to_upsert) # --- Main Function --- def main(): try: api_keys = load_api_keys(API_FILE_PATH) together_api_key = api_keys["together_ai_api_key"] pinecone_api_key = api_keys["pinecone_api_key"] pinecone_env = api_keys["pinecone_env"] course_data = load_course_data(COURSES_FILE_PATH) prepared_data = prepare_for_embedding(course_data) texts_for_embedding = [item["text"] for item in prepared_data] print("Generating embeddings...") embeddings = generate_embeddings(texts_for_embedding, together_api_key) print("Initializing Pinecone...") pinecone_instance = initialize_pinecone(pinecone_api_key, pinecone_env) index_name = os.getenv("PINECONE_INDEX_NAME") or api_keys.get("pinecone_index_name") if not index_name: raise ValueError("Pinecone index name not found in environment variables or API.yml") if index_name not in pinecone_instance.list_indexes().names(): pinecone_instance.create_index( name=index_name, dimension=1024, # Dimension for UAE-Large-V1 metric='cosine' ) # Upsert embeddings into Pinecone print("Upserting embeddings to Pinecone...") upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings) print("Embeddings generated and upserted to Pinecone successfully!") except Exception as e: print(f"An error occurred: {str(e)}") if __name__ == "__main__": main()