Spaces:
Sleeping
Sleeping
import json | |
from pinecone import Pinecone, ServerlessSpec | |
import os | |
from dotenv import load_dotenv | |
import yaml | |
from together import Together | |
load_dotenv() | |
# Define file paths as constants | |
API_FILE_PATH = r".\API.yml" | |
COURSES_FILE_PATH = r".\courses.json" | |
def load_api_keys(api_file_path): | |
"""Loads API keys from a YAML file.""" | |
with open(api_file_path, 'r') as f: | |
api_keys = yaml.safe_load(f) | |
return api_keys | |
def load_course_data(json_file_path): | |
"""Loads course data from a JSON file.""" | |
with open(json_file_path, 'r') as f: | |
course_data = json.load(f) | |
return course_data | |
def prepare_for_embedding(course_data): | |
"""Combines relevant course fields for embedding.""" | |
prepared_data = [] | |
for i, course in enumerate(course_data): | |
combined_text = f"Title: {course.get('title', '')}, Description: {course.get('description', '')}" | |
prepared_data.append( | |
{ | |
"course_id": i, | |
"text": combined_text, | |
"course_link": course.get("course_link"), | |
"image_url": course.get("image_url"), | |
"title": course.get("title"), | |
} | |
) | |
return prepared_data | |
# --- Generate Embeddings using Together AI Model --- | |
def generate_embeddings(texts, together_api_key): | |
"""Generates embeddings using Together AI model directly.""" | |
client = Together(api_key=together_api_key) | |
embeddings = [] | |
for text in texts: | |
response = client.embeddings.create( | |
model="WhereIsAI/UAE-Large-V1", input=text | |
) | |
embeddings.append(response.data[0].embedding) | |
return embeddings | |
# --- Initialize Pinecone --- | |
def initialize_pinecone(pinecone_api_key, pinecone_env): | |
"""Initializes Pinecone with API key and environment.""" | |
pc = Pinecone(api_key=pinecone_api_key) | |
return pc | |
# --- Upsert Embeddings into Pinecone --- | |
def upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings): | |
"""Upserts vectors into a Pinecone index.""" | |
index = pinecone_instance.Index(index_name) | |
vectors_to_upsert = [] | |
for i, item in enumerate(prepared_data): | |
vector = embeddings[i] | |
metadata = { | |
"course_id": item["course_id"], | |
"text": item["text"], | |
"course_link": item["course_link"], | |
"image_url": item["image_url"], | |
"title": item["title"], | |
} | |
vectors_to_upsert.append((str(item["course_id"]), vector, metadata)) | |
index.upsert(vectors=vectors_to_upsert) | |
# --- Main Function --- | |
def main(): | |
try: | |
api_keys = load_api_keys(API_FILE_PATH) | |
together_api_key = api_keys["together_ai_api_key"] | |
pinecone_api_key = api_keys["pinecone_api_key"] | |
pinecone_env = api_keys["pinecone_env"] | |
course_data = load_course_data(COURSES_FILE_PATH) | |
prepared_data = prepare_for_embedding(course_data) | |
texts_for_embedding = [item["text"] for item in prepared_data] | |
print("Generating embeddings...") | |
embeddings = generate_embeddings(texts_for_embedding, together_api_key) | |
print("Initializing Pinecone...") | |
pinecone_instance = initialize_pinecone(pinecone_api_key, pinecone_env) | |
index_name = os.getenv("PINECONE_INDEX_NAME") or api_keys.get("pinecone_index_name") | |
if not index_name: | |
raise ValueError("Pinecone index name not found in environment variables or API.yml") | |
if index_name not in pinecone_instance.list_indexes().names(): | |
pinecone_instance.create_index( | |
name=index_name, | |
dimension=1024, # Dimension for UAE-Large-V1 | |
metric='cosine' | |
) | |
# Upsert embeddings into Pinecone | |
print("Upserting embeddings to Pinecone...") | |
upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings) | |
print("Embeddings generated and upserted to Pinecone successfully!") | |
except Exception as e: | |
print(f"An error occurred: {str(e)}") | |
if __name__ == "__main__": | |
main() |