Abhaykumar04 commited on
Commit
2c8dc40
·
verified ·
1 Parent(s): 576e4aa

Update create_embeddings_together

Browse files
Files changed (1) hide show
  1. create_embeddings_together +128 -128
create_embeddings_together CHANGED
@@ -1,129 +1,129 @@
1
- import json
2
- from pinecone import Pinecone, ServerlessSpec
3
- import os
4
- from dotenv import load_dotenv
5
- import yaml
6
- from together import Together
7
-
8
-
9
-
10
- load_dotenv()
11
-
12
- # Define file paths as constants
13
- API_FILE_PATH = r"C:\Users\abhay\Analytics Vidhya\API.yml"
14
- COURSES_FILE_PATH = r"C:\Users\abhay\Analytics Vidhya\courses.json"
15
-
16
- def load_api_keys(api_file_path):
17
- """Loads API keys from a YAML file."""
18
- with open(api_file_path, 'r') as f:
19
- api_keys = yaml.safe_load(f)
20
- return api_keys
21
-
22
-
23
- def load_course_data(json_file_path):
24
- """Loads course data from a JSON file."""
25
- with open(json_file_path, 'r') as f:
26
- course_data = json.load(f)
27
- return course_data
28
-
29
-
30
- def prepare_for_embedding(course_data):
31
- """Combines relevant course fields for embedding."""
32
- prepared_data = []
33
- for i, course in enumerate(course_data):
34
- combined_text = f"Title: {course.get('title', '')}, Description: {course.get('description', '')}"
35
- prepared_data.append(
36
- {
37
- "course_id": i,
38
- "text": combined_text,
39
- "course_link": course.get("course_link"),
40
- "image_url": course.get("image_url"),
41
- "title": course.get("title"),
42
- }
43
- )
44
- return prepared_data
45
-
46
- # --- Generate Embeddings using Together AI Model ---
47
- def generate_embeddings(texts, together_api_key):
48
- """Generates embeddings using Together AI model directly."""
49
- client = Together(api_key=together_api_key)
50
- embeddings = []
51
- for text in texts:
52
- response = client.embeddings.create(
53
- model="WhereIsAI/UAE-Large-V1", input=text
54
- )
55
- embeddings.append(response.data[0].embedding)
56
- return embeddings
57
-
58
- # --- Initialize Pinecone ---
59
- def initialize_pinecone(pinecone_api_key, pinecone_env):
60
- """Initializes Pinecone with API key and environment."""
61
- pc = Pinecone(api_key=pinecone_api_key)
62
- return pc
63
-
64
- # --- Upsert Embeddings into Pinecone ---
65
- def upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings):
66
- """Upserts vectors into a Pinecone index."""
67
- index = pinecone_instance.Index(index_name)
68
- vectors_to_upsert = []
69
- for i, item in enumerate(prepared_data):
70
- vector = embeddings[i]
71
- metadata = {
72
- "course_id": item["course_id"],
73
- "text": item["text"],
74
- "course_link": item["course_link"],
75
- "image_url": item["image_url"],
76
- "title": item["title"],
77
- }
78
- vectors_to_upsert.append((str(item["course_id"]), vector, metadata))
79
- index.upsert(vectors=vectors_to_upsert)
80
-
81
-
82
- # --- Main Function ---
83
- def main():
84
- try:
85
-
86
- api_keys = load_api_keys(API_FILE_PATH)
87
- together_api_key = api_keys["together_ai_api_key"]
88
- pinecone_api_key = api_keys["pinecone_api_key"]
89
- pinecone_env = api_keys["pinecone_env"]
90
-
91
-
92
- course_data = load_course_data(COURSES_FILE_PATH)
93
-
94
-
95
- prepared_data = prepare_for_embedding(course_data)
96
- texts_for_embedding = [item["text"] for item in prepared_data]
97
-
98
-
99
- print("Generating embeddings...")
100
- embeddings = generate_embeddings(texts_for_embedding, together_api_key)
101
-
102
-
103
- print("Initializing Pinecone...")
104
- pinecone_instance = initialize_pinecone(pinecone_api_key, pinecone_env)
105
-
106
-
107
- index_name = os.getenv("PINECONE_INDEX_NAME") or api_keys.get("pinecone_index_name")
108
- if not index_name:
109
- raise ValueError("Pinecone index name not found in environment variables or API.yml")
110
-
111
-
112
- if index_name not in pinecone_instance.list_indexes().names():
113
- pinecone_instance.create_index(
114
- name=index_name,
115
- dimension=1024, # Dimension for UAE-Large-V1
116
- metric='cosine'
117
- )
118
-
119
- # Upsert embeddings into Pinecone
120
- print("Upserting embeddings to Pinecone...")
121
- upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings)
122
-
123
- print("Embeddings generated and upserted to Pinecone successfully!")
124
-
125
- except Exception as e:
126
- print(f"An error occurred: {str(e)}")
127
-
128
- if __name__ == "__main__":
129
  main()
 
1
+ import json
2
+ from pinecone import Pinecone, ServerlessSpec
3
+ import os
4
+ from dotenv import load_dotenv
5
+ import yaml
6
+ from together import Together
7
+
8
+
9
+
10
+ load_dotenv()
11
+
12
+ # Define file paths as constants
13
+ API_FILE_PATH = r".\API.yml"
14
+ COURSES_FILE_PATH = r".\courses.json"
15
+
16
+ def load_api_keys(api_file_path):
17
+ """Loads API keys from a YAML file."""
18
+ with open(api_file_path, 'r') as f:
19
+ api_keys = yaml.safe_load(f)
20
+ return api_keys
21
+
22
+
23
+ def load_course_data(json_file_path):
24
+ """Loads course data from a JSON file."""
25
+ with open(json_file_path, 'r') as f:
26
+ course_data = json.load(f)
27
+ return course_data
28
+
29
+
30
+ def prepare_for_embedding(course_data):
31
+ """Combines relevant course fields for embedding."""
32
+ prepared_data = []
33
+ for i, course in enumerate(course_data):
34
+ combined_text = f"Title: {course.get('title', '')}, Description: {course.get('description', '')}"
35
+ prepared_data.append(
36
+ {
37
+ "course_id": i,
38
+ "text": combined_text,
39
+ "course_link": course.get("course_link"),
40
+ "image_url": course.get("image_url"),
41
+ "title": course.get("title"),
42
+ }
43
+ )
44
+ return prepared_data
45
+
46
+ # --- Generate Embeddings using Together AI Model ---
47
+ def generate_embeddings(texts, together_api_key):
48
+ """Generates embeddings using Together AI model directly."""
49
+ client = Together(api_key=together_api_key)
50
+ embeddings = []
51
+ for text in texts:
52
+ response = client.embeddings.create(
53
+ model="WhereIsAI/UAE-Large-V1", input=text
54
+ )
55
+ embeddings.append(response.data[0].embedding)
56
+ return embeddings
57
+
58
+ # --- Initialize Pinecone ---
59
+ def initialize_pinecone(pinecone_api_key, pinecone_env):
60
+ """Initializes Pinecone with API key and environment."""
61
+ pc = Pinecone(api_key=pinecone_api_key)
62
+ return pc
63
+
64
+ # --- Upsert Embeddings into Pinecone ---
65
+ def upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings):
66
+ """Upserts vectors into a Pinecone index."""
67
+ index = pinecone_instance.Index(index_name)
68
+ vectors_to_upsert = []
69
+ for i, item in enumerate(prepared_data):
70
+ vector = embeddings[i]
71
+ metadata = {
72
+ "course_id": item["course_id"],
73
+ "text": item["text"],
74
+ "course_link": item["course_link"],
75
+ "image_url": item["image_url"],
76
+ "title": item["title"],
77
+ }
78
+ vectors_to_upsert.append((str(item["course_id"]), vector, metadata))
79
+ index.upsert(vectors=vectors_to_upsert)
80
+
81
+
82
+ # --- Main Function ---
83
+ def main():
84
+ try:
85
+
86
+ api_keys = load_api_keys(API_FILE_PATH)
87
+ together_api_key = api_keys["together_ai_api_key"]
88
+ pinecone_api_key = api_keys["pinecone_api_key"]
89
+ pinecone_env = api_keys["pinecone_env"]
90
+
91
+
92
+ course_data = load_course_data(COURSES_FILE_PATH)
93
+
94
+
95
+ prepared_data = prepare_for_embedding(course_data)
96
+ texts_for_embedding = [item["text"] for item in prepared_data]
97
+
98
+
99
+ print("Generating embeddings...")
100
+ embeddings = generate_embeddings(texts_for_embedding, together_api_key)
101
+
102
+
103
+ print("Initializing Pinecone...")
104
+ pinecone_instance = initialize_pinecone(pinecone_api_key, pinecone_env)
105
+
106
+
107
+ index_name = os.getenv("PINECONE_INDEX_NAME") or api_keys.get("pinecone_index_name")
108
+ if not index_name:
109
+ raise ValueError("Pinecone index name not found in environment variables or API.yml")
110
+
111
+
112
+ if index_name not in pinecone_instance.list_indexes().names():
113
+ pinecone_instance.create_index(
114
+ name=index_name,
115
+ dimension=1024, # Dimension for UAE-Large-V1
116
+ metric='cosine'
117
+ )
118
+
119
+ # Upsert embeddings into Pinecone
120
+ print("Upserting embeddings to Pinecone...")
121
+ upsert_to_pinecone(pinecone_instance, index_name, prepared_data, embeddings)
122
+
123
+ print("Embeddings generated and upserted to Pinecone successfully!")
124
+
125
+ except Exception as e:
126
+ print(f"An error occurred: {str(e)}")
127
+
128
+ if __name__ == "__main__":
129
  main()