oceansweep commited on
Commit
bb1e9dd
·
verified ·
1 Parent(s): facc9cd

Update App_Function_Libraries/RAG/Embeddings_Create.py

Browse files
App_Function_Libraries/RAG/Embeddings_Create.py CHANGED
@@ -1,167 +1,167 @@
1
- # Embeddings_Create.py
2
- # Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
3
- #
4
- # Imports:
5
- import logging
6
- from typing import List, Dict, Any
7
-
8
- import numpy as np
9
- #
10
- # 3rd-Party Imports:
11
- import requests
12
- from transformers import AutoTokenizer, AutoModel
13
- import torch
14
- #
15
- # Local Imports:
16
- from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
17
- from App_Function_Libraries.Summarization_General_Lib import summarize
18
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
19
- from App_Function_Libraries.Chunk_Lib import chunk_options, improved_chunking_process, determine_chunk_position
20
- #
21
- #
22
- #######################################################################################################################
23
- #
24
- # Functions:
25
-
26
- # FIXME - Add all globals to summarize.py
27
- loaded_config = load_comprehensive_config()
28
- embedding_provider = loaded_config['Embeddings']['embedding_provider']
29
- embedding_model = loaded_config['Embeddings']['embedding_model']
30
- embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
31
- embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
32
-
33
- # Embedding Chunking Settings
34
- chunk_size = loaded_config['Embeddings']['chunk_size']
35
- overlap = loaded_config['Embeddings']['overlap']
36
-
37
-
38
- # FIXME - Add logging
39
-
40
- # FIXME - refactor/setup to use config file & perform chunking
41
- def create_embedding(text: str, provider: str, model: str, api_url: str = None, api_key: str = None) -> List[float]:
42
- try:
43
- if provider == 'openai':
44
- embedding = get_openai_embeddings(text, model)
45
- elif provider == 'local':
46
- embedding = create_local_embedding(text, model, api_url, api_key)
47
- elif provider == 'huggingface':
48
- embedding = create_huggingface_embedding(text, model)
49
- elif provider == 'llamacpp':
50
- embedding = create_llamacpp_embedding(text, api_url)
51
- else:
52
- raise ValueError(f"Unsupported embedding provider: {provider}")
53
-
54
- if isinstance(embedding, np.ndarray):
55
- embedding = embedding.tolist()
56
- elif isinstance(embedding, torch.Tensor):
57
- embedding = embedding.detach().cpu().numpy().tolist()
58
-
59
- return embedding
60
-
61
- except Exception as e:
62
- logging.error(f"Error creating embedding: {str(e)}")
63
- raise
64
-
65
-
66
- def create_huggingface_embedding(text: str, model: str) -> List[float]:
67
- tokenizer = AutoTokenizer.from_pretrained(model)
68
- model = AutoModel.from_pretrained(model)
69
-
70
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
71
- with torch.no_grad():
72
- outputs = model(**inputs)
73
-
74
- embeddings = outputs.last_hidden_state.mean(dim=1)
75
- return embeddings[0].tolist()
76
-
77
-
78
- # FIXME
79
- def create_stella_embeddings(text: str) -> List[float]:
80
- if embedding_provider == 'local':
81
- # Load the model and tokenizer
82
- tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
83
- model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
84
-
85
- # Tokenize and encode the text
86
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
87
-
88
- # Generate embeddings
89
- with torch.no_grad():
90
- outputs = model(**inputs)
91
-
92
- # Use the mean of the last hidden state as the sentence embedding
93
- embeddings = outputs.last_hidden_state.mean(dim=1)
94
-
95
- return embeddings[0].tolist() # Convert to list for consistency
96
- elif embedding_provider == 'openai':
97
- return get_openai_embeddings(text, embedding_model)
98
- else:
99
- raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
100
-
101
-
102
- def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
103
- response = requests.post(
104
- api_url,
105
- json={"input": text}
106
- )
107
- response.raise_for_status()
108
- return response.json()['embedding']
109
-
110
-
111
- def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
112
- response = requests.post(
113
- api_url,
114
- json={"text": text, "model": model},
115
- headers={"Authorization": f"Bearer {api_key}"}
116
- )
117
- response.raise_for_status()
118
- return response.json().get('embedding', None)
119
-
120
-
121
- def chunk_for_embedding(text: str, file_name: str, api_name, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
122
- options = chunk_options.copy()
123
- if custom_chunk_options:
124
- options.update(custom_chunk_options)
125
-
126
-
127
- # FIXME
128
- if api_name is not None:
129
- # Generate summary of the full document
130
- full_summary = summarize(text, None, api_name, None, None, None)
131
- else:
132
- full_summary = "Full document summary not available."
133
-
134
- chunks = improved_chunking_process(text, options)
135
- total_chunks = len(chunks)
136
-
137
- chunked_text_with_headers = []
138
- for i, chunk in enumerate(chunks, 1):
139
- chunk_text = chunk['text']
140
- chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
141
-
142
- chunk_header = f"""
143
- Original Document: {file_name}
144
- Full Document Summary: {full_summary}
145
- Chunk: {i} of {total_chunks}
146
- Position: {chunk_position}
147
-
148
- --- Chunk Content ---
149
- """
150
-
151
- full_chunk_text = chunk_header + chunk_text
152
- chunk['text'] = full_chunk_text
153
- chunk['metadata']['file_name'] = file_name
154
- chunked_text_with_headers.append(chunk)
155
-
156
- return chunked_text_with_headers
157
-
158
-
159
- def create_openai_embedding(text: str, model: str) -> List[float]:
160
- embedding = get_openai_embeddings(text, model)
161
- return embedding
162
-
163
-
164
-
165
- #
166
- # End of File.
167
- #######################################################################################################################
 
1
+ # Embeddings_Create.py
2
+ # Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
3
+ #
4
+ # Imports:
5
+ import logging
6
+ from typing import List, Dict, Any
7
+
8
+ import numpy as np
9
+ #
10
+ # 3rd-Party Imports:
11
+ import requests
12
+ from transformers import AutoTokenizer, AutoModel
13
+ import torch
14
+ #
15
+ # Local Imports:
16
+ from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
17
+ from App_Function_Libraries.Summarization_General_Lib import summarize
18
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
19
+ from App_Function_Libraries.Chunk_Lib import chunk_options, improved_chunking_process#, determine_chunk_position
20
+ #
21
+ #
22
+ #######################################################################################################################
23
+ #
24
+ # Functions:
25
+
26
+ # FIXME - Add all globals to summarize.py
27
+ loaded_config = load_comprehensive_config()
28
+ embedding_provider = loaded_config['Embeddings']['embedding_provider']
29
+ embedding_model = loaded_config['Embeddings']['embedding_model']
30
+ embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
31
+ embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
32
+
33
+ # Embedding Chunking Settings
34
+ chunk_size = loaded_config['Embeddings']['chunk_size']
35
+ overlap = loaded_config['Embeddings']['overlap']
36
+
37
+
38
+ # FIXME - Add logging
39
+
40
+ # FIXME - refactor/setup to use config file & perform chunking
41
+ def create_embedding(text: str, provider: str, model: str, api_url: str = None, api_key: str = None) -> List[float]:
42
+ try:
43
+ if provider == 'openai':
44
+ embedding = get_openai_embeddings(text, model)
45
+ elif provider == 'local':
46
+ embedding = create_local_embedding(text, model, api_url, api_key)
47
+ elif provider == 'huggingface':
48
+ embedding = create_huggingface_embedding(text, model)
49
+ elif provider == 'llamacpp':
50
+ embedding = create_llamacpp_embedding(text, api_url)
51
+ else:
52
+ raise ValueError(f"Unsupported embedding provider: {provider}")
53
+
54
+ if isinstance(embedding, np.ndarray):
55
+ embedding = embedding.tolist()
56
+ elif isinstance(embedding, torch.Tensor):
57
+ embedding = embedding.detach().cpu().numpy().tolist()
58
+
59
+ return embedding
60
+
61
+ except Exception as e:
62
+ logging.error(f"Error creating embedding: {str(e)}")
63
+ raise
64
+
65
+
66
+ def create_huggingface_embedding(text: str, model: str) -> List[float]:
67
+ tokenizer = AutoTokenizer.from_pretrained(model)
68
+ model = AutoModel.from_pretrained(model)
69
+
70
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
71
+ with torch.no_grad():
72
+ outputs = model(**inputs)
73
+
74
+ embeddings = outputs.last_hidden_state.mean(dim=1)
75
+ return embeddings[0].tolist()
76
+
77
+
78
+ # FIXME
79
+ def create_stella_embeddings(text: str) -> List[float]:
80
+ if embedding_provider == 'local':
81
+ # Load the model and tokenizer
82
+ tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
83
+ model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
84
+
85
+ # Tokenize and encode the text
86
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
87
+
88
+ # Generate embeddings
89
+ with torch.no_grad():
90
+ outputs = model(**inputs)
91
+
92
+ # Use the mean of the last hidden state as the sentence embedding
93
+ embeddings = outputs.last_hidden_state.mean(dim=1)
94
+
95
+ return embeddings[0].tolist() # Convert to list for consistency
96
+ elif embedding_provider == 'openai':
97
+ return get_openai_embeddings(text, embedding_model)
98
+ else:
99
+ raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
100
+
101
+
102
+ def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
103
+ response = requests.post(
104
+ api_url,
105
+ json={"input": text}
106
+ )
107
+ response.raise_for_status()
108
+ return response.json()['embedding']
109
+
110
+
111
+ def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
112
+ response = requests.post(
113
+ api_url,
114
+ json={"text": text, "model": model},
115
+ headers={"Authorization": f"Bearer {api_key}"}
116
+ )
117
+ response.raise_for_status()
118
+ return response.json().get('embedding', None)
119
+
120
+
121
+ def chunk_for_embedding(text: str, file_name: str, api_name, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
122
+ options = chunk_options.copy()
123
+ if custom_chunk_options:
124
+ options.update(custom_chunk_options)
125
+
126
+
127
+ # FIXME
128
+ if api_name is not None:
129
+ # Generate summary of the full document
130
+ full_summary = summarize(text, None, api_name, None, None, None)
131
+ else:
132
+ full_summary = "Full document summary not available."
133
+
134
+ chunks = improved_chunking_process(text, options)
135
+ total_chunks = len(chunks)
136
+
137
+ chunked_text_with_headers = []
138
+ for i, chunk in enumerate(chunks, 1):
139
+ chunk_text = chunk['text']
140
+ chunk_position = 1#DIRTY HACK #determine_chunk_position(chunk['metadata']['relative_position'])
141
+
142
+ chunk_header = f"""
143
+ Original Document: {file_name}
144
+ Full Document Summary: {full_summary}
145
+ Chunk: {i} of {total_chunks}
146
+ Position: {chunk_position}
147
+
148
+ --- Chunk Content ---
149
+ """
150
+
151
+ full_chunk_text = chunk_header + chunk_text
152
+ chunk['text'] = full_chunk_text
153
+ chunk['metadata']['file_name'] = file_name
154
+ chunked_text_with_headers.append(chunk)
155
+
156
+ return chunked_text_with_headers
157
+
158
+
159
+ def create_openai_embedding(text: str, model: str) -> List[float]:
160
+ embedding = get_openai_embeddings(text, model)
161
+ return embedding
162
+
163
+
164
+
165
+ #
166
+ # End of File.
167
+ #######################################################################################################################