Spaces:
Sleeping
Sleeping
Upload RAG_OpenAI.py
Browse filesCode fonctionnel Open AI (sans affichage des référence bibliographiques, step 2 à terminer)
- RAG_OpenAI.py +8 -7
RAG_OpenAI.py
CHANGED
@@ -3,6 +3,7 @@ import numpy as np
|
|
3 |
import fitz # PyMuPDF pour extraction PDF
|
4 |
import faiss
|
5 |
import openai
|
|
|
6 |
from sklearn.manifold import TSNE
|
7 |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
|
8 |
from dotenv import load_dotenv
|
@@ -12,7 +13,7 @@ load_dotenv()
|
|
12 |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
13 |
|
14 |
# 📌 Initialisation du client OpenAI
|
15 |
-
|
16 |
model_embedding = "text-embedding-ada-002"
|
17 |
model_chat = "gpt-4-turbo"
|
18 |
|
@@ -33,11 +34,11 @@ def get_embeddings_in_batches(text_chunks, batch_size=5):
|
|
33 |
embeddings = []
|
34 |
for i in range(0, len(text_chunks), batch_size):
|
35 |
batch = text_chunks[i:i + batch_size]
|
36 |
-
response =
|
37 |
input=batch,
|
38 |
model=model_embedding
|
39 |
)
|
40 |
-
batch_embeddings = [data
|
41 |
embeddings.extend(batch_embeddings)
|
42 |
|
43 |
return np.array(embeddings).astype('float32')
|
@@ -55,11 +56,11 @@ index.add(embeddings)
|
|
55 |
# 📌 Récupération des chunks les plus pertinents
|
56 |
def retrieve_relevant_chunks(question, k=5):
|
57 |
"""Recherche les chunks les plus pertinents en fonction de la similarité des embeddings."""
|
58 |
-
response =
|
59 |
input=[question],
|
60 |
model=model_embedding
|
61 |
)
|
62 |
-
question_embedding = np.array(response
|
63 |
distances, indices = index.search(question_embedding, k)
|
64 |
return [chunked_docs[i] for i in indices[0]]
|
65 |
|
@@ -71,11 +72,11 @@ def generate_response(context, question):
|
|
71 |
{"role": "user", "content": question}
|
72 |
]
|
73 |
|
74 |
-
response =
|
75 |
model=model_chat,
|
76 |
messages=messages
|
77 |
)
|
78 |
-
return response
|
79 |
|
80 |
# 📌 Exécuter une requête utilisateur
|
81 |
user_question = "Quelles souches de poulet et poules se trouvent dans ce corpus de texte ?"
|
|
|
3 |
import fitz # PyMuPDF pour extraction PDF
|
4 |
import faiss
|
5 |
import openai
|
6 |
+
from openai import OpenAI
|
7 |
from sklearn.manifold import TSNE
|
8 |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
|
9 |
from dotenv import load_dotenv
|
|
|
13 |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
14 |
|
15 |
# 📌 Initialisation du client OpenAI
|
16 |
+
client = OpenAI(api_key='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')
|
17 |
model_embedding = "text-embedding-ada-002"
|
18 |
model_chat = "gpt-4-turbo"
|
19 |
|
|
|
34 |
embeddings = []
|
35 |
for i in range(0, len(text_chunks), batch_size):
|
36 |
batch = text_chunks[i:i + batch_size]
|
37 |
+
response = client.embeddings.create(
|
38 |
input=batch,
|
39 |
model=model_embedding
|
40 |
)
|
41 |
+
batch_embeddings = [data.embedding for data in response.data]
|
42 |
embeddings.extend(batch_embeddings)
|
43 |
|
44 |
return np.array(embeddings).astype('float32')
|
|
|
56 |
# 📌 Récupération des chunks les plus pertinents
|
57 |
def retrieve_relevant_chunks(question, k=5):
|
58 |
"""Recherche les chunks les plus pertinents en fonction de la similarité des embeddings."""
|
59 |
+
response = client.embeddings.create(
|
60 |
input=[question],
|
61 |
model=model_embedding
|
62 |
)
|
63 |
+
question_embedding = np.array(response.data[0].embedding).astype('float32').reshape(1, -1)
|
64 |
distances, indices = index.search(question_embedding, k)
|
65 |
return [chunked_docs[i] for i in indices[0]]
|
66 |
|
|
|
72 |
{"role": "user", "content": question}
|
73 |
]
|
74 |
|
75 |
+
response = client.chat.completions.create(
|
76 |
model=model_chat,
|
77 |
messages=messages
|
78 |
)
|
79 |
+
return response.choices[0].message.content
|
80 |
|
81 |
# 📌 Exécuter une requête utilisateur
|
82 |
user_question = "Quelles souches de poulet et poules se trouvent dans ce corpus de texte ?"
|