Spaces:
Sleeping
Sleeping
cdupland
commited on
Commit
·
2bec663
1
Parent(s):
dc7c31e
First commit
Browse files- app.py +81 -2
- requirements.txt +8 -0
- vectore_store.py +206 -0
app.py
CHANGED
@@ -1,4 +1,83 @@
|
|
|
|
1 |
import streamlit as st
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import streamlit as st
|
3 |
+
import time
|
4 |
|
5 |
+
from langchain_openai import ChatOpenAI # Nouvel import pour OpenAI
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from pinecone import Pinecone, ServerlessSpec
|
8 |
+
|
9 |
+
from vectore_store import get_retreive_answer
|
10 |
+
|
11 |
+
|
12 |
+
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
13 |
+
index_name = os.environ.get("PINECONE_INDEX_NAME")
|
14 |
+
|
15 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
16 |
+
|
17 |
+
|
18 |
+
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
|
19 |
+
|
20 |
+
if index_name not in existing_indexes:
|
21 |
+
pc.create_index(
|
22 |
+
name=index_name,
|
23 |
+
dimension=3072,
|
24 |
+
metric="cosine",
|
25 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
26 |
+
)
|
27 |
+
while not pc.describe_index(index_name).status["ready"]:
|
28 |
+
time.sleep(1)
|
29 |
+
|
30 |
+
index = pc.Index(index_name)
|
31 |
+
|
32 |
+
# Créez un modèle OpenAI
|
33 |
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
34 |
+
|
35 |
+
# Créez un template de prompt pour l'IA
|
36 |
+
prompt = PromptTemplate(
|
37 |
+
input_variables=["question"],
|
38 |
+
template="Voici une question : {question}. Réponds de manière concise et claire."
|
39 |
+
)
|
40 |
+
|
41 |
+
# Chaîne qui combine le modèle LLM et le prompt
|
42 |
+
llm_chain = prompt | llm # Nouvelle façon d'utiliser LLMChain
|
43 |
+
|
44 |
+
# Initialiser l'historique s'il n'existe pas déjà
|
45 |
+
if "history" not in st.session_state:
|
46 |
+
st.session_state["history"] = []
|
47 |
+
|
48 |
+
# Interface Streamlit
|
49 |
+
st.title("Application LLM avec LangChain")
|
50 |
+
st.write("Posez une question au modèle de langage et obtenez une réponse.")
|
51 |
+
|
52 |
+
# Champ pour entrer la question
|
53 |
+
question = st.text_input("Votre question :")
|
54 |
+
|
55 |
+
# Bouton pour soumettre la question
|
56 |
+
if st.button("Envoyer"):
|
57 |
+
if question:
|
58 |
+
|
59 |
+
context = get_retreive_answer(index, namespace="chat", prompt=question)
|
60 |
+
if not context:
|
61 |
+
print("no context found for prompt reormatting")
|
62 |
+
|
63 |
+
for chunk in context:
|
64 |
+
print(chunk.metadata)
|
65 |
+
chunk_name = chunk.metadata["filename"]
|
66 |
+
|
67 |
+
|
68 |
+
# Utilisation de la chaîne pour obtenir une réponse
|
69 |
+
response = llm_chain.invoke({"question": question, "context": context}) # Utilisation de invoke() au lieu de run()
|
70 |
+
|
71 |
+
# Ajouter la question et la réponse à l'historique
|
72 |
+
st.session_state["history"].append({"question": question, "response": response.content})
|
73 |
+
else:
|
74 |
+
st.write("Veuillez poser une question.")
|
75 |
+
|
76 |
+
# Affichage de l'historique des échanges
|
77 |
+
if st.session_state["history"]:
|
78 |
+
st.write("### Historique des échanges")
|
79 |
+
for i, exchange in enumerate(st.session_state["history"]):
|
80 |
+
st.write(f"**Échange {i + 1}**")
|
81 |
+
st.write(f"**Question :** {exchange['question']}")
|
82 |
+
st.write(f"**Réponse :** {exchange['response']}")
|
83 |
+
st.write("---")
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
langchain
|
3 |
+
langchain-core
|
4 |
+
langchain-pinecone
|
5 |
+
langchain-pinecone
|
6 |
+
pinecone-notebooks
|
7 |
+
pinecone-client[grpc]
|
8 |
+
pandas
|
vectore_store.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import pandas as pd
|
4 |
+
from langchain_openai import OpenAIEmbeddings
|
5 |
+
from langchain_pinecone import PineconeVectorStore
|
6 |
+
from langchain_core.documents import Document
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
|
9 |
+
def get_text_chunks(text):
|
10 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
11 |
+
chunk_size=500, # the character length of the chunck
|
12 |
+
chunk_overlap=100, # the character length of the overlap between chuncks
|
13 |
+
length_function=len # the length function - in this case, character length (aka the python len() fn.)
|
14 |
+
)
|
15 |
+
chunks = text_splitter.split_text(text)
|
16 |
+
return chunks
|
17 |
+
|
18 |
+
|
19 |
+
def get_vectorstore_from_csv(csv_file, text_columns, namespace, index):
|
20 |
+
"""
|
21 |
+
Function to process CSV with multiple text columns and insert embeddings into Pinecone.
|
22 |
+
|
23 |
+
Parameters:
|
24 |
+
- csv_file: Path to the CSV file.
|
25 |
+
- text_columns: List of column names that contain text data to be embedded.
|
26 |
+
- namespace: Pinecone namespace to store the vectors.
|
27 |
+
- index: Pinecone index object.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
- dict with filename_id upon success or False on failure.
|
31 |
+
"""
|
32 |
+
try:
|
33 |
+
# Load the CSV file into a DataFrame
|
34 |
+
df = pd.read_csv(csv_file)
|
35 |
+
|
36 |
+
# Ensure the specified text columns exist
|
37 |
+
for column in text_columns:
|
38 |
+
if column not in df.columns:
|
39 |
+
raise ValueError(f"'{column}' column not found in the CSV file")
|
40 |
+
|
41 |
+
# Initialize the OpenAI embedding model
|
42 |
+
embedding = OpenAIEmbeddings(model="text-embedding-3-large") # Replace with your model choice
|
43 |
+
vector_store = PineconeVectorStore(index=index, embedding=embedding, namespace=namespace)
|
44 |
+
|
45 |
+
# Clean up filename (for unique ids)
|
46 |
+
filename = csv_file.split(".")[0]
|
47 |
+
clean_filename = filename.replace(" ", "_").replace("-", "_").replace(".", "_").replace("/", "_").replace("\\", "_").strip()
|
48 |
+
|
49 |
+
# Prepare documents and UUIDs
|
50 |
+
documents = []
|
51 |
+
uuids = []
|
52 |
+
combined_text = " "
|
53 |
+
|
54 |
+
# Iterate through each row in the CSV and process the text columns
|
55 |
+
for i, row in df.iterrows():
|
56 |
+
# Concatenate text from the specified columns
|
57 |
+
combined_text += " ".join(str(row[col]) for col in text_columns)
|
58 |
+
|
59 |
+
text_chunks = get_text_chunks(combined_text)
|
60 |
+
|
61 |
+
print(text_chunks)
|
62 |
+
|
63 |
+
for i, chunk in enumerate(text_chunks):
|
64 |
+
|
65 |
+
# Create a Document object for each combined text chunk
|
66 |
+
document = Document(
|
67 |
+
page_content=chunk,
|
68 |
+
metadata={"filename": csv_file, "filename_id": clean_filename}
|
69 |
+
)
|
70 |
+
|
71 |
+
# Generate a unique ID for each document (row)
|
72 |
+
uuid = f"{clean_filename}_{i}"
|
73 |
+
uuids.append(uuid)
|
74 |
+
documents.append(document)
|
75 |
+
|
76 |
+
# Insert the documents into Pinecone
|
77 |
+
vector_store.add_documents(documents=documents, ids=uuids)
|
78 |
+
|
79 |
+
# Return filename ID after successful insertion
|
80 |
+
return {"filename_id": clean_filename}
|
81 |
+
|
82 |
+
except Exception as e:
|
83 |
+
print(f"Error: {e}")
|
84 |
+
return False
|
85 |
+
|
86 |
+
|
87 |
+
def get_vectorstore(text_chunks, filename, namespace, index):
|
88 |
+
try:
|
89 |
+
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
90 |
+
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
|
91 |
+
|
92 |
+
file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
|
93 |
+
clean_filename = file_name
|
94 |
+
|
95 |
+
documents = []
|
96 |
+
uuids = []
|
97 |
+
|
98 |
+
for i, chunk in enumerate(text_chunks):
|
99 |
+
|
100 |
+
document = Document(
|
101 |
+
page_content=chunk,
|
102 |
+
metadata={"filename":filename, "filename_id":clean_filename},
|
103 |
+
)
|
104 |
+
|
105 |
+
uuid = f"{clean_filename}_{i}"
|
106 |
+
uuids.append(uuid)
|
107 |
+
documents.append(document)
|
108 |
+
|
109 |
+
vector_store.add_documents(documents=documents, ids=uuids)
|
110 |
+
|
111 |
+
return {"filename_id":clean_filename}
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
print(e)
|
115 |
+
return False
|
116 |
+
|
117 |
+
def get_vectorstore_from_json(json_file, namespace, index):
|
118 |
+
"""
|
119 |
+
Fonction pour traiter un fichier JSON et insérer les embeddings dans Pinecone.
|
120 |
+
|
121 |
+
Parameters:
|
122 |
+
- json_file: Chemin vers le fichier JSON.
|
123 |
+
- namespace: Namespace Pinecone pour stocker les vecteurs.
|
124 |
+
- index: Objet d'index Pinecone.
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
- dict avec filename_id en cas de succès ou False en cas d'échec.
|
128 |
+
"""
|
129 |
+
try:
|
130 |
+
# Charger le fichier JSON dans une liste de dictionnaires
|
131 |
+
with open(json_file, 'r', encoding='utf-8') as file:
|
132 |
+
data = json.load(file)
|
133 |
+
|
134 |
+
# print(data)
|
135 |
+
# Initialiser le modèle d'embeddings OpenAI
|
136 |
+
embedding = OpenAIEmbeddings(model="text-embedding-3-large") # Remplacez par votre choix de modèle
|
137 |
+
vector_store = PineconeVectorStore(index=index, embedding=embedding, namespace=namespace)
|
138 |
+
|
139 |
+
# Nettoyer le nom de fichier (pour des IDs uniques)
|
140 |
+
filename = json_file.split(".")[0]
|
141 |
+
clean_filename = filename.replace(" ", "_").replace("-", "_").replace(".", "_").replace("/", "_").replace("\\", "_").strip()
|
142 |
+
|
143 |
+
# Préparer les documents et UUIDs
|
144 |
+
documents = []
|
145 |
+
uuids = []
|
146 |
+
|
147 |
+
# # Itérer à travers chaque entrée du JSON
|
148 |
+
# for i, entry in enumerate(data):
|
149 |
+
# # Concaténer toutes les propriétés de l'entrée
|
150 |
+
# combined_text = " ".join(str(value) for value in entry.values())
|
151 |
+
|
152 |
+
text_chunks = get_text_chunks(json.dumps(data, indent=4))
|
153 |
+
|
154 |
+
for j, chunk in enumerate(text_chunks):
|
155 |
+
# Créer un objet Document pour chaque morceau de texte combiné
|
156 |
+
document = Document(
|
157 |
+
page_content=chunk,
|
158 |
+
metadata={"filename": json_file, "filename_id": clean_filename, "chunk_index": j} # Ajout de toutes les propriétés de l'entrée
|
159 |
+
)
|
160 |
+
|
161 |
+
# Générer un ID unique pour chaque document (entrée)
|
162 |
+
uuid = f"{clean_filename}_{j}" # Ajouter l'index à l'ID unique
|
163 |
+
uuids.append(uuid)
|
164 |
+
documents.append(document)
|
165 |
+
|
166 |
+
# Insérer les documents dans Pinecone
|
167 |
+
vector_store.add_documents(documents=documents, ids=uuids)
|
168 |
+
|
169 |
+
# Retourner l'ID de fichier après insertion réussie
|
170 |
+
return {"filename_id": clean_filename}
|
171 |
+
|
172 |
+
except Exception as e:
|
173 |
+
print(f"Error: {e}")
|
174 |
+
return False
|
175 |
+
|
176 |
+
|
177 |
+
def get_retreive_answer(index,namespace,prompt):
|
178 |
+
try:
|
179 |
+
|
180 |
+
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
181 |
+
vector_store_chat = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
|
182 |
+
|
183 |
+
retriever_chat = vector_store_chat.as_retriever(
|
184 |
+
search_type="similarity_score_threshold",
|
185 |
+
search_kwargs={"k": 3, "score_threshold": 0.6},
|
186 |
+
)
|
187 |
+
|
188 |
+
vector_store_tic = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
|
189 |
+
|
190 |
+
retriever_tic = vector_store_tic.as_retriever(
|
191 |
+
search_type="similarity_score_threshold",
|
192 |
+
search_kwargs={"k": 3, "score_threshold": 0.6},
|
193 |
+
)
|
194 |
+
|
195 |
+
response = retriever_chat.invoke(prompt) + retriever_tic.invoke(prompt)
|
196 |
+
|
197 |
+
if response:
|
198 |
+
print("found enterprise context")
|
199 |
+
else:
|
200 |
+
print("no enterprise context")
|
201 |
+
|
202 |
+
return response
|
203 |
+
|
204 |
+
except Exception as e:
|
205 |
+
print(e)
|
206 |
+
return False
|