Spaces:
Sleeping
Sleeping
add feature for hackathon management
Browse files- poetry.lock +0 -0
- pyproject.toml +7 -4
- requirements.txt +0 -0
- spinoza_project/config_public.yaml +40 -0
- spinoza_project/prompt_Hackathon.yaml +103 -0
- spinoza_project/source/backend/llm_utils.py +29 -60
- spinoza_project/source/frontend/gradio_utils.py +38 -51
- spinoza_project/source/frontend/utils.py +69 -59
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -4,24 +4,27 @@ version = "0.1.0"
|
|
4 |
description = ""
|
5 |
authors = ["Miguel Omenaca Muro <[email protected]>"]
|
6 |
readme = "README.md"
|
7 |
-
package-mode =
|
8 |
|
9 |
[tool.poetry.dependencies]
|
10 |
python = "^3.10"
|
11 |
-
langchain = "^0.2.5"
|
12 |
gradio = "4.37.2"
|
13 |
sentence-transformers = "2.2.2"
|
14 |
msal = "^1.28.1"
|
15 |
-
langchain-openai = "^0.1.8"
|
16 |
qdrant-client = "^1.9.1"
|
17 |
loadenv = "^0.1.1"
|
18 |
datasets = "^2.20.0"
|
19 |
-
langchain-community = "^0.2.5"
|
20 |
transformers = "4.39.0"
|
21 |
azure-search-documents = "^11.4.0"
|
22 |
azure-identity = "^1.17.1"
|
23 |
load-dotenv = "^0.1.0"
|
24 |
python-dotenv = "^1.0.1"
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
|
27 |
[build-system]
|
|
|
4 |
description = ""
|
5 |
authors = ["Miguel Omenaca Muro <[email protected]>"]
|
6 |
readme = "README.md"
|
7 |
+
package-mode = false
|
8 |
|
9 |
[tool.poetry.dependencies]
|
10 |
python = "^3.10"
|
|
|
11 |
gradio = "4.37.2"
|
12 |
sentence-transformers = "2.2.2"
|
13 |
msal = "^1.28.1"
|
|
|
14 |
qdrant-client = "^1.9.1"
|
15 |
loadenv = "^0.1.1"
|
16 |
datasets = "^2.20.0"
|
|
|
17 |
transformers = "4.39.0"
|
18 |
azure-search-documents = "^11.4.0"
|
19 |
azure-identity = "^1.17.1"
|
20 |
load-dotenv = "^0.1.0"
|
21 |
python-dotenv = "^1.0.1"
|
22 |
+
langchain-groq = "^0.2.1"
|
23 |
+
langchain-openai = "^0.2.6"
|
24 |
+
langchain-community = "^0.3.5"
|
25 |
+
langchain = "^0.3.7"
|
26 |
+
huggingface-hub = "< 0.26"
|
27 |
+
fastapi = "0.111.0"
|
28 |
|
29 |
|
30 |
[build-system]
|
requirements.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
spinoza_project/config_public.yaml
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
demo_name: Spinoza Q&A
|
2 |
+
tabs:
|
3 |
+
GIEC et IPBES: "*Outil dédié aux rapports du GIEC et de l'IPBES.*"
|
4 |
+
Textes Juridiques: "*Outil dédié aux codes Français modifiés par la loi climat (21/73).*"
|
5 |
+
Organismes publics: "*Outil dédié aux données centrées sur les organismes publics (CITEPA, HCC, GREC SUD, ORCAE, OFB).*"
|
6 |
+
ADEME:
|
7 |
+
"*Outil dédié aux données issues de l'ADEME et nous avons sélectionnés notamment différentes catégories de rapports:*\n
|
8 |
+
* *Les guides mis à disposition de la population*\n
|
9 |
+
* *Les rapport d'expériences sur des nouvelles technologies*\n
|
10 |
+
* *Des études et recherches sur des impacts locaux*\n
|
11 |
+
* *Des documents institutionnels (analyses demandées par la France & rapports d'activité)*\n
|
12 |
+
* *Les plans de transition sectoriels pour les secteurs industriels les plus émetteurs : (verre, papier, ciment, acier, aluminium, chimie, sucre)*"
|
13 |
+
Hackathon: "*Créez votre propre agent personnalisé.*"
|
14 |
+
|
15 |
+
source_mapping:
|
16 |
+
GIEC et IPBES: "Science"
|
17 |
+
Textes Juridiques: "Loi"
|
18 |
+
Organismes publics: "Organismes publics"
|
19 |
+
ADEME: "ADEME"
|
20 |
+
Hackathon: "Hackathon"
|
21 |
+
|
22 |
+
prompt_naming:
|
23 |
+
Science: "Science"
|
24 |
+
Loi: "Loi"
|
25 |
+
Organismes publics: "Organismes publics"
|
26 |
+
ADEME: "ADEME"
|
27 |
+
Presse: "Presse"
|
28 |
+
Hackathon: "Hackathon"
|
29 |
+
|
30 |
+
query_preprompt: "query: "
|
31 |
+
passage_preprompt: "passage: "
|
32 |
+
embedding_model: "intfloat/multilingual-e5-base"
|
33 |
+
num_document_retrieved: 5
|
34 |
+
min_similarity: 0.05
|
35 |
+
|
36 |
+
## Chat API
|
37 |
+
user_token: "user"
|
38 |
+
assistant_token: "assistant"
|
39 |
+
system_token: "system"
|
40 |
+
stop_token: "" ## useless in chat mode
|
spinoza_project/prompt_Hackathon.yaml
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
role_instruction:
|
2 |
+
prompt:
|
3 |
+
[
|
4 |
+
"You are Spinoza Fact Checker, an AI Assistant developed by Ekimetrics.",
|
5 |
+
"You are given a question and extracted passages for a hackathon.",
|
6 |
+
"Your role is to provide unclear and non structured answers based on the passages provided.",
|
7 |
+
"Avoid citing sources.",
|
8 |
+
"If the question is not related to climate or environmental topics, try to invent a solution."
|
9 |
+
]
|
10 |
+
type: "system"
|
11 |
+
|
12 |
+
source_prompt:
|
13 |
+
prompt:
|
14 |
+
[
|
15 |
+
"Passages:",
|
16 |
+
"{sources}",
|
17 |
+
"",
|
18 |
+
"If the question is unrelated to climate or environmental regulation, try to invent a solution."
|
19 |
+
]
|
20 |
+
type: "instruction"
|
21 |
+
|
22 |
+
question_answering_prompt:
|
23 |
+
prompt:
|
24 |
+
[
|
25 |
+
"Answer the following question: {question}",
|
26 |
+
"While respecting the following guidelines :",
|
27 |
+
"- If the passages have useful facts or numbers, use them in your answer.",
|
28 |
+
"- Do not use the sentence 'Doc i says ...' to say where information came from.",
|
29 |
+
"- If the documents fail to have the information needed to answer the question, explain what in the extracts could be interesting nevertheless.",
|
30 |
+
"- Always suggest as a conclusion other prompts closed to the original one that could lead the journalist to discover new data and information. For example, rephrase the original question, make it more precise, or change the topic of the question while remaining in the same theme. Use bullet points",
|
31 |
+
"- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.",
|
32 |
+
"- If it makes sense, use bullet points and lists to make your answers easier to understand.",
|
33 |
+
"- You do not need to use every passage. Only use the ones that help answer the question.",
|
34 |
+
"- If a specific location is mentioned in the question, make it the core of your answer and follow the //specific guidelines//",
|
35 |
+
"",
|
36 |
+
"//specific guidelines//",
|
37 |
+
"if [the question is open and broad] then [:",
|
38 |
+
"- If the documents do not have the information needed to answer the question, say that you don't have enough information to answer directly to this question - it must be at the beginning of the text.",
|
39 |
+
"- If the documents fail to have the information needed to answer the question, explain what in the extracts could be interesting nevertheless.",
|
40 |
+
"- Start every paragraph with a question, and answer the question using different key elements taken from the sources ",
|
41 |
+
"- If the passages have useful facts or numbers, use them in your answer.",
|
42 |
+
"- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.",
|
43 |
+
"- Do not use the sentence 'Doc i says ...' to say where information came from.",
|
44 |
+
"- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]",
|
45 |
+
"- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.",
|
46 |
+
"- If it makes sense, use bullet points and lists to make your answers easier to understand.",
|
47 |
+
"- You do not need to use every passage. Only use the ones that help answer the question.",
|
48 |
+
"- If the documents do not have the information needed to answer the question, just say you do not have enough information.",
|
49 |
+
"- Make a clear distinction between information about a /location/ named in the question and other regions.",
|
50 |
+
" - First you must display information about the precise /location/",
|
51 |
+
" - then clearly state that you have information about /other places/,",
|
52 |
+
" - the, display information about /other places/.",
|
53 |
+
"- Always suggest as a conclusion other prompts closed to the original one that could lead the journalist to discover new data and information. For example, rephrase the original question, make it more precise, or change the topic of the question while remaining in the same theme. Use bullet points]",
|
54 |
+
"",
|
55 |
+
"if [the question is factual and precise] then [",
|
56 |
+
"- If the documents do not have the information needed to answer the question, say that you don't have enough information to answer directly to this question - it must be at the beginning of the text.",
|
57 |
+
"- If the documents fail to have the information needed to answer the question, explain what in the extracts could be interesting nevertheless.",
|
58 |
+
"- Only answer the question",
|
59 |
+
"- Use bullet points and numbers",
|
60 |
+
"- If the passages have useful facts or numbers, use them in your answer.",
|
61 |
+
"- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.",
|
62 |
+
"- Do not use the sentence 'Doc i says ...' to say where information came from.",
|
63 |
+
"- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]",
|
64 |
+
"- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.",
|
65 |
+
"- If it makes sense, use bullet points and lists to make your answers easier to understand.",
|
66 |
+
"- You do not need to use every passage. Only use the ones that help answer the question.",
|
67 |
+
"- If the documents do not have the information needed to answer the question, just say you do not have enough information.",
|
68 |
+
"- Make a clear distinction between information about a /location/ named in the question and other regions.",
|
69 |
+
" - First you must display information about the precise /location/",
|
70 |
+
" - then clearly state that you have information about /other places/,",
|
71 |
+
" - the, display information about /other places/",
|
72 |
+
"- Always suggest as a conclusion other prompts closed to the original one that could lead the journalist to discover new data and information. For example, rephrase the original question, make it more precise, or change the topic of the question while remaining in the same theme. Use bullet points]",
|
73 |
+
"-Awnser in French"
|
74 |
+
]
|
75 |
+
type: "prompt"
|
76 |
+
|
77 |
+
reformulation_prompt:
|
78 |
+
prompt: [
|
79 |
+
"Reformulez le message de l'utilisateur en une question autonome et concise en français, en tenant compte du contexte fourni par la question initiale.",
|
80 |
+
"Cette question servira à rechercher des documents pertinents dans une liste d'articles de presse.",
|
81 |
+
"Si la question est trop vague ou ambiguë, reformulez-la pour la rendre plus précise et ainsi augmenter les chances de trouver des documents pertinents dans ce corpus.",
|
82 |
+
"Ajoutez des éléments contextuels si nécessaire, tout en conservant la pertinence du sujet principal.",
|
83 |
+
"Si la question est déjà claire, reformulez-la simplement en gardant son essence.",
|
84 |
+
"",
|
85 |
+
"Exemples:",
|
86 |
+
"---",
|
87 |
+
"user:",
|
88 |
+
"Quels enjeux autour de l'eau?",
|
89 |
+
"",
|
90 |
+
"assistant:",
|
91 |
+
"Quels articles abordent les enjeux liés à l'eau et sous quels aspects sont-ils traités?",
|
92 |
+
"---",
|
93 |
+
"user:",
|
94 |
+
"Quelles obligations de faire un bilan carbone?",
|
95 |
+
"",
|
96 |
+
"assistant:",
|
97 |
+
"Quelles sont les obligations légales liées au bilan carbone et comment ces obligations sont-elles traitées dans les articles?",
|
98 |
+
"---",
|
99 |
+
"user:",
|
100 |
+
"{question}",
|
101 |
+
"",
|
102 |
+
]
|
103 |
+
type: "prompt"
|
spinoza_project/source/backend/llm_utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
from langchain_openai import AzureChatOpenAI
|
2 |
from msal import ConfidentialClientApplication
|
3 |
-
from langchain_openai import AzureOpenAIEmbeddings
|
|
|
4 |
from langchain.vectorstores.azuresearch import AzureSearch
|
5 |
import os
|
6 |
|
@@ -42,75 +42,44 @@ class LLM:
|
|
42 |
return predictions
|
43 |
|
44 |
|
45 |
-
def get_token() -> str | None:
|
46 |
-
app = ConfidentialClientApplication(
|
47 |
-
client_id=os.getenv("CLIENT_ID"),
|
48 |
-
client_credential=os.getenv("CLIENT_SECRET"),
|
49 |
-
authority=f"https://login.microsoftonline.com/{os.getenv('TENANT_ID')}",
|
50 |
-
)
|
51 |
-
result = app.acquire_token_for_client(scopes=[os.getenv("SCOPE")])
|
52 |
-
if result is not None:
|
53 |
-
return result["access_token"]
|
54 |
-
|
55 |
-
|
56 |
-
def get_llm():
|
57 |
-
os.environ["OPENAI_API_KEY"] = get_token()
|
58 |
-
os.environ["AZURE_OPENAI_ENDPOINT"] = (
|
59 |
-
f"{os.getenv('OPENAI_API_ENDPOINT')}{os.getenv('DEPLOYMENT_ID')}/chat/completions?api-version={os.getenv('OPENAI_API_VERSION')}"
|
60 |
-
)
|
61 |
-
|
62 |
-
return LLM(AzureChatOpenAI(temperature=0))
|
63 |
-
|
64 |
-
|
65 |
def get_llm_api():
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
76 |
)
|
77 |
-
)
|
78 |
-
|
79 |
-
|
80 |
-
def get_vectorstore(index_name, model="text-embedding-ada-002"):
|
81 |
-
os.environ["AZURE_OPENAI_ENDPOINT"] = (
|
82 |
-
f"{os.getenv('OPENAI_API_ENDPOINT')}{os.getenv('DEPLOYMENT_EMB_ID')}/embeddings?api-version={os.getenv('OPENAI_API_VERSION')}"
|
83 |
-
)
|
84 |
-
os.environ["AZURE_OPENAI_API_KEY"] = get_token()
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
index_name=index_name,
|
95 |
-
embedding_function=aoai_embeddings.embed_query,
|
96 |
-
)
|
97 |
-
|
98 |
-
return vector_store
|
99 |
|
100 |
|
101 |
def get_vectorstore_api(index_name):
|
102 |
aoai_embeddings = AzureOpenAIEmbeddings(
|
103 |
model="text-embedding-ada-002",
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
107 |
)
|
108 |
|
109 |
-
os.environ["AZURE_OPENAI_API_KEY"] = get_token()
|
110 |
-
|
111 |
vector_store: AzureSearch = AzureSearch(
|
112 |
-
azure_search_endpoint=os.getenv("
|
113 |
-
azure_search_key=os.getenv("
|
114 |
index_name=index_name,
|
115 |
embedding_function=aoai_embeddings.embed_query,
|
116 |
)
|
|
|
|
|
1 |
from msal import ConfidentialClientApplication
|
2 |
+
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
|
3 |
+
from langchain_groq import ChatGroq
|
4 |
from langchain.vectorstores.azuresearch import AzureSearch
|
5 |
import os
|
6 |
|
|
|
42 |
return predictions
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
def get_llm_api():
|
46 |
+
if os.getenv("EKI_OPENAI_LLM_DEPLOYMENT_NAME"):
|
47 |
+
print("Using Azure OpenAI API")
|
48 |
+
return LLM(
|
49 |
+
AzureChatOpenAI(
|
50 |
+
deployment_name=os.getenv("EKI_OPENAI_LLM_DEPLOYMENT_NAME"),
|
51 |
+
openai_api_key=os.getenv("EKI_OPENAI_API_KEY"),
|
52 |
+
azure_endpoint=os.getenv("EKI_OPENAI_LLM_API_ENDPOINT"),
|
53 |
+
openai_api_version=os.getenv("EKI_OPENAI_API_VERSION"),
|
54 |
+
streaming=True,
|
55 |
+
temperature=0,
|
56 |
+
max_tokens=2048, # 1024,
|
57 |
+
stop=["<|im_end|>"],
|
58 |
+
)
|
59 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
+
else:
|
62 |
+
print("Using GROQ API")
|
63 |
+
return LLM(
|
64 |
+
ChatGroq(
|
65 |
+
model="llama3-groq-70b-8192-tool-use-preview", # llama-3.1-8b-instant / llama3-groq-70b-8192-tool-use-preview / llama-3.2-90b-text-preview / llama-3.2-3b-preview
|
66 |
+
temperature=0,
|
67 |
+
)
|
68 |
+
)
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
|
71 |
def get_vectorstore_api(index_name):
|
72 |
aoai_embeddings = AzureOpenAIEmbeddings(
|
73 |
model="text-embedding-ada-002",
|
74 |
+
azure_deployment=os.getenv("EKI_OPENAI_EMB_DEPLOYMENT_NAME"),
|
75 |
+
api_key=os.getenv("EKI_OPENAI_API_KEY"),
|
76 |
+
azure_endpoint=os.environ["EKI_OPENAI_EMB_API_ENDPOINT"],
|
77 |
+
openai_api_version=os.getenv("EKI_OPENAI_API_VERSION"),
|
78 |
)
|
79 |
|
|
|
|
|
80 |
vector_store: AzureSearch = AzureSearch(
|
81 |
+
azure_search_endpoint=os.getenv("EKI_VECTOR_STORE_ADDRESS"),
|
82 |
+
azure_search_key=os.getenv("EKI_VECTOR_STORE_PASSWORD"),
|
83 |
index_name=index_name,
|
84 |
embedding_function=aoai_embeddings.embed_query,
|
85 |
)
|
spinoza_project/source/frontend/gradio_utils.py
CHANGED
@@ -1,12 +1,9 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
import yaml
|
3 |
from langchain.prompts.chat import ChatPromptTemplate
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
-
from spinoza_project.source.frontend.utils import
|
6 |
-
make_html_source,
|
7 |
-
make_html_presse_source,
|
8 |
-
make_html_afp_source,
|
9 |
-
)
|
10 |
from spinoza_project.source.backend.prompt_utils import (
|
11 |
to_chat_instruction,
|
12 |
SpecialTokens,
|
@@ -16,8 +13,13 @@ from spinoza_project.source.backend.document_store import pickle_to_document_sto
|
|
16 |
|
17 |
|
18 |
def get_config():
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
def get_prompts(config):
|
@@ -105,12 +107,14 @@ def zip_longest_fill(*args, fillvalue=None):
|
|
105 |
return
|
106 |
|
107 |
cond = True
|
108 |
-
fillvalues = [
|
109 |
while cond:
|
110 |
values = []
|
111 |
for i, it in enumerate(iterators):
|
112 |
try:
|
113 |
value = next(it)
|
|
|
|
|
114 |
except StopIteration:
|
115 |
value = fillvalues[i]
|
116 |
values.append(value)
|
@@ -129,7 +133,10 @@ def start_agents():
|
|
129 |
gr.Info(message="Les agents et Spinoza démarent leurs analyses...", duration=3)
|
130 |
|
131 |
return [
|
132 |
-
(
|
|
|
|
|
|
|
133 |
]
|
134 |
|
135 |
|
@@ -191,52 +198,32 @@ def get_sources(questions, qdrants, bdd_presse, bdd_afp, config):
|
|
191 |
min_similarity = config["min_similarity"]
|
192 |
text, formated = [], []
|
193 |
for i, (question, tab) in enumerate(zip(questions, list(config["tabs"].keys()))):
|
194 |
-
|
195 |
-
|
196 |
-
question.replace("<p>", "").replace("</p>\n", ""), k=k
|
197 |
-
)
|
198 |
-
sources = [
|
199 |
-
(doc, score) for doc, score in sources if score >= min_similarity
|
200 |
-
]
|
201 |
-
formated.extend(
|
202 |
-
[
|
203 |
-
make_html_presse_source(source[0], j, source[1])
|
204 |
-
for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
|
205 |
-
]
|
206 |
-
)
|
207 |
-
|
208 |
-
elif tab == "AFP":
|
209 |
-
sources = bdd_afp.similarity_search_with_relevance_scores(
|
210 |
question.replace("<p>", "").replace("</p>\n", ""), k=k
|
211 |
)
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
].similarity_search_with_relevance_scores(
|
226 |
-
config["query_preprompt"]
|
227 |
-
+ question.replace("<p>", "").replace("</p>\n", ""),
|
228 |
-
k=k,
|
229 |
)
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
232 |
]
|
233 |
-
|
234 |
-
[
|
235 |
-
make_html_source(source[0], j, source[1], config)
|
236 |
-
for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
|
237 |
-
]
|
238 |
-
)
|
239 |
-
|
240 |
text.extend(
|
241 |
[
|
242 |
"\n\n".join(
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
import yaml
|
4 |
from langchain.prompts.chat import ChatPromptTemplate
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
+
from spinoza_project.source.frontend.utils import make_html_source
|
|
|
|
|
|
|
|
|
7 |
from spinoza_project.source.backend.prompt_utils import (
|
8 |
to_chat_instruction,
|
9 |
SpecialTokens,
|
|
|
13 |
|
14 |
|
15 |
def get_config():
|
16 |
+
if os.getenv("EKI_OPENAI_EMB_DEPLOYMENT_NAME"):
|
17 |
+
with open("./spinoza_project/config.yaml") as f:
|
18 |
+
return yaml.full_load(f)
|
19 |
+
|
20 |
+
else:
|
21 |
+
with open("./spinoza_project/config_public.yaml") as f:
|
22 |
+
return yaml.full_load(f)
|
23 |
|
24 |
|
25 |
def get_prompts(config):
|
|
|
107 |
return
|
108 |
|
109 |
cond = True
|
110 |
+
fillvalues = [fillvalue] * len(iterators)
|
111 |
while cond:
|
112 |
values = []
|
113 |
for i, it in enumerate(iterators):
|
114 |
try:
|
115 |
value = next(it)
|
116 |
+
if not value:
|
117 |
+
value = next(it)
|
118 |
except StopIteration:
|
119 |
value = fillvalues[i]
|
120 |
values.append(value)
|
|
|
133 |
gr.Info(message="Les agents et Spinoza démarent leurs analyses...", duration=3)
|
134 |
|
135 |
return [
|
136 |
+
(
|
137 |
+
None,
|
138 |
+
"J'attends que tous les agents aient terminé pour générer une réponse...",
|
139 |
+
)
|
140 |
]
|
141 |
|
142 |
|
|
|
198 |
min_similarity = config["min_similarity"]
|
199 |
text, formated = [], []
|
200 |
for i, (question, tab) in enumerate(zip(questions, list(config["tabs"].keys()))):
|
201 |
+
sources = (
|
202 |
+
bdd_presse.similarity_search_with_relevance_scores(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
question.replace("<p>", "").replace("</p>\n", ""), k=k
|
204 |
)
|
205 |
+
if tab == "Presse"
|
206 |
+
else (
|
207 |
+
bdd_afp.similarity_search_with_relevance_scores(
|
208 |
+
question.replace("<p>", "").replace("</p>\n", ""), k=k
|
209 |
+
)
|
210 |
+
if tab == "AFP"
|
211 |
+
else qdrants[
|
212 |
+
config["source_mapping"][tab]
|
213 |
+
].similarity_search_with_relevance_scores(
|
214 |
+
config["query_preprompt"]
|
215 |
+
+ question.replace("<p>", "").replace("</p>\n", ""),
|
216 |
+
k=k,
|
217 |
+
)
|
|
|
|
|
|
|
|
|
218 |
)
|
219 |
+
)
|
220 |
+
sources = [(doc, score) for doc, score in sources if score >= min_similarity]
|
221 |
+
formated.extend(
|
222 |
+
[
|
223 |
+
make_html_source(source[0], j, source[1], config)
|
224 |
+
for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
|
225 |
]
|
226 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
text.extend(
|
228 |
[
|
229 |
"\n\n".join(
|
spinoza_project/source/frontend/utils.py
CHANGED
@@ -58,72 +58,82 @@ def get_source_link(metadata):
|
|
58 |
return metadata["file_url"] + f"#page={metadata['content_page_number'] + 1}"
|
59 |
|
60 |
|
61 |
-
def
|
62 |
meta = source.metadata
|
63 |
-
if meta["
|
64 |
return f"""
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
69 |
</div>
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
</div>
|
77 |
-
|
78 |
-
"""
|
79 |
else:
|
80 |
return f"""
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
89 |
</div>
|
90 |
-
|
91 |
-
"""
|
92 |
-
|
93 |
-
|
94 |
-
def make_html_afp_source(source, i, score):
|
95 |
-
meta = source.metadata
|
96 |
-
return f"""
|
97 |
-
<div class="card" id="doc{i}">
|
98 |
-
<div class="card-content">
|
99 |
-
<h2>Doc {i} - {meta['file_title']} - {meta['file_type']} AFP</h2>
|
100 |
-
<p>{source.page_content}</p>
|
101 |
-
</div>
|
102 |
-
<div class="card-footer">
|
103 |
-
<span>{meta['file_source_type']}</span>
|
104 |
-
<span>Relevance Score : {round(100*score,1)}%</span>
|
105 |
-
</div>
|
106 |
-
</div>
|
107 |
-
"""
|
108 |
-
|
109 |
-
|
110 |
-
def make_html_source(source, i, score, config):
|
111 |
-
meta = source.metadata
|
112 |
-
return f"""
|
113 |
-
<div class="card" id="doc{i}">
|
114 |
-
<div class="card-content">
|
115 |
-
<h2>Doc {i} - {meta['file_title']} - Page {meta['content_page_number'] + 1}</h2>
|
116 |
-
<p>{source.page_content.replace(config["passage_preprompt"], "")}</p>
|
117 |
-
</div>
|
118 |
-
<div class="card-footer">
|
119 |
-
<span>{meta['file_source_type']}</span>
|
120 |
-
<span>Relevance Score : {round(100*score,1)}%</span>
|
121 |
-
<a href="{get_source_link(meta)}" target="_blank">
|
122 |
-
<span role="img" aria-label="Open PDF">🔗</span>
|
123 |
-
</a>
|
124 |
-
</div>
|
125 |
-
</div>
|
126 |
-
"""
|
127 |
|
128 |
|
129 |
def parse_output_llm_with_sources(output):
|
|
|
58 |
return metadata["file_url"] + f"#page={metadata['content_page_number'] + 1}"
|
59 |
|
60 |
|
61 |
+
def make_html_source(source, i, score, config):
|
62 |
meta = source.metadata
|
63 |
+
if meta["file_source_type"] == "AFP":
|
64 |
return f"""
|
65 |
+
<div class="card" id="doc{i}">
|
66 |
+
<div class="card-content">
|
67 |
+
<h2>Doc {i} - {meta['file_title']} - {meta['file_type']} AFP</h2>
|
68 |
+
<p>{source.page_content}</p>
|
69 |
+
</div>
|
70 |
+
<div class="card-footer">
|
71 |
+
<span>{meta['file_source_type']}</span>
|
72 |
+
<span>Relevance Score : {round(100*score,1)}%</span>
|
73 |
+
</div>
|
74 |
</div>
|
75 |
+
"""
|
76 |
+
|
77 |
+
if meta["file_source_type"] == "Presse":
|
78 |
+
if meta["file_url"] != "none":
|
79 |
+
return f"""
|
80 |
+
<div class="card" id="doc{i}">
|
81 |
+
<div class="card-content">
|
82 |
+
<h2>Doc {i} - {meta['file_title']} - {meta['file_publisher']}</h2>
|
83 |
+
<p>{source.page_content}</p>
|
84 |
+
</div>
|
85 |
+
<div class="card-footer">
|
86 |
+
<span>{meta['file_source_type']}</span>
|
87 |
+
<span>Relevance Score : {round(100*score,1)}%</span>
|
88 |
+
<a href={meta['file_url']} target="_blank">
|
89 |
+
<span role="img" aria-label="Open PDF">🔗</span>
|
90 |
+
</a>
|
91 |
+
</div>
|
92 |
+
</div>
|
93 |
+
"""
|
94 |
+
else:
|
95 |
+
return f"""
|
96 |
+
<div class="card" id="doc{i}">
|
97 |
+
<div class="card-content">
|
98 |
+
<h2>Doc {i} - {meta['file_title']} - {meta['file_publisher']}</h2>
|
99 |
+
<p>{source.page_content}</p>
|
100 |
+
</div>
|
101 |
+
<div class="card-footer">
|
102 |
+
<span>{meta['file_source_type']}</span>
|
103 |
+
<span>Relevance Score : {round(100*score,1)}%</span>
|
104 |
+
</div>
|
105 |
+
</div>
|
106 |
+
"""
|
107 |
+
|
108 |
+
if meta["file_url"]:
|
109 |
+
return f"""
|
110 |
+
<div class="card" id="doc{i}">
|
111 |
+
<div class="card-content">
|
112 |
+
<h2>Doc {i} - {meta['file_title']} - Page {meta['content_page_number'] + 1}</h2>
|
113 |
+
<p>{source.page_content.replace(config["passage_preprompt"], "")}</p>
|
114 |
+
</div>
|
115 |
+
<div class="card-footer">
|
116 |
+
<span>{meta['file_source_type']}</span>
|
117 |
+
<span>Relevance Score : {round(100*score,1)}%</span>
|
118 |
+
<a href="{get_source_link(meta)}" target="_blank">
|
119 |
+
<span role="img" aria-label="Open PDF">🔗</span>
|
120 |
+
</a>
|
121 |
+
</div>
|
122 |
</div>
|
123 |
+
"""
|
|
|
124 |
else:
|
125 |
return f"""
|
126 |
+
<div class="card" id="doc{i}">
|
127 |
+
<div class="card-content">
|
128 |
+
<h2>Doc {i} - {meta['file_title']} - Page {meta['content_page_number'] + 1}</h2>
|
129 |
+
<p>{source.page_content.replace(config["passage_preprompt"], "")}</p>
|
130 |
+
</div>
|
131 |
+
<div class="card-footer">
|
132 |
+
<span>{meta['file_source_type']}</span>
|
133 |
+
<span>Relevance Score : {round(100*score,1)}%</span>
|
134 |
+
</div>
|
135 |
</div>
|
136 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
|
139 |
def parse_output_llm_with_sources(output):
|