Spaces:
Running
Running
[email protected]
commited on
Commit
·
b10792b
1
Parent(s):
2d6124c
feat : Multiple documents & audit
Browse files- app.py +1 -0
- pages/documents.py +35 -2
- rag.py +9 -5
- requirements.txt +2 -1
- utils/document.py +108 -0
app.py
CHANGED
@@ -29,6 +29,7 @@ def init_app():
|
|
29 |
st.session_state["messages"] = []
|
30 |
st.session_state["assistant"] = Rag()
|
31 |
# st.session_state["data_dict"] = config['variables']
|
|
|
32 |
st.session_state["prompt_system"] = config['prompt_system']
|
33 |
st.session_state["chapters"] = config['chapters']
|
34 |
|
|
|
29 |
st.session_state["messages"] = []
|
30 |
st.session_state["assistant"] = Rag()
|
31 |
# st.session_state["data_dict"] = config['variables']
|
32 |
+
st.session_state["files"] = []
|
33 |
st.session_state["prompt_system"] = config['prompt_system']
|
34 |
st.session_state["chapters"] = config['chapters']
|
35 |
|
pages/documents.py
CHANGED
@@ -1,15 +1,19 @@
|
|
1 |
import os
|
2 |
import tempfile
|
3 |
import streamlit as st
|
|
|
4 |
|
5 |
def read_and_save_file():
|
6 |
-
st.session_state["messages"] = []
|
7 |
-
st.session_state["user_input"] = ""
|
8 |
|
9 |
for file in st.session_state["file_uploader"]:
|
10 |
with tempfile.NamedTemporaryFile(delete=False) as tf:
|
11 |
tf.write(file.getbuffer())
|
12 |
file_path = tf.name
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
|
15 |
st.session_state["assistant"].ingest(file_path)
|
@@ -20,6 +24,9 @@ def read_and_save_file():
|
|
20 |
def page():
|
21 |
st.subheader("Charger vos documents")
|
22 |
|
|
|
|
|
|
|
23 |
# Custom CSS to hide default English labels
|
24 |
# st.markdown(
|
25 |
# """
|
@@ -50,6 +57,32 @@ def page():
|
|
50 |
)
|
51 |
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
st.session_state["ingestion_spinner"] = st.empty()
|
54 |
|
55 |
page()
|
|
|
1 |
import os
|
2 |
import tempfile
|
3 |
import streamlit as st
|
4 |
+
from utils.document import audit_descriptif_pdf
|
5 |
|
6 |
def read_and_save_file():
|
|
|
|
|
7 |
|
8 |
for file in st.session_state["file_uploader"]:
|
9 |
with tempfile.NamedTemporaryFile(delete=False) as tf:
|
10 |
tf.write(file.getbuffer())
|
11 |
file_path = tf.name
|
12 |
+
if not any(f["name"] == file.name for f in st.session_state["files"]):
|
13 |
+
st.session_state["files"].append({
|
14 |
+
"name": file.name,
|
15 |
+
"audit": audit_descriptif_pdf(file)["audit"]
|
16 |
+
})
|
17 |
|
18 |
with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
|
19 |
st.session_state["assistant"].ingest(file_path)
|
|
|
24 |
def page():
|
25 |
st.subheader("Charger vos documents")
|
26 |
|
27 |
+
if "files" not in st.session_state:
|
28 |
+
st.session_state["files"] = []
|
29 |
+
|
30 |
# Custom CSS to hide default English labels
|
31 |
# st.markdown(
|
32 |
# """
|
|
|
57 |
)
|
58 |
|
59 |
|
60 |
+
for file in st.session_state["files"]:
|
61 |
+
st.markdown(f"#### {file['name']}")
|
62 |
+
|
63 |
+
audit = file["audit"]
|
64 |
+
st.markdown(
|
65 |
+
"""
|
66 |
+
<table>
|
67 |
+
<tr><td>Nombre de pages</td><td>{}</td></tr>
|
68 |
+
<tr><td>Nombre d'images</td><td>{}</td></tr>
|
69 |
+
<tr><td>Nombre de liens</td><td>{}</td></tr>
|
70 |
+
<tr><td>Nombre de tableaux</td><td>{}</td></tr>
|
71 |
+
<tr><td>Nombre de tokens</td><td>{}</td></tr>
|
72 |
+
<tr><td>Nombre de mots</td><td>{}</td></tr>
|
73 |
+
</table>
|
74 |
+
""".format(
|
75 |
+
audit['number_of_pages'],
|
76 |
+
audit['number_of_images'],
|
77 |
+
audit['number_of_links'],
|
78 |
+
audit['number_of_tables'],
|
79 |
+
audit['number_of_tokens'],
|
80 |
+
audit['number_of_words']
|
81 |
+
),
|
82 |
+
unsafe_allow_html=True
|
83 |
+
)
|
84 |
+
|
85 |
+
|
86 |
st.session_state["ingestion_spinner"] = st.empty()
|
87 |
|
88 |
page()
|
rag.py
CHANGED
@@ -10,7 +10,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
10 |
from langchain.schema.runnable import RunnablePassthrough
|
11 |
from langchain.prompts import PromptTemplate
|
12 |
from langchain_community.vectorstores.utils import filter_complex_metadata
|
13 |
-
from langchain_community.document_loaders.csv_loader import CSVLoader
|
14 |
|
15 |
from util import getYamlConfig
|
16 |
|
@@ -24,6 +23,7 @@ class Rag:
|
|
24 |
retriever = None
|
25 |
chain = None
|
26 |
readableModelName = ""
|
|
|
27 |
|
28 |
def __init__(self, vectore_store=None):
|
29 |
|
@@ -60,16 +60,18 @@ class Rag:
|
|
60 |
|
61 |
def getDbFiles(self):
|
62 |
return self.vector_store.getDocs()
|
63 |
-
|
64 |
def ingest(self, pdf_file_path: str):
|
65 |
docs = PyPDFLoader(file_path=pdf_file_path).load()
|
66 |
-
|
67 |
chunks = self.text_splitter.split_documents(docs)
|
68 |
chunks = filter_complex_metadata(chunks)
|
69 |
|
70 |
-
|
|
|
|
|
71 |
|
72 |
-
self.retriever = document_vector_store.as_retriever(
|
73 |
search_type="similarity_score_threshold",
|
74 |
search_kwargs={
|
75 |
"k": 3,
|
@@ -87,6 +89,8 @@ class Rag:
|
|
87 |
documentContext = self.retriever.invoke(query)
|
88 |
|
89 |
|
|
|
|
|
90 |
# Dictionnaire de base avec les variables principales
|
91 |
chain_input = {
|
92 |
"query": query,
|
|
|
10 |
from langchain.schema.runnable import RunnablePassthrough
|
11 |
from langchain.prompts import PromptTemplate
|
12 |
from langchain_community.vectorstores.utils import filter_complex_metadata
|
|
|
13 |
|
14 |
from util import getYamlConfig
|
15 |
|
|
|
23 |
retriever = None
|
24 |
chain = None
|
25 |
readableModelName = ""
|
26 |
+
documents = []
|
27 |
|
28 |
def __init__(self, vectore_store=None):
|
29 |
|
|
|
60 |
|
61 |
def getDbFiles(self):
|
62 |
return self.vector_store.getDocs()
|
63 |
+
|
64 |
def ingest(self, pdf_file_path: str):
|
65 |
docs = PyPDFLoader(file_path=pdf_file_path).load()
|
66 |
+
|
67 |
chunks = self.text_splitter.split_documents(docs)
|
68 |
chunks = filter_complex_metadata(chunks)
|
69 |
|
70 |
+
self.documents.extend(chunks)
|
71 |
+
self.document_vector_store = FAISS.from_documents(self.documents, self.embedding)
|
72 |
+
|
73 |
|
74 |
+
self.retriever = self.document_vector_store.as_retriever(
|
75 |
search_type="similarity_score_threshold",
|
76 |
search_kwargs={
|
77 |
"k": 3,
|
|
|
89 |
documentContext = self.retriever.invoke(query)
|
90 |
|
91 |
|
92 |
+
print(documentContext)
|
93 |
+
|
94 |
# Dictionnaire de base avec les variables principales
|
95 |
chain_input = {
|
96 |
"query": query,
|
requirements.txt
CHANGED
@@ -20,4 +20,5 @@ llamaapi
|
|
20 |
pyyaml
|
21 |
st_copy_to_clipboard
|
22 |
faiss-gpu
|
23 |
-
faiss-cpu
|
|
|
|
20 |
pyyaml
|
21 |
st_copy_to_clipboard
|
22 |
faiss-gpu
|
23 |
+
faiss-cpu
|
24 |
+
tiktoken
|
utils/document.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymupdf
|
2 |
+
import tiktoken
|
3 |
+
|
4 |
+
|
5 |
+
def count_tokens(input_string: str) -> int:
|
6 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
7 |
+
tokens = tokenizer.encode(input_string)
|
8 |
+
return len(tokens)
|
9 |
+
|
10 |
+
|
11 |
+
def audit_descriptif_pdf(file,max_img_width=500) -> dict:
|
12 |
+
document = pymupdf.open(stream=file.read())
|
13 |
+
|
14 |
+
audit_dict_doc = {
|
15 |
+
"number_of_pages": len(document),
|
16 |
+
"number_of_images": 0,
|
17 |
+
"number_of_links": 0,
|
18 |
+
"number_of_tables": 0,
|
19 |
+
"number_of_tokens": 0,
|
20 |
+
"number_of_words": 0,
|
21 |
+
"key_words": []
|
22 |
+
}
|
23 |
+
|
24 |
+
doc_content = dict()
|
25 |
+
|
26 |
+
for page in document:
|
27 |
+
|
28 |
+
audit_dict_page = {}
|
29 |
+
page_content = {
|
30 |
+
"images": [],
|
31 |
+
"texte": "",
|
32 |
+
"liens": [],
|
33 |
+
"tableaux": []
|
34 |
+
}
|
35 |
+
|
36 |
+
#number of images
|
37 |
+
images = page.get_images()
|
38 |
+
number_images = len(images)
|
39 |
+
audit_dict_page["number_of_images"] = number_images
|
40 |
+
audit_dict_doc["number_of_images"] += number_images
|
41 |
+
|
42 |
+
#get images
|
43 |
+
for _, img in enumerate(images):
|
44 |
+
xref = img[0]
|
45 |
+
base_image = document.extract_image(xref)
|
46 |
+
|
47 |
+
image_bytes = base_image["image"]
|
48 |
+
image_width = base_image["width"]
|
49 |
+
image_height = base_image["height"]
|
50 |
+
|
51 |
+
# Adjust image size if it exceeds the maximum width
|
52 |
+
if image_width > max_img_width:
|
53 |
+
ratio = max_img_width / image_width
|
54 |
+
image_width = max_img_width
|
55 |
+
image_height = int(image_height * ratio)
|
56 |
+
|
57 |
+
page_content["images"].append((image_bytes, image_width, image_height))
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
#get links with uri
|
62 |
+
links = []
|
63 |
+
for link in page.get_links():
|
64 |
+
if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
|
65 |
+
links.append({"uri": link["uri"], "page": page.number})
|
66 |
+
|
67 |
+
page_content["liens"] = links
|
68 |
+
|
69 |
+
#number of links
|
70 |
+
number_links = len(links)
|
71 |
+
audit_dict_page["number_of_links"] = number_links
|
72 |
+
audit_dict_doc["number_of_links"] += number_links
|
73 |
+
|
74 |
+
#number of tables
|
75 |
+
tables = page.find_tables().tables
|
76 |
+
number_tables = len(tables)
|
77 |
+
for tab in tables:
|
78 |
+
page_content["tableaux"].append(tab.to_pandas())
|
79 |
+
audit_dict_page["number_of_tables"] = number_tables
|
80 |
+
audit_dict_doc["number_of_tables"] += number_tables
|
81 |
+
|
82 |
+
#number of tokens and words
|
83 |
+
text = page.get_text("text")
|
84 |
+
number_tokens = count_tokens(text)
|
85 |
+
number_words = len(text.split())
|
86 |
+
|
87 |
+
audit_dict_page["number_of_tokens"] = number_tokens
|
88 |
+
audit_dict_page["number_of_words"] = number_words
|
89 |
+
|
90 |
+
#get text
|
91 |
+
page_content["texte"] = text
|
92 |
+
|
93 |
+
audit_dict_doc["number_of_tokens"] += number_tokens
|
94 |
+
audit_dict_doc["number_of_words"] += number_words
|
95 |
+
|
96 |
+
audit_dict_doc[f"page_{page.number}"] = audit_dict_page
|
97 |
+
|
98 |
+
doc_content[f"page_{page.number}"] = page_content
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
#merge 2 dicts
|
103 |
+
global_audit = {
|
104 |
+
"audit": audit_dict_doc,
|
105 |
+
"content": doc_content
|
106 |
+
}
|
107 |
+
|
108 |
+
return global_audit
|