[email protected] commited on
Commit
b10792b
·
1 Parent(s): 2d6124c

feat : Multiple documents & audit

Browse files
Files changed (5) hide show
  1. app.py +1 -0
  2. pages/documents.py +35 -2
  3. rag.py +9 -5
  4. requirements.txt +2 -1
  5. utils/document.py +108 -0
app.py CHANGED
@@ -29,6 +29,7 @@ def init_app():
29
  st.session_state["messages"] = []
30
  st.session_state["assistant"] = Rag()
31
  # st.session_state["data_dict"] = config['variables']
 
32
  st.session_state["prompt_system"] = config['prompt_system']
33
  st.session_state["chapters"] = config['chapters']
34
 
 
29
  st.session_state["messages"] = []
30
  st.session_state["assistant"] = Rag()
31
  # st.session_state["data_dict"] = config['variables']
32
+ st.session_state["files"] = []
33
  st.session_state["prompt_system"] = config['prompt_system']
34
  st.session_state["chapters"] = config['chapters']
35
 
pages/documents.py CHANGED
@@ -1,15 +1,19 @@
1
  import os
2
  import tempfile
3
  import streamlit as st
 
4
 
5
  def read_and_save_file():
6
- st.session_state["messages"] = []
7
- st.session_state["user_input"] = ""
8
 
9
  for file in st.session_state["file_uploader"]:
10
  with tempfile.NamedTemporaryFile(delete=False) as tf:
11
  tf.write(file.getbuffer())
12
  file_path = tf.name
 
 
 
 
 
13
 
14
  with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
15
  st.session_state["assistant"].ingest(file_path)
@@ -20,6 +24,9 @@ def read_and_save_file():
20
  def page():
21
  st.subheader("Charger vos documents")
22
 
 
 
 
23
  # Custom CSS to hide default English labels
24
  # st.markdown(
25
  # """
@@ -50,6 +57,32 @@ def page():
50
  )
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  st.session_state["ingestion_spinner"] = st.empty()
54
 
55
  page()
 
1
  import os
2
  import tempfile
3
  import streamlit as st
4
+ from utils.document import audit_descriptif_pdf
5
 
6
  def read_and_save_file():
 
 
7
 
8
  for file in st.session_state["file_uploader"]:
9
  with tempfile.NamedTemporaryFile(delete=False) as tf:
10
  tf.write(file.getbuffer())
11
  file_path = tf.name
12
+ if not any(f["name"] == file.name for f in st.session_state["files"]):
13
+ st.session_state["files"].append({
14
+ "name": file.name,
15
+ "audit": audit_descriptif_pdf(file)["audit"]
16
+ })
17
 
18
  with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
19
  st.session_state["assistant"].ingest(file_path)
 
24
  def page():
25
  st.subheader("Charger vos documents")
26
 
27
+ if "files" not in st.session_state:
28
+ st.session_state["files"] = []
29
+
30
  # Custom CSS to hide default English labels
31
  # st.markdown(
32
  # """
 
57
  )
58
 
59
 
60
+ for file in st.session_state["files"]:
61
+ st.markdown(f"#### {file['name']}")
62
+
63
+ audit = file["audit"]
64
+ st.markdown(
65
+ """
66
+ <table>
67
+ <tr><td>Nombre de pages</td><td>{}</td></tr>
68
+ <tr><td>Nombre d'images</td><td>{}</td></tr>
69
+ <tr><td>Nombre de liens</td><td>{}</td></tr>
70
+ <tr><td>Nombre de tableaux</td><td>{}</td></tr>
71
+ <tr><td>Nombre de tokens</td><td>{}</td></tr>
72
+ <tr><td>Nombre de mots</td><td>{}</td></tr>
73
+ </table>
74
+ """.format(
75
+ audit['number_of_pages'],
76
+ audit['number_of_images'],
77
+ audit['number_of_links'],
78
+ audit['number_of_tables'],
79
+ audit['number_of_tokens'],
80
+ audit['number_of_words']
81
+ ),
82
+ unsafe_allow_html=True
83
+ )
84
+
85
+
86
  st.session_state["ingestion_spinner"] = st.empty()
87
 
88
  page()
rag.py CHANGED
@@ -10,7 +10,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.schema.runnable import RunnablePassthrough
11
  from langchain.prompts import PromptTemplate
12
  from langchain_community.vectorstores.utils import filter_complex_metadata
13
- from langchain_community.document_loaders.csv_loader import CSVLoader
14
 
15
  from util import getYamlConfig
16
 
@@ -24,6 +23,7 @@ class Rag:
24
  retriever = None
25
  chain = None
26
  readableModelName = ""
 
27
 
28
  def __init__(self, vectore_store=None):
29
 
@@ -60,16 +60,18 @@ class Rag:
60
 
61
  def getDbFiles(self):
62
  return self.vector_store.getDocs()
63
-
64
  def ingest(self, pdf_file_path: str):
65
  docs = PyPDFLoader(file_path=pdf_file_path).load()
66
-
67
  chunks = self.text_splitter.split_documents(docs)
68
  chunks = filter_complex_metadata(chunks)
69
 
70
- document_vector_store = FAISS.from_documents(chunks, self.embedding)
 
 
71
 
72
- self.retriever = document_vector_store.as_retriever(
73
  search_type="similarity_score_threshold",
74
  search_kwargs={
75
  "k": 3,
@@ -87,6 +89,8 @@ class Rag:
87
  documentContext = self.retriever.invoke(query)
88
 
89
 
 
 
90
  # Dictionnaire de base avec les variables principales
91
  chain_input = {
92
  "query": query,
 
10
  from langchain.schema.runnable import RunnablePassthrough
11
  from langchain.prompts import PromptTemplate
12
  from langchain_community.vectorstores.utils import filter_complex_metadata
 
13
 
14
  from util import getYamlConfig
15
 
 
23
  retriever = None
24
  chain = None
25
  readableModelName = ""
26
+ documents = []
27
 
28
  def __init__(self, vectore_store=None):
29
 
 
60
 
61
  def getDbFiles(self):
62
  return self.vector_store.getDocs()
63
+
64
  def ingest(self, pdf_file_path: str):
65
  docs = PyPDFLoader(file_path=pdf_file_path).load()
66
+
67
  chunks = self.text_splitter.split_documents(docs)
68
  chunks = filter_complex_metadata(chunks)
69
 
70
+ self.documents.extend(chunks)
71
+ self.document_vector_store = FAISS.from_documents(self.documents, self.embedding)
72
+
73
 
74
+ self.retriever = self.document_vector_store.as_retriever(
75
  search_type="similarity_score_threshold",
76
  search_kwargs={
77
  "k": 3,
 
89
  documentContext = self.retriever.invoke(query)
90
 
91
 
92
+ print(documentContext)
93
+
94
  # Dictionnaire de base avec les variables principales
95
  chain_input = {
96
  "query": query,
requirements.txt CHANGED
@@ -20,4 +20,5 @@ llamaapi
20
  pyyaml
21
  st_copy_to_clipboard
22
  faiss-gpu
23
- faiss-cpu
 
 
20
  pyyaml
21
  st_copy_to_clipboard
22
  faiss-gpu
23
+ faiss-cpu
24
+ tiktoken
utils/document.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf
2
+ import tiktoken
3
+
4
+
5
+ def count_tokens(input_string: str) -> int:
6
+ tokenizer = tiktoken.get_encoding("cl100k_base")
7
+ tokens = tokenizer.encode(input_string)
8
+ return len(tokens)
9
+
10
+
11
+ def audit_descriptif_pdf(file,max_img_width=500) -> dict:
12
+ document = pymupdf.open(stream=file.read())
13
+
14
+ audit_dict_doc = {
15
+ "number_of_pages": len(document),
16
+ "number_of_images": 0,
17
+ "number_of_links": 0,
18
+ "number_of_tables": 0,
19
+ "number_of_tokens": 0,
20
+ "number_of_words": 0,
21
+ "key_words": []
22
+ }
23
+
24
+ doc_content = dict()
25
+
26
+ for page in document:
27
+
28
+ audit_dict_page = {}
29
+ page_content = {
30
+ "images": [],
31
+ "texte": "",
32
+ "liens": [],
33
+ "tableaux": []
34
+ }
35
+
36
+ #number of images
37
+ images = page.get_images()
38
+ number_images = len(images)
39
+ audit_dict_page["number_of_images"] = number_images
40
+ audit_dict_doc["number_of_images"] += number_images
41
+
42
+ #get images
43
+ for _, img in enumerate(images):
44
+ xref = img[0]
45
+ base_image = document.extract_image(xref)
46
+
47
+ image_bytes = base_image["image"]
48
+ image_width = base_image["width"]
49
+ image_height = base_image["height"]
50
+
51
+ # Adjust image size if it exceeds the maximum width
52
+ if image_width > max_img_width:
53
+ ratio = max_img_width / image_width
54
+ image_width = max_img_width
55
+ image_height = int(image_height * ratio)
56
+
57
+ page_content["images"].append((image_bytes, image_width, image_height))
58
+
59
+
60
+
61
+ #get links with uri
62
+ links = []
63
+ for link in page.get_links():
64
+ if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
65
+ links.append({"uri": link["uri"], "page": page.number})
66
+
67
+ page_content["liens"] = links
68
+
69
+ #number of links
70
+ number_links = len(links)
71
+ audit_dict_page["number_of_links"] = number_links
72
+ audit_dict_doc["number_of_links"] += number_links
73
+
74
+ #number of tables
75
+ tables = page.find_tables().tables
76
+ number_tables = len(tables)
77
+ for tab in tables:
78
+ page_content["tableaux"].append(tab.to_pandas())
79
+ audit_dict_page["number_of_tables"] = number_tables
80
+ audit_dict_doc["number_of_tables"] += number_tables
81
+
82
+ #number of tokens and words
83
+ text = page.get_text("text")
84
+ number_tokens = count_tokens(text)
85
+ number_words = len(text.split())
86
+
87
+ audit_dict_page["number_of_tokens"] = number_tokens
88
+ audit_dict_page["number_of_words"] = number_words
89
+
90
+ #get text
91
+ page_content["texte"] = text
92
+
93
+ audit_dict_doc["number_of_tokens"] += number_tokens
94
+ audit_dict_doc["number_of_words"] += number_words
95
+
96
+ audit_dict_doc[f"page_{page.number}"] = audit_dict_page
97
+
98
+ doc_content[f"page_{page.number}"] = page_content
99
+
100
+
101
+
102
+ #merge 2 dicts
103
+ global_audit = {
104
+ "audit": audit_dict_doc,
105
+ "content": doc_content
106
+ }
107
+
108
+ return global_audit