ionosphere commited on
Commit
f676a1d
·
1 Parent(s): 6c5383d

Update with Mistral and rag PDF

Browse files
Files changed (8) hide show
  1. .gitignore +162 -0
  2. Grand livre au 30 avril 2024.pdf +0 -0
  3. HOW_TO.md +31 -0
  4. README.md +4 -4
  5. app.py +89 -0
  6. images/agir.png +0 -0
  7. rag.py +81 -0
  8. requirements.txt +8 -0
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .idea/
2
+ ### Python template
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+ db.sqlite3
64
+ db.sqlite3-journal
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # poetry
100
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104
+ #poetry.lock
105
+
106
+ # pdm
107
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108
+ #pdm.lock
109
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110
+ # in version control.
111
+ # https://pdm.fming.dev/#use-with-ide
112
+ .pdm.toml
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
Grand livre au 30 avril 2024.pdf ADDED
Binary file (348 kB). View file
 
HOW_TO.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # INSTALLING
2
+
3
+ > Clone the project in a local folder 'viti'
4
+
5
+ > Go to the project folder
6
+
7
+ `python -m venv .venv`
8
+
9
+ `source .venv/bin/activate`
10
+
11
+ `pip install -r requirements.txt`
12
+
13
+ Test the installation
14
+
15
+ `streamlit run app.py`
16
+
17
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
18
+
19
+ # REF
20
+
21
+ https://github.com/couchbase-examples/rag-demo-llama-index/blob/main/chat_with_pdf.py
22
+
23
+ https://lightning.ai/maxidiazbattan/studios/rag-streamlit-llamaindex-ollama?tab=files&layout=column&path=cloudspaces%2F01hwfjrdss66hkje94vb7enf15&y=2&x=0
24
+
25
+ https://medium.com/@sindhu.madicherla/rag-chatbot-using-chromadb-llamaindex-open-ai-and-streamlit-d3fb87df415f
26
+
27
+ https://github.com/jacttp/simpleRAG/blob/main/rag.py
28
+
29
+ https://github.com/naimkatiman/RAG-using-Llama-3.1-WebUi-on-Streamlit/blob/main/app.py
30
+
31
+
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Gc
3
- emoji: 🏆
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.38.0
8
  app_file: app.py
 
1
  ---
2
+ title: GC
3
+ emoji: 💻
4
+ colorFrom: green
5
+ colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.38.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import streamlit as st
4
+ from streamlit_chat import message
5
+ from rag import ChatPDF
6
+
7
+ title = "Simulateur IA Grandes cultures"
8
+ subtitle = "Poser vos questions"
9
+ description = "Demonstrateur GC"
10
+ LOGO = "images/agir.png"
11
+ form_help ="Vous pouvez compléter les informations ci-dessous pour personnaliser votre expérience"
12
+ placeholder = (
13
+ "Vous pouvez me posez une question sur vos attentes, appuyer sur Entrée pour valider"
14
+ )
15
+ placeholder_doc = (
16
+ "Vous pouvez charger un grand livre ou des écritures comptables au format PDF"
17
+ )
18
+ placeholder_url = "Récupérer les données de ce lien."
19
+
20
+ # st.title(title)
21
+
22
+ st.set_page_config(page_title=title)
23
+
24
+ def display_messages():
25
+ st.subheader(subtitle)
26
+ for i, (msg, is_user) in enumerate(st.session_state["messages"]):
27
+ message(msg, is_user=is_user, key=str(i))
28
+ st.session_state["thinking_spinner"] = st.empty()
29
+
30
+
31
+ def process_input():
32
+ if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0:
33
+ user_text = st.session_state["user_input"].strip()
34
+ with st.session_state["thinking_spinner"], st.spinner(f"Je réfléchis"):
35
+ agent_text = st.session_state["assistant"].ask(user_text)
36
+
37
+ st.session_state["messages"].append((user_text, True))
38
+ st.session_state["messages"].append((agent_text, False))
39
+
40
+
41
+ def read_and_save_file():
42
+ st.session_state["assistant"].clear()
43
+ st.session_state["messages"] = []
44
+ st.session_state["user_input"] = ""
45
+
46
+ for file in st.session_state["file_uploader"]:
47
+ with tempfile.NamedTemporaryFile(delete=False) as tf:
48
+ tf.write(file.getbuffer())
49
+ file_path = tf.name
50
+
51
+ with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
52
+ st.session_state["assistant"].ingest(file_path)
53
+ os.remove(file_path)
54
+
55
+
56
+ def page():
57
+ if len(st.session_state) == 0:
58
+ st.session_state["messages"] = []
59
+ st.session_state["assistant"] = ChatPDF()
60
+
61
+
62
+ st.logo(LOGO)
63
+ st.sidebar.markdown(form_help)
64
+ info1 = st.sidebar.text_input("Info 1", type="default")
65
+ info1 = st.sidebar.text_input("Info 2", type="default")
66
+ info1 = st.sidebar.text_input("Info 3", type="default")
67
+
68
+ st.header(title)
69
+
70
+ st.subheader("Charger un ou plusieurs documents")
71
+ st.caption(placeholder_doc)
72
+ st.file_uploader(
73
+ "Charger un document",
74
+ type=["pdf"],
75
+ key="file_uploader",
76
+ on_change=read_and_save_file,
77
+ label_visibility="collapsed",
78
+ accept_multiple_files=True,
79
+ )
80
+
81
+ st.session_state["ingestion_spinner"] = st.empty()
82
+
83
+ display_messages()
84
+ st.caption(placeholder)
85
+ st.text_input("Message", key="user_input", on_change=process_input)
86
+
87
+
88
+ if __name__ == "__main__":
89
+ page()
images/agir.png ADDED
rag.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # __import__('pysqlite3')
3
+ # import sys
4
+ # sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
5
+ from dotenv import load_dotenv
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_mistralai.chat_models import ChatMistralAI
8
+ from langchain_mistralai.embeddings import MistralAIEmbeddings
9
+ from langchain.schema.output_parser import StrOutputParser
10
+ from langchain_community.document_loaders import PyPDFLoader
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.schema.runnable import RunnablePassthrough
13
+ from langchain.prompts import PromptTemplate
14
+ from langchain_community.vectorstores.utils import filter_complex_metadata
15
+ #add new import
16
+ from langchain_community.document_loaders.csv_loader import CSVLoader
17
+
18
+ # load .env in local dev
19
+ load_dotenv()
20
+ env_api_key = os.environ.get("MISTRAL_API_KEY")
21
+ llm_model = "open-mixtral-8x7b"
22
+
23
+ class ChatPDF:
24
+ vector_store = None
25
+ retriever = None
26
+ chain = None
27
+
28
+ def __init__(self):
29
+ # https://python.langchain.com/docs/integrations/chat/mistralai/
30
+ self.model = ChatMistralAI(model=llm_model)
31
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
32
+ self.prompt = PromptTemplate.from_template(
33
+ """
34
+ <s> [INST] Vous échangez en français et avec précision.
35
+ Vous êtes un assistant comptable spécialisé dans la comptabilité agricole en grandes cultures.
36
+ Vous devez analyser les documents ci-dessous et calculer les couts de productions.
37
+ Les documents fournis représente la comptabilité de l'exploitation agricole.
38
+ Vous devez répondre sous forme de tableaux et de textes.
39
+ Vous devez répondre de façon synthétique et argumentée.
40
+ [/INST] </s>
41
+ [INST]
42
+ Question: {question}
43
+ Context: {context}
44
+ Answer: [/INST]
45
+ """
46
+ )
47
+
48
+ def ingest(self, pdf_file_path: str):
49
+ docs = PyPDFLoader(file_path=pdf_file_path).load()
50
+
51
+
52
+ chunks = self.text_splitter.split_documents(docs)
53
+ chunks = filter_complex_metadata(chunks)
54
+
55
+ embeddings = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=env_api_key)
56
+
57
+ vector_store = FAISS.from_documents(chunks, embeddings)
58
+ # vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings)
59
+ self.retriever = vector_store.as_retriever(
60
+ search_type="similarity_score_threshold",
61
+ search_kwargs={
62
+ "k": 3,
63
+ "score_threshold": 0.5,
64
+ },
65
+ )
66
+
67
+ self.chain = ({"context": self.retriever, "question": RunnablePassthrough()}
68
+ | self.prompt
69
+ | self.model
70
+ | StrOutputParser())
71
+
72
+ def ask(self, query: str):
73
+ if not self.chain:
74
+ return "Ajouter un document PDF d'abord."
75
+
76
+ return self.chain.invoke(query)
77
+
78
+ def clear(self):
79
+ self.vector_store = None
80
+ self.retriever = None
81
+ self.chain = None
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain_mistralai
3
+ langchain-community
4
+ streamlit==1.38.0
5
+ streamlit-chat
6
+ pypdf
7
+ fastembed
8
+ faiss-gpu