Spaces:
Runtime error
Runtime error
adrien.aribaut-gaudin
commited on
Commit
·
9ff01ff
1
Parent(s):
29de006
push on hugging_face
Browse files- .gitattributes +2 -0
- .gitignore +166 -0
- app.py +48 -0
- config.py +14 -0
- data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf +3 -0
- database/4a5944a6-5c35-44f8-88be-78ce2e35028c/data_level0.bin +3 -0
- database/4a5944a6-5c35-44f8-88be-78ce2e35028c/header.bin +3 -0
- database/4a5944a6-5c35-44f8-88be-78ce2e35028c/length.bin +3 -0
- database/4a5944a6-5c35-44f8-88be-78ce2e35028c/link_lists.bin +0 -0
- database/chroma.sqlite3 +3 -0
- src/__init__.py +0 -0
- src/control/__init__.py +0 -0
- src/control/control.py +69 -0
- src/model/__init__.py +0 -0
- src/model/block.py +34 -0
- src/model/container.py +150 -0
- src/model/doc.py +65 -0
- src/model/paragraph.py +20 -0
- src/tools/__init__.py +0 -0
- src/tools/llm.py +58 -0
- src/tools/pretty_print.py +25 -0
- src/tools/reader.py +102 -0
- src/tools/retriever.py +30 -0
- src/tools/test_read.py +209 -0
- src/view/view.py +112 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config_key.py
|
2 |
+
|
3 |
+
|
4 |
+
#Test folder
|
5 |
+
data/Test/
|
6 |
+
|
7 |
+
# Byte-compiled / optimized / DLL files
|
8 |
+
__pycache__/
|
9 |
+
*.py[cod]
|
10 |
+
*$py.class
|
11 |
+
|
12 |
+
# C extensions
|
13 |
+
*.so
|
14 |
+
|
15 |
+
# Distribution / packaging
|
16 |
+
.Python
|
17 |
+
build/
|
18 |
+
develop-eggs/
|
19 |
+
dist/
|
20 |
+
downloads/
|
21 |
+
eggs/
|
22 |
+
.eggs/
|
23 |
+
lib/
|
24 |
+
lib64/
|
25 |
+
parts/
|
26 |
+
sdist/
|
27 |
+
var/
|
28 |
+
wheels/
|
29 |
+
share/python-wheels/
|
30 |
+
*.egg-info/
|
31 |
+
.installed.cfg
|
32 |
+
*.egg
|
33 |
+
MANIFEST
|
34 |
+
|
35 |
+
# PyInstaller
|
36 |
+
# Usually these files are written by a python script from a template
|
37 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
38 |
+
*.manifest
|
39 |
+
*.spec
|
40 |
+
|
41 |
+
# Installer logs
|
42 |
+
pip-log.txt
|
43 |
+
pip-delete-this-directory.txt
|
44 |
+
|
45 |
+
# Unit test / coverage reports
|
46 |
+
htmlcov/
|
47 |
+
.tox/
|
48 |
+
.nox/
|
49 |
+
.coverage
|
50 |
+
.coverage.*
|
51 |
+
.cache
|
52 |
+
nosetests.xml
|
53 |
+
coverage.xml
|
54 |
+
*.cover
|
55 |
+
*.py,cover
|
56 |
+
.hypothesis/
|
57 |
+
.pytest_cache/
|
58 |
+
cover/
|
59 |
+
|
60 |
+
# Translations
|
61 |
+
*.mo
|
62 |
+
*.pot
|
63 |
+
|
64 |
+
# Django stuff:
|
65 |
+
*.log
|
66 |
+
local_settings.py
|
67 |
+
db.sqlite3
|
68 |
+
db.sqlite3-journal
|
69 |
+
|
70 |
+
# Flask stuff:
|
71 |
+
instance/
|
72 |
+
.webassets-cache
|
73 |
+
|
74 |
+
# Scrapy stuff:
|
75 |
+
.scrapy
|
76 |
+
|
77 |
+
# Sphinx documentation
|
78 |
+
docs/_build/
|
79 |
+
|
80 |
+
# PyBuilder
|
81 |
+
.pybuilder/
|
82 |
+
target/
|
83 |
+
|
84 |
+
# Jupyter Notebook
|
85 |
+
.ipynb_checkpoints
|
86 |
+
|
87 |
+
# IPython
|
88 |
+
profile_default/
|
89 |
+
ipython_config.py
|
90 |
+
|
91 |
+
# pyenv
|
92 |
+
# For a library or package, you might want to ignore these files since the code is
|
93 |
+
# intended to run in multiple environments; otherwise, check them in:
|
94 |
+
# .python-version
|
95 |
+
|
96 |
+
# pipenv
|
97 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
98 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
99 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
100 |
+
# install all needed dependencies.
|
101 |
+
#Pipfile.lock
|
102 |
+
|
103 |
+
# poetry
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106 |
+
# commonly ignored for libraries.
|
107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108 |
+
#poetry.lock
|
109 |
+
|
110 |
+
# pdm
|
111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
112 |
+
#pdm.lock
|
113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
114 |
+
# in version control.
|
115 |
+
# https://pdm.fming.dev/#use-with-ide
|
116 |
+
.pdm.toml
|
117 |
+
|
118 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
119 |
+
__pypackages__/
|
120 |
+
|
121 |
+
# Celery stuff
|
122 |
+
celerybeat-schedule
|
123 |
+
celerybeat.pid
|
124 |
+
|
125 |
+
# SageMath parsed files
|
126 |
+
*.sage.py
|
127 |
+
|
128 |
+
# Environments
|
129 |
+
.env
|
130 |
+
.venv
|
131 |
+
env/
|
132 |
+
venv/
|
133 |
+
ENV/
|
134 |
+
env.bak/
|
135 |
+
venv.bak/
|
136 |
+
|
137 |
+
# Spyder project settings
|
138 |
+
.spyderproject
|
139 |
+
.spyproject
|
140 |
+
|
141 |
+
# Rope project settings
|
142 |
+
.ropeproject
|
143 |
+
|
144 |
+
# mkdocs documentation
|
145 |
+
/site
|
146 |
+
|
147 |
+
# mypy
|
148 |
+
.mypy_cache/
|
149 |
+
.dmypy.json
|
150 |
+
dmypy.json
|
151 |
+
|
152 |
+
# Pyre type checker
|
153 |
+
.pyre/
|
154 |
+
|
155 |
+
# pytype static type analyzer
|
156 |
+
.pytype/
|
157 |
+
|
158 |
+
# Cython debug symbols
|
159 |
+
cython_debug/
|
160 |
+
|
161 |
+
# PyCharm
|
162 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
163 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
164 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
165 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
166 |
+
#.idea/
|
app.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
from langchain.llms import OpenAI
|
4 |
+
import chromadb
|
5 |
+
|
6 |
+
from config import *
|
7 |
+
from src.tools.reader import get_pdf_title_styles
|
8 |
+
from src.tools.llm import LlmAgent
|
9 |
+
import src.view.view as view
|
10 |
+
from src.tools.pretty_print import pretty_print_container_structure, pretty_printer_paragraphs
|
11 |
+
from src.model.container import Container
|
12 |
+
from src.control.control import Chatbot
|
13 |
+
from src.tools.retriever import Retriever
|
14 |
+
from src.model.doc import Doc
|
15 |
+
from src.tools.test_read import pdf_manager
|
16 |
+
|
17 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
18 |
+
|
19 |
+
if not "OPENAI_API_KEY" in os.environ:
|
20 |
+
from config_key import OPENAI_API_KEY
|
21 |
+
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
|
22 |
+
|
23 |
+
#check if the database is empty
|
24 |
+
# pdf_manager(pdf_path=content_en_path_real)
|
25 |
+
# pretty_printer_paragraphs(doc.container.paragraphs)
|
26 |
+
# pretty_print_container_structure(doc.container)
|
27 |
+
|
28 |
+
if not os.path.exists("Ilumio_chatbot/database/"):
|
29 |
+
os.makedirs("Ilumio_chatbot/database/")
|
30 |
+
|
31 |
+
client_db = chromadb.PersistentClient(path="Ilumio_chatbot/database/")
|
32 |
+
|
33 |
+
try:
|
34 |
+
client_db.get_collection(name="illumio_database")
|
35 |
+
retriever = Retriever(client_db, None, "illumio_database")
|
36 |
+
except:
|
37 |
+
print("Database is empty")
|
38 |
+
doc = Doc(path=content_en_path_real)
|
39 |
+
retriever = Retriever(client_db,doc.container,"illumio_database")
|
40 |
+
|
41 |
+
llm_model = OpenAI(temperature=0)
|
42 |
+
llm = LlmAgent(llm_model)
|
43 |
+
|
44 |
+
chat = Chatbot(llm_agent=llm, retriever=retriever)
|
45 |
+
|
46 |
+
ilumio_qna = view.run(ctrl=chat, config=view_config)
|
47 |
+
|
48 |
+
ilumio_qna.queue().launch()
|
config.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
content_language = 'en'
|
2 |
+
plan_language = 'en'
|
3 |
+
content_en_path_real = "Ilumio_chatbot/data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
|
4 |
+
content_test = "Ilumio_chatbot/data/Test/Test_children.pdf"
|
5 |
+
|
6 |
+
|
7 |
+
examples = {"Question banale?": "Pourquoi le ciel est bleu?",
|
8 |
+
}
|
9 |
+
|
10 |
+
|
11 |
+
view_config = {
|
12 |
+
'title': '# Ilumio Q&A',
|
13 |
+
'examples': examples,
|
14 |
+
}
|
data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8821bd9530837f23a99e6b5d17d1e893f74d91ac6112c861d4ecd3f830e42479
|
3 |
+
size 4115867
|
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
|
3 |
+
size 1676000
|
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b8e0bc4024bd2c60f910c5ea628a6b78cab039442c2c1e1576985ca11d8ab69
|
3 |
+
size 4000
|
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/link_lists.bin
ADDED
File without changes
|
database/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5f6d73b67b4014dc1c30e357807cf0910ce18f617f99ae46d18126d541ad3a2
|
3 |
+
size 5914624
|
src/__init__.py
ADDED
File without changes
|
src/control/__init__.py
ADDED
File without changes
|
src/control/control.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
from src.tools.retriever import Retriever
|
4 |
+
from src.tools.llm import LlmAgent
|
5 |
+
from src.model.block import Block
|
6 |
+
|
7 |
+
|
8 |
+
class Chatbot:
|
9 |
+
def __init__(self, llm_agent, retriever):
|
10 |
+
self.retriever = retriever
|
11 |
+
self.llm = llm_agent
|
12 |
+
|
13 |
+
def get_response(self, query, histo):
|
14 |
+
histo_conversation, histo_queries = self._get_histo(histo)
|
15 |
+
queries = histo_queries
|
16 |
+
block_sources = self.retriever.similarity_search(query=queries)
|
17 |
+
block_sources = self._select_best_sources(block_sources)
|
18 |
+
sources_contents = [s.content for s in block_sources]
|
19 |
+
context = '\n'.join(sources_contents)
|
20 |
+
answer = self.llm.generate_paragraph(query=queries, histo=histo_conversation, context=context, language='en')
|
21 |
+
answer = self.llm.generate_answer(answer_en=answer, query=query, histo=histo_conversation, context=context)
|
22 |
+
answer = self._clean_answer(answer)
|
23 |
+
return answer, block_sources
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
@staticmethod
|
28 |
+
def _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
|
29 |
+
"""
|
30 |
+
Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
|
31 |
+
"""
|
32 |
+
best_sources = []
|
33 |
+
for idx, s in enumerate(sources):
|
34 |
+
if idx == 0 \
|
35 |
+
or (s.distance - sources[idx - 1].distance < delta_1_2
|
36 |
+
and s.distance - sources[0].distance < delta_1_n) \
|
37 |
+
or s.distance < absolute:
|
38 |
+
best_sources.append(s)
|
39 |
+
delta_1_2 *= alpha
|
40 |
+
delta_1_n *= alpha
|
41 |
+
absolute *= alpha
|
42 |
+
else:
|
43 |
+
break
|
44 |
+
return best_sources
|
45 |
+
|
46 |
+
|
47 |
+
@staticmethod
|
48 |
+
def _get_histo(histo: [(str, str)]) -> (str, str):
|
49 |
+
histo_conversation = ""
|
50 |
+
histo_queries = ""
|
51 |
+
|
52 |
+
for (query, answer) in histo[-5:]:
|
53 |
+
histo_conversation += f'user: {query} \n bot: {answer}\n'
|
54 |
+
histo_queries += query + '\n'
|
55 |
+
return histo_conversation[:-1], histo_queries
|
56 |
+
|
57 |
+
|
58 |
+
@staticmethod
|
59 |
+
def _clean_answer(answer: str) -> str:
|
60 |
+
answer = answer.strip('bot:')
|
61 |
+
while answer and answer[-1] in {"'", '"', " ", "`"}:
|
62 |
+
answer = answer[:-1]
|
63 |
+
while answer and answer[0] in {"'", '"', " ", "`"}:
|
64 |
+
answer = answer[1:]
|
65 |
+
answer = answer.strip('bot:')
|
66 |
+
if answer:
|
67 |
+
if answer[-1] != ".":
|
68 |
+
answer += "."
|
69 |
+
return answer
|
src/model/__init__.py
ADDED
File without changes
|
src/model/block.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Block:
|
2 |
+
def __init__(self, doc: str= '',title: str = '', content: str = '',
|
3 |
+
index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
|
4 |
+
self.doc = doc
|
5 |
+
self.title = title
|
6 |
+
self.content = content
|
7 |
+
self.index = index
|
8 |
+
self.rank = rank
|
9 |
+
self.level = level
|
10 |
+
self.distance = distance
|
11 |
+
|
12 |
+
def to_dict(self) -> {}:
|
13 |
+
block_dict = {'doc': self.doc,
|
14 |
+
'title': self.title,
|
15 |
+
'content': self.content,
|
16 |
+
'index': self.index,
|
17 |
+
'rank': self.rank,
|
18 |
+
'level': self.level,
|
19 |
+
'distance': self.distance}
|
20 |
+
return block_dict
|
21 |
+
|
22 |
+
def from_dict(self, block_dict: {}):
|
23 |
+
self.doc = block_dict['doc']
|
24 |
+
self.title = block_dict['title']
|
25 |
+
self.content = block_dict['content']
|
26 |
+
self.index = block_dict['index']
|
27 |
+
self.rank = block_dict['rank']
|
28 |
+
self.level = block_dict['level']
|
29 |
+
self.distance = block_dict['distance']
|
30 |
+
return self
|
31 |
+
|
32 |
+
@property
|
33 |
+
def distance_str(self) -> str:
|
34 |
+
return format(self.distance, '.2f')
|
src/model/container.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .paragraph import Paragraph
|
2 |
+
from .block import Block
|
3 |
+
|
4 |
+
INFINITE = 99999
|
5 |
+
|
6 |
+
class Container:
|
7 |
+
|
8 |
+
def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
|
9 |
+
if index is None:
|
10 |
+
index = []
|
11 |
+
self.level = level
|
12 |
+
self.title = title
|
13 |
+
self.paragraphs = []
|
14 |
+
self.children = []
|
15 |
+
self.index = index
|
16 |
+
self.father = father
|
17 |
+
self.id_ = int(str(1) + str(father.id_) + str(id_))
|
18 |
+
if paragraphs:
|
19 |
+
self.paragraphs, self.children = self.create_children(paragraphs, level, index)
|
20 |
+
self.blocks = self.get_blocks()
|
21 |
+
|
22 |
+
|
23 |
+
def get_blocks(self):
|
24 |
+
block = Block(level=self.level, index=self.index)
|
25 |
+
if self.title:
|
26 |
+
block.title = self.title.text
|
27 |
+
for p in self.paragraphs:
|
28 |
+
block.content += p.text
|
29 |
+
blocks = [block] if block.content else []
|
30 |
+
for child in self.children:
|
31 |
+
blocks += child.blocks
|
32 |
+
return blocks
|
33 |
+
|
34 |
+
|
35 |
+
def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
|
36 |
+
"""
|
37 |
+
Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
|
38 |
+
The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
|
39 |
+
:return: List of Content or Container
|
40 |
+
"""
|
41 |
+
attached_paragraphs = []
|
42 |
+
children = []
|
43 |
+
in_children = False
|
44 |
+
level = INFINITE
|
45 |
+
container_paragraphs = []
|
46 |
+
container_title = None
|
47 |
+
|
48 |
+
while paragraphs:
|
49 |
+
p = paragraphs.pop(0)
|
50 |
+
|
51 |
+
if not in_children and not p.is_structure:
|
52 |
+
attached_paragraphs.append(p)
|
53 |
+
else:
|
54 |
+
in_children = True
|
55 |
+
if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
|
56 |
+
if container_paragraphs or container_title:
|
57 |
+
if level <= len(index):
|
58 |
+
index = index[:level]
|
59 |
+
index[-1] += 1
|
60 |
+
else:
|
61 |
+
for i in range(level-len(index)):
|
62 |
+
index.append(1)
|
63 |
+
children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
|
64 |
+
container_paragraphs = []
|
65 |
+
container_title = p
|
66 |
+
level = p.level
|
67 |
+
else: # p is normal text or strictly lower in hierarchy, then the child continues to grow
|
68 |
+
container_paragraphs.append(p)
|
69 |
+
if container_paragraphs or container_title:
|
70 |
+
if level <= len(index):
|
71 |
+
index = index[:level]
|
72 |
+
index[-1] += 1
|
73 |
+
else:
|
74 |
+
for i in range(level - len(index)):
|
75 |
+
index.append(1)
|
76 |
+
children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
|
77 |
+
|
78 |
+
return attached_paragraphs, children
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
# def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
|
83 |
+
# """
|
84 |
+
# Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
|
85 |
+
# The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
|
86 |
+
# :return: List of Content or Container
|
87 |
+
# """
|
88 |
+
# attached_paragraphs = []
|
89 |
+
# children = []
|
90 |
+
# in_children = False
|
91 |
+
# level = INFINITE
|
92 |
+
# # container_paragraphs = []
|
93 |
+
# # container_title = None
|
94 |
+
|
95 |
+
# while paragraphs:
|
96 |
+
# p = paragraphs.pop(0)
|
97 |
+
|
98 |
+
# if not in_children and p.is_structure and level != INFINITE:
|
99 |
+
# paragraphs.insert(0, p)
|
100 |
+
# children.append(Container(paragraphs, title=p, level=p.level, children=children, index=index.copy(), father=self))
|
101 |
+
# else:
|
102 |
+
# in_children = True
|
103 |
+
# if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
|
104 |
+
# level = p.level
|
105 |
+
# if len(index) == level:
|
106 |
+
# index[-1] += 1
|
107 |
+
# elif len(index) < level:
|
108 |
+
# if self.children != []:
|
109 |
+
# index = self.children[-1].index.copy()
|
110 |
+
# index[-1] += 1
|
111 |
+
# else:
|
112 |
+
# index.append(1)
|
113 |
+
# else:
|
114 |
+
# index = index[:level]
|
115 |
+
# index[-1] += 1
|
116 |
+
# while paragraphs:
|
117 |
+
# p = paragraphs.pop(0)
|
118 |
+
# if p.is_structure:
|
119 |
+
# paragraphs.insert(0, p)
|
120 |
+
# break
|
121 |
+
# else:
|
122 |
+
# attached_paragraphs.append(p)
|
123 |
+
# if paragraphs and p.level > level:
|
124 |
+
# in_children = False
|
125 |
+
# children.append(Container(paragraphs, title=p, level=p.level, index=index.copy(), father=self))
|
126 |
+
# else:
|
127 |
+
# break
|
128 |
+
# return attached_paragraphs, children
|
129 |
+
|
130 |
+
|
131 |
+
@property
|
132 |
+
def structure(self):
|
133 |
+
|
134 |
+
self_structure = {str(self.id_): {
|
135 |
+
'index': str(self.id_),
|
136 |
+
'canMove': True,
|
137 |
+
'isFolder': True,
|
138 |
+
'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
|
139 |
+
'canRename': True,
|
140 |
+
'data': {},
|
141 |
+
'level': self.level,
|
142 |
+
'rank': self.rank,
|
143 |
+
'title': self.title.text if self.title else 'root'
|
144 |
+
}}
|
145 |
+
paragraphs_structure = [p.structure for p in self.paragraphs]
|
146 |
+
structure = [self_structure] + paragraphs_structure
|
147 |
+
for child in self.children:
|
148 |
+
structure += child.structure
|
149 |
+
return structure
|
150 |
+
|
src/model/doc.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.container import Container
|
2 |
+
from src.model.paragraph import Paragraph
|
3 |
+
from src.tools.reader import get_pdf_title_styles
|
4 |
+
|
5 |
+
|
6 |
+
class Doc:
|
7 |
+
|
8 |
+
def __init__(self, path='', id_=None):
|
9 |
+
|
10 |
+
self.title = path.split('/')[-1]
|
11 |
+
self.id_ = id(self)
|
12 |
+
self.path = path
|
13 |
+
paragraphs = get_pdf_title_styles(path)
|
14 |
+
self.container = Container(paragraphs, father=self, level=0)
|
15 |
+
self.blocks = self.get_blocks()
|
16 |
+
|
17 |
+
@property
|
18 |
+
def structure(self):
|
19 |
+
|
20 |
+
return self.container.structure
|
21 |
+
|
22 |
+
def get_blocks(self):
|
23 |
+
|
24 |
+
def from_list_to_str(index_list):
|
25 |
+
index_str = str(index_list[0])
|
26 |
+
for el in index_list[1:]:
|
27 |
+
index_str += '.' + str(el)
|
28 |
+
return index_str
|
29 |
+
|
30 |
+
blocks = self.container.blocks
|
31 |
+
blocks = self.delete_duplicate()
|
32 |
+
for block in blocks:
|
33 |
+
block.doc = self.title
|
34 |
+
block.index = from_list_to_str(block.index)
|
35 |
+
return blocks
|
36 |
+
|
37 |
+
|
38 |
+
def delete_duplicate(self):
|
39 |
+
while self.found_duplicates(self.container.blocks):
|
40 |
+
for i in range(len(self.container.blocks) - 1):
|
41 |
+
if self.container.blocks[i].index == self.container.blocks[i + 1].index:
|
42 |
+
if self.container.blocks[i].index != []:
|
43 |
+
self.container.blocks[i].index.pop()
|
44 |
+
return self.container.blocks
|
45 |
+
|
46 |
+
def found_duplicates(self, blocks):
|
47 |
+
for i in range(len(blocks) - 1):
|
48 |
+
if blocks[i].index == blocks[i + 1].index:
|
49 |
+
return True
|
50 |
+
return False
|
51 |
+
|
52 |
+
"""
|
53 |
+
current_level = len(current_index)
|
54 |
+
if 0 < block.level:
|
55 |
+
if block.level == current_level:
|
56 |
+
current_index[-1] += 1
|
57 |
+
elif current_level < block.level:
|
58 |
+
current_index.append(1)
|
59 |
+
elif block.level < current_level:
|
60 |
+
current_index = current_index[:block.level]
|
61 |
+
current_index[-1] += 1
|
62 |
+
block.index = from_list_to_str(current_index)
|
63 |
+
else:
|
64 |
+
block.index = "0"
|
65 |
+
"""
|
src/model/paragraph.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
|
3 |
+
INFINITE = 10000
|
4 |
+
|
5 |
+
class Paragraph:
|
6 |
+
def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
|
7 |
+
self.font_style = font_style
|
8 |
+
self.id_ = int(str(2)+str(page_id)+str(id_))
|
9 |
+
self.page_id = page_id
|
10 |
+
self.level = int(font_style[-1]) if 'title' in font_style else INFINITE
|
11 |
+
self.is_structure = self.level < INFINITE
|
12 |
+
self.text = text
|
13 |
+
|
14 |
+
@property
|
15 |
+
def blank(self):
|
16 |
+
"""
|
17 |
+
checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
|
18 |
+
"""
|
19 |
+
text = self.text.replace('\n', '')
|
20 |
+
return set(text).isdisjoint(string.ascii_letters)
|
src/tools/__init__.py
ADDED
File without changes
|
src/tools/llm.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class LlmAgent:
|
2 |
+
|
3 |
+
def __init__(self, llm):
|
4 |
+
self.llm = llm
|
5 |
+
|
6 |
+
def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
|
7 |
+
"""generates the answer"""
|
8 |
+
template = (f"You are a conversation bot designed to answer to the query from users delimited by "
|
9 |
+
f"triple backticks: "
|
10 |
+
f"\\n ``` {query} ```\\n"
|
11 |
+
f"Your answer is based on the context delimited by triple backticks: "
|
12 |
+
f"\\n ``` {context} ```\\n"
|
13 |
+
f"You are consistent and avoid redundancies with the rest of the initial conversation in French"
|
14 |
+
f"delimited by triple backticks: "
|
15 |
+
f"\\n ``` {histo} ```\\n"
|
16 |
+
f"Your response shall be in {language} and shall be concise"
|
17 |
+
f"In case the provided context is not relevant to answer to the question, just return that you "
|
18 |
+
f"don't know the answer ")
|
19 |
+
|
20 |
+
p = self.llm(template)
|
21 |
+
print("****************")
|
22 |
+
print(template)
|
23 |
+
print("----")
|
24 |
+
print(p)
|
25 |
+
return p
|
26 |
+
|
27 |
+
def translate(self, text: str, language="en") -> str:
|
28 |
+
"""translates"""
|
29 |
+
|
30 |
+
languages = "`French to English" if language == "en" else "English to French"
|
31 |
+
|
32 |
+
template = (f" Your task consists in translating {languages}\\n"
|
33 |
+
f" the following text delimited by by triple backticks: ```{text}```\n"
|
34 |
+
)
|
35 |
+
|
36 |
+
p = self.llm(template)
|
37 |
+
return p
|
38 |
+
|
39 |
+
def generate_answer(self, query: str, answer_en: str, histo: str, context: str) -> str:
|
40 |
+
"""provides the final answer in French based on the initial query and the answer in english"""
|
41 |
+
|
42 |
+
def _cut_unfinished_sentence(s: str):
|
43 |
+
return '.'.join(s.split('.')[:-1])
|
44 |
+
|
45 |
+
template = (f"Your task consists in translating the answer in French to the query "
|
46 |
+
f"delimited by triple backticks: ```{query}``` \\n"
|
47 |
+
f"You are given the answer in english delimited by triple backticks: ```{answer_en}```"
|
48 |
+
f"\\n You don't add new content to the answer in English but: "
|
49 |
+
f"\\n 1 You can use some vocabulary from the context in English delimited by triple backticks: "
|
50 |
+
f"```{context}```"
|
51 |
+
f"\\n 2 You are consistent and avoid redundancies with the rest of the initial"
|
52 |
+
f" conversation in English delimited by triple backticks: ```{histo}```"
|
53 |
+
)
|
54 |
+
|
55 |
+
p = self.llm(template)
|
56 |
+
# p = _cut_unfinished_sentence(p)
|
57 |
+
return p
|
58 |
+
|
src/tools/pretty_print.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.paragraph import Paragraph
|
2 |
+
from src.model.container import Container
|
3 |
+
|
4 |
+
|
5 |
+
#function that pretty prints the paragraphs
|
6 |
+
def pretty_printer_paragraphs(paragraphs):
|
7 |
+
for p in paragraphs:
|
8 |
+
if (p.font_style == "title1"):
|
9 |
+
print(f"Titre 1 {p.text}")
|
10 |
+
elif (p.font_style == "title2"):
|
11 |
+
print(f"---> Titre 2 {p.text}")
|
12 |
+
elif (p.font_style == "title3"):
|
13 |
+
print(f"-------> Titre 3 {p.text}")
|
14 |
+
# elif (p.font_style == "title4"):
|
15 |
+
# print(f"-----------> Titre 4 {p.text}")
|
16 |
+
# elif (p.font_style == "content"):
|
17 |
+
# print(f"---------------> {p.text}")
|
18 |
+
|
19 |
+
def pretty_print_container_structure(container):
|
20 |
+
if container.title:
|
21 |
+
print(f"{'-'*container.level} {container.title.text}")
|
22 |
+
for p in container.paragraphs:
|
23 |
+
print(f"{'-'*container.level} {p.text}")
|
24 |
+
for c in container.children:
|
25 |
+
pretty_print_container_structure(c)
|
src/tools/reader.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pdfplumber as pdfp
|
3 |
+
from src.model.paragraph import Paragraph
|
4 |
+
import asyncio
|
5 |
+
|
6 |
+
def skip_header(dictionary):
|
7 |
+
i = 0
|
8 |
+
if not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
|
9 |
+
i+=2
|
10 |
+
return i
|
11 |
+
|
12 |
+
|
13 |
+
def get_style_of_line(size : float):
|
14 |
+
if size >= 9 and size < 11.5:
|
15 |
+
return "content"
|
16 |
+
elif size >= 11.5 and size <= 12.7:
|
17 |
+
return "title5"
|
18 |
+
elif size >= 12.8 and size <= 13.5:
|
19 |
+
return "title4"
|
20 |
+
elif size > 13.5 and size <= 15.5:
|
21 |
+
return "title3"
|
22 |
+
elif size > 15.5 and size <= 18.5:
|
23 |
+
return "title2"
|
24 |
+
elif size > 19 and size < 30:
|
25 |
+
return "title1"
|
26 |
+
# elif size >= 12 and size <= 14.5:
|
27 |
+
# return "title2"
|
28 |
+
# elif size > 14.5 and size <= 16.5:
|
29 |
+
# return "title1"
|
30 |
+
else:
|
31 |
+
return "unknown"
|
32 |
+
|
33 |
+
def get_pdf_title_styles(path):
|
34 |
+
pdf_to_read = extract_all_lines_from_the_doc(path)
|
35 |
+
paragraphs = []
|
36 |
+
j = 0
|
37 |
+
while j < len(pdf_to_read):
|
38 |
+
dictionary = pdf_to_read[j]["content"]
|
39 |
+
i = skip_header(dictionary)
|
40 |
+
while i < len(dictionary):
|
41 |
+
#print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
|
42 |
+
if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
|
43 |
+
i+=1
|
44 |
+
continue
|
45 |
+
p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"]),id_=i,page_id=pdf_to_read[j]["page_number"])
|
46 |
+
if(i != len(dictionary)-1):
|
47 |
+
while(dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][0]["size"]):
|
48 |
+
p.text += " " + dictionary[i+1]["text"]
|
49 |
+
i += 1
|
50 |
+
# if(i == len(dictionary)-1):
|
51 |
+
# print("PIDOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
|
52 |
+
# if(j == len(pdf_to_read)-1):
|
53 |
+
# print("JUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU")
|
54 |
+
# break
|
55 |
+
# else:
|
56 |
+
# if(dictionary[i]["chars"][0]["size"] == pdf_to_read[j+1]["content"][0]["chars"][0]["size"]):
|
57 |
+
# print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
|
58 |
+
# j += 1
|
59 |
+
# p.text += " " + pdf_to_read[j]["content"][0]["text"]
|
60 |
+
# dictionary = pdf_to_read[j]["content"]
|
61 |
+
# i = 0
|
62 |
+
# else:
|
63 |
+
# print("RRIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIZ")
|
64 |
+
# break
|
65 |
+
else:
|
66 |
+
p.text = dictionary[i]["text"]
|
67 |
+
#print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
|
68 |
+
i += 1
|
69 |
+
# print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
|
70 |
+
paragraphs.append(p)
|
71 |
+
j += 1
|
72 |
+
return paragraphs
|
73 |
+
|
74 |
+
|
75 |
+
def test_get_font_sizes_of_a_page(page : int, path):
|
76 |
+
with open(os.path.abspath(path)) as f:
|
77 |
+
reader = pdfp.PDF(f)
|
78 |
+
page = reader.pages[page]
|
79 |
+
dictionary = page.extract_text_lines()
|
80 |
+
for i in range(len(dictionary)):
|
81 |
+
print(f'{i} : {dictionary[i]["chars"][0]["size"]} ->>>>> {dictionary[i]["text"]}')
|
82 |
+
|
83 |
+
|
84 |
+
def extract_all_lines_from_the_doc(path):
|
85 |
+
lines_of_doc = []
|
86 |
+
with open(path, 'rb') as f:
|
87 |
+
reader = pdfp.PDF(f)
|
88 |
+
skip_table_of_contents = reader.pages[8:]
|
89 |
+
j = 0
|
90 |
+
while j < len(skip_table_of_contents):
|
91 |
+
lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines()})
|
92 |
+
j += 1
|
93 |
+
return lines_of_doc
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
# path = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
|
99 |
+
# get_pdf_title_styles(os.path.abspath(path))
|
100 |
+
# print("--------------------------------------------------")
|
101 |
+
# print("--------------------------------------------------")
|
102 |
+
#print(test_get_font_sizes_of_a_page(8))
|
src/tools/retriever.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.container import Container
|
2 |
+
from src.model.block import Block
|
3 |
+
from src.model.doc import Doc
|
4 |
+
|
5 |
+
class Retriever:
|
6 |
+
def __init__(self,db_client,doc : Doc = None, collection_name:str = "illumio_database"):
|
7 |
+
if doc != None:
|
8 |
+
self.collection = db_client.create_collection(name=collection_name)
|
9 |
+
blocks_good_format: [Block] = doc.blocks
|
10 |
+
self.collection.add(
|
11 |
+
documents=[block.content for block in blocks_good_format],
|
12 |
+
ids=[block.index for block in blocks_good_format],
|
13 |
+
metadatas=[block.to_dict() for block in blocks_good_format]
|
14 |
+
)
|
15 |
+
else:
|
16 |
+
self.collection = db_client.get_collection(name=collection_name)
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
def similarity_search(self, query: str) -> {}:
|
21 |
+
res = self.collection.query(query_texts=query)
|
22 |
+
block_dict_sources = res['metadatas'][0]
|
23 |
+
distances = res['distances'][0]
|
24 |
+
blocks = []
|
25 |
+
for bd, d in zip(block_dict_sources, distances):
|
26 |
+
b = Block().from_dict(bd)
|
27 |
+
b.distance = d
|
28 |
+
blocks.append(b)
|
29 |
+
return blocks
|
30 |
+
|
src/tools/test_read.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# To read the PDF
|
2 |
+
import PyPDF2
|
3 |
+
# To analyze the PDF layout and extract text
|
4 |
+
from pdfminer.high_level import extract_pages, extract_text
|
5 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
6 |
+
# To extract text from tables in PDF
|
7 |
+
import pdfplumber
|
8 |
+
# To extract the images from the PDFs
|
9 |
+
from PIL import Image
|
10 |
+
from pdf2image import convert_from_path
|
11 |
+
# To perform OCR to extract text from images
|
12 |
+
import pytesseract
|
13 |
+
# To remove the additional created files
|
14 |
+
import os
|
15 |
+
|
16 |
+
def text_extraction(element):
|
17 |
+
# Extracting the text from the in-line text element
|
18 |
+
line_text = element.get_text()
|
19 |
+
|
20 |
+
# Find the formats of the text
|
21 |
+
# Initialize the list with all the formats that appeared in the line of text
|
22 |
+
line_formats = []
|
23 |
+
for text_line in element:
|
24 |
+
if isinstance(text_line, LTTextContainer):
|
25 |
+
# Iterating through each character in the line of text
|
26 |
+
for character in text_line:
|
27 |
+
if isinstance(character, LTChar):
|
28 |
+
# Append the font name of the character
|
29 |
+
line_formats.append(character.fontname)
|
30 |
+
# Append the font size of the character
|
31 |
+
line_formats.append(character.size)
|
32 |
+
# Find the unique font sizes and names in the line
|
33 |
+
format_per_line = list(set(line_formats))
|
34 |
+
|
35 |
+
# Return a tuple with the text in each line along with its format
|
36 |
+
return (line_text, format_per_line)
|
37 |
+
|
38 |
+
|
39 |
+
def crop_image(element, pageObj):
|
40 |
+
# Get the coordinates to crop the image from the PDF
|
41 |
+
[image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
|
42 |
+
# Crop the page using coordinates (left, bottom, right, top)
|
43 |
+
pageObj.mediabox.lower_left = (image_left, image_bottom)
|
44 |
+
pageObj.mediabox.upper_right = (image_right, image_top)
|
45 |
+
# Save the cropped page to a new PDF
|
46 |
+
cropped_pdf_writer = PyPDF2.PdfWriter()
|
47 |
+
cropped_pdf_writer.add_page(pageObj)
|
48 |
+
# Save the cropped PDF to a new file
|
49 |
+
with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
|
50 |
+
cropped_pdf_writer.write(cropped_pdf_file)
|
51 |
+
|
52 |
+
# Create a function to convert the PDF to images
|
53 |
+
def convert_to_images(input_file,):
|
54 |
+
images = convert_from_path(input_file,poppler_path=r'C:\Program Files\poppler-23.08.0\Library\bin')
|
55 |
+
image = images[0]
|
56 |
+
output_file = "PDF_image.png"
|
57 |
+
image.save(output_file, "PNG")
|
58 |
+
|
59 |
+
# Create a function to read text from images
|
60 |
+
def image_to_text(image_path):
|
61 |
+
# Read the image
|
62 |
+
img = Image.open(image_path)
|
63 |
+
# Extract the text from the image
|
64 |
+
text = pytesseract.image_to_string(img)
|
65 |
+
return text
|
66 |
+
|
67 |
+
|
68 |
+
def extract_table(pdf_path, page_num, table_num):
|
69 |
+
# Open the pdf file
|
70 |
+
pdf = pdfplumber.open(pdf_path)
|
71 |
+
# Find the examined page
|
72 |
+
table_page = pdf.pages[page_num]
|
73 |
+
# Extract the appropriate table
|
74 |
+
table = table_page.extract_tables()[table_num]
|
75 |
+
return table
|
76 |
+
|
77 |
+
# Convert table into the appropriate format
|
78 |
+
def table_converter(table):
|
79 |
+
table_string = ''
|
80 |
+
# Iterate through each row of the table
|
81 |
+
for row_num in range(len(table)):
|
82 |
+
row = table[row_num]
|
83 |
+
# Remove the line breaker from the wrapped texts
|
84 |
+
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
|
85 |
+
# Convert the table into a string
|
86 |
+
table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
|
87 |
+
# Removing the last line break
|
88 |
+
table_string = table_string[:-1]
|
89 |
+
return table_string
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
def pdf_manager(pdf_path):
|
94 |
+
# create a PDF file object
|
95 |
+
pdfFileObj = open(pdf_path, 'rb')
|
96 |
+
# create a PDF reader object
|
97 |
+
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
|
98 |
+
|
99 |
+
# Create the dictionary to extract text from each image
|
100 |
+
text_per_page = {}
|
101 |
+
# We extract the pages from the PDF
|
102 |
+
for pagenum, page in enumerate(extract_pages(pdf_path)):
|
103 |
+
|
104 |
+
# Initialize the variables needed for the text extraction from the page
|
105 |
+
pageObj = pdfReaded.pages[pagenum]
|
106 |
+
page_text = []
|
107 |
+
line_format = []
|
108 |
+
text_from_images = []
|
109 |
+
text_from_tables = []
|
110 |
+
page_content = []
|
111 |
+
# Initialize the number of the examined tables
|
112 |
+
table_num = 0
|
113 |
+
first_element= True
|
114 |
+
table_extraction_flag= False
|
115 |
+
# Open the pdf file
|
116 |
+
pdf = pdfplumber.open(pdf_path)
|
117 |
+
# Find the examined page
|
118 |
+
page_tables = pdf.pages[pagenum]
|
119 |
+
# Find the number of tables on the page
|
120 |
+
tables = page_tables.find_tables()
|
121 |
+
|
122 |
+
|
123 |
+
# Find all the elements
|
124 |
+
page_elements = [(element.y1, element) for element in page._objs]
|
125 |
+
# Sort all the elements as they appear in the page
|
126 |
+
page_elements.sort(key=lambda a: a[0], reverse=True)
|
127 |
+
|
128 |
+
# Find the elements that composed a page
|
129 |
+
for i,component in enumerate(page_elements):
|
130 |
+
# Extract the position of the top side of the element in the PDF
|
131 |
+
pos= component[0]
|
132 |
+
# Extract the element of the page layout
|
133 |
+
element = component[1]
|
134 |
+
|
135 |
+
# Check if the element is a text element
|
136 |
+
if isinstance(element, LTTextContainer):
|
137 |
+
# Check if the text appeared in a table
|
138 |
+
if table_extraction_flag == False:
|
139 |
+
# Use the function to extract the text and format for each text element
|
140 |
+
(line_text, format_per_line) = text_extraction(element)
|
141 |
+
# Append the text of each line to the page text
|
142 |
+
page_text.append(line_text)
|
143 |
+
# Append the format for each line containing text
|
144 |
+
line_format.append(format_per_line)
|
145 |
+
page_content.append(line_text)
|
146 |
+
else:
|
147 |
+
# Omit the text that appeared in a table
|
148 |
+
pass
|
149 |
+
|
150 |
+
# Check the elements for images
|
151 |
+
if isinstance(element, LTFigure):
|
152 |
+
# Crop the image from the PDF
|
153 |
+
crop_image(element, pageObj)
|
154 |
+
# Convert the cropped pdf to an image
|
155 |
+
convert_to_images('cropped_image.pdf')
|
156 |
+
# Extract the text from the image
|
157 |
+
image_text = image_to_text('PDF_image.png')
|
158 |
+
text_from_images.append(image_text)
|
159 |
+
page_content.append(image_text)
|
160 |
+
# Add a placeholder in the text and format lists
|
161 |
+
page_text.append('image')
|
162 |
+
line_format.append('image')
|
163 |
+
|
164 |
+
# Check the elements for tables
|
165 |
+
if isinstance(element, LTRect):
|
166 |
+
# If the first rectangular element
|
167 |
+
if first_element == True and (table_num+1) <= len(tables):
|
168 |
+
# Find the bounding box of the table
|
169 |
+
lower_side = page.bbox[3] - tables[table_num].bbox[3]
|
170 |
+
upper_side = element.y1
|
171 |
+
# Extract the information from the table
|
172 |
+
table = extract_table(pdf_path, pagenum, table_num)
|
173 |
+
# Convert the table information in structured string format
|
174 |
+
table_string = table_converter(table)
|
175 |
+
# Append the table string into a list
|
176 |
+
text_from_tables.append(table_string)
|
177 |
+
page_content.append(table_string)
|
178 |
+
# Set the flag as True to avoid the content again
|
179 |
+
table_extraction_flag = True
|
180 |
+
# Make it another element
|
181 |
+
first_element = False
|
182 |
+
# Add a placeholder in the text and format lists
|
183 |
+
page_text.append('table')
|
184 |
+
line_format.append('table')
|
185 |
+
|
186 |
+
# Check if we already extracted the tables from the page
|
187 |
+
if element.y0 >= lower_side and element.y1 <= upper_side:
|
188 |
+
pass
|
189 |
+
elif not isinstance(page_elements[i+1][1], LTRect):
|
190 |
+
table_extraction_flag = False
|
191 |
+
first_element = True
|
192 |
+
table_num+=1
|
193 |
+
|
194 |
+
|
195 |
+
# Create the key of the dictionary
|
196 |
+
dctkey = 'Page_'+str(pagenum)
|
197 |
+
# Add the list of list as the value of the page key
|
198 |
+
text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
|
199 |
+
|
200 |
+
# Closing the pdf file object
|
201 |
+
pdfFileObj.close()
|
202 |
+
|
203 |
+
# Deleting the additional files created
|
204 |
+
os.remove('cropped_image.pdf')
|
205 |
+
os.remove('PDF_image.png')
|
206 |
+
|
207 |
+
# Display the content of the page
|
208 |
+
result = ''.join(text_per_page['Page_0'][4])
|
209 |
+
print(result)
|
src/view/view.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from src.control.control import Chatbot
|
3 |
+
|
4 |
+
|
5 |
+
def run(ctrl: Chatbot, config: {}):
|
6 |
+
with gr.Blocks() as qna:
|
7 |
+
with gr.Row():
|
8 |
+
with gr.Column():
|
9 |
+
pass
|
10 |
+
|
11 |
+
with gr.Column(scale=10):
|
12 |
+
|
13 |
+
gr.Markdown(config['title'])
|
14 |
+
|
15 |
+
histo_text_comp = gr.Chatbot(
|
16 |
+
visible=False,
|
17 |
+
value=[],
|
18 |
+
)
|
19 |
+
input_text_comp = gr.Textbox(
|
20 |
+
label="",
|
21 |
+
lines=1,
|
22 |
+
max_lines=3,
|
23 |
+
interactive=True,
|
24 |
+
placeholder="Posez votre question ici",
|
25 |
+
)
|
26 |
+
clear_btn = gr.Button("Clear")
|
27 |
+
input_example_comp = gr.Radio(
|
28 |
+
label="Examples",
|
29 |
+
choices=list(config['examples'].values()),
|
30 |
+
value="",
|
31 |
+
)
|
32 |
+
source_text_comp = []
|
33 |
+
for i in range(4):
|
34 |
+
source_text_comp.append(gr.Textbox(
|
35 |
+
lines=4,
|
36 |
+
max_lines=4,
|
37 |
+
interactive=False,
|
38 |
+
visible=False,
|
39 |
+
))
|
40 |
+
|
41 |
+
with gr.Column():
|
42 |
+
pass
|
43 |
+
|
44 |
+
def input_text_fn1(input_text_, histo_text_):
|
45 |
+
histo_text_.append((input_text_, None))
|
46 |
+
update_ = {
|
47 |
+
histo_text_comp: gr.update(visible=True, value=histo_text_),
|
48 |
+
input_example_comp: gr.update(visible=False,),
|
49 |
+
}
|
50 |
+
for i in range(4):
|
51 |
+
update_[source_text_comp[i]] = gr.update(visible=False)
|
52 |
+
return update_
|
53 |
+
|
54 |
+
def input_text_fn2(input_text_, histo_text_):
|
55 |
+
answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
|
56 |
+
histo_text_[-1] = (input_text_, answer)
|
57 |
+
update_ = {
|
58 |
+
histo_text_comp: gr.update(value=histo_text_),
|
59 |
+
input_text_comp: gr.update(value=''),
|
60 |
+
}
|
61 |
+
for i in range(min(len(sources), 3)):
|
62 |
+
s = sources[i]
|
63 |
+
source_label = f'{s.index} {s.title} score = {s.distance_str}'
|
64 |
+
source_text = s.content
|
65 |
+
update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
|
66 |
+
return update_
|
67 |
+
|
68 |
+
def input_example_fn(input_example_, histo_text_):
|
69 |
+
histo_text_.append((input_example_, None))
|
70 |
+
update_ = {
|
71 |
+
input_text_comp: gr.update(value=input_example_),
|
72 |
+
histo_text_comp: gr.update(visible=True, value=histo_text_),
|
73 |
+
input_example_comp: gr.update(visible=False, value=''),
|
74 |
+
}
|
75 |
+
for i in range(4):
|
76 |
+
update_[source_text_comp[i]] = gr.update(visible=False)
|
77 |
+
return update_
|
78 |
+
|
79 |
+
def clear_fn():
|
80 |
+
update_ = {
|
81 |
+
input_text_comp: gr.update(value=''),
|
82 |
+
histo_text_comp: gr.update(value='', visible=False),
|
83 |
+
input_example_comp: gr.update(value='', visible=True),
|
84 |
+
}
|
85 |
+
for i in range(4):
|
86 |
+
update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
|
87 |
+
return update_
|
88 |
+
|
89 |
+
input_text_comp \
|
90 |
+
.submit(input_text_fn1,
|
91 |
+
inputs=[input_text_comp, histo_text_comp],
|
92 |
+
outputs=[histo_text_comp, input_example_comp,
|
93 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
|
94 |
+
.then(input_text_fn2,
|
95 |
+
inputs=[input_text_comp, histo_text_comp],
|
96 |
+
outputs=[input_text_comp, histo_text_comp,
|
97 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
98 |
+
input_example_comp \
|
99 |
+
.input(input_example_fn,
|
100 |
+
inputs=[input_example_comp, histo_text_comp],
|
101 |
+
outputs=[input_text_comp, histo_text_comp, input_example_comp,
|
102 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
|
103 |
+
.then(input_text_fn2,
|
104 |
+
inputs=[input_text_comp, histo_text_comp],
|
105 |
+
outputs=[input_text_comp, histo_text_comp,
|
106 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
107 |
+
clear_btn.click(clear_fn,
|
108 |
+
inputs=None,
|
109 |
+
outputs=[input_text_comp, histo_text_comp, input_example_comp,
|
110 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
111 |
+
|
112 |
+
return qna
|