adrien.aribaut-gaudin commited on
Commit
9ff01ff
·
1 Parent(s): 29de006

push on hugging_face

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config_key.py
2
+
3
+
4
+ #Test folder
5
+ data/Test/
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+
12
+ # C extensions
13
+ *.so
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ cover/
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+
80
+ # PyBuilder
81
+ .pybuilder/
82
+ target/
83
+
84
+ # Jupyter Notebook
85
+ .ipynb_checkpoints
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ # For a library or package, you might want to ignore these files since the code is
93
+ # intended to run in multiple environments; otherwise, check them in:
94
+ # .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/#use-with-ide
116
+ .pdm.toml
117
+
118
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119
+ __pypackages__/
120
+
121
+ # Celery stuff
122
+ celerybeat-schedule
123
+ celerybeat.pid
124
+
125
+ # SageMath parsed files
126
+ *.sage.py
127
+
128
+ # Environments
129
+ .env
130
+ .venv
131
+ env/
132
+ venv/
133
+ ENV/
134
+ env.bak/
135
+ venv.bak/
136
+
137
+ # Spyder project settings
138
+ .spyderproject
139
+ .spyproject
140
+
141
+ # Rope project settings
142
+ .ropeproject
143
+
144
+ # mkdocs documentation
145
+ /site
146
+
147
+ # mypy
148
+ .mypy_cache/
149
+ .dmypy.json
150
+ dmypy.json
151
+
152
+ # Pyre type checker
153
+ .pyre/
154
+
155
+ # pytype static type analyzer
156
+ .pytype/
157
+
158
+ # Cython debug symbols
159
+ cython_debug/
160
+
161
+ # PyCharm
162
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
165
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166
+ #.idea/
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from langchain.llms import OpenAI
4
+ import chromadb
5
+
6
+ from config import *
7
+ from src.tools.reader import get_pdf_title_styles
8
+ from src.tools.llm import LlmAgent
9
+ import src.view.view as view
10
+ from src.tools.pretty_print import pretty_print_container_structure, pretty_printer_paragraphs
11
+ from src.model.container import Container
12
+ from src.control.control import Chatbot
13
+ from src.tools.retriever import Retriever
14
+ from src.model.doc import Doc
15
+ from src.tools.test_read import pdf_manager
16
+
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
18
+
19
+ if not "OPENAI_API_KEY" in os.environ:
20
+ from config_key import OPENAI_API_KEY
21
+ os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
22
+
23
+ #check if the database is empty
24
+ # pdf_manager(pdf_path=content_en_path_real)
25
+ # pretty_printer_paragraphs(doc.container.paragraphs)
26
+ # pretty_print_container_structure(doc.container)
27
+
28
+ if not os.path.exists("Ilumio_chatbot/database/"):
29
+ os.makedirs("Ilumio_chatbot/database/")
30
+
31
+ client_db = chromadb.PersistentClient(path="Ilumio_chatbot/database/")
32
+
33
+ try:
34
+ client_db.get_collection(name="illumio_database")
35
+ retriever = Retriever(client_db, None, "illumio_database")
36
+ except:
37
+ print("Database is empty")
38
+ doc = Doc(path=content_en_path_real)
39
+ retriever = Retriever(client_db,doc.container,"illumio_database")
40
+
41
+ llm_model = OpenAI(temperature=0)
42
+ llm = LlmAgent(llm_model)
43
+
44
+ chat = Chatbot(llm_agent=llm, retriever=retriever)
45
+
46
+ ilumio_qna = view.run(ctrl=chat, config=view_config)
47
+
48
+ ilumio_qna.queue().launch()
config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ content_language = 'en'
2
+ plan_language = 'en'
3
+ content_en_path_real = "Ilumio_chatbot/data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
4
+ content_test = "Ilumio_chatbot/data/Test/Test_children.pdf"
5
+
6
+
7
+ examples = {"Question banale?": "Pourquoi le ciel est bleu?",
8
+ }
9
+
10
+
11
+ view_config = {
12
+ 'title': '# Ilumio Q&A',
13
+ 'examples': examples,
14
+ }
data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8821bd9530837f23a99e6b5d17d1e893f74d91ac6112c861d4ecd3f830e42479
3
+ size 4115867
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
3
+ size 1676000
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b8e0bc4024bd2c60f910c5ea628a6b78cab039442c2c1e1576985ca11d8ab69
3
+ size 4000
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/link_lists.bin ADDED
File without changes
database/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5f6d73b67b4014dc1c30e357807cf0910ce18f617f99ae46d18126d541ad3a2
3
+ size 5914624
src/__init__.py ADDED
File without changes
src/control/__init__.py ADDED
File without changes
src/control/control.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from src.tools.retriever import Retriever
4
+ from src.tools.llm import LlmAgent
5
+ from src.model.block import Block
6
+
7
+
8
+ class Chatbot:
9
+ def __init__(self, llm_agent, retriever):
10
+ self.retriever = retriever
11
+ self.llm = llm_agent
12
+
13
+ def get_response(self, query, histo):
14
+ histo_conversation, histo_queries = self._get_histo(histo)
15
+ queries = histo_queries
16
+ block_sources = self.retriever.similarity_search(query=queries)
17
+ block_sources = self._select_best_sources(block_sources)
18
+ sources_contents = [s.content for s in block_sources]
19
+ context = '\n'.join(sources_contents)
20
+ answer = self.llm.generate_paragraph(query=queries, histo=histo_conversation, context=context, language='en')
21
+ answer = self.llm.generate_answer(answer_en=answer, query=query, histo=histo_conversation, context=context)
22
+ answer = self._clean_answer(answer)
23
+ return answer, block_sources
24
+
25
+
26
+
27
+ @staticmethod
28
+ def _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
29
+ """
30
+ Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
31
+ """
32
+ best_sources = []
33
+ for idx, s in enumerate(sources):
34
+ if idx == 0 \
35
+ or (s.distance - sources[idx - 1].distance < delta_1_2
36
+ and s.distance - sources[0].distance < delta_1_n) \
37
+ or s.distance < absolute:
38
+ best_sources.append(s)
39
+ delta_1_2 *= alpha
40
+ delta_1_n *= alpha
41
+ absolute *= alpha
42
+ else:
43
+ break
44
+ return best_sources
45
+
46
+
47
+ @staticmethod
48
+ def _get_histo(histo: [(str, str)]) -> (str, str):
49
+ histo_conversation = ""
50
+ histo_queries = ""
51
+
52
+ for (query, answer) in histo[-5:]:
53
+ histo_conversation += f'user: {query} \n bot: {answer}\n'
54
+ histo_queries += query + '\n'
55
+ return histo_conversation[:-1], histo_queries
56
+
57
+
58
+ @staticmethod
59
+ def _clean_answer(answer: str) -> str:
60
+ answer = answer.strip('bot:')
61
+ while answer and answer[-1] in {"'", '"', " ", "`"}:
62
+ answer = answer[:-1]
63
+ while answer and answer[0] in {"'", '"', " ", "`"}:
64
+ answer = answer[1:]
65
+ answer = answer.strip('bot:')
66
+ if answer:
67
+ if answer[-1] != ".":
68
+ answer += "."
69
+ return answer
src/model/__init__.py ADDED
File without changes
src/model/block.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Block:
2
+ def __init__(self, doc: str= '',title: str = '', content: str = '',
3
+ index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
4
+ self.doc = doc
5
+ self.title = title
6
+ self.content = content
7
+ self.index = index
8
+ self.rank = rank
9
+ self.level = level
10
+ self.distance = distance
11
+
12
+ def to_dict(self) -> {}:
13
+ block_dict = {'doc': self.doc,
14
+ 'title': self.title,
15
+ 'content': self.content,
16
+ 'index': self.index,
17
+ 'rank': self.rank,
18
+ 'level': self.level,
19
+ 'distance': self.distance}
20
+ return block_dict
21
+
22
+ def from_dict(self, block_dict: {}):
23
+ self.doc = block_dict['doc']
24
+ self.title = block_dict['title']
25
+ self.content = block_dict['content']
26
+ self.index = block_dict['index']
27
+ self.rank = block_dict['rank']
28
+ self.level = block_dict['level']
29
+ self.distance = block_dict['distance']
30
+ return self
31
+
32
+ @property
33
+ def distance_str(self) -> str:
34
+ return format(self.distance, '.2f')
src/model/container.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .paragraph import Paragraph
2
+ from .block import Block
3
+
4
+ INFINITE = 99999
5
+
6
+ class Container:
7
+
8
+ def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
9
+ if index is None:
10
+ index = []
11
+ self.level = level
12
+ self.title = title
13
+ self.paragraphs = []
14
+ self.children = []
15
+ self.index = index
16
+ self.father = father
17
+ self.id_ = int(str(1) + str(father.id_) + str(id_))
18
+ if paragraphs:
19
+ self.paragraphs, self.children = self.create_children(paragraphs, level, index)
20
+ self.blocks = self.get_blocks()
21
+
22
+
23
+ def get_blocks(self):
24
+ block = Block(level=self.level, index=self.index)
25
+ if self.title:
26
+ block.title = self.title.text
27
+ for p in self.paragraphs:
28
+ block.content += p.text
29
+ blocks = [block] if block.content else []
30
+ for child in self.children:
31
+ blocks += child.blocks
32
+ return blocks
33
+
34
+
35
+ def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
36
+ """
37
+ Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
38
+ The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
39
+ :return: List of Content or Container
40
+ """
41
+ attached_paragraphs = []
42
+ children = []
43
+ in_children = False
44
+ level = INFINITE
45
+ container_paragraphs = []
46
+ container_title = None
47
+
48
+ while paragraphs:
49
+ p = paragraphs.pop(0)
50
+
51
+ if not in_children and not p.is_structure:
52
+ attached_paragraphs.append(p)
53
+ else:
54
+ in_children = True
55
+ if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
56
+ if container_paragraphs or container_title:
57
+ if level <= len(index):
58
+ index = index[:level]
59
+ index[-1] += 1
60
+ else:
61
+ for i in range(level-len(index)):
62
+ index.append(1)
63
+ children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
64
+ container_paragraphs = []
65
+ container_title = p
66
+ level = p.level
67
+ else: # p is normal text or strictly lower in hierarchy, then the child continues to grow
68
+ container_paragraphs.append(p)
69
+ if container_paragraphs or container_title:
70
+ if level <= len(index):
71
+ index = index[:level]
72
+ index[-1] += 1
73
+ else:
74
+ for i in range(level - len(index)):
75
+ index.append(1)
76
+ children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
77
+
78
+ return attached_paragraphs, children
79
+
80
+
81
+
82
+ # def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
83
+ # """
84
+ # Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
85
+ # The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
86
+ # :return: List of Content or Container
87
+ # """
88
+ # attached_paragraphs = []
89
+ # children = []
90
+ # in_children = False
91
+ # level = INFINITE
92
+ # # container_paragraphs = []
93
+ # # container_title = None
94
+
95
+ # while paragraphs:
96
+ # p = paragraphs.pop(0)
97
+
98
+ # if not in_children and p.is_structure and level != INFINITE:
99
+ # paragraphs.insert(0, p)
100
+ # children.append(Container(paragraphs, title=p, level=p.level, children=children, index=index.copy(), father=self))
101
+ # else:
102
+ # in_children = True
103
+ # if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
104
+ # level = p.level
105
+ # if len(index) == level:
106
+ # index[-1] += 1
107
+ # elif len(index) < level:
108
+ # if self.children != []:
109
+ # index = self.children[-1].index.copy()
110
+ # index[-1] += 1
111
+ # else:
112
+ # index.append(1)
113
+ # else:
114
+ # index = index[:level]
115
+ # index[-1] += 1
116
+ # while paragraphs:
117
+ # p = paragraphs.pop(0)
118
+ # if p.is_structure:
119
+ # paragraphs.insert(0, p)
120
+ # break
121
+ # else:
122
+ # attached_paragraphs.append(p)
123
+ # if paragraphs and p.level > level:
124
+ # in_children = False
125
+ # children.append(Container(paragraphs, title=p, level=p.level, index=index.copy(), father=self))
126
+ # else:
127
+ # break
128
+ # return attached_paragraphs, children
129
+
130
+
131
+ @property
132
+ def structure(self):
133
+
134
+ self_structure = {str(self.id_): {
135
+ 'index': str(self.id_),
136
+ 'canMove': True,
137
+ 'isFolder': True,
138
+ 'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
139
+ 'canRename': True,
140
+ 'data': {},
141
+ 'level': self.level,
142
+ 'rank': self.rank,
143
+ 'title': self.title.text if self.title else 'root'
144
+ }}
145
+ paragraphs_structure = [p.structure for p in self.paragraphs]
146
+ structure = [self_structure] + paragraphs_structure
147
+ for child in self.children:
148
+ structure += child.structure
149
+ return structure
150
+
src/model/doc.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.container import Container
2
+ from src.model.paragraph import Paragraph
3
+ from src.tools.reader import get_pdf_title_styles
4
+
5
+
6
+ class Doc:
7
+
8
+ def __init__(self, path='', id_=None):
9
+
10
+ self.title = path.split('/')[-1]
11
+ self.id_ = id(self)
12
+ self.path = path
13
+ paragraphs = get_pdf_title_styles(path)
14
+ self.container = Container(paragraphs, father=self, level=0)
15
+ self.blocks = self.get_blocks()
16
+
17
+ @property
18
+ def structure(self):
19
+
20
+ return self.container.structure
21
+
22
+ def get_blocks(self):
23
+
24
+ def from_list_to_str(index_list):
25
+ index_str = str(index_list[0])
26
+ for el in index_list[1:]:
27
+ index_str += '.' + str(el)
28
+ return index_str
29
+
30
+ blocks = self.container.blocks
31
+ blocks = self.delete_duplicate()
32
+ for block in blocks:
33
+ block.doc = self.title
34
+ block.index = from_list_to_str(block.index)
35
+ return blocks
36
+
37
+
38
+ def delete_duplicate(self):
39
+ while self.found_duplicates(self.container.blocks):
40
+ for i in range(len(self.container.blocks) - 1):
41
+ if self.container.blocks[i].index == self.container.blocks[i + 1].index:
42
+ if self.container.blocks[i].index != []:
43
+ self.container.blocks[i].index.pop()
44
+ return self.container.blocks
45
+
46
+ def found_duplicates(self, blocks):
47
+ for i in range(len(blocks) - 1):
48
+ if blocks[i].index == blocks[i + 1].index:
49
+ return True
50
+ return False
51
+
52
+ """
53
+ current_level = len(current_index)
54
+ if 0 < block.level:
55
+ if block.level == current_level:
56
+ current_index[-1] += 1
57
+ elif current_level < block.level:
58
+ current_index.append(1)
59
+ elif block.level < current_level:
60
+ current_index = current_index[:block.level]
61
+ current_index[-1] += 1
62
+ block.index = from_list_to_str(current_index)
63
+ else:
64
+ block.index = "0"
65
+ """
src/model/paragraph.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ INFINITE = 10000
4
+
5
+ class Paragraph:
6
+ def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
7
+ self.font_style = font_style
8
+ self.id_ = int(str(2)+str(page_id)+str(id_))
9
+ self.page_id = page_id
10
+ self.level = int(font_style[-1]) if 'title' in font_style else INFINITE
11
+ self.is_structure = self.level < INFINITE
12
+ self.text = text
13
+
14
+ @property
15
+ def blank(self):
16
+ """
17
+ checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
18
+ """
19
+ text = self.text.replace('\n', '')
20
+ return set(text).isdisjoint(string.ascii_letters)
src/tools/__init__.py ADDED
File without changes
src/tools/llm.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class LlmAgent:
2
+
3
+ def __init__(self, llm):
4
+ self.llm = llm
5
+
6
+ def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
7
+ """generates the answer"""
8
+ template = (f"You are a conversation bot designed to answer to the query from users delimited by "
9
+ f"triple backticks: "
10
+ f"\\n ``` {query} ```\\n"
11
+ f"Your answer is based on the context delimited by triple backticks: "
12
+ f"\\n ``` {context} ```\\n"
13
+ f"You are consistent and avoid redundancies with the rest of the initial conversation in French"
14
+ f"delimited by triple backticks: "
15
+ f"\\n ``` {histo} ```\\n"
16
+ f"Your response shall be in {language} and shall be concise"
17
+ f"In case the provided context is not relevant to answer to the question, just return that you "
18
+ f"don't know the answer ")
19
+
20
+ p = self.llm(template)
21
+ print("****************")
22
+ print(template)
23
+ print("----")
24
+ print(p)
25
+ return p
26
+
27
+ def translate(self, text: str, language="en") -> str:
28
+ """translates"""
29
+
30
+ languages = "`French to English" if language == "en" else "English to French"
31
+
32
+ template = (f" Your task consists in translating {languages}\\n"
33
+ f" the following text delimited by by triple backticks: ```{text}```\n"
34
+ )
35
+
36
+ p = self.llm(template)
37
+ return p
38
+
39
+ def generate_answer(self, query: str, answer_en: str, histo: str, context: str) -> str:
40
+ """provides the final answer in French based on the initial query and the answer in english"""
41
+
42
+ def _cut_unfinished_sentence(s: str):
43
+ return '.'.join(s.split('.')[:-1])
44
+
45
+ template = (f"Your task consists in translating the answer in French to the query "
46
+ f"delimited by triple backticks: ```{query}``` \\n"
47
+ f"You are given the answer in english delimited by triple backticks: ```{answer_en}```"
48
+ f"\\n You don't add new content to the answer in English but: "
49
+ f"\\n 1 You can use some vocabulary from the context in English delimited by triple backticks: "
50
+ f"```{context}```"
51
+ f"\\n 2 You are consistent and avoid redundancies with the rest of the initial"
52
+ f" conversation in English delimited by triple backticks: ```{histo}```"
53
+ )
54
+
55
+ p = self.llm(template)
56
+ # p = _cut_unfinished_sentence(p)
57
+ return p
58
+
src/tools/pretty_print.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.paragraph import Paragraph
2
+ from src.model.container import Container
3
+
4
+
5
+ #function that pretty prints the paragraphs
6
+ def pretty_printer_paragraphs(paragraphs):
7
+ for p in paragraphs:
8
+ if (p.font_style == "title1"):
9
+ print(f"Titre 1 {p.text}")
10
+ elif (p.font_style == "title2"):
11
+ print(f"---> Titre 2 {p.text}")
12
+ elif (p.font_style == "title3"):
13
+ print(f"-------> Titre 3 {p.text}")
14
+ # elif (p.font_style == "title4"):
15
+ # print(f"-----------> Titre 4 {p.text}")
16
+ # elif (p.font_style == "content"):
17
+ # print(f"---------------> {p.text}")
18
+
19
+ def pretty_print_container_structure(container):
20
+ if container.title:
21
+ print(f"{'-'*container.level} {container.title.text}")
22
+ for p in container.paragraphs:
23
+ print(f"{'-'*container.level} {p.text}")
24
+ for c in container.children:
25
+ pretty_print_container_structure(c)
src/tools/reader.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdfplumber as pdfp
3
+ from src.model.paragraph import Paragraph
4
+ import asyncio
5
+
6
+ def skip_header(dictionary):
7
+ i = 0
8
+ if not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
9
+ i+=2
10
+ return i
11
+
12
+
13
+ def get_style_of_line(size : float):
14
+ if size >= 9 and size < 11.5:
15
+ return "content"
16
+ elif size >= 11.5 and size <= 12.7:
17
+ return "title5"
18
+ elif size >= 12.8 and size <= 13.5:
19
+ return "title4"
20
+ elif size > 13.5 and size <= 15.5:
21
+ return "title3"
22
+ elif size > 15.5 and size <= 18.5:
23
+ return "title2"
24
+ elif size > 19 and size < 30:
25
+ return "title1"
26
+ # elif size >= 12 and size <= 14.5:
27
+ # return "title2"
28
+ # elif size > 14.5 and size <= 16.5:
29
+ # return "title1"
30
+ else:
31
+ return "unknown"
32
+
33
+ def get_pdf_title_styles(path):
34
+ pdf_to_read = extract_all_lines_from_the_doc(path)
35
+ paragraphs = []
36
+ j = 0
37
+ while j < len(pdf_to_read):
38
+ dictionary = pdf_to_read[j]["content"]
39
+ i = skip_header(dictionary)
40
+ while i < len(dictionary):
41
+ #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
42
+ if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
43
+ i+=1
44
+ continue
45
+ p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"]),id_=i,page_id=pdf_to_read[j]["page_number"])
46
+ if(i != len(dictionary)-1):
47
+ while(dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][0]["size"]):
48
+ p.text += " " + dictionary[i+1]["text"]
49
+ i += 1
50
+ # if(i == len(dictionary)-1):
51
+ # print("PIDOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
52
+ # if(j == len(pdf_to_read)-1):
53
+ # print("JUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU")
54
+ # break
55
+ # else:
56
+ # if(dictionary[i]["chars"][0]["size"] == pdf_to_read[j+1]["content"][0]["chars"][0]["size"]):
57
+ # print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
58
+ # j += 1
59
+ # p.text += " " + pdf_to_read[j]["content"][0]["text"]
60
+ # dictionary = pdf_to_read[j]["content"]
61
+ # i = 0
62
+ # else:
63
+ # print("RRIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIZ")
64
+ # break
65
+ else:
66
+ p.text = dictionary[i]["text"]
67
+ #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
68
+ i += 1
69
+ # print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
70
+ paragraphs.append(p)
71
+ j += 1
72
+ return paragraphs
73
+
74
+
75
+ def test_get_font_sizes_of_a_page(page : int, path):
76
+ with open(os.path.abspath(path)) as f:
77
+ reader = pdfp.PDF(f)
78
+ page = reader.pages[page]
79
+ dictionary = page.extract_text_lines()
80
+ for i in range(len(dictionary)):
81
+ print(f'{i} : {dictionary[i]["chars"][0]["size"]} ->>>>> {dictionary[i]["text"]}')
82
+
83
+
84
+ def extract_all_lines_from_the_doc(path):
85
+ lines_of_doc = []
86
+ with open(path, 'rb') as f:
87
+ reader = pdfp.PDF(f)
88
+ skip_table_of_contents = reader.pages[8:]
89
+ j = 0
90
+ while j < len(skip_table_of_contents):
91
+ lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines()})
92
+ j += 1
93
+ return lines_of_doc
94
+
95
+
96
+
97
+
98
+ # path = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
99
+ # get_pdf_title_styles(os.path.abspath(path))
100
+ # print("--------------------------------------------------")
101
+ # print("--------------------------------------------------")
102
+ #print(test_get_font_sizes_of_a_page(8))
src/tools/retriever.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.container import Container
2
+ from src.model.block import Block
3
+ from src.model.doc import Doc
4
+
5
+ class Retriever:
6
+ def __init__(self,db_client,doc : Doc = None, collection_name:str = "illumio_database"):
7
+ if doc != None:
8
+ self.collection = db_client.create_collection(name=collection_name)
9
+ blocks_good_format: [Block] = doc.blocks
10
+ self.collection.add(
11
+ documents=[block.content for block in blocks_good_format],
12
+ ids=[block.index for block in blocks_good_format],
13
+ metadatas=[block.to_dict() for block in blocks_good_format]
14
+ )
15
+ else:
16
+ self.collection = db_client.get_collection(name=collection_name)
17
+
18
+
19
+
20
+ def similarity_search(self, query: str) -> {}:
21
+ res = self.collection.query(query_texts=query)
22
+ block_dict_sources = res['metadatas'][0]
23
+ distances = res['distances'][0]
24
+ blocks = []
25
+ for bd, d in zip(block_dict_sources, distances):
26
+ b = Block().from_dict(bd)
27
+ b.distance = d
28
+ blocks.append(b)
29
+ return blocks
30
+
src/tools/test_read.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To read the PDF
2
+ import PyPDF2
3
+ # To analyze the PDF layout and extract text
4
+ from pdfminer.high_level import extract_pages, extract_text
5
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
6
+ # To extract text from tables in PDF
7
+ import pdfplumber
8
+ # To extract the images from the PDFs
9
+ from PIL import Image
10
+ from pdf2image import convert_from_path
11
+ # To perform OCR to extract text from images
12
+ import pytesseract
13
+ # To remove the additional created files
14
+ import os
15
+
16
+ def text_extraction(element):
17
+ # Extracting the text from the in-line text element
18
+ line_text = element.get_text()
19
+
20
+ # Find the formats of the text
21
+ # Initialize the list with all the formats that appeared in the line of text
22
+ line_formats = []
23
+ for text_line in element:
24
+ if isinstance(text_line, LTTextContainer):
25
+ # Iterating through each character in the line of text
26
+ for character in text_line:
27
+ if isinstance(character, LTChar):
28
+ # Append the font name of the character
29
+ line_formats.append(character.fontname)
30
+ # Append the font size of the character
31
+ line_formats.append(character.size)
32
+ # Find the unique font sizes and names in the line
33
+ format_per_line = list(set(line_formats))
34
+
35
+ # Return a tuple with the text in each line along with its format
36
+ return (line_text, format_per_line)
37
+
38
+
39
+ def crop_image(element, pageObj):
40
+ # Get the coordinates to crop the image from the PDF
41
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
42
+ # Crop the page using coordinates (left, bottom, right, top)
43
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
44
+ pageObj.mediabox.upper_right = (image_right, image_top)
45
+ # Save the cropped page to a new PDF
46
+ cropped_pdf_writer = PyPDF2.PdfWriter()
47
+ cropped_pdf_writer.add_page(pageObj)
48
+ # Save the cropped PDF to a new file
49
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
50
+ cropped_pdf_writer.write(cropped_pdf_file)
51
+
52
+ # Create a function to convert the PDF to images
53
+ def convert_to_images(input_file,):
54
+ images = convert_from_path(input_file,poppler_path=r'C:\Program Files\poppler-23.08.0\Library\bin')
55
+ image = images[0]
56
+ output_file = "PDF_image.png"
57
+ image.save(output_file, "PNG")
58
+
59
+ # Create a function to read text from images
60
+ def image_to_text(image_path):
61
+ # Read the image
62
+ img = Image.open(image_path)
63
+ # Extract the text from the image
64
+ text = pytesseract.image_to_string(img)
65
+ return text
66
+
67
+
68
+ def extract_table(pdf_path, page_num, table_num):
69
+ # Open the pdf file
70
+ pdf = pdfplumber.open(pdf_path)
71
+ # Find the examined page
72
+ table_page = pdf.pages[page_num]
73
+ # Extract the appropriate table
74
+ table = table_page.extract_tables()[table_num]
75
+ return table
76
+
77
+ # Convert table into the appropriate format
78
+ def table_converter(table):
79
+ table_string = ''
80
+ # Iterate through each row of the table
81
+ for row_num in range(len(table)):
82
+ row = table[row_num]
83
+ # Remove the line breaker from the wrapped texts
84
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
85
+ # Convert the table into a string
86
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
87
+ # Removing the last line break
88
+ table_string = table_string[:-1]
89
+ return table_string
90
+
91
+
92
+
93
+ def pdf_manager(pdf_path):
94
+ # create a PDF file object
95
+ pdfFileObj = open(pdf_path, 'rb')
96
+ # create a PDF reader object
97
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
98
+
99
+ # Create the dictionary to extract text from each image
100
+ text_per_page = {}
101
+ # We extract the pages from the PDF
102
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
103
+
104
+ # Initialize the variables needed for the text extraction from the page
105
+ pageObj = pdfReaded.pages[pagenum]
106
+ page_text = []
107
+ line_format = []
108
+ text_from_images = []
109
+ text_from_tables = []
110
+ page_content = []
111
+ # Initialize the number of the examined tables
112
+ table_num = 0
113
+ first_element= True
114
+ table_extraction_flag= False
115
+ # Open the pdf file
116
+ pdf = pdfplumber.open(pdf_path)
117
+ # Find the examined page
118
+ page_tables = pdf.pages[pagenum]
119
+ # Find the number of tables on the page
120
+ tables = page_tables.find_tables()
121
+
122
+
123
+ # Find all the elements
124
+ page_elements = [(element.y1, element) for element in page._objs]
125
+ # Sort all the elements as they appear in the page
126
+ page_elements.sort(key=lambda a: a[0], reverse=True)
127
+
128
+ # Find the elements that composed a page
129
+ for i,component in enumerate(page_elements):
130
+ # Extract the position of the top side of the element in the PDF
131
+ pos= component[0]
132
+ # Extract the element of the page layout
133
+ element = component[1]
134
+
135
+ # Check if the element is a text element
136
+ if isinstance(element, LTTextContainer):
137
+ # Check if the text appeared in a table
138
+ if table_extraction_flag == False:
139
+ # Use the function to extract the text and format for each text element
140
+ (line_text, format_per_line) = text_extraction(element)
141
+ # Append the text of each line to the page text
142
+ page_text.append(line_text)
143
+ # Append the format for each line containing text
144
+ line_format.append(format_per_line)
145
+ page_content.append(line_text)
146
+ else:
147
+ # Omit the text that appeared in a table
148
+ pass
149
+
150
+ # Check the elements for images
151
+ if isinstance(element, LTFigure):
152
+ # Crop the image from the PDF
153
+ crop_image(element, pageObj)
154
+ # Convert the cropped pdf to an image
155
+ convert_to_images('cropped_image.pdf')
156
+ # Extract the text from the image
157
+ image_text = image_to_text('PDF_image.png')
158
+ text_from_images.append(image_text)
159
+ page_content.append(image_text)
160
+ # Add a placeholder in the text and format lists
161
+ page_text.append('image')
162
+ line_format.append('image')
163
+
164
+ # Check the elements for tables
165
+ if isinstance(element, LTRect):
166
+ # If the first rectangular element
167
+ if first_element == True and (table_num+1) <= len(tables):
168
+ # Find the bounding box of the table
169
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
170
+ upper_side = element.y1
171
+ # Extract the information from the table
172
+ table = extract_table(pdf_path, pagenum, table_num)
173
+ # Convert the table information in structured string format
174
+ table_string = table_converter(table)
175
+ # Append the table string into a list
176
+ text_from_tables.append(table_string)
177
+ page_content.append(table_string)
178
+ # Set the flag as True to avoid the content again
179
+ table_extraction_flag = True
180
+ # Make it another element
181
+ first_element = False
182
+ # Add a placeholder in the text and format lists
183
+ page_text.append('table')
184
+ line_format.append('table')
185
+
186
+ # Check if we already extracted the tables from the page
187
+ if element.y0 >= lower_side and element.y1 <= upper_side:
188
+ pass
189
+ elif not isinstance(page_elements[i+1][1], LTRect):
190
+ table_extraction_flag = False
191
+ first_element = True
192
+ table_num+=1
193
+
194
+
195
+ # Create the key of the dictionary
196
+ dctkey = 'Page_'+str(pagenum)
197
+ # Add the list of list as the value of the page key
198
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
199
+
200
+ # Closing the pdf file object
201
+ pdfFileObj.close()
202
+
203
+ # Deleting the additional files created
204
+ os.remove('cropped_image.pdf')
205
+ os.remove('PDF_image.png')
206
+
207
+ # Display the content of the page
208
+ result = ''.join(text_per_page['Page_0'][4])
209
+ print(result)
src/view/view.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.control.control import Chatbot
3
+
4
+
5
+ def run(ctrl: Chatbot, config: {}):
6
+ with gr.Blocks() as qna:
7
+ with gr.Row():
8
+ with gr.Column():
9
+ pass
10
+
11
+ with gr.Column(scale=10):
12
+
13
+ gr.Markdown(config['title'])
14
+
15
+ histo_text_comp = gr.Chatbot(
16
+ visible=False,
17
+ value=[],
18
+ )
19
+ input_text_comp = gr.Textbox(
20
+ label="",
21
+ lines=1,
22
+ max_lines=3,
23
+ interactive=True,
24
+ placeholder="Posez votre question ici",
25
+ )
26
+ clear_btn = gr.Button("Clear")
27
+ input_example_comp = gr.Radio(
28
+ label="Examples",
29
+ choices=list(config['examples'].values()),
30
+ value="",
31
+ )
32
+ source_text_comp = []
33
+ for i in range(4):
34
+ source_text_comp.append(gr.Textbox(
35
+ lines=4,
36
+ max_lines=4,
37
+ interactive=False,
38
+ visible=False,
39
+ ))
40
+
41
+ with gr.Column():
42
+ pass
43
+
44
+ def input_text_fn1(input_text_, histo_text_):
45
+ histo_text_.append((input_text_, None))
46
+ update_ = {
47
+ histo_text_comp: gr.update(visible=True, value=histo_text_),
48
+ input_example_comp: gr.update(visible=False,),
49
+ }
50
+ for i in range(4):
51
+ update_[source_text_comp[i]] = gr.update(visible=False)
52
+ return update_
53
+
54
+ def input_text_fn2(input_text_, histo_text_):
55
+ answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
56
+ histo_text_[-1] = (input_text_, answer)
57
+ update_ = {
58
+ histo_text_comp: gr.update(value=histo_text_),
59
+ input_text_comp: gr.update(value=''),
60
+ }
61
+ for i in range(min(len(sources), 3)):
62
+ s = sources[i]
63
+ source_label = f'{s.index} {s.title} score = {s.distance_str}'
64
+ source_text = s.content
65
+ update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
66
+ return update_
67
+
68
+ def input_example_fn(input_example_, histo_text_):
69
+ histo_text_.append((input_example_, None))
70
+ update_ = {
71
+ input_text_comp: gr.update(value=input_example_),
72
+ histo_text_comp: gr.update(visible=True, value=histo_text_),
73
+ input_example_comp: gr.update(visible=False, value=''),
74
+ }
75
+ for i in range(4):
76
+ update_[source_text_comp[i]] = gr.update(visible=False)
77
+ return update_
78
+
79
+ def clear_fn():
80
+ update_ = {
81
+ input_text_comp: gr.update(value=''),
82
+ histo_text_comp: gr.update(value='', visible=False),
83
+ input_example_comp: gr.update(value='', visible=True),
84
+ }
85
+ for i in range(4):
86
+ update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
87
+ return update_
88
+
89
+ input_text_comp \
90
+ .submit(input_text_fn1,
91
+ inputs=[input_text_comp, histo_text_comp],
92
+ outputs=[histo_text_comp, input_example_comp,
93
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
94
+ .then(input_text_fn2,
95
+ inputs=[input_text_comp, histo_text_comp],
96
+ outputs=[input_text_comp, histo_text_comp,
97
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
98
+ input_example_comp \
99
+ .input(input_example_fn,
100
+ inputs=[input_example_comp, histo_text_comp],
101
+ outputs=[input_text_comp, histo_text_comp, input_example_comp,
102
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
103
+ .then(input_text_fn2,
104
+ inputs=[input_text_comp, histo_text_comp],
105
+ outputs=[input_text_comp, histo_text_comp,
106
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
107
+ clear_btn.click(clear_fn,
108
+ inputs=None,
109
+ outputs=[input_text_comp, histo_text_comp, input_example_comp,
110
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
111
+
112
+ return qna