Spaces:

Hexamind
/

ChatBot_Illumio

Runtime error

App Files Files Community

adrien.aribaut-gaudin commited on Sep 28, 2023

Commit

9ff01ff

1 Parent(s): 29de006

push on hugging_face

Browse files

Files changed (25) hide show

.gitattributes +2 -0
.gitignore +166 -0
app.py +48 -0
config.py +14 -0
data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf +3 -0
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/data_level0.bin +3 -0
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/header.bin +3 -0
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/length.bin +3 -0
database/4a5944a6-5c35-44f8-88be-78ce2e35028c/link_lists.bin +0 -0
database/chroma.sqlite3 +3 -0
src/__init__.py +0 -0
src/control/__init__.py +0 -0
src/control/control.py +69 -0
src/model/__init__.py +0 -0
src/model/block.py +34 -0
src/model/container.py +150 -0
src/model/doc.py +65 -0
src/model/paragraph.py +20 -0
src/tools/__init__.py +0 -0
src/tools/llm.py +58 -0
src/tools/pretty_print.py +25 -0
src/tools/reader.py +102 -0
src/tools/retriever.py +30 -0
src/tools/test_read.py +209 -0
src/view/view.py +112 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+config_key.py
+#Test folder
+data/Test/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import pandas as pd
+import os
+from langchain.llms import OpenAI
+import chromadb
+from config import *
+from src.tools.reader import get_pdf_title_styles
+from src.tools.llm import LlmAgent
+import src.view.view as view
+from src.tools.pretty_print import pretty_print_container_structure, pretty_printer_paragraphs
+from src.model.container import Container
+from src.control.control import Chatbot
+from src.tools.retriever import Retriever
+from src.model.doc import Doc
+from src.tools.test_read import pdf_manager
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+if not "OPENAI_API_KEY" in os.environ:
+    from config_key import OPENAI_API_KEY
+    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
+#check if the database is empty
+# pdf_manager(pdf_path=content_en_path_real)
+# pretty_printer_paragraphs(doc.container.paragraphs)
+# pretty_print_container_structure(doc.container)
+if not os.path.exists("Ilumio_chatbot/database/"):
+    os.makedirs("Ilumio_chatbot/database/")
+client_db = chromadb.PersistentClient(path="Ilumio_chatbot/database/")
+try:
+    client_db.get_collection(name="illumio_database")
+    retriever = Retriever(client_db, None, "illumio_database")
+except:
+    print("Database is empty")
+    doc = Doc(path=content_en_path_real)
+    retriever = Retriever(client_db,doc.container,"illumio_database")
+llm_model = OpenAI(temperature=0)
+llm = LlmAgent(llm_model)
+chat = Chatbot(llm_agent=llm, retriever=retriever)
+ilumio_qna = view.run(ctrl=chat, config=view_config)
+ilumio_qna.queue().launch()

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+content_language = 'en'
+plan_language = 'en'
+content_en_path_real = "Ilumio_chatbot/data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
+content_test = "Ilumio_chatbot/data/Test/Test_children.pdf"
+examples = {"Question banale?": "Pourquoi le ciel est bleu?",
+}
+view_config = {
+    'title': '# Ilumio Q&A',
+    'examples': examples,
+}

data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8821bd9530837f23a99e6b5d17d1e893f74d91ac6112c861d4ecd3f830e42479
+size 4115867

database/4a5944a6-5c35-44f8-88be-78ce2e35028c/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
+size 1676000

database/4a5944a6-5c35-44f8-88be-78ce2e35028c/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
+size 100

database/4a5944a6-5c35-44f8-88be-78ce2e35028c/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b8e0bc4024bd2c60f910c5ea628a6b78cab039442c2c1e1576985ca11d8ab69
+size 4000

database/4a5944a6-5c35-44f8-88be-78ce2e35028c/link_lists.bin ADDED Viewed

File without changes

database/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5f6d73b67b4014dc1c30e357807cf0910ce18f617f99ae46d18126d541ad3a2
+size 5914624

src/__init__.py ADDED Viewed

File without changes

src/control/__init__.py ADDED Viewed

File without changes

src/control/control.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pandas as pd
+from src.tools.retriever import Retriever
+from src.tools.llm import LlmAgent
+from src.model.block import Block
+class Chatbot:
+    def __init__(self, llm_agent, retriever):
+        self.retriever = retriever
+        self.llm = llm_agent
+    def get_response(self, query, histo):
+        histo_conversation, histo_queries = self._get_histo(histo)
+        queries = histo_queries
+        block_sources = self.retriever.similarity_search(query=queries)
+        block_sources = self._select_best_sources(block_sources)
+        sources_contents = [s.content for s in block_sources]
+        context = '\n'.join(sources_contents)
+        answer = self.llm.generate_paragraph(query=queries, histo=histo_conversation, context=context, language='en')
+        answer = self.llm.generate_answer(answer_en=answer, query=query, histo=histo_conversation, context=context)
+        answer = self._clean_answer(answer)
+        return answer, block_sources
+    @staticmethod
+    def  _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
+        """
+        Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
+        """
+        best_sources = []
+        for idx, s in enumerate(sources):
+            if idx == 0 \
+                    or (s.distance - sources[idx - 1].distance < delta_1_2
+                        and s.distance - sources[0].distance < delta_1_n) \
+                    or s.distance < absolute:
+                best_sources.append(s)
+                delta_1_2 *= alpha
+                delta_1_n *= alpha
+                absolute *= alpha
+            else:
+                break
+        return best_sources
+    @staticmethod
+    def _get_histo(histo: [(str, str)]) -> (str, str):
+        histo_conversation = ""
+        histo_queries = ""
+        for (query, answer) in histo[-5:]:
+            histo_conversation += f'user: {query} \n bot: {answer}\n'
+            histo_queries += query + '\n'
+        return histo_conversation[:-1], histo_queries
+    @staticmethod
+    def _clean_answer(answer: str) -> str:
+        answer = answer.strip('bot:')
+        while answer and answer[-1] in {"'", '"', " ", "`"}:
+            answer = answer[:-1]
+        while answer and answer[0] in {"'", '"', " ", "`"}:
+            answer = answer[1:]
+        answer = answer.strip('bot:')
+        if answer:
+            if answer[-1] != ".":
+                answer += "."
+        return answer

src/model/__init__.py ADDED Viewed

File without changes

src/model/block.py ADDED Viewed

	@@ -0,0 +1,34 @@

+class Block:
+    def __init__(self, doc: str= '',title: str = '', content: str = '',
+                 index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
+        self.doc = doc
+        self.title = title
+        self.content = content
+        self.index = index
+        self.rank = rank
+        self.level = level
+        self.distance = distance
+    def to_dict(self) -> {}:
+        block_dict = {'doc': self.doc,
+                      'title': self.title,
+                      'content': self.content,
+                      'index': self.index,
+                      'rank': self.rank,
+                      'level': self.level,
+                      'distance': self.distance}
+        return block_dict
+    def from_dict(self, block_dict: {}):
+        self.doc = block_dict['doc']
+        self.title = block_dict['title']
+        self.content = block_dict['content']
+        self.index = block_dict['index']
+        self.rank = block_dict['rank']
+        self.level = block_dict['level']
+        self.distance = block_dict['distance']
+        return self
+    @property
+    def distance_str(self) -> str:
+        return format(self.distance, '.2f')

src/model/container.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from .paragraph import Paragraph
+from .block import Block
+INFINITE = 99999
+class Container:
+    def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
+        if index is None:
+            index = []
+        self.level = level
+        self.title = title
+        self.paragraphs = []
+        self.children = []
+        self.index = index
+        self.father = father
+        self.id_ = int(str(1) + str(father.id_) + str(id_))
+        if paragraphs:
+            self.paragraphs, self.children = self.create_children(paragraphs, level, index)
+        self.blocks = self.get_blocks()
+    def get_blocks(self):
+        block = Block(level=self.level, index=self.index)
+        if self.title:
+            block.title = self.title.text
+        for p in self.paragraphs:
+            block.content += p.text
+        blocks = [block] if block.content else []
+        for child in self.children:
+            blocks += child.blocks
+        return blocks
+    def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
+        """
+        Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
+        The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
+        :return: List of Content or Container
+        """
+        attached_paragraphs = []
+        children = []
+        in_children = False
+        level = INFINITE
+        container_paragraphs = []
+        container_title = None
+        while paragraphs:
+            p = paragraphs.pop(0)
+            if not in_children and not p.is_structure:
+                attached_paragraphs.append(p)
+            else:
+                in_children = True
+                if p.is_structure and p.level <= level:  # if p is higher in hierarchy, then the child is completed
+                    if container_paragraphs or container_title:
+                        if level <= len(index):
+                            index = index[:level]
+                            index[-1] += 1
+                        else:
+                            for i in range(level-len(index)):
+                                index.append(1)
+                        children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
+                    container_paragraphs = []
+                    container_title = p
+                    level = p.level
+                else:  # p is normal text or strictly lower in hierarchy, then the child continues to grow
+                    container_paragraphs.append(p)
+        if container_paragraphs or container_title:
+            if level <= len(index):
+                index = index[:level]
+                index[-1] += 1
+            else:
+                for i in range(level - len(index)):
+                    index.append(1)
+            children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
+        return attached_paragraphs, children
+    # def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
+    #     """
+    #     Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
+    #     The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
+    #     :return: List of Content or Container
+    #     """
+    #     attached_paragraphs = []
+    #     children = []
+    #     in_children = False
+    #     level = INFINITE
+    #     # container_paragraphs = []
+    #     # container_title = None
+    #     while paragraphs:
+    #         p = paragraphs.pop(0)
+    #         if not in_children and p.is_structure and level != INFINITE:
+    #             paragraphs.insert(0, p)
+    #             children.append(Container(paragraphs, title=p, level=p.level, children=children, index=index.copy(), father=self))
+    #         else:
+    #             in_children = True
+    #             if p.is_structure and p.level <= level:  # if p is higher in hierarchy, then the child is completed
+    #                 level = p.level
+    #                 if len(index) == level:
+    #                     index[-1] += 1
+    #                 elif len(index) < level:
+    #                     if self.children != []:
+    #                         index = self.children[-1].index.copy()
+    #                         index[-1] += 1
+    #                     else:
+    #                         index.append(1)
+    #                 else:
+    #                     index = index[:level]
+    #                     index[-1] += 1
+    #                 while paragraphs:
+    #                     p = paragraphs.pop(0)
+    #                     if p.is_structure:
+    #                         paragraphs.insert(0, p)
+    #                         break
+    #                     else:
+    #                         attached_paragraphs.append(p)
+    #                 if paragraphs and p.level > level:
+    #                     in_children = False
+    #                     children.append(Container(paragraphs, title=p, level=p.level, index=index.copy(), father=self))
+    #                 else:
+    #                     break
+    #     return attached_paragraphs, children
+    @property
+    def structure(self):
+        self_structure = {str(self.id_): {
+            'index': str(self.id_),
+            'canMove': True,
+            'isFolder': True,
+            'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
+            'canRename': True,
+            'data': {},
+            'level': self.level,
+            'rank': self.rank,
+            'title': self.title.text if self.title else 'root'
+        }}
+        paragraphs_structure = [p.structure for p in self.paragraphs]
+        structure = [self_structure] + paragraphs_structure
+        for child in self.children:
+            structure += child.structure
+        return structure

src/model/doc.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from src.model.container import Container
+from src.model.paragraph import Paragraph
+from src.tools.reader import get_pdf_title_styles
+class Doc:
+    def __init__(self, path='', id_=None):
+        self.title = path.split('/')[-1]
+        self.id_ = id(self)
+        self.path = path
+        paragraphs = get_pdf_title_styles(path)
+        self.container = Container(paragraphs, father=self, level=0)
+        self.blocks = self.get_blocks()
+    @property
+    def structure(self):
+        return self.container.structure
+    def get_blocks(self):
+        def from_list_to_str(index_list):
+            index_str = str(index_list[0])
+            for el in index_list[1:]:
+                index_str += '.' + str(el)
+            return index_str
+        blocks = self.container.blocks
+        blocks = self.delete_duplicate()
+        for block in blocks:
+            block.doc = self.title
+            block.index = from_list_to_str(block.index)
+        return blocks
+    def delete_duplicate(self):
+        while self.found_duplicates(self.container.blocks):
+            for i in range(len(self.container.blocks) - 1):
+                if self.container.blocks[i].index == self.container.blocks[i + 1].index:
+                    if self.container.blocks[i].index != []:
+                        self.container.blocks[i].index.pop()
+        return self.container.blocks
+    def found_duplicates(self, blocks):
+        for i in range(len(blocks) - 1):
+            if blocks[i].index == blocks[i + 1].index:
+                return True
+        return False
+"""
+    current_level = len(current_index)
+    if 0 < block.level:
+        if block.level == current_level:
+            current_index[-1] += 1
+        elif current_level < block.level:
+            current_index.append(1)
+        elif block.level < current_level:
+            current_index = current_index[:block.level]
+            current_index[-1] += 1
+        block.index = from_list_to_str(current_index)
+    else:
+        block.index = "0"
+"""

src/model/paragraph.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import string
+INFINITE = 10000
+class Paragraph:
+    def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
+        self.font_style = font_style
+        self.id_ = int(str(2)+str(page_id)+str(id_))
+        self.page_id = page_id
+        self.level = int(font_style[-1]) if 'title' in font_style else INFINITE
+        self.is_structure = self.level < INFINITE
+        self.text = text
+    @property
+    def blank(self):
+        """
+        checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
+        """
+        text = self.text.replace('\n', '')
+        return set(text).isdisjoint(string.ascii_letters)

src/tools/__init__.py ADDED Viewed

File without changes

src/tools/llm.py ADDED Viewed

	@@ -0,0 +1,58 @@

+class LlmAgent:
+    def __init__(self, llm):
+        self.llm = llm
+    def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
+        """generates the  answer"""
+        template = (f"You are a conversation bot designed to answer to the query from users delimited by "
+                    f"triple backticks: "
+                    f"\\n ``` {query} ```\\n"
+                    f"Your answer is based on the context delimited by triple backticks: "
+                    f"\\n ``` {context} ```\\n"
+                    f"You are consistent and avoid redundancies with the rest of the initial conversation in French"
+                    f"delimited by triple backticks: "
+                    f"\\n ``` {histo} ```\\n"
+                    f"Your response shall be in {language} and shall be concise"
+                    f"In case the provided context is not relevant to answer to the question, just return that you "
+                    f"don't know the answer ")
+        p = self.llm(template)
+        print("****************")
+        print(template)
+        print("----")
+        print(p)
+        return p
+    def translate(self, text: str, language="en") -> str:
+        """translates"""
+        languages = "`French to English" if language == "en" else "English to French"
+        template = (f"    Your task consists in translating {languages}\\n"
+                    f"    the following text delimited by by triple backticks: ```{text}```\n"
+                    )
+        p = self.llm(template)
+        return p
+    def generate_answer(self, query: str, answer_en: str, histo: str, context: str) -> str:
+        """provides the final answer in French based on the initial query and the answer in english"""
+        def _cut_unfinished_sentence(s: str):
+            return '.'.join(s.split('.')[:-1])
+        template = (f"Your task consists in translating the answer in French to the query "
+                    f"delimited by triple backticks: ```{query}``` \\n"
+                    f"You are given the answer in english delimited by triple backticks: ```{answer_en}```"
+                    f"\\n You don't add new content to the answer in English but: "
+                    f"\\n 1 You can use some vocabulary from the context in English delimited by triple backticks: "
+                    f"```{context}```"
+                    f"\\n 2 You are consistent and avoid redundancies with the rest of the initial"
+                    f" conversation in English delimited by triple backticks: ```{histo}```"
+                    )
+        p = self.llm(template)
+        # p = _cut_unfinished_sentence(p)
+        return p

src/tools/pretty_print.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from src.model.paragraph import Paragraph
+from src.model.container import Container
+#function that pretty prints the paragraphs
+def pretty_printer_paragraphs(paragraphs):
+    for p in paragraphs:
+        if (p.font_style == "title1"):
+            print(f"Titre 1 {p.text}")
+        elif (p.font_style == "title2"):
+            print(f"---> Titre 2 {p.text}")
+        elif (p.font_style == "title3"):
+            print(f"-------> Titre 3 {p.text}")
+        # elif (p.font_style == "title4"):
+        #     print(f"-----------> Titre 4 {p.text}")
+        # elif (p.font_style == "content"):
+        #     print(f"---------------> {p.text}")
+def pretty_print_container_structure(container):
+    if container.title:
+        print(f"{'-'*container.level} {container.title.text}")
+    for p in container.paragraphs:
+        print(f"{'-'*container.level} {p.text}")
+    for c in container.children:
+        pretty_print_container_structure(c)

src/tools/reader.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import pdfplumber as pdfp
+from src.model.paragraph import Paragraph
+import asyncio
+def skip_header(dictionary):
+    i = 0
+    if not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
+        i+=2
+    return i
+def get_style_of_line(size : float):
+    if size >= 9 and size < 11.5:
+        return "content"
+    elif size >= 11.5 and size <= 12.7:
+        return "title5"
+    elif size >= 12.8 and size <= 13.5:
+        return "title4"
+    elif size > 13.5 and size <= 15.5:
+        return "title3"
+    elif size > 15.5 and size <= 18.5:
+        return "title2"
+    elif size > 19 and size < 30:
+        return "title1"
+    # elif size >= 12 and size <= 14.5:
+    #     return "title2"
+    # elif size > 14.5 and size <= 16.5:
+    #     return "title1"
+    else:
+        return "unknown"
+def get_pdf_title_styles(path):
+    pdf_to_read = extract_all_lines_from_the_doc(path)
+    paragraphs = []
+    j = 0
+    while j < len(pdf_to_read):
+        dictionary = pdf_to_read[j]["content"]
+        i = skip_header(dictionary)
+        while i < len(dictionary):
+            #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
+            if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
+                i+=1
+                continue
+            p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"]),id_=i,page_id=pdf_to_read[j]["page_number"])
+            if(i != len(dictionary)-1):
+                while(dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][0]["size"]):
+                    p.text += " " + dictionary[i+1]["text"]
+                    i += 1
+                    # if(i == len(dictionary)-1):
+                    #     print("PIDOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
+                    #     if(j == len(pdf_to_read)-1):
+                    #         print("JUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU")
+                    #         break
+                    #     else:
+                    #         if(dictionary[i]["chars"][0]["size"] == pdf_to_read[j+1]["content"][0]["chars"][0]["size"]):
+                    #             print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
+                    #             j += 1
+                    #             p.text += " " + pdf_to_read[j]["content"][0]["text"]
+                    #             dictionary = pdf_to_read[j]["content"]
+                    #             i = 0
+                    #         else:
+                    #             print("RRIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIZ")
+                    #             break
+            else:
+                p.text = dictionary[i]["text"]
+            #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
+            i += 1
+            # print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
+            paragraphs.append(p)
+        j += 1
+    return paragraphs
+def test_get_font_sizes_of_a_page(page : int, path):
+    with open(os.path.abspath(path)) as f:
+        reader = pdfp.PDF(f)
+        page = reader.pages[page]
+        dictionary = page.extract_text_lines()
+        for i in range(len(dictionary)):
+            print(f'{i} : {dictionary[i]["chars"][0]["size"]} ->>>>> {dictionary[i]["text"]}')
+def extract_all_lines_from_the_doc(path):
+    lines_of_doc = []
+    with open(path, 'rb') as f:
+        reader = pdfp.PDF(f)
+        skip_table_of_contents = reader.pages[8:]
+        j = 0
+        while j < len(skip_table_of_contents):
+            lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines()})
+            j += 1
+    return lines_of_doc
+# path = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
+# get_pdf_title_styles(os.path.abspath(path))
+# print("--------------------------------------------------")
+# print("--------------------------------------------------")
+#print(test_get_font_sizes_of_a_page(8))

src/tools/retriever.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from src.model.container import Container
+from src.model.block import Block
+from src.model.doc import Doc
+class Retriever:
+    def __init__(self,db_client,doc : Doc = None, collection_name:str = "illumio_database"):
+        if doc != None:
+            self.collection = db_client.create_collection(name=collection_name)
+            blocks_good_format: [Block] = doc.blocks
+            self.collection.add(
+                documents=[block.content for block in blocks_good_format],
+                ids=[block.index for block in blocks_good_format],
+                metadatas=[block.to_dict() for block in blocks_good_format]
+            )
+        else:
+            self.collection = db_client.get_collection(name=collection_name)
+    def similarity_search(self, query: str) -> {}:
+        res = self.collection.query(query_texts=query)
+        block_dict_sources = res['metadatas'][0]
+        distances = res['distances'][0]
+        blocks = []
+        for bd, d in zip(block_dict_sources, distances):
+            b = Block().from_dict(bd)
+            b.distance = d
+            blocks.append(b)
+        return blocks

src/tools/test_read.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# To read the PDF
+import PyPDF2
+# To analyze the PDF layout and extract text
+from pdfminer.high_level import extract_pages, extract_text
+from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
+# To extract text from tables in PDF
+import pdfplumber
+# To extract the images from the PDFs
+from PIL import Image
+from pdf2image import convert_from_path
+# To perform OCR to extract text from images
+import pytesseract
+# To remove the additional created files
+import os
+def text_extraction(element):
+    # Extracting the text from the in-line text element
+    line_text = element.get_text()
+    # Find the formats of the text
+    # Initialize the list with all the formats that appeared in the line of text
+    line_formats = []
+    for text_line in element:
+        if isinstance(text_line, LTTextContainer):
+            # Iterating through each character in the line of text
+            for character in text_line:
+                if isinstance(character, LTChar):
+                    # Append the font name of the character
+                    line_formats.append(character.fontname)
+                    # Append the font size of the character
+                    line_formats.append(character.size)
+    # Find the unique font sizes and names in the line
+    format_per_line = list(set(line_formats))
+    # Return a tuple with the text in each line along with its format
+    return (line_text, format_per_line)
+def crop_image(element, pageObj):
+    # Get the coordinates to crop the image from the PDF
+    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
+    # Crop the page using coordinates (left, bottom, right, top)
+    pageObj.mediabox.lower_left = (image_left, image_bottom)
+    pageObj.mediabox.upper_right = (image_right, image_top)
+    # Save the cropped page to a new PDF
+    cropped_pdf_writer = PyPDF2.PdfWriter()
+    cropped_pdf_writer.add_page(pageObj)
+    # Save the cropped PDF to a new file
+    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
+        cropped_pdf_writer.write(cropped_pdf_file)
+# Create a function to convert the PDF to images
+def convert_to_images(input_file,):
+    images = convert_from_path(input_file,poppler_path=r'C:\Program Files\poppler-23.08.0\Library\bin')
+    image = images[0]
+    output_file = "PDF_image.png"
+    image.save(output_file, "PNG")
+# Create a function to read text from images
+def image_to_text(image_path):
+    # Read the image
+    img = Image.open(image_path)
+    # Extract the text from the image
+    text = pytesseract.image_to_string(img)
+    return text
+def extract_table(pdf_path, page_num, table_num):
+    # Open the pdf file
+    pdf = pdfplumber.open(pdf_path)
+    # Find the examined page
+    table_page = pdf.pages[page_num]
+    # Extract the appropriate table
+    table = table_page.extract_tables()[table_num]
+    return table
+# Convert table into the appropriate format
+def table_converter(table):
+    table_string = ''
+    # Iterate through each row of the table
+    for row_num in range(len(table)):
+        row = table[row_num]
+        # Remove the line breaker from the wrapped texts
+        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
+        # Convert the table into a string
+        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
+    # Removing the last line break
+    table_string = table_string[:-1]
+    return table_string
+def pdf_manager(pdf_path):
+    # create a PDF file object
+    pdfFileObj = open(pdf_path, 'rb')
+    # create a PDF reader object
+    pdfReaded = PyPDF2.PdfReader(pdfFileObj)
+    # Create the dictionary to extract text from each image
+    text_per_page = {}
+    # We extract the pages from the PDF
+    for pagenum, page in enumerate(extract_pages(pdf_path)):
+        # Initialize the variables needed for the text extraction from the page
+        pageObj = pdfReaded.pages[pagenum]
+        page_text = []
+        line_format = []
+        text_from_images = []
+        text_from_tables = []
+        page_content = []
+        # Initialize the number of the examined tables
+        table_num = 0
+        first_element= True
+        table_extraction_flag= False
+        # Open the pdf file
+        pdf = pdfplumber.open(pdf_path)
+        # Find the examined page
+        page_tables = pdf.pages[pagenum]
+        # Find the number of tables on the page
+        tables = page_tables.find_tables()
+        # Find all the elements
+        page_elements = [(element.y1, element) for element in page._objs]
+        # Sort all the elements as they appear in the page
+        page_elements.sort(key=lambda a: a[0], reverse=True)
+        # Find the elements that composed a page
+        for i,component in enumerate(page_elements):
+            # Extract the position of the top side of the element in the PDF
+            pos= component[0]
+            # Extract the element of the page layout
+            element = component[1]
+            # Check if the element is a text element
+            if isinstance(element, LTTextContainer):
+                # Check if the text appeared in a table
+                if table_extraction_flag == False:
+                    # Use the function to extract the text and format for each text element
+                    (line_text, format_per_line) = text_extraction(element)
+                    # Append the text of each line to the page text
+                    page_text.append(line_text)
+                    # Append the format for each line containing text
+                    line_format.append(format_per_line)
+                    page_content.append(line_text)
+                else:
+                    # Omit the text that appeared in a table
+                    pass
+            # Check the elements for images
+            if isinstance(element, LTFigure):
+                # Crop the image from the PDF
+                crop_image(element, pageObj)
+                # Convert the cropped pdf to an image
+                convert_to_images('cropped_image.pdf')
+                # Extract the text from the image
+                image_text = image_to_text('PDF_image.png')
+                text_from_images.append(image_text)
+                page_content.append(image_text)
+                # Add a placeholder in the text and format lists
+                page_text.append('image')
+                line_format.append('image')
+            # Check the elements for tables
+            if isinstance(element, LTRect):
+                # If the first rectangular element
+                if first_element == True and (table_num+1) <= len(tables):
+                    # Find the bounding box of the table
+                    lower_side = page.bbox[3] - tables[table_num].bbox[3]
+                    upper_side = element.y1
+                    # Extract the information from the table
+                    table = extract_table(pdf_path, pagenum, table_num)
+                    # Convert the table information in structured string format
+                    table_string = table_converter(table)
+                    # Append the table string into a list
+                    text_from_tables.append(table_string)
+                    page_content.append(table_string)
+                    # Set the flag as True to avoid the content again
+                    table_extraction_flag = True
+                    # Make it another element
+                    first_element = False
+                    # Add a placeholder in the text and format lists
+                    page_text.append('table')
+                    line_format.append('table')
+                # Check if we already extracted the tables from the page
+                if element.y0 >= lower_side and element.y1 <= upper_side:
+                    pass
+                elif not isinstance(page_elements[i+1][1], LTRect):
+                    table_extraction_flag = False
+                    first_element = True
+                    table_num+=1
+        # Create the key of the dictionary
+        dctkey = 'Page_'+str(pagenum)
+        # Add the list of list as the value of the page key
+        text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
+    # Closing the pdf file object
+    pdfFileObj.close()
+    # Deleting the additional files created
+    os.remove('cropped_image.pdf')
+    os.remove('PDF_image.png')
+    # Display the content of the page
+    result = ''.join(text_per_page['Page_0'][4])
+    print(result)

src/view/view.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr
+from src.control.control import Chatbot
+def run(ctrl: Chatbot, config: {}):
+    with gr.Blocks() as qna:
+        with gr.Row():
+            with gr.Column():
+                pass
+            with gr.Column(scale=10):
+                gr.Markdown(config['title'])
+                histo_text_comp = gr.Chatbot(
+                    visible=False,
+                    value=[],
+                )
+                input_text_comp = gr.Textbox(
+                    label="",
+                    lines=1,
+                    max_lines=3,
+                    interactive=True,
+                    placeholder="Posez votre question ici",
+                )
+                clear_btn = gr.Button("Clear")
+                input_example_comp = gr.Radio(
+                    label="Examples",
+                    choices=list(config['examples'].values()),
+                    value="",
+                )
+                source_text_comp = []
+                for i in range(4):
+                    source_text_comp.append(gr.Textbox(
+                        lines=4,
+                        max_lines=4,
+                        interactive=False,
+                        visible=False,
+                    ))
+            with gr.Column():
+                pass
+        def input_text_fn1(input_text_, histo_text_):
+            histo_text_.append((input_text_, None))
+            update_ = {
+                histo_text_comp: gr.update(visible=True, value=histo_text_),
+                input_example_comp: gr.update(visible=False,),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False)
+            return update_
+        def input_text_fn2(input_text_, histo_text_):
+            answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
+            histo_text_[-1] = (input_text_, answer)
+            update_ = {
+                histo_text_comp: gr.update(value=histo_text_),
+                input_text_comp: gr.update(value=''),
+            }
+            for i in range(min(len(sources), 3)):
+                s = sources[i]
+                source_label = f'{s.index}   {s.title}                        score = {s.distance_str}'
+                source_text = s.content
+                update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
+            return update_
+        def input_example_fn(input_example_, histo_text_):
+            histo_text_.append((input_example_, None))
+            update_ = {
+                input_text_comp: gr.update(value=input_example_),
+                histo_text_comp: gr.update(visible=True, value=histo_text_),
+                input_example_comp: gr.update(visible=False, value=''),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False)
+            return update_
+        def clear_fn():
+            update_ = {
+                input_text_comp: gr.update(value=''),
+                histo_text_comp: gr.update(value='', visible=False),
+                input_example_comp: gr.update(value='', visible=True),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
+            return update_
+        input_text_comp \
+            .submit(input_text_fn1,
+                    inputs=[input_text_comp, histo_text_comp],
+                    outputs=[histo_text_comp, input_example_comp,
+                             source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+            .then(input_text_fn2,
+                  inputs=[input_text_comp, histo_text_comp],
+                  outputs=[input_text_comp, histo_text_comp,
+                           source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+        input_example_comp \
+            .input(input_example_fn,
+                   inputs=[input_example_comp, histo_text_comp],
+                   outputs=[input_text_comp, histo_text_comp, input_example_comp,
+                            source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+            .then(input_text_fn2,
+                  inputs=[input_text_comp, histo_text_comp],
+                  outputs=[input_text_comp, histo_text_comp,
+                           source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+        clear_btn.click(clear_fn,
+                        inputs=None,
+                        outputs=[input_text_comp, histo_text_comp, input_example_comp,
+                                 source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+    return qna