updated version with extracts from documents
Browse files- app.py +5 -19
- src/model/block.py +5 -2
- src/model/container.py +25 -32
- src/model/doc.py +18 -14
- src/tools/retriever.py +1 -0
- src/view/view.py +22 -5
app.py
CHANGED
@@ -4,31 +4,17 @@ from langchain.llms import OpenAI
|
|
4 |
import chromadb
|
5 |
|
6 |
from config import *
|
7 |
-
# from config_key import *
|
8 |
from src.control.control import Controller
|
9 |
from src.tools.retriever import Retriever
|
10 |
from src.tools.llm import LlmAgent
|
11 |
from src.model.doc import Doc
|
12 |
import src.view.view as view
|
13 |
|
14 |
-
|
15 |
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
16 |
-
cle = os.environ['CLE']
|
17 |
-
open_key = os.environ['OPEN_KEY']
|
18 |
-
os.environ["OPENAI_API_KEY"] = open_key
|
19 |
-
|
20 |
-
|
21 |
-
OPENAI_API_KEY = "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"
|
22 |
-
|
23 |
-
print('***')
|
24 |
-
print(cle)
|
25 |
-
print(open_key == OPENAI_API_KEY)
|
26 |
-
print(f'open_key: {open_key}')
|
27 |
-
print(f'OPENAI_API_KEY: {OPENAI_API_KEY}')
|
28 |
-
print(f'os.environ["OPENAI_API_KEY"]: {os.environ["OPENAI_API_KEY"]}')
|
29 |
-
print('***')
|
30 |
-
|
31 |
|
|
|
|
|
|
|
32 |
|
33 |
doc_content = Doc(content_en_path)
|
34 |
doc_plan = Doc(plan_path)
|
@@ -37,12 +23,12 @@ doc_content_fr = Doc(content_fr_path)
|
|
37 |
client_db = chromadb.Client()
|
38 |
retriever = Retriever(client_db, doc_plan, doc_content, doc_content_fr, collection_name)
|
39 |
|
40 |
-
llm_model = OpenAI(temperature=0
|
41 |
llm = LlmAgent(llm_model)
|
42 |
|
43 |
specials['remote_rate_df'] = pd.read_csv(specials['remote_rate_path'])
|
44 |
specials['accommodation_meal_df'] = pd.read_csv(specials['accommodation_meal_path'])
|
45 |
-
controller = Controller(retriever=retriever, llm=llm,
|
46 |
specials=specials)
|
47 |
|
48 |
qna = view.run(ctrl=controller, config=view_config)
|
|
|
4 |
import chromadb
|
5 |
|
6 |
from config import *
|
|
|
7 |
from src.control.control import Controller
|
8 |
from src.tools.retriever import Retriever
|
9 |
from src.tools.llm import LlmAgent
|
10 |
from src.model.doc import Doc
|
11 |
import src.view.view as view
|
12 |
|
|
|
13 |
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
if not "OPENAI_API_KEY" in os.environ:
|
16 |
+
from config_key import OPENAI_API_KEY
|
17 |
+
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
|
18 |
|
19 |
doc_content = Doc(content_en_path)
|
20 |
doc_plan = Doc(plan_path)
|
|
|
23 |
client_db = chromadb.Client()
|
24 |
retriever = Retriever(client_db, doc_plan, doc_content, doc_content_fr, collection_name)
|
25 |
|
26 |
+
llm_model = OpenAI(temperature=0)
|
27 |
llm = LlmAgent(llm_model)
|
28 |
|
29 |
specials['remote_rate_df'] = pd.read_csv(specials['remote_rate_path'])
|
30 |
specials['accommodation_meal_df'] = pd.read_csv(specials['accommodation_meal_path'])
|
31 |
+
controller = Controller(retriever=retriever, llm=llm, content_language=content_language, plan_language=plan_language,
|
32 |
specials=specials)
|
33 |
|
34 |
qna = view.run(ctrl=controller, config=view_config)
|
src/model/block.py
CHANGED
@@ -3,6 +3,7 @@ class Block:
|
|
3 |
index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
|
4 |
self.doc = doc
|
5 |
self.title = title
|
|
|
6 |
self.content = content
|
7 |
self.content_fr = content_fr
|
8 |
self.specials = []
|
@@ -12,8 +13,9 @@ class Block:
|
|
12 |
self.distance = distance
|
13 |
|
14 |
def to_dict(self) -> {}:
|
15 |
-
block_dict = {'doc': self.doc, 'title': self.title, '
|
16 |
-
'
|
|
|
17 |
for i, s in enumerate(self.specials):
|
18 |
special_key = 'special_'+str(i)
|
19 |
block_dict[special_key] = s
|
@@ -23,6 +25,7 @@ class Block:
|
|
23 |
def from_dict(self, block_dict: {}):
|
24 |
self.doc = block_dict['doc']
|
25 |
self.title = block_dict['title']
|
|
|
26 |
self.content = block_dict['content']
|
27 |
self.content_fr = block_dict['content_fr']
|
28 |
self.index = block_dict['index']
|
|
|
3 |
index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
|
4 |
self.doc = doc
|
5 |
self.title = title
|
6 |
+
self.title_fr = ""
|
7 |
self.content = content
|
8 |
self.content_fr = content_fr
|
9 |
self.specials = []
|
|
|
13 |
self.distance = distance
|
14 |
|
15 |
def to_dict(self) -> {}:
|
16 |
+
block_dict = {'doc': self.doc, 'title': self.title, 'title_fr': self.title_fr, 'content': self.content,
|
17 |
+
'content_fr': self.content_fr, 'index': self.index, 'rank': self.rank, 'level': self.level,
|
18 |
+
'distance': self.distance}
|
19 |
for i, s in enumerate(self.specials):
|
20 |
special_key = 'special_'+str(i)
|
21 |
block_dict[special_key] = s
|
|
|
25 |
def from_dict(self, block_dict: {}):
|
26 |
self.doc = block_dict['doc']
|
27 |
self.title = block_dict['title']
|
28 |
+
self.title_fr = block_dict['title_fr']
|
29 |
self.content = block_dict['content']
|
30 |
self.content_fr = block_dict['content_fr']
|
31 |
self.index = block_dict['index']
|
src/model/container.py
CHANGED
@@ -6,17 +6,19 @@ INFINITE = 99999
|
|
6 |
|
7 |
class Container:
|
8 |
|
9 |
-
def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0,
|
10 |
-
id_=0):
|
|
|
|
|
11 |
self.level = level
|
12 |
self.title = title
|
13 |
self.paragraphs = []
|
14 |
self.children = []
|
15 |
-
self.
|
16 |
self.father = father # if not father, then the container is at the top of the hierarchy
|
17 |
self.id_ = int(str(1) + str(father.id_) + str(id_))
|
18 |
if paragraphs:
|
19 |
-
self.paragraphs, self.children = self.create_children(paragraphs, level,
|
20 |
self.blocks = self.get_blocks()
|
21 |
|
22 |
@property
|
@@ -47,7 +49,7 @@ class Container:
|
|
47 |
return text_chunks
|
48 |
|
49 |
def get_blocks(self):
|
50 |
-
block = Block(
|
51 |
if self.title:
|
52 |
block.title = self.title.text
|
53 |
for p in self.paragraphs:
|
@@ -62,28 +64,7 @@ class Container:
|
|
62 |
blocks += child.blocks
|
63 |
return blocks
|
64 |
|
65 |
-
|
66 |
-
def table_of_contents(self):
|
67 |
-
toc = []
|
68 |
-
if self.title:
|
69 |
-
toc += [{str(self.level): self.title.text}]
|
70 |
-
if self.children:
|
71 |
-
for child in self.children:
|
72 |
-
toc += child.table_of_contents
|
73 |
-
return toc
|
74 |
-
|
75 |
-
def move(self, position: int, new_father=None):
|
76 |
-
current_father = self.father # should be added in the model
|
77 |
-
current_father.children.remove(self)
|
78 |
-
|
79 |
-
self.rank = new_father.rank + 1 if new_father else 0
|
80 |
-
self.father = new_father
|
81 |
-
if position < len(new_father.children):
|
82 |
-
new_father.children.insert(position, self)
|
83 |
-
else:
|
84 |
-
new_father.children.append(self)
|
85 |
-
|
86 |
-
def create_children(self, paragraphs, level, rank) -> ([], []):
|
87 |
"""
|
88 |
creates children containers or directly attached content
|
89 |
and returns the list of containers and contents of level+1
|
@@ -95,8 +76,8 @@ class Container:
|
|
95 |
container_title = None
|
96 |
children = []
|
97 |
in_children = False
|
98 |
-
level = INFINITE
|
99 |
child_id = 0
|
|
|
100 |
|
101 |
while paragraphs:
|
102 |
p = paragraphs.pop(0)
|
@@ -104,19 +85,31 @@ class Container:
|
|
104 |
attached_paragraphs.append(p)
|
105 |
else:
|
106 |
in_children = True
|
107 |
-
if p.is_structure and p.level <= level: # if p is higher
|
108 |
if container_paragraphs or container_title:
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
child_id += 1
|
111 |
container_paragraphs = []
|
112 |
container_title = p
|
113 |
level = p.level
|
114 |
|
115 |
-
else: # p is strictly lower in hierarchy
|
116 |
container_paragraphs.append(p)
|
117 |
|
118 |
if container_paragraphs or container_title:
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
child_id += 1
|
121 |
|
122 |
return attached_paragraphs, children
|
|
|
6 |
|
7 |
class Container:
|
8 |
|
9 |
+
def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, index: [int] = None,
|
10 |
+
father=None, id_=0):
|
11 |
+
if index is None:
|
12 |
+
index = []
|
13 |
self.level = level
|
14 |
self.title = title
|
15 |
self.paragraphs = []
|
16 |
self.children = []
|
17 |
+
self.index = index
|
18 |
self.father = father # if not father, then the container is at the top of the hierarchy
|
19 |
self.id_ = int(str(1) + str(father.id_) + str(id_))
|
20 |
if paragraphs:
|
21 |
+
self.paragraphs, self.children = self.create_children(paragraphs, level, index)
|
22 |
self.blocks = self.get_blocks()
|
23 |
|
24 |
@property
|
|
|
49 |
return text_chunks
|
50 |
|
51 |
def get_blocks(self):
|
52 |
+
block = Block(level=self.level, index=self.index)
|
53 |
if self.title:
|
54 |
block.title = self.title.text
|
55 |
for p in self.paragraphs:
|
|
|
64 |
blocks += child.blocks
|
65 |
return blocks
|
66 |
|
67 |
+
def create_children(self, paragraphs: Paragraph, level: int, index: [int]) -> ([Paragraph], []):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
"""
|
69 |
creates children containers or directly attached content
|
70 |
and returns the list of containers and contents of level+1
|
|
|
76 |
container_title = None
|
77 |
children = []
|
78 |
in_children = False
|
|
|
79 |
child_id = 0
|
80 |
+
level = INFINITE
|
81 |
|
82 |
while paragraphs:
|
83 |
p = paragraphs.pop(0)
|
|
|
85 |
attached_paragraphs.append(p)
|
86 |
else:
|
87 |
in_children = True
|
88 |
+
if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
|
89 |
if container_paragraphs or container_title:
|
90 |
+
if level <= len(index):
|
91 |
+
index = index[:level]
|
92 |
+
index[-1] += 1
|
93 |
+
else:
|
94 |
+
for i in range(level-len(index)):
|
95 |
+
index.append(1)
|
96 |
+
children.append(Container(container_paragraphs, container_title, level, index, self, child_id))
|
97 |
child_id += 1
|
98 |
container_paragraphs = []
|
99 |
container_title = p
|
100 |
level = p.level
|
101 |
|
102 |
+
else: # p is normal text or strictly lower in hierarchy, then the child continues to grow
|
103 |
container_paragraphs.append(p)
|
104 |
|
105 |
if container_paragraphs or container_title:
|
106 |
+
if level <= len(index):
|
107 |
+
index = index[:level]
|
108 |
+
index[-1] += 1
|
109 |
+
else:
|
110 |
+
for i in range(level - len(index)):
|
111 |
+
index.append(1)
|
112 |
+
children.append(Container(container_paragraphs, container_title, level, index, self, child_id))
|
113 |
child_id += 1
|
114 |
|
115 |
return attached_paragraphs, children
|
src/model/doc.py
CHANGED
@@ -13,7 +13,7 @@ class Doc:
|
|
13 |
self.id_ = id(self)
|
14 |
self.path = path
|
15 |
paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)]
|
16 |
-
self.container = Container(paragraphs, father=self)
|
17 |
self.blocks = self.get_blocks()
|
18 |
|
19 |
@property
|
@@ -29,21 +29,25 @@ class Doc:
|
|
29 |
index_str += '.' + str(el)
|
30 |
return index_str
|
31 |
|
32 |
-
current_index = []
|
33 |
blocks = self.container.blocks
|
34 |
for block in blocks:
|
35 |
block.doc = self.title
|
36 |
-
|
37 |
-
if 0 < block.level:
|
38 |
-
if block.level == current_level:
|
39 |
-
current_index[-1] += 1
|
40 |
-
elif current_level < block.level:
|
41 |
-
current_index.append(1)
|
42 |
-
elif block.level < current_level:
|
43 |
-
current_index = current_index[:block.level]
|
44 |
-
current_index[-1] += 1
|
45 |
-
block.index = from_list_to_str(current_index)
|
46 |
-
else:
|
47 |
-
block.index = "0"
|
48 |
blocks.remove(block)
|
|
|
49 |
return blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
self.id_ = id(self)
|
14 |
self.path = path
|
15 |
paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)]
|
16 |
+
self.container = Container(paragraphs, father=self, level=0)
|
17 |
self.blocks = self.get_blocks()
|
18 |
|
19 |
@property
|
|
|
29 |
index_str += '.' + str(el)
|
30 |
return index_str
|
31 |
|
|
|
32 |
blocks = self.container.blocks
|
33 |
for block in blocks:
|
34 |
block.doc = self.title
|
35 |
+
if block.level == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
blocks.remove(block)
|
37 |
+
block.index = from_list_to_str(block.index)
|
38 |
return blocks
|
39 |
+
"""
|
40 |
+
current_level = len(current_index)
|
41 |
+
if 0 < block.level:
|
42 |
+
if block.level == current_level:
|
43 |
+
current_index[-1] += 1
|
44 |
+
elif current_level < block.level:
|
45 |
+
current_index.append(1)
|
46 |
+
elif block.level < current_level:
|
47 |
+
current_index = current_index[:block.level]
|
48 |
+
current_index[-1] += 1
|
49 |
+
block.index = from_list_to_str(current_index)
|
50 |
+
else:
|
51 |
+
block.index = "0"
|
52 |
+
"""
|
53 |
+
|
src/tools/retriever.py
CHANGED
@@ -12,6 +12,7 @@ class Retriever:
|
|
12 |
cb.specials = pb.specials
|
13 |
for cb, cb_fr in zip(content_blocks, content_fr_blocks):
|
14 |
cb.content_fr = cb_fr.content
|
|
|
15 |
self.collection = db_client.create_collection(name=collection_name)
|
16 |
self.collection.add(
|
17 |
documents=[block.content for block in plan_blocks],
|
|
|
12 |
cb.specials = pb.specials
|
13 |
for cb, cb_fr in zip(content_blocks, content_fr_blocks):
|
14 |
cb.content_fr = cb_fr.content
|
15 |
+
cb.title_fr = cb_fr.title
|
16 |
self.collection = db_client.create_collection(name=collection_name)
|
17 |
self.collection.add(
|
18 |
documents=[block.content for block in plan_blocks],
|
src/view/view.py
CHANGED
@@ -30,11 +30,19 @@ def run(ctrl: Controller, config: {}):
|
|
30 |
interactive=False,
|
31 |
visible=False,
|
32 |
)
|
33 |
-
|
34 |
label="Documents sources",
|
35 |
visible=False,
|
36 |
interactive=False,
|
37 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
with gr.Column():
|
40 |
pass
|
@@ -47,11 +55,17 @@ def run(ctrl: Controller, config: {}):
|
|
47 |
|
48 |
def input_text_fn2(input_text_):
|
49 |
answer, sources = ctrl.get_response(query_fr=input_text_)
|
50 |
-
source_labels = [s.distance_str
|
|
|
51 |
update_ = {
|
52 |
output_text_comp: gr.update(value=answer),
|
53 |
-
|
54 |
}
|
|
|
|
|
|
|
|
|
|
|
55 |
return update_
|
56 |
|
57 |
def input_example_fn(input_example_):
|
@@ -63,9 +77,12 @@ def run(ctrl: Controller, config: {}):
|
|
63 |
|
64 |
input_text_comp \
|
65 |
.submit(input_text_fn1, inputs=[], outputs=[output_text_comp]) \
|
66 |
-
.then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp,
|
67 |
input_example_comp \
|
68 |
.change(input_example_fn, inputs=[input_example_comp], outputs=[input_text_comp, output_text_comp]) \
|
69 |
-
.then(input_text_fn2,
|
|
|
|
|
|
|
70 |
|
71 |
return qna
|
|
|
30 |
interactive=False,
|
31 |
visible=False,
|
32 |
)
|
33 |
+
sources_title_comp = gr.CheckboxGroup(
|
34 |
label="Documents sources",
|
35 |
visible=False,
|
36 |
interactive=False,
|
37 |
)
|
38 |
+
source_text_comp = []
|
39 |
+
for i in range(4):
|
40 |
+
source_text_comp.append(gr.Textbox(
|
41 |
+
lines=4,
|
42 |
+
max_lines=4,
|
43 |
+
interactive=False,
|
44 |
+
visible=False,
|
45 |
+
))
|
46 |
|
47 |
with gr.Column():
|
48 |
pass
|
|
|
55 |
|
56 |
def input_text_fn2(input_text_):
|
57 |
answer, sources = ctrl.get_response(query_fr=input_text_)
|
58 |
+
source_labels = [f'{s.distance_str} {s.index} {s.title} from {s.doc}' for s in sources]
|
59 |
+
|
60 |
update_ = {
|
61 |
output_text_comp: gr.update(value=answer),
|
62 |
+
sources_title_comp: gr.update(visible=False, choices=source_labels, value=source_labels),
|
63 |
}
|
64 |
+
for i in range(min(len(sources), 4)):
|
65 |
+
s = sources[i]
|
66 |
+
source_label = f'{s.index} {s.title_fr} score = {s.distance_str}'
|
67 |
+
source_text = s.content_fr
|
68 |
+
update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
|
69 |
return update_
|
70 |
|
71 |
def input_example_fn(input_example_):
|
|
|
77 |
|
78 |
input_text_comp \
|
79 |
.submit(input_text_fn1, inputs=[], outputs=[output_text_comp]) \
|
80 |
+
.then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_title_comp])
|
81 |
input_example_comp \
|
82 |
.change(input_example_fn, inputs=[input_example_comp], outputs=[input_text_comp, output_text_comp]) \
|
83 |
+
.then(input_text_fn2,
|
84 |
+
inputs=[input_text_comp],
|
85 |
+
outputs=[output_text_comp, sources_title_comp,
|
86 |
+
source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
|
87 |
|
88 |
return qna
|