YvesP commited on
Commit
988c713
·
1 Parent(s): 00f7c25

updated version with extracts from documents

Browse files
app.py CHANGED
@@ -4,31 +4,17 @@ from langchain.llms import OpenAI
4
  import chromadb
5
 
6
  from config import *
7
- # from config_key import *
8
  from src.control.control import Controller
9
  from src.tools.retriever import Retriever
10
  from src.tools.llm import LlmAgent
11
  from src.model.doc import Doc
12
  import src.view.view as view
13
 
14
-
15
  os.environ["TOKENIZERS_PARALLELISM"] = "true"
16
- cle = os.environ['CLE']
17
- open_key = os.environ['OPEN_KEY']
18
- os.environ["OPENAI_API_KEY"] = open_key
19
-
20
-
21
- OPENAI_API_KEY = "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"
22
-
23
- print('***')
24
- print(cle)
25
- print(open_key == OPENAI_API_KEY)
26
- print(f'open_key: {open_key}')
27
- print(f'OPENAI_API_KEY: {OPENAI_API_KEY}')
28
- print(f'os.environ["OPENAI_API_KEY"]: {os.environ["OPENAI_API_KEY"]}')
29
- print('***')
30
-
31
 
 
 
 
32
 
33
  doc_content = Doc(content_en_path)
34
  doc_plan = Doc(plan_path)
@@ -37,12 +23,12 @@ doc_content_fr = Doc(content_fr_path)
37
  client_db = chromadb.Client()
38
  retriever = Retriever(client_db, doc_plan, doc_content, doc_content_fr, collection_name)
39
 
40
- llm_model = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
41
  llm = LlmAgent(llm_model)
42
 
43
  specials['remote_rate_df'] = pd.read_csv(specials['remote_rate_path'])
44
  specials['accommodation_meal_df'] = pd.read_csv(specials['accommodation_meal_path'])
45
- controller = Controller(retriever=retriever, llm=llm, content_language=content_language, plan_language=plan_language,
46
  specials=specials)
47
 
48
  qna = view.run(ctrl=controller, config=view_config)
 
4
  import chromadb
5
 
6
  from config import *
 
7
  from src.control.control import Controller
8
  from src.tools.retriever import Retriever
9
  from src.tools.llm import LlmAgent
10
  from src.model.doc import Doc
11
  import src.view.view as view
12
 
 
13
  os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ if not "OPENAI_API_KEY" in os.environ:
16
+ from config_key import OPENAI_API_KEY
17
+ os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
18
 
19
  doc_content = Doc(content_en_path)
20
  doc_plan = Doc(plan_path)
 
23
  client_db = chromadb.Client()
24
  retriever = Retriever(client_db, doc_plan, doc_content, doc_content_fr, collection_name)
25
 
26
+ llm_model = OpenAI(temperature=0)
27
  llm = LlmAgent(llm_model)
28
 
29
  specials['remote_rate_df'] = pd.read_csv(specials['remote_rate_path'])
30
  specials['accommodation_meal_df'] = pd.read_csv(specials['accommodation_meal_path'])
31
+ controller = Controller(retriever=retriever, llm=llm, content_language=content_language, plan_language=plan_language,
32
  specials=specials)
33
 
34
  qna = view.run(ctrl=controller, config=view_config)
src/model/block.py CHANGED
@@ -3,6 +3,7 @@ class Block:
3
  index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
4
  self.doc = doc
5
  self.title = title
 
6
  self.content = content
7
  self.content_fr = content_fr
8
  self.specials = []
@@ -12,8 +13,9 @@ class Block:
12
  self.distance = distance
13
 
14
  def to_dict(self) -> {}:
15
- block_dict = {'doc': self.doc, 'title': self.title, 'content': self.content, 'content_fr': self.content_fr,
16
- 'index': self.index, 'rank': self.rank, 'level': self.level, 'distance': self.distance}
 
17
  for i, s in enumerate(self.specials):
18
  special_key = 'special_'+str(i)
19
  block_dict[special_key] = s
@@ -23,6 +25,7 @@ class Block:
23
  def from_dict(self, block_dict: {}):
24
  self.doc = block_dict['doc']
25
  self.title = block_dict['title']
 
26
  self.content = block_dict['content']
27
  self.content_fr = block_dict['content_fr']
28
  self.index = block_dict['index']
 
3
  index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
4
  self.doc = doc
5
  self.title = title
6
+ self.title_fr = ""
7
  self.content = content
8
  self.content_fr = content_fr
9
  self.specials = []
 
13
  self.distance = distance
14
 
15
  def to_dict(self) -> {}:
16
+ block_dict = {'doc': self.doc, 'title': self.title, 'title_fr': self.title_fr, 'content': self.content,
17
+ 'content_fr': self.content_fr, 'index': self.index, 'rank': self.rank, 'level': self.level,
18
+ 'distance': self.distance}
19
  for i, s in enumerate(self.specials):
20
  special_key = 'special_'+str(i)
21
  block_dict[special_key] = s
 
25
  def from_dict(self, block_dict: {}):
26
  self.doc = block_dict['doc']
27
  self.title = block_dict['title']
28
+ self.title_fr = block_dict['title_fr']
29
  self.content = block_dict['content']
30
  self.content_fr = block_dict['content_fr']
31
  self.index = block_dict['index']
src/model/container.py CHANGED
@@ -6,17 +6,19 @@ INFINITE = 99999
6
 
7
  class Container:
8
 
9
- def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, rank: int = 0, father=None,
10
- id_=0):
 
 
11
  self.level = level
12
  self.title = title
13
  self.paragraphs = []
14
  self.children = []
15
- self.rank = rank
16
  self.father = father # if not father, then the container is at the top of the hierarchy
17
  self.id_ = int(str(1) + str(father.id_) + str(id_))
18
  if paragraphs:
19
- self.paragraphs, self.children = self.create_children(paragraphs, level, rank + 1)
20
  self.blocks = self.get_blocks()
21
 
22
  @property
@@ -47,7 +49,7 @@ class Container:
47
  return text_chunks
48
 
49
  def get_blocks(self):
50
- block = Block(rank=self.rank, level=self.level)
51
  if self.title:
52
  block.title = self.title.text
53
  for p in self.paragraphs:
@@ -62,28 +64,7 @@ class Container:
62
  blocks += child.blocks
63
  return blocks
64
 
65
- @property
66
- def table_of_contents(self):
67
- toc = []
68
- if self.title:
69
- toc += [{str(self.level): self.title.text}]
70
- if self.children:
71
- for child in self.children:
72
- toc += child.table_of_contents
73
- return toc
74
-
75
- def move(self, position: int, new_father=None):
76
- current_father = self.father # should be added in the model
77
- current_father.children.remove(self)
78
-
79
- self.rank = new_father.rank + 1 if new_father else 0
80
- self.father = new_father
81
- if position < len(new_father.children):
82
- new_father.children.insert(position, self)
83
- else:
84
- new_father.children.append(self)
85
-
86
- def create_children(self, paragraphs, level, rank) -> ([], []):
87
  """
88
  creates children containers or directly attached content
89
  and returns the list of containers and contents of level+1
@@ -95,8 +76,8 @@ class Container:
95
  container_title = None
96
  children = []
97
  in_children = False
98
- level = INFINITE
99
  child_id = 0
 
100
 
101
  while paragraphs:
102
  p = paragraphs.pop(0)
@@ -104,19 +85,31 @@ class Container:
104
  attached_paragraphs.append(p)
105
  else:
106
  in_children = True
107
- if p.is_structure and p.level <= level: # if p is higher or equal in hierarchy
108
  if container_paragraphs or container_title:
109
- children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
 
 
 
 
 
 
110
  child_id += 1
111
  container_paragraphs = []
112
  container_title = p
113
  level = p.level
114
 
115
- else: # p is strictly lower in hierarchy
116
  container_paragraphs.append(p)
117
 
118
  if container_paragraphs or container_title:
119
- children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
 
 
 
 
 
 
120
  child_id += 1
121
 
122
  return attached_paragraphs, children
 
6
 
7
  class Container:
8
 
9
+ def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, index: [int] = None,
10
+ father=None, id_=0):
11
+ if index is None:
12
+ index = []
13
  self.level = level
14
  self.title = title
15
  self.paragraphs = []
16
  self.children = []
17
+ self.index = index
18
  self.father = father # if not father, then the container is at the top of the hierarchy
19
  self.id_ = int(str(1) + str(father.id_) + str(id_))
20
  if paragraphs:
21
+ self.paragraphs, self.children = self.create_children(paragraphs, level, index)
22
  self.blocks = self.get_blocks()
23
 
24
  @property
 
49
  return text_chunks
50
 
51
  def get_blocks(self):
52
+ block = Block(level=self.level, index=self.index)
53
  if self.title:
54
  block.title = self.title.text
55
  for p in self.paragraphs:
 
64
  blocks += child.blocks
65
  return blocks
66
 
67
+ def create_children(self, paragraphs: Paragraph, level: int, index: [int]) -> ([Paragraph], []):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
  creates children containers or directly attached content
70
  and returns the list of containers and contents of level+1
 
76
  container_title = None
77
  children = []
78
  in_children = False
 
79
  child_id = 0
80
+ level = INFINITE
81
 
82
  while paragraphs:
83
  p = paragraphs.pop(0)
 
85
  attached_paragraphs.append(p)
86
  else:
87
  in_children = True
88
+ if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
89
  if container_paragraphs or container_title:
90
+ if level <= len(index):
91
+ index = index[:level]
92
+ index[-1] += 1
93
+ else:
94
+ for i in range(level-len(index)):
95
+ index.append(1)
96
+ children.append(Container(container_paragraphs, container_title, level, index, self, child_id))
97
  child_id += 1
98
  container_paragraphs = []
99
  container_title = p
100
  level = p.level
101
 
102
+ else: # p is normal text or strictly lower in hierarchy, then the child continues to grow
103
  container_paragraphs.append(p)
104
 
105
  if container_paragraphs or container_title:
106
+ if level <= len(index):
107
+ index = index[:level]
108
+ index[-1] += 1
109
+ else:
110
+ for i in range(level - len(index)):
111
+ index.append(1)
112
+ children.append(Container(container_paragraphs, container_title, level, index, self, child_id))
113
  child_id += 1
114
 
115
  return attached_paragraphs, children
src/model/doc.py CHANGED
@@ -13,7 +13,7 @@ class Doc:
13
  self.id_ = id(self)
14
  self.path = path
15
  paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)]
16
- self.container = Container(paragraphs, father=self)
17
  self.blocks = self.get_blocks()
18
 
19
  @property
@@ -29,21 +29,25 @@ class Doc:
29
  index_str += '.' + str(el)
30
  return index_str
31
 
32
- current_index = []
33
  blocks = self.container.blocks
34
  for block in blocks:
35
  block.doc = self.title
36
- current_level = len(current_index)
37
- if 0 < block.level:
38
- if block.level == current_level:
39
- current_index[-1] += 1
40
- elif current_level < block.level:
41
- current_index.append(1)
42
- elif block.level < current_level:
43
- current_index = current_index[:block.level]
44
- current_index[-1] += 1
45
- block.index = from_list_to_str(current_index)
46
- else:
47
- block.index = "0"
48
  blocks.remove(block)
 
49
  return blocks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  self.id_ = id(self)
14
  self.path = path
15
  paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)]
16
+ self.container = Container(paragraphs, father=self, level=0)
17
  self.blocks = self.get_blocks()
18
 
19
  @property
 
29
  index_str += '.' + str(el)
30
  return index_str
31
 
 
32
  blocks = self.container.blocks
33
  for block in blocks:
34
  block.doc = self.title
35
+ if block.level == 0:
 
 
 
 
 
 
 
 
 
 
 
36
  blocks.remove(block)
37
+ block.index = from_list_to_str(block.index)
38
  return blocks
39
+ """
40
+ current_level = len(current_index)
41
+ if 0 < block.level:
42
+ if block.level == current_level:
43
+ current_index[-1] += 1
44
+ elif current_level < block.level:
45
+ current_index.append(1)
46
+ elif block.level < current_level:
47
+ current_index = current_index[:block.level]
48
+ current_index[-1] += 1
49
+ block.index = from_list_to_str(current_index)
50
+ else:
51
+ block.index = "0"
52
+ """
53
+
src/tools/retriever.py CHANGED
@@ -12,6 +12,7 @@ class Retriever:
12
  cb.specials = pb.specials
13
  for cb, cb_fr in zip(content_blocks, content_fr_blocks):
14
  cb.content_fr = cb_fr.content
 
15
  self.collection = db_client.create_collection(name=collection_name)
16
  self.collection.add(
17
  documents=[block.content for block in plan_blocks],
 
12
  cb.specials = pb.specials
13
  for cb, cb_fr in zip(content_blocks, content_fr_blocks):
14
  cb.content_fr = cb_fr.content
15
+ cb.title_fr = cb_fr.title
16
  self.collection = db_client.create_collection(name=collection_name)
17
  self.collection.add(
18
  documents=[block.content for block in plan_blocks],
src/view/view.py CHANGED
@@ -30,11 +30,19 @@ def run(ctrl: Controller, config: {}):
30
  interactive=False,
31
  visible=False,
32
  )
33
- sources_comp = gr.CheckboxGroup(
34
  label="Documents sources",
35
  visible=False,
36
  interactive=False,
37
  )
 
 
 
 
 
 
 
 
38
 
39
  with gr.Column():
40
  pass
@@ -47,11 +55,17 @@ def run(ctrl: Controller, config: {}):
47
 
48
  def input_text_fn2(input_text_):
49
  answer, sources = ctrl.get_response(query_fr=input_text_)
50
- source_labels = [s.distance_str + ' ' + s.index + ' ' + s.title + ' from ' + s.doc for s in sources]
 
51
  update_ = {
52
  output_text_comp: gr.update(value=answer),
53
- sources_comp: gr.update(visible=True, choices=source_labels, value=source_labels)
54
  }
 
 
 
 
 
55
  return update_
56
 
57
  def input_example_fn(input_example_):
@@ -63,9 +77,12 @@ def run(ctrl: Controller, config: {}):
63
 
64
  input_text_comp \
65
  .submit(input_text_fn1, inputs=[], outputs=[output_text_comp]) \
66
- .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp])
67
  input_example_comp \
68
  .change(input_example_fn, inputs=[input_example_comp], outputs=[input_text_comp, output_text_comp]) \
69
- .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp])
 
 
 
70
 
71
  return qna
 
30
  interactive=False,
31
  visible=False,
32
  )
33
+ sources_title_comp = gr.CheckboxGroup(
34
  label="Documents sources",
35
  visible=False,
36
  interactive=False,
37
  )
38
+ source_text_comp = []
39
+ for i in range(4):
40
+ source_text_comp.append(gr.Textbox(
41
+ lines=4,
42
+ max_lines=4,
43
+ interactive=False,
44
+ visible=False,
45
+ ))
46
 
47
  with gr.Column():
48
  pass
 
55
 
56
  def input_text_fn2(input_text_):
57
  answer, sources = ctrl.get_response(query_fr=input_text_)
58
+ source_labels = [f'{s.distance_str} {s.index} {s.title} from {s.doc}' for s in sources]
59
+
60
  update_ = {
61
  output_text_comp: gr.update(value=answer),
62
+ sources_title_comp: gr.update(visible=False, choices=source_labels, value=source_labels),
63
  }
64
+ for i in range(min(len(sources), 4)):
65
+ s = sources[i]
66
+ source_label = f'{s.index} {s.title_fr} score = {s.distance_str}'
67
+ source_text = s.content_fr
68
+ update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
69
  return update_
70
 
71
  def input_example_fn(input_example_):
 
77
 
78
  input_text_comp \
79
  .submit(input_text_fn1, inputs=[], outputs=[output_text_comp]) \
80
+ .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_title_comp])
81
  input_example_comp \
82
  .change(input_example_fn, inputs=[input_example_comp], outputs=[input_text_comp, output_text_comp]) \
83
+ .then(input_text_fn2,
84
+ inputs=[input_text_comp],
85
+ outputs=[output_text_comp, sources_title_comp,
86
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
87
 
88
  return qna