Carlos Rosas commited on
Commit
ca3da3d
·
verified ·
1 Parent(s): 201543f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -45
app.py CHANGED
@@ -32,7 +32,7 @@ tokenizer.pad_token = tokenizer.eos_token
32
  tokenizer.pad_token_id = 1
33
 
34
  # Define variables
35
- temperature = 0
36
  max_new_tokens = 1200
37
  top_p = 0.95
38
  repetition_penalty = 1.0
@@ -62,20 +62,13 @@ def hybrid_search(text):
62
  title = row['section']
63
  content = row['text']
64
 
65
- source_text = f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>"
66
- document.append(source_text)
67
  document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
68
-
69
- # Add debug print
70
- print(f"Source added: {hash_id}")
71
 
72
  document = "\n".join(document)
73
  document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
74
-
75
- # Add debug print
76
- print(f"Total sources: {len(seen_hashes)}")
77
  return document, document_html
78
-
79
  class pleiasBot:
80
  def __init__(self, system_prompt="Tu es Appli, un asistant de recherche qui donne des responses sourcées"):
81
  self.system_prompt = system_prompt
@@ -84,13 +77,9 @@ class pleiasBot:
84
  fiches, fiches_html = hybrid_search(user_message)
85
 
86
  detailed_prompt = f"""<|query_start|>{user_message}<|query_end|>\n{fiches}\n<|source_analysis_start|>"""
87
-
88
- # Add debug print
89
- print("Model input length:", len(detailed_prompt))
90
-
91
  # Convert inputs to tensor
92
  input_ids = tokenizer.encode(detailed_prompt, return_tensors="pt").to(device)
93
- print("Token count:", len(input_ids[0]))
94
  attention_mask = torch.ones_like(input_ids)
95
 
96
  try:
@@ -132,40 +121,33 @@ class pleiasBot:
132
  traceback.print_exc()
133
  return None, None, None
134
 
135
- def hybrid_search(text):
136
- results = table.search(text, query_type="hybrid").limit(5).to_pandas()
137
-
138
- # Use a list to maintain order
139
- seen_hashes = []
140
 
141
- document = []
142
- document_html = []
143
- for _, row in results.iterrows():
144
- hash_id = str(row['hash'])
145
-
146
- # Skip if we've already seen this hash
147
- if hash_id in seen_hashes:
148
- continue
149
-
150
- seen_hashes.append(hash_id) # append instead of add to maintain order
151
- title = row['section']
152
- content = row['text']
153
-
154
- source_text = f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>"
155
- document.append(source_text)
156
- document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
157
 
158
- # Print for debugging
159
- print(f"Added source {hash_id}")
160
- print(f"Length of source text: {len(source_text)}")
161
-
162
- document = "\n".join(document)
163
- document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
 
 
 
 
 
 
 
 
 
164
 
165
- # Print total length for debugging
166
- print(f"Total length of document: {len(document)}")
167
 
168
- return document, document_html
169
 
170
  # Initialize the pleiasBot
171
  pleias_bot = pleiasBot()
 
32
  tokenizer.pad_token_id = 1
33
 
34
  # Define variables
35
+ temperature = 0.0
36
  max_new_tokens = 1200
37
  top_p = 0.95
38
  repetition_penalty = 1.0
 
62
  title = row['section']
63
  content = row['text']
64
 
65
+ document.append(f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>")
 
66
  document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
 
 
 
67
 
68
  document = "\n".join(document)
69
  document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
 
 
 
70
  return document, document_html
71
+
72
  class pleiasBot:
73
  def __init__(self, system_prompt="Tu es Appli, un asistant de recherche qui donne des responses sourcées"):
74
  self.system_prompt = system_prompt
 
77
  fiches, fiches_html = hybrid_search(user_message)
78
 
79
  detailed_prompt = f"""<|query_start|>{user_message}<|query_end|>\n{fiches}\n<|source_analysis_start|>"""
80
+
 
 
 
81
  # Convert inputs to tensor
82
  input_ids = tokenizer.encode(detailed_prompt, return_tensors="pt").to(device)
 
83
  attention_mask = torch.ones_like(input_ids)
84
 
85
  try:
 
121
  traceback.print_exc()
122
  return None, None, None
123
 
124
+ def format_references(text):
125
+ ref_pattern = r'<ref name="([^"]+)">"([^"]+)"</ref>\.\s*' # Modified pattern to include the period and whitespace after ref
 
 
 
126
 
127
+ parts = []
128
+ current_pos = 0
129
+ ref_number = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ for match in re.finditer(ref_pattern, text):
132
+ # Add text before the reference
133
+ text_before = text[current_pos:match.start()].rstrip()
134
+ parts.append(text_before)
135
+
136
+ # Extract reference components
137
+ ref_id = match.group(1)
138
+ ref_text = match.group(2).strip()
139
+
140
+ # Add the reference, keeping the existing structure but adding <br> where whitespace was
141
+ tooltip_html = f'<span class="tooltip"><strong>[{ref_number}]</strong><span class="tooltiptext"><strong>{ref_id}</strong>: {ref_text}</span></span>.<br>'
142
+ parts.append(tooltip_html)
143
+
144
+ current_pos = match.end()
145
+ ref_number += 1
146
 
147
+ # Add any remaining text
148
+ parts.append(text[current_pos:])
149
 
150
+ return ''.join(parts)
151
 
152
  # Initialize the pleiasBot
153
  pleias_bot = pleiasBot()