gutgut

Paused

App Files Files Community

Carlos Rosas commited on Nov 28, 2024

Commit

ca3da3d

verified ·

1 Parent(s): 201543f

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -45

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ tokenizer.pad_token = tokenizer.eos_token
 tokenizer.pad_token_id = 1
 # Define variables
-temperature = 0
 max_new_tokens = 1200
 top_p = 0.95
 repetition_penalty = 1.0
@@ -62,20 +62,13 @@ def hybrid_search(text):
         title = row['section']
         content = row['text']
-        source_text = f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>"
-        document.append(source_text)
         document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
-        # Add debug print
-        print(f"Source added: {hash_id}")
     document = "\n".join(document)
     document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
-    # Add debug print
-    print(f"Total sources: {len(seen_hashes)}")
     return document, document_html
 class pleiasBot:
     def __init__(self, system_prompt="Tu es Appli, un asistant de recherche qui donne des responses sourcées"):
         self.system_prompt = system_prompt
@@ -84,13 +77,9 @@ class pleiasBot:
         fiches, fiches_html = hybrid_search(user_message)
         detailed_prompt = f"""<|query_start|>{user_message}<|query_end|>\n{fiches}\n<|source_analysis_start|>"""
-        # Add debug print
-        print("Model input length:", len(detailed_prompt))
         # Convert inputs to tensor
         input_ids = tokenizer.encode(detailed_prompt, return_tensors="pt").to(device)
-        print("Token count:", len(input_ids[0]))
         attention_mask = torch.ones_like(input_ids)
         try:
@@ -132,40 +121,33 @@ class pleiasBot:
             traceback.print_exc()
             return None, None, None
-def hybrid_search(text):
-    results = table.search(text, query_type="hybrid").limit(5).to_pandas()
-    # Use a list to maintain order
-    seen_hashes = []
-    document = []
-    document_html = []
-    for _, row in results.iterrows():
-        hash_id = str(row['hash'])
-        # Skip if we've already seen this hash
-        if hash_id in seen_hashes:
-            continue
-        seen_hashes.append(hash_id)  # append instead of add to maintain order
-        title = row['section']
-        content = row['text']
-        source_text = f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>"
-        document.append(source_text)
-        document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
-        # Print for debugging
-        print(f"Added source {hash_id}")
-        print(f"Length of source text: {len(source_text)}")
-    document = "\n".join(document)
-    document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
-    # Print total length for debugging
-    print(f"Total length of document: {len(document)}")
-    return document, document_html
 # Initialize the pleiasBot
 pleias_bot = pleiasBot()

 tokenizer.pad_token_id = 1
 # Define variables
+temperature = 0.0
 max_new_tokens = 1200
 top_p = 0.95
 repetition_penalty = 1.0
         title = row['section']
         content = row['text']
+        document.append(f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>")
         document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
     document = "\n".join(document)
     document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
     return document, document_html
 class pleiasBot:
     def __init__(self, system_prompt="Tu es Appli, un asistant de recherche qui donne des responses sourcées"):
         self.system_prompt = system_prompt
         fiches, fiches_html = hybrid_search(user_message)
         detailed_prompt = f"""<|query_start|>{user_message}<|query_end|>\n{fiches}\n<|source_analysis_start|>"""
         # Convert inputs to tensor
         input_ids = tokenizer.encode(detailed_prompt, return_tensors="pt").to(device)
         attention_mask = torch.ones_like(input_ids)
         try:
             traceback.print_exc()
             return None, None, None
+def format_references(text):
+    ref_pattern = r'<ref name="([^"]+)">"([^"]+)"</ref>\.\s*'  # Modified pattern to include the period and whitespace after ref
+    parts = []
+    current_pos = 0
+    ref_number = 1
+    for match in re.finditer(ref_pattern, text):
+        # Add text before the reference
+        text_before = text[current_pos:match.start()].rstrip()
+        parts.append(text_before)
+        # Extract reference components
+        ref_id = match.group(1)
+        ref_text = match.group(2).strip()
+        # Add the reference, keeping the existing structure but adding <br> where whitespace was
+        tooltip_html = f'<span class="tooltip"><strong>[{ref_number}]</strong><span class="tooltiptext"><strong>{ref_id}</strong>: {ref_text}</span></span>.<br>'
+        parts.append(tooltip_html)
+        current_pos = match.end()
+        ref_number += 1
+    # Add any remaining text
+    parts.append(text[current_pos:])
+    return ''.join(parts)
 # Initialize the pleiasBot
 pleias_bot = pleiasBot()