gutgut

Paused

App Files Files Community

Carlos Rosas commited on Nov 28, 2024

Commit

201543f

verified ·

1 Parent(s): cbf42f2

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -11

app.py CHANGED Viewed

@@ -46,8 +46,8 @@ table = db.open_table("edunat19")
 def hybrid_search(text):
     results = table.search(text, query_type="hybrid").limit(5).to_pandas()
-    # Use a list to maintain order
-    seen_hashes = []
     document = []
     document_html = []
@@ -58,24 +58,22 @@ def hybrid_search(text):
         if hash_id in seen_hashes:
             continue
-        seen_hashes.append(hash_id)  # append instead of add to maintain order
         title = row['section']
         content = row['text']
         source_text = f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>"
         document.append(source_text)
         document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
-        # Print for debugging
-        print(f"Added source {hash_id}")
-        print(f"Length of source text: {len(source_text)}")
     document = "\n".join(document)
     document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
-    # Print total length for debugging
-    print(f"Total length of document: {len(document)}")
     return document, document_html
 class pleiasBot:
@@ -86,9 +84,13 @@ class pleiasBot:
         fiches, fiches_html = hybrid_search(user_message)
         detailed_prompt = f"""<|query_start|>{user_message}<|query_end|>\n{fiches}\n<|source_analysis_start|>"""
         # Convert inputs to tensor
         input_ids = tokenizer.encode(detailed_prompt, return_tensors="pt").to(device)
         attention_mask = torch.ones_like(input_ids)
         try:

 def hybrid_search(text):
     results = table.search(text, query_type="hybrid").limit(5).to_pandas()
+    # Add a check for duplicate hashes
+    seen_hashes = set()
     document = []
     document_html = []
         if hash_id in seen_hashes:
             continue
+        seen_hashes.add(hash_id)
         title = row['section']
         content = row['text']
         source_text = f"<|source_start|><|source_id_start|>{hash_id}<|source_id_end|>{title}\n{content}<|source_end|>"
         document.append(source_text)
         document_html.append(f'<div class="source" id="{hash_id}"><p><b>{hash_id}</b> : {title}<br>{content}</div>')
+        # Add debug print
+        print(f"Source added: {hash_id}")
     document = "\n".join(document)
     document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
+    # Add debug print
+    print(f"Total sources: {len(seen_hashes)}")
     return document, document_html
 class pleiasBot:
         fiches, fiches_html = hybrid_search(user_message)
         detailed_prompt = f"""<|query_start|>{user_message}<|query_end|>\n{fiches}\n<|source_analysis_start|>"""
+        # Add debug print
+        print("Model input length:", len(detailed_prompt))
         # Convert inputs to tensor
         input_ids = tokenizer.encode(detailed_prompt, return_tensors="pt").to(device)
+        print("Token count:", len(input_ids[0]))
         attention_mask = torch.ones_like(input_ids)
         try: