Spaces:

milwright
/

chatui-helper

Running

App Files Files Community

milwright commited on 25 days ago

Commit

65ea2b0

1 Parent(s): 217dd65

implement url prioritization with primary (5000 chars) and secondary (2500 chars) sources

Browse files

Files changed (3) hide show

.gitignore +4 -1
app.py +26 -9
space_template.py +35 -17

.gitignore CHANGED Viewed

@@ -52,4 +52,7 @@ CLAUDE.md
 *_guide.md
 # Testing folder
-testing/

 *_guide.md
 # Testing folder
+testing/
+# Test files
+test_complete_system.py

app.py CHANGED Viewed

@@ -232,7 +232,7 @@ class SpaceGenerator:
                 # Create initial URL inputs
                 for i in range(10):
                     url_input = gr.Textbox(
-                        label=f"URL {i+1}" + (" (Primary)" if i < 2 else " (Secondary)"),
                         placeholder="https://...",
                         visible=(i < 2)
                     )
@@ -541,14 +541,31 @@ class SpaceGenerator:
                             grounding_context = ""
                             urls = config.get('grounding_urls', [])
                             if urls and len(urls) > 0:
-                                grounding_context = "\n📚 **Reference Context:**\n"
-                                for i, url in enumerate(urls[:2], 1):  # Primary URLs only
-                                    try:
-                                        content = fetch_url_content(url)
-                                        if not content.startswith("❌") and not content.startswith("⏱️"):
-                                            grounding_context += f"\n**Source {i}:** {content}\n"
-                                    except:
-                                        pass
                             # Build messages for API with grounding context in system prompt
                             system_content = config.get('system_prompt', 'You are a helpful AI assistant.')

                 # Create initial URL inputs
                 for i in range(10):
                     url_input = gr.Textbox(
+                        label=f"URL {i+1}" + (" (Primary - 5000 chars)" if i < 2 else " (Secondary - 2500 chars)"),
                         placeholder="https://...",
                         visible=(i < 2)
                     )
                             grounding_context = ""
                             urls = config.get('grounding_urls', [])
                             if urls and len(urls) > 0:
+                                grounding_context = ""
+                                # Process primary sources (first 2 URLs)
+                                primary_urls = urls[:2]
+                                if primary_urls:
+                                    grounding_context += "\n📚 **PRIMARY SOURCES:**\n"
+                                    for i, url in enumerate(primary_urls, 1):
+                                        try:
+                                            content = fetch_url_content(url, max_chars=5000)
+                                            if not content.startswith("❌") and not content.startswith("⏱️"):
+                                                grounding_context += f"\n**Primary Source {i}:** {content}\n"
+                                        except:
+                                            pass
+                                # Process secondary sources (URLs 3+)
+                                secondary_urls = urls[2:]
+                                if secondary_urls:
+                                    grounding_context += "\n\n📎 **SECONDARY SOURCES:**\n"
+                                    for i, url in enumerate(secondary_urls, 1):
+                                        try:
+                                            content = fetch_url_content(url, max_chars=2500)
+                                            if not content.startswith("❌") and not content.startswith("⏱️"):
+                                                grounding_context += f"\n**Secondary Source {i}:** {content}\n"
+                                        except:
+                                            pass
                             # Build messages for API with grounding context in system prompt
                             system_content = config.get('system_prompt', 'You are a helpful AI assistant.')

space_template.py CHANGED Viewed

@@ -181,7 +181,7 @@ def validate_url_domain(url: str) -> bool:
         return False
-def fetch_url_content(url: str) -> str:
     """Fetch and convert URL content to text"""
     try:
         if not validate_url_domain(url):
@@ -210,15 +210,15 @@ def fetch_url_content(url: str) -> str:
             text = ' '.join(text.split())
             # Limit content length
-            if len(text) > 3000:
-                text = text[:3000] + "... [truncated]"
             return f"📄 Content from {{url}}:\\n{{text}}\\n"
         elif any(ct in content_type for ct in ['text/plain', 'application/json']):
             text = response.text
-            if len(text) > 3000:
-                text = text[:3000] + "... [truncated]"
             return f"📄 Content from {{url}}:\\n{{text}}\\n"
         else:
@@ -299,19 +299,37 @@ def get_grounding_context() -> str:
     if not urls:
         return ""
-    context_parts = ["📚 **Reference Context:**\\n"]
-    for i, url in enumerate(urls[:2], 1):  # Primary URLs only
-        if url in _url_content_cache:
-            content = _url_content_cache[url]
-        else:
-            content = fetch_url_content(url)
-            _url_content_cache[url] = content
-        if not content.startswith("❌") and not content.startswith("⏱️"):
-            context_parts.append(f"\\n**Source {{i}}:** {{content}}")
-    if len(context_parts) > 1:
         return "\\n".join(context_parts)
     return ""
@@ -825,7 +843,7 @@ def create_interface():
                             placeholder="https://example.com/docs\\nhttps://example.com/api",
                             value='\\n'.join(config.get('grounding_urls', [])),
                             lines=5,
-                            info="Add URLs to provide context. First 2 URLs are primary sources."
                         )
                         with gr.Row():

         return False
+def fetch_url_content(url: str, max_length: int = 3000) -> str:
     """Fetch and convert URL content to text"""
     try:
         if not validate_url_domain(url):
             text = ' '.join(text.split())
             # Limit content length
+            if len(text) > max_length:
+                text = text[:max_length] + "... [truncated]"
             return f"📄 Content from {{url}}:\\n{{text}}\\n"
         elif any(ct in content_type for ct in ['text/plain', 'application/json']):
             text = response.text
+            if len(text) > max_length:
+                text = text[:max_length] + "... [truncated]"
             return f"📄 Content from {{url}}:\\n{{text}}\\n"
         else:
     if not urls:
         return ""
+    context_parts = []
+    # Process primary sources (first 2 URLs with 5000 char limit)
+    primary_urls = urls[:2]
+    if primary_urls:
+        context_parts.append("📚 **PRIMARY SOURCES:**\\n")
+        for i, url in enumerate(primary_urls, 1):
+            if url in _url_content_cache:
+                content = _url_content_cache[url]
+            else:
+                content = fetch_url_content(url, max_length=5000)
+                _url_content_cache[url] = content
+            if not content.startswith("❌") and not content.startswith("⏱️"):
+                context_parts.append(f"\\n**Primary Source {{i}}:** {{content}}")
+    # Process secondary sources (URLs 3+ with 2500 char limit)
+    secondary_urls = urls[2:]
+    if secondary_urls:
+        context_parts.append("\\n\\n📎 **SECONDARY SOURCES:**\\n")
+        for i, url in enumerate(secondary_urls, 1):
+            if url in _url_content_cache:
+                content = _url_content_cache[url]
+            else:
+                content = fetch_url_content(url, max_length=2500)
+                _url_content_cache[url] = content
+            if not content.startswith("❌") and not content.startswith("⏱️"):
+                context_parts.append(f"\\n**Secondary Source {{i}}:** {{content}}")
+    if len(context_parts) > 0:
         return "\\n".join(context_parts)
     return ""
                             placeholder="https://example.com/docs\\nhttps://example.com/api",
                             value='\\n'.join(config.get('grounding_urls', [])),
                             lines=5,
+                            info="First 2 URLs: Primary sources (5000 chars). URLs 3+: Secondary sources (2500 chars)."
                         )
                         with gr.Row():