milwright commited on
Commit
65ea2b0
Β·
1 Parent(s): 217dd65

implement url prioritization with primary (5000 chars) and secondary (2500 chars) sources

Browse files
Files changed (3) hide show
  1. .gitignore +4 -1
  2. app.py +26 -9
  3. space_template.py +35 -17
.gitignore CHANGED
@@ -52,4 +52,7 @@ CLAUDE.md
52
  *_guide.md
53
 
54
  # Testing folder
55
- testing/
 
 
 
 
52
  *_guide.md
53
 
54
  # Testing folder
55
+ testing/
56
+
57
+ # Test files
58
+ test_complete_system.py
app.py CHANGED
@@ -232,7 +232,7 @@ class SpaceGenerator:
232
  # Create initial URL inputs
233
  for i in range(10):
234
  url_input = gr.Textbox(
235
- label=f"URL {i+1}" + (" (Primary)" if i < 2 else " (Secondary)"),
236
  placeholder="https://...",
237
  visible=(i < 2)
238
  )
@@ -541,14 +541,31 @@ class SpaceGenerator:
541
  grounding_context = ""
542
  urls = config.get('grounding_urls', [])
543
  if urls and len(urls) > 0:
544
- grounding_context = "\nπŸ“š **Reference Context:**\n"
545
- for i, url in enumerate(urls[:2], 1): # Primary URLs only
546
- try:
547
- content = fetch_url_content(url)
548
- if not content.startswith("❌") and not content.startswith("⏱️"):
549
- grounding_context += f"\n**Source {i}:** {content}\n"
550
- except:
551
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
 
553
  # Build messages for API with grounding context in system prompt
554
  system_content = config.get('system_prompt', 'You are a helpful AI assistant.')
 
232
  # Create initial URL inputs
233
  for i in range(10):
234
  url_input = gr.Textbox(
235
+ label=f"URL {i+1}" + (" (Primary - 5000 chars)" if i < 2 else " (Secondary - 2500 chars)"),
236
  placeholder="https://...",
237
  visible=(i < 2)
238
  )
 
541
  grounding_context = ""
542
  urls = config.get('grounding_urls', [])
543
  if urls and len(urls) > 0:
544
+ grounding_context = ""
545
+
546
+ # Process primary sources (first 2 URLs)
547
+ primary_urls = urls[:2]
548
+ if primary_urls:
549
+ grounding_context += "\nπŸ“š **PRIMARY SOURCES:**\n"
550
+ for i, url in enumerate(primary_urls, 1):
551
+ try:
552
+ content = fetch_url_content(url, max_chars=5000)
553
+ if not content.startswith("❌") and not content.startswith("⏱️"):
554
+ grounding_context += f"\n**Primary Source {i}:** {content}\n"
555
+ except:
556
+ pass
557
+
558
+ # Process secondary sources (URLs 3+)
559
+ secondary_urls = urls[2:]
560
+ if secondary_urls:
561
+ grounding_context += "\n\nπŸ“Ž **SECONDARY SOURCES:**\n"
562
+ for i, url in enumerate(secondary_urls, 1):
563
+ try:
564
+ content = fetch_url_content(url, max_chars=2500)
565
+ if not content.startswith("❌") and not content.startswith("⏱️"):
566
+ grounding_context += f"\n**Secondary Source {i}:** {content}\n"
567
+ except:
568
+ pass
569
 
570
  # Build messages for API with grounding context in system prompt
571
  system_content = config.get('system_prompt', 'You are a helpful AI assistant.')
space_template.py CHANGED
@@ -181,7 +181,7 @@ def validate_url_domain(url: str) -> bool:
181
  return False
182
 
183
 
184
- def fetch_url_content(url: str) -> str:
185
  """Fetch and convert URL content to text"""
186
  try:
187
  if not validate_url_domain(url):
@@ -210,15 +210,15 @@ def fetch_url_content(url: str) -> str:
210
  text = ' '.join(text.split())
211
 
212
  # Limit content length
213
- if len(text) > 3000:
214
- text = text[:3000] + "... [truncated]"
215
 
216
  return f"πŸ“„ Content from {{url}}:\\n{{text}}\\n"
217
 
218
  elif any(ct in content_type for ct in ['text/plain', 'application/json']):
219
  text = response.text
220
- if len(text) > 3000:
221
- text = text[:3000] + "... [truncated]"
222
  return f"πŸ“„ Content from {{url}}:\\n{{text}}\\n"
223
 
224
  else:
@@ -299,19 +299,37 @@ def get_grounding_context() -> str:
299
  if not urls:
300
  return ""
301
 
302
- context_parts = ["πŸ“š **Reference Context:**\\n"]
303
 
304
- for i, url in enumerate(urls[:2], 1): # Primary URLs only
305
- if url in _url_content_cache:
306
- content = _url_content_cache[url]
307
- else:
308
- content = fetch_url_content(url)
309
- _url_content_cache[url] = content
310
-
311
- if not content.startswith("❌") and not content.startswith("⏱️"):
312
- context_parts.append(f"\\n**Source {{i}}:** {{content}}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- if len(context_parts) > 1:
315
  return "\\n".join(context_parts)
316
  return ""
317
 
@@ -825,7 +843,7 @@ def create_interface():
825
  placeholder="https://example.com/docs\\nhttps://example.com/api",
826
  value='\\n'.join(config.get('grounding_urls', [])),
827
  lines=5,
828
- info="Add URLs to provide context. First 2 URLs are primary sources."
829
  )
830
 
831
  with gr.Row():
 
181
  return False
182
 
183
 
184
+ def fetch_url_content(url: str, max_length: int = 3000) -> str:
185
  """Fetch and convert URL content to text"""
186
  try:
187
  if not validate_url_domain(url):
 
210
  text = ' '.join(text.split())
211
 
212
  # Limit content length
213
+ if len(text) > max_length:
214
+ text = text[:max_length] + "... [truncated]"
215
 
216
  return f"πŸ“„ Content from {{url}}:\\n{{text}}\\n"
217
 
218
  elif any(ct in content_type for ct in ['text/plain', 'application/json']):
219
  text = response.text
220
+ if len(text) > max_length:
221
+ text = text[:max_length] + "... [truncated]"
222
  return f"πŸ“„ Content from {{url}}:\\n{{text}}\\n"
223
 
224
  else:
 
299
  if not urls:
300
  return ""
301
 
302
+ context_parts = []
303
 
304
+ # Process primary sources (first 2 URLs with 5000 char limit)
305
+ primary_urls = urls[:2]
306
+ if primary_urls:
307
+ context_parts.append("πŸ“š **PRIMARY SOURCES:**\\n")
308
+ for i, url in enumerate(primary_urls, 1):
309
+ if url in _url_content_cache:
310
+ content = _url_content_cache[url]
311
+ else:
312
+ content = fetch_url_content(url, max_length=5000)
313
+ _url_content_cache[url] = content
314
+
315
+ if not content.startswith("❌") and not content.startswith("⏱️"):
316
+ context_parts.append(f"\\n**Primary Source {{i}}:** {{content}}")
317
+
318
+ # Process secondary sources (URLs 3+ with 2500 char limit)
319
+ secondary_urls = urls[2:]
320
+ if secondary_urls:
321
+ context_parts.append("\\n\\nπŸ“Ž **SECONDARY SOURCES:**\\n")
322
+ for i, url in enumerate(secondary_urls, 1):
323
+ if url in _url_content_cache:
324
+ content = _url_content_cache[url]
325
+ else:
326
+ content = fetch_url_content(url, max_length=2500)
327
+ _url_content_cache[url] = content
328
+
329
+ if not content.startswith("❌") and not content.startswith("⏱️"):
330
+ context_parts.append(f"\\n**Secondary Source {{i}}:** {{content}}")
331
 
332
+ if len(context_parts) > 0:
333
  return "\\n".join(context_parts)
334
  return ""
335
 
 
843
  placeholder="https://example.com/docs\\nhttps://example.com/api",
844
  value='\\n'.join(config.get('grounding_urls', [])),
845
  lines=5,
846
+ info="First 2 URLs: Primary sources (5000 chars). URLs 3+: Secondary sources (2500 chars)."
847
  )
848
 
849
  with gr.Row():