dball commited on
Commit
9caf3d5
·
verified ·
1 Parent(s): 0e1c853

Add tool to only extract parts of a website

Browse files

Add tool `extract_website_content_parts` that does not return the hole website but only parts matching a regexp.

This is necessary to avoid exceeding the LLM's context limit. Just telling the model to produce the code
(i.e. what is implemented in `extract_website_content_parts`) does not work, the LLM produces code that always
first prints the whole contents, i.e. pushes the whole contents into the LLM context.

Files changed (1) hide show
  1. app.py +21 -1
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import datetime
2
- import requests
3
  import pytz
 
 
 
4
  import yaml
5
 
6
  from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
@@ -24,6 +26,24 @@ def get_website_content(url: str) -> str:
24
  except requests.RequestException as e:
25
  return f"Error fetching website content: {str(e)}"
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @tool
28
  def get_papers_url_for_date(year:int, month:int, day:int)-> str:
29
  """A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.
 
1
  import datetime
 
2
  import pytz
3
+ import re
4
+ import requests
5
+ from typing import List
6
  import yaml
7
 
8
  from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
 
26
  except requests.RequestException as e:
27
  return f"Error fetching website content: {str(e)}"
28
 
29
+ @tool
30
+ def extract_website_content_parts(url: str, extraction_pattern: str) -> List[str]:
31
+ """
32
+ This tool extracts content parts matching the regular expression string `extraction_pattern` of a website given its `url`.
33
+ Args:
34
+ url: The URL of the website from which content parts should be extracted
35
+ extraction_pattern: The regular expression string of the content parts to extract from the website
36
+ Returns:
37
+ List[str]: The content parts matching extraction_pattern of the website `url`
38
+ """
39
+ try:
40
+ response = requests.get(url)
41
+ response.raise_for_status()
42
+ matches: List[str] = re.findall(extraction_pattern, response.text)
43
+ return matches
44
+ except requests.RequestException as e:
45
+ return [f"Error fetching website content: {str(e)}"]
46
+
47
  @tool
48
  def get_papers_url_for_date(year:int, month:int, day:int)-> str:
49
  """A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.