Spaces:
Sleeping
Sleeping
Add tool to only extract parts of a website
Browse filesAdd tool `extract_website_content_parts` that does not return the hole website but only parts matching a regexp.
This is necessary to avoid exceeding the LLM's context limit. Just telling the model to produce the code
(i.e. what is implemented in `extract_website_content_parts`) does not work, the LLM produces code that always
first prints the whole contents, i.e. pushes the whole contents into the LLM context.
app.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import datetime
|
2 |
-
import requests
|
3 |
import pytz
|
|
|
|
|
|
|
4 |
import yaml
|
5 |
|
6 |
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
|
@@ -24,6 +26,24 @@ def get_website_content(url: str) -> str:
|
|
24 |
except requests.RequestException as e:
|
25 |
return f"Error fetching website content: {str(e)}"
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
@tool
|
28 |
def get_papers_url_for_date(year:int, month:int, day:int)-> str:
|
29 |
"""A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.
|
|
|
1 |
import datetime
|
|
|
2 |
import pytz
|
3 |
+
import re
|
4 |
+
import requests
|
5 |
+
from typing import List
|
6 |
import yaml
|
7 |
|
8 |
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
|
|
|
26 |
except requests.RequestException as e:
|
27 |
return f"Error fetching website content: {str(e)}"
|
28 |
|
29 |
+
@tool
|
30 |
+
def extract_website_content_parts(url: str, extraction_pattern: str) -> List[str]:
|
31 |
+
"""
|
32 |
+
This tool extracts content parts matching the regular expression string `extraction_pattern` of a website given its `url`.
|
33 |
+
Args:
|
34 |
+
url: The URL of the website from which content parts should be extracted
|
35 |
+
extraction_pattern: The regular expression string of the content parts to extract from the website
|
36 |
+
Returns:
|
37 |
+
List[str]: The content parts matching extraction_pattern of the website `url`
|
38 |
+
"""
|
39 |
+
try:
|
40 |
+
response = requests.get(url)
|
41 |
+
response.raise_for_status()
|
42 |
+
matches: List[str] = re.findall(extraction_pattern, response.text)
|
43 |
+
return matches
|
44 |
+
except requests.RequestException as e:
|
45 |
+
return [f"Error fetching website content: {str(e)}"]
|
46 |
+
|
47 |
@tool
|
48 |
def get_papers_url_for_date(year:int, month:int, day:int)-> str:
|
49 |
"""A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.
|