watanabe3tipapa
commited on
Commit
•
6b7ef92
1
Parent(s):
9bd5920
Upload 4 files
Browse files
tools/__pycache__/fetch_page.cpython-311.pyc
ADDED
Binary file (4.41 kB). View file
|
|
tools/__pycache__/search_ddg.cpython-311.pyc
ADDED
Binary file (3.14 kB). View file
|
|
tools/fetch_page.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import html2text
|
3 |
+
from readability import Document
|
4 |
+
from langchain.agents import Tool
|
5 |
+
from urllib.parse import urlparse, parse_qs, urlunparse
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
|
8 |
+
|
9 |
+
def fetch_page(url, model_name='gpt-3.5-turbo', timeout_sec=10):
|
10 |
+
"""Tool to fetch the content of a web page from a given URL.
|
11 |
+
- This returns `title`, `content`, and `has_next` indicator. `content` is returned in markdown format.
|
12 |
+
- By default, only up to 2,000 tokens of content are retrieved.
|
13 |
+
- If there is more content available on the page, the `has_next` value will be True.
|
14 |
+
- To read the continuation, you can increment the `page` parameter with the same URL and input them again.
|
15 |
+
|
16 |
+
Returns
|
17 |
+
-------
|
18 |
+
Dict[str, Any]:
|
19 |
+
- status: str
|
20 |
+
- page_content
|
21 |
+
- title: str
|
22 |
+
- content: str
|
23 |
+
- has_next: bool
|
24 |
+
"""
|
25 |
+
# page parameter
|
26 |
+
parsed_url = urlparse(url)
|
27 |
+
parsed_qs = parse_qs(parsed_url.query)
|
28 |
+
page = int(parsed_qs.get("page", [1])[0]) - 1
|
29 |
+
url = urlunparse(
|
30 |
+
(parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")
|
31 |
+
)
|
32 |
+
|
33 |
+
try:
|
34 |
+
response = requests.get(url, timeout=timeout_sec)
|
35 |
+
response.encoding = 'utf-8'
|
36 |
+
except requests.exceptions.Timeout:
|
37 |
+
return {
|
38 |
+
"status": 500,
|
39 |
+
"page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'}
|
40 |
+
}
|
41 |
+
|
42 |
+
if response.status_code != 200:
|
43 |
+
return {
|
44 |
+
"status": response.status_code,
|
45 |
+
"page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'}
|
46 |
+
}
|
47 |
+
|
48 |
+
try:
|
49 |
+
doc = Document(response.text)
|
50 |
+
title = doc.title()
|
51 |
+
html_content = doc.summary()
|
52 |
+
content = html2text.html2text(html_content)
|
53 |
+
except:
|
54 |
+
return {
|
55 |
+
"status": 500,
|
56 |
+
"page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'}
|
57 |
+
}
|
58 |
+
|
59 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
60 |
+
model_name=model_name,
|
61 |
+
chunk_size=1000,
|
62 |
+
chunk_overlap=0,
|
63 |
+
)
|
64 |
+
chunks = text_splitter.split_text(content)
|
65 |
+
if page >= len(chunks):
|
66 |
+
return {
|
67 |
+
"status": 500,
|
68 |
+
"page_content": {'error_message': 'page parameter looks invalid. Please try to fetch other pages.'}
|
69 |
+
}
|
70 |
+
else:
|
71 |
+
return {
|
72 |
+
"status": 200,
|
73 |
+
"page_content": {
|
74 |
+
"title": title,
|
75 |
+
"content": chunks[page],
|
76 |
+
"has_next": page < len(chunks) - 1
|
77 |
+
}
|
78 |
+
}
|
79 |
+
|
80 |
+
|
81 |
+
def get_fetch_page_tool():
|
82 |
+
fetch_page_tool_description = """
|
83 |
+
Tool to fetch the content of a web page from a given URL.
|
84 |
+
|
85 |
+
This returns `status` and `page_content` (`title`, `content` and `has_next` indicator).
|
86 |
+
If status is not 200, there was some error of fetching page. (Try fetch other pages.)
|
87 |
+
If a status code other than 200 is returned, please don't give up and make sure to check other pages.
|
88 |
+
|
89 |
+
By default, only up to 2,000 tokens of content are retrieved. If there is more content available on the page, the `has_next` value will be True.
|
90 |
+
To read the continuation, you can increment the `page` parameter with the same URL and input them again. (paging is start with 1, so next page is 2)
|
91 |
+
e.g. https://www.obamalibrary.gov/obamas/president-barack-obama?page=2
|
92 |
+
"""
|
93 |
+
return Tool(
|
94 |
+
name='fetch_page',
|
95 |
+
func=fetch_page,
|
96 |
+
description=fetch_page_tool_description
|
97 |
+
)
|
tools/search_ddg.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import islice
|
2 |
+
from langchain.agents import Tool
|
3 |
+
from duckduckgo_search import DDGS
|
4 |
+
|
5 |
+
|
6 |
+
def search_ddg(query, max_result_num=5):
|
7 |
+
"""
|
8 |
+
Tool for performing DuckDuckGo searches
|
9 |
+
- Please enter the keyword you want to search for and use it.
|
10 |
+
- The title, snippet (description), and URL of each page in the search results will be returned.
|
11 |
+
|
12 |
+
Sample Response of DuckDuckGo python library
|
13 |
+
--------------------------------------------
|
14 |
+
[
|
15 |
+
{
|
16 |
+
'title': '日程・結果|Fifa 女子ワールドカップ オーストラリア&ニュージーランド 2023|なでしこジャパン|日本代表|Jfa|日本サッカー協会',
|
17 |
+
'href': 'https://www.jfa.jp/nadeshikojapan/womensworldcup2023/schedule_result/',
|
18 |
+
'body': '日程・結果|FIFA 女子ワールドカップ オーストラリア&ニュージーランド 2023|なでしこジャパン|日本代表|JFA|日本サッカー協会. FIFA 女子ワールドカップ. オーストラリア&ニュージーランド 2023.'
|
19 |
+
}, ...
|
20 |
+
]
|
21 |
+
|
22 |
+
Returns
|
23 |
+
-------
|
24 |
+
List[Dict[str, str]]:
|
25 |
+
- title
|
26 |
+
- snippet
|
27 |
+
- url
|
28 |
+
"""
|
29 |
+
res = DDGS().text(query, region='wt-wt', safesearch='off', backend="lite")
|
30 |
+
return [
|
31 |
+
{
|
32 |
+
"title": r.get('title', ""),
|
33 |
+
"snippet": r.get('body', ""),
|
34 |
+
"url": r.get('href', "")
|
35 |
+
}
|
36 |
+
for r in islice(res, max_result_num)
|
37 |
+
]
|
38 |
+
|
39 |
+
|
40 |
+
def get_search_ddg_tool():
|
41 |
+
search_tool_description = """
|
42 |
+
Tool for performing DuckDuckGo searches.
|
43 |
+
Please enter the keyword you want to search for and use it.
|
44 |
+
The title, snippet (description) and URL of each page in the search results will be returned.
|
45 |
+
The information available through this tool is QUITE CONDENSED and sometimes outdated.
|
46 |
+
|
47 |
+
If you can't find the information you're looking for, please make sure to use the `WEB Page Fetcher` tool to read the content of each page.
|
48 |
+
Feel free to use the most appropriate language for the context. (not necessary same as the user's language)
|
49 |
+
For example, for programming-related questions, it's best to search in English.
|
50 |
+
"""
|
51 |
+
return Tool(
|
52 |
+
name='search_ddg',
|
53 |
+
func=search_ddg,
|
54 |
+
description=search_tool_description
|
55 |
+
)
|