Spaces:

watanabe3tipapa
/

web-sge-agent

App Files Files Community

watanabe3tipapa commited on Sep 4, 2023

Commit

6b7ef92

•

1 Parent(s): 9bd5920

Upload 4 files

Browse files

Files changed (4) hide show

tools/__pycache__/fetch_page.cpython-311.pyc +0 -0
tools/__pycache__/search_ddg.cpython-311.pyc +0 -0
tools/fetch_page.py +97 -0
tools/search_ddg.py +55 -0

tools/__pycache__/fetch_page.cpython-311.pyc ADDED Viewed

Binary file (4.41 kB). View file

tools/__pycache__/search_ddg.cpython-311.pyc ADDED Viewed

Binary file (3.14 kB). View file

tools/fetch_page.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import requests
+import html2text
+from readability import Document
+from langchain.agents import Tool
+from urllib.parse import urlparse, parse_qs, urlunparse
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+def fetch_page(url, model_name='gpt-3.5-turbo', timeout_sec=10):
+    """Tool to fetch the content of a web page from a given URL.
+    - This returns `title`, `content`, and `has_next` indicator. `content` is returned in markdown format.
+    - By default, only up to 2,000 tokens of content are retrieved.
+    - If there is more content available on the page, the `has_next` value will be True.
+    - To read the continuation, you can increment the `page` parameter with the same URL and input them again.
+    Returns
+    -------
+    Dict[str, Any]:
+    - status: str
+    - page_content
+      - title: str
+      - content: str
+      - has_next: bool
+    """
+    # page parameter
+    parsed_url = urlparse(url)
+    parsed_qs = parse_qs(parsed_url.query)
+    page = int(parsed_qs.get("page", [1])[0]) - 1
+    url = urlunparse(
+        (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")
+    )
+    try:
+        response = requests.get(url, timeout=timeout_sec)
+        response.encoding = 'utf-8'
+    except requests.exceptions.Timeout:
+        return {
+            "status": 500,
+            "page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'}
+        }
+    if response.status_code != 200:
+        return {
+            "status": response.status_code,
+            "page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'}
+        }
+    try:
+        doc = Document(response.text)
+        title = doc.title()
+        html_content = doc.summary()
+        content = html2text.html2text(html_content)
+    except:
+        return {
+            "status": 500,
+            "page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'}
+        }
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        model_name=model_name,
+        chunk_size=1000,
+        chunk_overlap=0,
+    )
+    chunks = text_splitter.split_text(content)
+    if page >= len(chunks):
+        return {
+            "status": 500,
+            "page_content": {'error_message': 'page parameter looks invalid. Please try to fetch other pages.'}
+        }
+    else:
+        return {
+            "status": 200,
+            "page_content": {
+                "title": title,
+                "content": chunks[page],
+                "has_next": page < len(chunks) - 1
+            }
+        }
+def get_fetch_page_tool():
+    fetch_page_tool_description = """
+    Tool to fetch the content of a web page from a given URL.
+    This returns `status` and `page_content` (`title`, `content` and `has_next` indicator).
+    If status is not 200, there was some error of fetching page. (Try fetch other pages.)
+    If a status code other than 200 is returned, please don't give up and make sure to check other pages.
+    By default, only up to 2,000 tokens of content are retrieved. If there is more content available on the page, the `has_next` value will be True.
+    To read the continuation, you can increment the `page` parameter with the same URL and input them again. (paging is start with 1, so next page is 2)
+    e.g. https://www.obamalibrary.gov/obamas/president-barack-obama?page=2
+    """
+    return Tool(
+        name='fetch_page',
+        func=fetch_page,
+        description=fetch_page_tool_description
+    )

tools/search_ddg.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from itertools import islice
+from langchain.agents import Tool
+from duckduckgo_search import DDGS
+def search_ddg(query, max_result_num=5):
+    """
+    Tool for performing DuckDuckGo searches
+    - Please enter the keyword you want to search for and use it.
+    - The title, snippet (description), and URL of each page in the search results will be returned.
+    Sample Response of DuckDuckGo python library
+    --------------------------------------------
+    [
+        {
+            'title': '日程・結果｜Fifa 女子ワールドカップ オーストラリア&ニュージーランド 2023｜なでしこジャパン｜日本代表｜Jfa｜日本サッカー協会',
+            'href': 'https://www.jfa.jp/nadeshikojapan/womensworldcup2023/schedule_result/',
+            'body': '日程・結果｜FIFA 女子ワールドカップ オーストラリア&ニュージーランド 2023｜なでしこジャパン｜日本代表｜JFA｜日本サッカー協会. FIFA 女子ワールドカップ. オーストラリア&ニュージーランド 2023.'
+        }, ...
+    ]
+    Returns
+    -------
+    List[Dict[str, str]]:
+    - title
+    - snippet
+    - url
+    """
+    res = DDGS().text(query, region='wt-wt', safesearch='off', backend="lite")
+    return [
+        {
+            "title": r.get('title', ""),
+            "snippet": r.get('body', ""),
+            "url": r.get('href', "")
+        }
+        for r in islice(res, max_result_num)
+    ]
+def get_search_ddg_tool():
+    search_tool_description = """
+    Tool for performing DuckDuckGo searches.
+    Please enter the keyword you want to search for and use it.
+    The title, snippet (description) and URL of each page in the search results will be returned.
+    The information available through this tool is QUITE CONDENSED and sometimes outdated.
+    If you can't find the information you're looking for, please make sure to use the `WEB Page Fetcher` tool to read the content of each page.
+    Feel free to use the most appropriate language for the context. (not necessary same as the user's language)
+    For example, for programming-related questions, it's best to search in English.
+    """
+    return Tool(
+        name='search_ddg',
+        func=search_ddg,
+        description=search_tool_description
+    )