Spaces:

TypeGPT
/

Webscout-API

Running

App Files Files Community

Niansuh commited on Jul 3, 2024

Commit

ffc3034

verified ·

1 Parent(s): b024a65

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -44

app.py CHANGED Viewed

@@ -1,11 +1,14 @@
-from fastapi import FastAPI, HTTPException, Query  # Make sure Query is imported
 from fastapi.responses import JSONResponse
 from webscout import WEBS, transcriber, LLM
-from typing import Optional, List, Dict, Union # Import List, Dict, Union
 from fastapi.encoders import jsonable_encoder
 from bs4 import BeautifulSoup
 import requests
 import urllib.parse
 app = FastAPI()
@@ -152,6 +155,21 @@ def extract_text_from_webpage(html_content):
     visible_text = soup.get_text(strip=True)
     return visible_text
 @app.get("/api/web_extract")
 async def web_extract(
     url: str,
@@ -159,12 +177,8 @@ async def web_extract(
 ):
     """Extracts text from a given URL."""
     try:
-        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
-        response.raise_for_status()
-        visible_text = extract_text_from_webpage(response.text)
-        if len(visible_text) > max_chars:
-            visible_text = visible_text[:max_chars] + "..."
-        return {"url": url, "text": visible_text}
     except requests.exceptions.RequestException as e:
         raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
@@ -175,9 +189,9 @@ async def web_search_and_extract(
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
-    backend: str = "api",
     max_chars: int = 6000,
-    extract_only: bool = False
 ):
     """
     Searches using WEBS, extracts text from the top results, and returns both.
@@ -188,25 +202,12 @@ async def web_search_and_extract(
             search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
                                      timelimit=timelimit, backend=backend, max_results=max_results)
-            # Extract text from each result's link
-            extracted_results = []
-            for result in search_results:
-                if 'href' in result:
-                    link = result['href']
-                    try:
-                        response = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
-                        response.raise_for_status()
-                        visible_text = extract_text_from_webpage(response.text)
-                        if len(visible_text) > max_chars:
-                            visible_text = visible_text[:max_chars] + "..."
-                        extracted_results.append({"link": link, "text": visible_text})
-                    except requests.exceptions.RequestException as e:
-                        print(f"Error fetching or processing {link}: {e}")
-                        extracted_results.append({"link": link, "text": None})
-                else:
-                    extracted_results.append({"link": None, "text": None})
             if extract_only:
-                return JSONResponse(content=jsonable_encoder({extracted_results}))
             else:
                 return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
     except Exception as e:
@@ -220,7 +221,7 @@ async def adv_web_search(
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
-    backend: str = "api",
     max_chars: int = 6000,
     system_prompt: str = "You are Most Advanced and Powerful Ai chatbot, User ask you questions and you have to answer that, You are also provided with Google Search Results, To increase your accuracy and providing real time data. Your task is to answer in best way to user."
 ):
@@ -235,22 +236,13 @@ async def adv_web_search(
                                      timelimit=timelimit, backend=backend,
                                      max_results=max_results)
-            # 2. Extract text from top search result URLs
             extracted_text = ""
-            for result in search_results:
-                if 'href' in result:
-                    link = result['href']
-                    try:
-                        response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
-                        response.raise_for_status()
-                        visible_text = extract_text_from_webpage(response.text)
-                        if len(visible_text) > max_chars:
-                            visible_text = visible_text[:max_chars] + "..."
-                        extracted_text += f"## Content from: {link}\n\n{visible_text}\n\n"
-                    except requests.exceptions.RequestException as e:
-                        print(f"Error fetching or processing {link}: {e}")
-                else:
-                   pass
         # 3. Construct the prompt for the LLM
         llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"

+from fastapi import FastAPI, HTTPException, Query
 from fastapi.responses import JSONResponse
 from webscout import WEBS, transcriber, LLM
+from typing import Optional, List, Dict, Union
 from fastapi.encoders import jsonable_encoder
 from bs4 import BeautifulSoup
 import requests
 import urllib.parse
+import asyncio
+import aiohttp
+from typing import List
 app = FastAPI()
     visible_text = soup.get_text(strip=True)
     return visible_text
+async def fetch_and_extract(url, max_chars):
+    """Fetches a URL and extracts text asynchronously."""
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
+                response.raise_for_status()
+                html_content = await response.text()
+                visible_text = extract_text_from_webpage(html_content)
+                if len(visible_text) > max_chars:
+                    visible_text = visible_text[:max_chars] + "..."
+                return {"link": url, "text": visible_text}
+        except (aiohttp.ClientError, requests.exceptions.RequestException) as e:
+            print(f"Error fetching or processing {url}: {e}")
+            return {"link": url, "text": None}
 @app.get("/api/web_extract")
 async def web_extract(
     url: str,
 ):
     """Extracts text from a given URL."""
     try:
+        result = await fetch_and_extract(url, max_chars)
+        return {"url": url, "text": result["text"]}
     except requests.exceptions.RequestException as e:
         raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
+    backend: str = "html",
     max_chars: int = 6000,
+    extract_only: bool = True
 ):
     """
     Searches using WEBS, extracts text from the top results, and returns both.
             search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
                                      timelimit=timelimit, backend=backend, max_results=max_results)
+            # Extract text from each result's link asynchronously
+            tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
+            extracted_results = await asyncio.gather(*tasks)
             if extract_only:
+                return JSONResponse(content=jsonable_encoder(extracted_results))
             else:
                 return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
     except Exception as e:
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
+    backend: str = "html",
     max_chars: int = 6000,
     system_prompt: str = "You are Most Advanced and Powerful Ai chatbot, User ask you questions and you have to answer that, You are also provided with Google Search Results, To increase your accuracy and providing real time data. Your task is to answer in best way to user."
 ):
                                      timelimit=timelimit, backend=backend,
                                      max_results=max_results)
+            # 2. Extract text from top search result URLs asynchronously
             extracted_text = ""
+            tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
+            extracted_results = await asyncio.gather(*tasks)
+            for result in extracted_results:
+                if result['text']:
+                    extracted_text += f"## Content from: {result['link']}\n\n{result['text']}\n\n"
         # 3. Construct the prompt for the LLM
         llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"