Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
-
from fastapi import FastAPI, HTTPException, Query
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from webscout import WEBS, transcriber, LLM
|
4 |
-
from typing import Optional, List, Dict, Union
|
5 |
from fastapi.encoders import jsonable_encoder
|
6 |
from bs4 import BeautifulSoup
|
7 |
import requests
|
8 |
import urllib.parse
|
|
|
|
|
|
|
9 |
|
10 |
app = FastAPI()
|
11 |
|
@@ -152,6 +155,21 @@ def extract_text_from_webpage(html_content):
|
|
152 |
visible_text = soup.get_text(strip=True)
|
153 |
return visible_text
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
@app.get("/api/web_extract")
|
156 |
async def web_extract(
|
157 |
url: str,
|
@@ -159,12 +177,8 @@ async def web_extract(
|
|
159 |
):
|
160 |
"""Extracts text from a given URL."""
|
161 |
try:
|
162 |
-
|
163 |
-
|
164 |
-
visible_text = extract_text_from_webpage(response.text)
|
165 |
-
if len(visible_text) > max_chars:
|
166 |
-
visible_text = visible_text[:max_chars] + "..."
|
167 |
-
return {"url": url, "text": visible_text}
|
168 |
except requests.exceptions.RequestException as e:
|
169 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
170 |
|
@@ -175,9 +189,9 @@ async def web_search_and_extract(
|
|
175 |
timelimit: Optional[str] = None,
|
176 |
safesearch: str = "moderate",
|
177 |
region: str = "wt-wt",
|
178 |
-
backend: str = "
|
179 |
max_chars: int = 6000,
|
180 |
-
extract_only: bool =
|
181 |
):
|
182 |
"""
|
183 |
Searches using WEBS, extracts text from the top results, and returns both.
|
@@ -188,25 +202,12 @@ async def web_search_and_extract(
|
|
188 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
189 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
190 |
|
191 |
-
# Extract text from each result's link
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
link = result['href']
|
196 |
-
try:
|
197 |
-
response = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
|
198 |
-
response.raise_for_status()
|
199 |
-
visible_text = extract_text_from_webpage(response.text)
|
200 |
-
if len(visible_text) > max_chars:
|
201 |
-
visible_text = visible_text[:max_chars] + "..."
|
202 |
-
extracted_results.append({"link": link, "text": visible_text})
|
203 |
-
except requests.exceptions.RequestException as e:
|
204 |
-
print(f"Error fetching or processing {link}: {e}")
|
205 |
-
extracted_results.append({"link": link, "text": None})
|
206 |
-
else:
|
207 |
-
extracted_results.append({"link": None, "text": None})
|
208 |
if extract_only:
|
209 |
-
return JSONResponse(content=jsonable_encoder(
|
210 |
else:
|
211 |
return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
|
212 |
except Exception as e:
|
@@ -220,7 +221,7 @@ async def adv_web_search(
|
|
220 |
timelimit: Optional[str] = None,
|
221 |
safesearch: str = "moderate",
|
222 |
region: str = "wt-wt",
|
223 |
-
backend: str = "
|
224 |
max_chars: int = 6000,
|
225 |
system_prompt: str = "You are Most Advanced and Powerful Ai chatbot, User ask you questions and you have to answer that, You are also provided with Google Search Results, To increase your accuracy and providing real time data. Your task is to answer in best way to user."
|
226 |
):
|
@@ -235,22 +236,13 @@ async def adv_web_search(
|
|
235 |
timelimit=timelimit, backend=backend,
|
236 |
max_results=max_results)
|
237 |
|
238 |
-
# 2. Extract text from top search result URLs
|
239 |
extracted_text = ""
|
240 |
-
for result in search_results
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
response.raise_for_status()
|
246 |
-
visible_text = extract_text_from_webpage(response.text)
|
247 |
-
if len(visible_text) > max_chars:
|
248 |
-
visible_text = visible_text[:max_chars] + "..."
|
249 |
-
extracted_text += f"## Content from: {link}\n\n{visible_text}\n\n"
|
250 |
-
except requests.exceptions.RequestException as e:
|
251 |
-
print(f"Error fetching or processing {link}: {e}")
|
252 |
-
else:
|
253 |
-
pass
|
254 |
|
255 |
# 3. Construct the prompt for the LLM
|
256 |
llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Query
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from webscout import WEBS, transcriber, LLM
|
4 |
+
from typing import Optional, List, Dict, Union
|
5 |
from fastapi.encoders import jsonable_encoder
|
6 |
from bs4 import BeautifulSoup
|
7 |
import requests
|
8 |
import urllib.parse
|
9 |
+
import asyncio
|
10 |
+
import aiohttp
|
11 |
+
from typing import List
|
12 |
|
13 |
app = FastAPI()
|
14 |
|
|
|
155 |
visible_text = soup.get_text(strip=True)
|
156 |
return visible_text
|
157 |
|
158 |
+
async def fetch_and_extract(url, max_chars):
|
159 |
+
"""Fetches a URL and extracts text asynchronously."""
|
160 |
+
async with aiohttp.ClientSession() as session:
|
161 |
+
try:
|
162 |
+
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
|
163 |
+
response.raise_for_status()
|
164 |
+
html_content = await response.text()
|
165 |
+
visible_text = extract_text_from_webpage(html_content)
|
166 |
+
if len(visible_text) > max_chars:
|
167 |
+
visible_text = visible_text[:max_chars] + "..."
|
168 |
+
return {"link": url, "text": visible_text}
|
169 |
+
except (aiohttp.ClientError, requests.exceptions.RequestException) as e:
|
170 |
+
print(f"Error fetching or processing {url}: {e}")
|
171 |
+
return {"link": url, "text": None}
|
172 |
+
|
173 |
@app.get("/api/web_extract")
|
174 |
async def web_extract(
|
175 |
url: str,
|
|
|
177 |
):
|
178 |
"""Extracts text from a given URL."""
|
179 |
try:
|
180 |
+
result = await fetch_and_extract(url, max_chars)
|
181 |
+
return {"url": url, "text": result["text"]}
|
|
|
|
|
|
|
|
|
182 |
except requests.exceptions.RequestException as e:
|
183 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
184 |
|
|
|
189 |
timelimit: Optional[str] = None,
|
190 |
safesearch: str = "moderate",
|
191 |
region: str = "wt-wt",
|
192 |
+
backend: str = "html",
|
193 |
max_chars: int = 6000,
|
194 |
+
extract_only: bool = True
|
195 |
):
|
196 |
"""
|
197 |
Searches using WEBS, extracts text from the top results, and returns both.
|
|
|
202 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
203 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
204 |
|
205 |
+
# Extract text from each result's link asynchronously
|
206 |
+
tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
|
207 |
+
extracted_results = await asyncio.gather(*tasks)
|
208 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
if extract_only:
|
210 |
+
return JSONResponse(content=jsonable_encoder(extracted_results))
|
211 |
else:
|
212 |
return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
|
213 |
except Exception as e:
|
|
|
221 |
timelimit: Optional[str] = None,
|
222 |
safesearch: str = "moderate",
|
223 |
region: str = "wt-wt",
|
224 |
+
backend: str = "html",
|
225 |
max_chars: int = 6000,
|
226 |
system_prompt: str = "You are Most Advanced and Powerful Ai chatbot, User ask you questions and you have to answer that, You are also provided with Google Search Results, To increase your accuracy and providing real time data. Your task is to answer in best way to user."
|
227 |
):
|
|
|
236 |
timelimit=timelimit, backend=backend,
|
237 |
max_results=max_results)
|
238 |
|
239 |
+
# 2. Extract text from top search result URLs asynchronously
|
240 |
extracted_text = ""
|
241 |
+
tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
|
242 |
+
extracted_results = await asyncio.gather(*tasks)
|
243 |
+
for result in extracted_results:
|
244 |
+
if result['text']:
|
245 |
+
extracted_text += f"## Content from: {result['link']}\n\n{result['text']}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
# 3. Construct the prompt for the LLM
|
248 |
llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
|