Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
-
from fastapi import FastAPI, HTTPException, Query
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from webscout import WEBS, transcriber, LLM
|
4 |
-
from typing import Optional, List, Dict, Union
|
5 |
from fastapi.encoders import jsonable_encoder
|
6 |
from bs4 import BeautifulSoup
|
7 |
import requests
|
8 |
import urllib.parse
|
|
|
|
|
|
|
9 |
|
10 |
app = FastAPI()
|
11 |
|
@@ -152,6 +155,21 @@ def extract_text_from_webpage(html_content):
|
|
152 |
visible_text = soup.get_text(strip=True)
|
153 |
return visible_text
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
@app.get("/api/web_extract")
|
156 |
async def web_extract(
|
157 |
url: str,
|
@@ -159,12 +177,8 @@ async def web_extract(
|
|
159 |
):
|
160 |
"""Extracts text from a given URL."""
|
161 |
try:
|
162 |
-
|
163 |
-
|
164 |
-
visible_text = extract_text_from_webpage(response.text)
|
165 |
-
if len(visible_text) > max_chars:
|
166 |
-
visible_text = visible_text[:max_chars] + "..."
|
167 |
-
return {"url": url, "text": visible_text}
|
168 |
except requests.exceptions.RequestException as e:
|
169 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
170 |
|
@@ -188,23 +202,10 @@ async def web_search_and_extract(
|
|
188 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
189 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
190 |
|
191 |
-
# Extract text from each result's link
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
link = result['href']
|
196 |
-
try:
|
197 |
-
response = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
|
198 |
-
response.raise_for_status()
|
199 |
-
visible_text = extract_text_from_webpage(response.text)
|
200 |
-
if len(visible_text) > max_chars:
|
201 |
-
visible_text = visible_text[:max_chars] + "..."
|
202 |
-
extracted_results.append({"link": link, "text": visible_text})
|
203 |
-
except requests.exceptions.RequestException as e:
|
204 |
-
print(f"Error fetching or processing {link}: {e}")
|
205 |
-
extracted_results.append({"link": link, "text": None})
|
206 |
-
else:
|
207 |
-
extracted_results.append({"link": None, "text": None})
|
208 |
if extract_only:
|
209 |
return JSONResponse(content=jsonable_encoder({extracted_results}))
|
210 |
else:
|
@@ -235,22 +236,13 @@ async def adv_web_search(
|
|
235 |
timelimit=timelimit, backend=backend,
|
236 |
max_results=max_results)
|
237 |
|
238 |
-
# 2. Extract text from top search result URLs
|
239 |
extracted_text = ""
|
240 |
-
for result in search_results
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
response.raise_for_status()
|
246 |
-
visible_text = extract_text_from_webpage(response.text)
|
247 |
-
if len(visible_text) > max_chars:
|
248 |
-
visible_text = visible_text[:max_chars] + "..."
|
249 |
-
extracted_text += f"## Content from: {link}\n\n{visible_text}\n\n"
|
250 |
-
except requests.exceptions.RequestException as e:
|
251 |
-
print(f"Error fetching or processing {link}: {e}")
|
252 |
-
else:
|
253 |
-
pass
|
254 |
|
255 |
# 3. Construct the prompt for the LLM
|
256 |
llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Query
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from webscout import WEBS, transcriber, LLM
|
4 |
+
from typing import Optional, List, Dict, Union
|
5 |
from fastapi.encoders import jsonable_encoder
|
6 |
from bs4 import BeautifulSoup
|
7 |
import requests
|
8 |
import urllib.parse
|
9 |
+
import asyncio
|
10 |
+
import aiohttp
|
11 |
+
from typing import List
|
12 |
|
13 |
app = FastAPI()
|
14 |
|
|
|
155 |
visible_text = soup.get_text(strip=True)
|
156 |
return visible_text
|
157 |
|
158 |
+
async def fetch_and_extract(url, max_chars):
|
159 |
+
"""Fetches a URL and extracts text asynchronously."""
|
160 |
+
async with aiohttp.ClientSession() as session:
|
161 |
+
try:
|
162 |
+
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
|
163 |
+
response.raise_for_status()
|
164 |
+
html_content = await response.text()
|
165 |
+
visible_text = extract_text_from_webpage(html_content)
|
166 |
+
if len(visible_text) > max_chars:
|
167 |
+
visible_text = visible_text[:max_chars] + "..."
|
168 |
+
return {"link": url, "text": visible_text}
|
169 |
+
except (aiohttp.ClientError, requests.exceptions.RequestException) as e:
|
170 |
+
print(f"Error fetching or processing {url}: {e}")
|
171 |
+
return {"link": url, "text": None}
|
172 |
+
|
173 |
@app.get("/api/web_extract")
|
174 |
async def web_extract(
|
175 |
url: str,
|
|
|
177 |
):
|
178 |
"""Extracts text from a given URL."""
|
179 |
try:
|
180 |
+
result = await fetch_and_extract(url, max_chars)
|
181 |
+
return {"url": url, "text": result["text"]}
|
|
|
|
|
|
|
|
|
182 |
except requests.exceptions.RequestException as e:
|
183 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
184 |
|
|
|
202 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
203 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
204 |
|
205 |
+
# Extract text from each result's link asynchronously
|
206 |
+
tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
|
207 |
+
extracted_results = await asyncio.gather(*tasks)
|
208 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
if extract_only:
|
210 |
return JSONResponse(content=jsonable_encoder({extracted_results}))
|
211 |
else:
|
|
|
236 |
timelimit=timelimit, backend=backend,
|
237 |
max_results=max_results)
|
238 |
|
239 |
+
# 2. Extract text from top search result URLs asynchronously
|
240 |
extracted_text = ""
|
241 |
+
tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
|
242 |
+
extracted_results = await asyncio.gather(*tasks)
|
243 |
+
for result in extracted_results:
|
244 |
+
if result['text']:
|
245 |
+
extracted_text += f"## Content from: {result['link']}\n\n{result['text']}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
# 3. Construct the prompt for the LLM
|
248 |
llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
|