Niansuh commited on
Commit
ffc3034
·
verified ·
1 Parent(s): b024a65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -44
app.py CHANGED
@@ -1,11 +1,14 @@
1
- from fastapi import FastAPI, HTTPException, Query # Make sure Query is imported
2
  from fastapi.responses import JSONResponse
3
  from webscout import WEBS, transcriber, LLM
4
- from typing import Optional, List, Dict, Union # Import List, Dict, Union
5
  from fastapi.encoders import jsonable_encoder
6
  from bs4 import BeautifulSoup
7
  import requests
8
  import urllib.parse
 
 
 
9
 
10
  app = FastAPI()
11
 
@@ -152,6 +155,21 @@ def extract_text_from_webpage(html_content):
152
  visible_text = soup.get_text(strip=True)
153
  return visible_text
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  @app.get("/api/web_extract")
156
  async def web_extract(
157
  url: str,
@@ -159,12 +177,8 @@ async def web_extract(
159
  ):
160
  """Extracts text from a given URL."""
161
  try:
162
- response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
163
- response.raise_for_status()
164
- visible_text = extract_text_from_webpage(response.text)
165
- if len(visible_text) > max_chars:
166
- visible_text = visible_text[:max_chars] + "..."
167
- return {"url": url, "text": visible_text}
168
  except requests.exceptions.RequestException as e:
169
  raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
170
 
@@ -175,9 +189,9 @@ async def web_search_and_extract(
175
  timelimit: Optional[str] = None,
176
  safesearch: str = "moderate",
177
  region: str = "wt-wt",
178
- backend: str = "api",
179
  max_chars: int = 6000,
180
- extract_only: bool = False
181
  ):
182
  """
183
  Searches using WEBS, extracts text from the top results, and returns both.
@@ -188,25 +202,12 @@ async def web_search_and_extract(
188
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
189
  timelimit=timelimit, backend=backend, max_results=max_results)
190
 
191
- # Extract text from each result's link
192
- extracted_results = []
193
- for result in search_results:
194
- if 'href' in result:
195
- link = result['href']
196
- try:
197
- response = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
198
- response.raise_for_status()
199
- visible_text = extract_text_from_webpage(response.text)
200
- if len(visible_text) > max_chars:
201
- visible_text = visible_text[:max_chars] + "..."
202
- extracted_results.append({"link": link, "text": visible_text})
203
- except requests.exceptions.RequestException as e:
204
- print(f"Error fetching or processing {link}: {e}")
205
- extracted_results.append({"link": link, "text": None})
206
- else:
207
- extracted_results.append({"link": None, "text": None})
208
  if extract_only:
209
- return JSONResponse(content=jsonable_encoder({extracted_results}))
210
  else:
211
  return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
212
  except Exception as e:
@@ -220,7 +221,7 @@ async def adv_web_search(
220
  timelimit: Optional[str] = None,
221
  safesearch: str = "moderate",
222
  region: str = "wt-wt",
223
- backend: str = "api",
224
  max_chars: int = 6000,
225
  system_prompt: str = "You are Most Advanced and Powerful Ai chatbot, User ask you questions and you have to answer that, You are also provided with Google Search Results, To increase your accuracy and providing real time data. Your task is to answer in best way to user."
226
  ):
@@ -235,22 +236,13 @@ async def adv_web_search(
235
  timelimit=timelimit, backend=backend,
236
  max_results=max_results)
237
 
238
- # 2. Extract text from top search result URLs
239
  extracted_text = ""
240
- for result in search_results:
241
- if 'href' in result:
242
- link = result['href']
243
- try:
244
- response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
245
- response.raise_for_status()
246
- visible_text = extract_text_from_webpage(response.text)
247
- if len(visible_text) > max_chars:
248
- visible_text = visible_text[:max_chars] + "..."
249
- extracted_text += f"## Content from: {link}\n\n{visible_text}\n\n"
250
- except requests.exceptions.RequestException as e:
251
- print(f"Error fetching or processing {link}: {e}")
252
- else:
253
- pass
254
 
255
  # 3. Construct the prompt for the LLM
256
  llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
  from fastapi.responses import JSONResponse
3
  from webscout import WEBS, transcriber, LLM
4
+ from typing import Optional, List, Dict, Union
5
  from fastapi.encoders import jsonable_encoder
6
  from bs4 import BeautifulSoup
7
  import requests
8
  import urllib.parse
9
+ import asyncio
10
+ import aiohttp
11
+ from typing import List
12
 
13
  app = FastAPI()
14
 
 
155
  visible_text = soup.get_text(strip=True)
156
  return visible_text
157
 
158
+ async def fetch_and_extract(url, max_chars):
159
+ """Fetches a URL and extracts text asynchronously."""
160
+ async with aiohttp.ClientSession() as session:
161
+ try:
162
+ async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
163
+ response.raise_for_status()
164
+ html_content = await response.text()
165
+ visible_text = extract_text_from_webpage(html_content)
166
+ if len(visible_text) > max_chars:
167
+ visible_text = visible_text[:max_chars] + "..."
168
+ return {"link": url, "text": visible_text}
169
+ except (aiohttp.ClientError, requests.exceptions.RequestException) as e:
170
+ print(f"Error fetching or processing {url}: {e}")
171
+ return {"link": url, "text": None}
172
+
173
  @app.get("/api/web_extract")
174
  async def web_extract(
175
  url: str,
 
177
  ):
178
  """Extracts text from a given URL."""
179
  try:
180
+ result = await fetch_and_extract(url, max_chars)
181
+ return {"url": url, "text": result["text"]}
 
 
 
 
182
  except requests.exceptions.RequestException as e:
183
  raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
184
 
 
189
  timelimit: Optional[str] = None,
190
  safesearch: str = "moderate",
191
  region: str = "wt-wt",
192
+ backend: str = "html",
193
  max_chars: int = 6000,
194
+ extract_only: bool = True
195
  ):
196
  """
197
  Searches using WEBS, extracts text from the top results, and returns both.
 
202
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
203
  timelimit=timelimit, backend=backend, max_results=max_results)
204
 
205
+ # Extract text from each result's link asynchronously
206
+ tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
207
+ extracted_results = await asyncio.gather(*tasks)
208
+
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  if extract_only:
210
+ return JSONResponse(content=jsonable_encoder(extracted_results))
211
  else:
212
  return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
213
  except Exception as e:
 
221
  timelimit: Optional[str] = None,
222
  safesearch: str = "moderate",
223
  region: str = "wt-wt",
224
+ backend: str = "html",
225
  max_chars: int = 6000,
226
  system_prompt: str = "You are Most Advanced and Powerful Ai chatbot, User ask you questions and you have to answer that, You are also provided with Google Search Results, To increase your accuracy and providing real time data. Your task is to answer in best way to user."
227
  ):
 
236
  timelimit=timelimit, backend=backend,
237
  max_results=max_results)
238
 
239
+ # 2. Extract text from top search result URLs asynchronously
240
  extracted_text = ""
241
+ tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
242
+ extracted_results = await asyncio.gather(*tasks)
243
+ for result in extracted_results:
244
+ if result['text']:
245
+ extracted_text += f"## Content from: {result['link']}\n\n{result['text']}\n\n"
 
 
 
 
 
 
 
 
 
246
 
247
  # 3. Construct the prompt for the LLM
248
  llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"