Abhaykoul commited on
Commit
f85d786
·
verified ·
1 Parent(s): e533971

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +997 -0
app.py CHANGED
@@ -1,3 +1,1000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import g4f
2
  from webscout import DDGS
3
  from time import time as t
 
1
+ import asyncio
2
+ import logging
3
+ import warnings
4
+ from typing import Dict, Generator, Optional
5
+ import nest_asyncio
6
+ import json
7
+ import logging
8
+ import sys
9
+ from collections import deque
10
+ from datetime import datetime, timezone
11
+ from decimal import Decimal
12
+ from itertools import cycle
13
+ from typing import AsyncGenerator, Deque, Dict, Optional, Set, Tuple
14
+ from curl_cffi import requests
15
+ from docstring_inheritance import GoogleDocstringInheritanceMeta
16
+ from lxml import html
17
+ import json
18
+ import re
19
+ from html import unescape
20
+ from typing import Optional
21
+ from urllib.parse import unquote
22
+ from dataclasses import dataclass
23
+ from typing import Dict, Optional
24
+ from random import randint
25
+ class DuckDuckGoSearchException(Exception):
26
+ """Base exception class for webscout."""
27
+
28
+
29
+
30
+ @dataclass
31
+ class MapsResult:
32
+ """Represents a result from the maps search."""
33
+
34
+ title: Optional[str] = None
35
+ address: Optional[str] = None
36
+ country_code: Optional[str] = None
37
+ latitude: Optional[str] = None
38
+ longitude: Optional[str] = None
39
+ url: Optional[str] = None
40
+ desc: Optional[str] = None
41
+ phone: Optional[str] = None
42
+ image: Optional[str] = None
43
+ source: Optional[str] = None
44
+ hours: Optional[Dict[str, str]] = None
45
+ category: Optional[str] = None
46
+ facebook: Optional[str] = None
47
+ instagram: Optional[str] = None
48
+ twitter: Optional[str] = None
49
+
50
+
51
+
52
+ REGEX_500_IN_URL = re.compile(r"(?:\d{3}-\d{2}\.js)")
53
+ REGEX_STRIP_TAGS = re.compile("<.*?>")
54
+ REGEX_VQD = re.compile(rb"""vqd=['"]?([^&"']+)""")
55
+
56
+
57
+ def _extract_vqd(html_bytes: bytes, keywords: str) -> Optional[str]:
58
+ """Extract vqd from html using a regular expression."""
59
+ try:
60
+ match = REGEX_VQD.search(html_bytes)
61
+ if match:
62
+ return match.group(1).decode()
63
+ except Exception:
64
+ pass
65
+ raise DuckDuckGoSearchException(f"_extract_vqd() {keywords=} Could not extract vqd.")
66
+
67
+
68
+ def _text_extract_json(html_bytes: bytes, keywords: str) -> Optional[str]:
69
+ """text(backend="api") -> extract json from html."""
70
+ try:
71
+ start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
72
+ end = html_bytes.index(b");DDG.duckbar.load(", start)
73
+ data = html_bytes[start:end]
74
+ return json.loads(data)
75
+ except Exception as ex:
76
+ raise DuckDuckGoSearchException(f"_text_extract_json() {keywords=} {type(ex).__name__}: {ex}") from ex
77
+
78
+
79
+ def _is_500_in_url(url: str) -> bool:
80
+ """Something like '506-00.js' inside the url."""
81
+ return bool(REGEX_500_IN_URL.search(url))
82
+
83
+
84
+ def _normalize(raw_html: str) -> str:
85
+ """Strip HTML tags from the raw_html string."""
86
+ return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
87
+
88
+
89
+ def _normalize_url(url: str) -> str:
90
+ """Unquote URL and replace spaces with '+'."""
91
+ return unquote(url.replace(" ", "+")) if url else ""
92
+
93
+ logger = logging.getLogger("duckduckgo_search.AsyncDDGS")
94
+ # Not working on Windows, NotImplementedError (https://curl-cffi.readthedocs.io/en/latest/faq/)
95
+ if sys.platform.lower().startswith("win"):
96
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
97
+
98
+
99
+ class AsyncDDGS(metaclass=GoogleDocstringInheritanceMeta):
100
+ """webscout_search async class to get search results from duckduckgo.com."""
101
+
102
+ def __init__(self, headers=None, proxies=None, timeout=10) -> None:
103
+ """Initialize the AsyncDDGS object.
104
+
105
+ Args:
106
+ headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None.
107
+ proxies (Union[dict, str], optional): Proxies for the HTTP client (can be dict or str). Defaults to None.
108
+ timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
109
+ """
110
+ useragent=f'{randint(0, 1000000)}'
111
+ headers = {'User-Agent': useragent}
112
+ self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
113
+ self._asession = requests.AsyncSession(
114
+ headers=headers, proxies=self.proxies, timeout=timeout, impersonate="chrome"
115
+ )
116
+ self._asession.headers["Referer"] = "https://duckduckgo.com/"
117
+
118
+ async def __aenter__(self) -> "AsyncDDGS":
119
+ """A context manager method that is called when entering the 'with' statement."""
120
+ return self
121
+
122
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
123
+ """Closes the session."""
124
+ return self._asession.close()
125
+
126
+ async def _aget_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
127
+ try:
128
+ useragent=f'{randint(0, 1000000)}'
129
+ headers = {'User-Agent': useragent}
130
+ resp = await self._asession.request(method, url, stream=True, **kwargs,headers=headers)
131
+ resp.raise_for_status()
132
+ resp_content = await resp.acontent()
133
+ logger.debug(f"_aget_url() {url} {resp.status_code} {resp.http_version} {resp.elapsed} {len(resp_content)}")
134
+ if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
135
+ raise DuckDuckGoSearchException("Ratelimit")
136
+ if resp.status_code == 200:
137
+ return resp_content
138
+ except Exception as ex:
139
+ raise DuckDuckGoSearchException(f"_aget_url() {url} {type(ex).__name__}: {ex}") from ex
140
+
141
+ async def _aget_vqd(self, keywords: str) -> Optional[str]:
142
+ """Get vqd value for a search query."""
143
+ resp_content = await self._aget_url("POST", "https://duckduckgo.com", data={"q": keywords})
144
+ if resp_content:
145
+ return _extract_vqd(resp_content, keywords)
146
+
147
+ async def text(
148
+ self,
149
+ keywords: str,
150
+ region: str = "wt-wt",
151
+ safesearch: str = "moderate",
152
+ timelimit: Optional[str] = None,
153
+ backend: str = "api",
154
+ max_results: Optional[int] = None,
155
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
156
+ """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params.
157
+
158
+ Args:
159
+ keywords: keywords for query.
160
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
161
+ safesearch: on, moderate, off. Defaults to "moderate".
162
+ timelimit: d, w, m, y. Defaults to None.
163
+ backend: api, html, lite. Defaults to api.
164
+ api - collect data from https://duckduckgo.com,
165
+ html - collect data from https://html.duckduckgo.com,
166
+ lite - collect data from https://lite.duckduckgo.com.
167
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
168
+
169
+ Yields:
170
+ dict with search results.
171
+
172
+ """
173
+ if backend == "api":
174
+ results = self._text_api(keywords, region, safesearch, timelimit, max_results)
175
+ elif backend == "html":
176
+ results = self._text_html(keywords, region, safesearch, timelimit, max_results)
177
+ elif backend == "lite":
178
+ results = self._text_lite(keywords, region, timelimit, max_results)
179
+
180
+ async for result in results:
181
+ yield result
182
+
183
+ async def _text_api(
184
+ self,
185
+ keywords: str,
186
+ region: str = "wt-wt",
187
+ safesearch: str = "moderate",
188
+ timelimit: Optional[str] = None,
189
+ max_results: Optional[int] = None,
190
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
191
+ """webscout text search generator. Query params: https://duckduckgo.com/params.
192
+
193
+ Args:
194
+ keywords: keywords for query.
195
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
196
+ safesearch: on, moderate, off. Defaults to "moderate".
197
+ timelimit: d, w, m, y. Defaults to None.
198
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
199
+
200
+ Yields:
201
+ dict with search results.
202
+
203
+ """
204
+ assert keywords, "keywords is mandatory"
205
+
206
+ vqd = await self._aget_vqd(keywords)
207
+
208
+ payload = {
209
+ "q": keywords,
210
+ "kl": region,
211
+ "l": region,
212
+ "bing_market": region,
213
+ "s": "0",
214
+ "df": timelimit,
215
+ "vqd": vqd,
216
+ # "o": "json",
217
+ "sp": "0",
218
+ }
219
+ safesearch = safesearch.lower()
220
+ if safesearch == "moderate":
221
+ payload["ex"] = "-1"
222
+ elif safesearch == "off":
223
+ payload["ex"] = "-2"
224
+ elif safesearch == "on": # strict
225
+ payload["p"] = "1"
226
+
227
+ cache = set()
228
+ for _ in range(11):
229
+ resp_content = await self._aget_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
230
+ if resp_content is None:
231
+ return
232
+
233
+ page_data = _text_extract_json(resp_content, keywords)
234
+ if page_data is None:
235
+ return
236
+
237
+ result_exists, next_page_url = False, None
238
+ for row in page_data:
239
+ href = row.get("u", None)
240
+ if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
241
+ cache.add(href)
242
+ body = _normalize(row["a"])
243
+ if body:
244
+ result_exists = True
245
+ yield {
246
+ "title": _normalize(row["t"]),
247
+ "href": _normalize_url(href),
248
+ "body": body,
249
+ }
250
+ if max_results and len(cache) >= max_results:
251
+ return
252
+ else:
253
+ next_page_url = row.get("n", None)
254
+ if max_results is None or result_exists is False or next_page_url is None:
255
+ return
256
+ payload["s"] = next_page_url.split("s=")[1].split("&")[0]
257
+
258
+ async def _text_html(
259
+ self,
260
+ keywords: str,
261
+ region: str = "wt-wt",
262
+ safesearch: str = "moderate",
263
+ timelimit: Optional[str] = None,
264
+ max_results: Optional[int] = None,
265
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
266
+ """webscout text search generator. Query params: https://duckduckgo.com/params.
267
+
268
+ Args:
269
+ keywords: keywords for query.
270
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
271
+ safesearch: on, moderate, off. Defaults to "moderate".
272
+ timelimit: d, w, m, y. Defaults to None.
273
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
274
+
275
+ Yields:
276
+ dict with search results.
277
+
278
+ """
279
+ assert keywords, "keywords is mandatory"
280
+
281
+ self._asession.headers["Referer"] = "https://html.duckduckgo.com/"
282
+ safesearch_base = {"on": 1, "moderate": -1, "off": -2}
283
+ payload = {
284
+ "q": keywords,
285
+ "s": "0",
286
+ "kl": region,
287
+ "p": safesearch_base[safesearch.lower()],
288
+ "df": timelimit,
289
+ }
290
+ cache: Set[str] = set()
291
+ for _ in range(11):
292
+ resp_content = await self._aget_url("POST", "https://html.duckduckgo.com/html", data=payload)
293
+ if resp_content is None:
294
+ return
295
+
296
+ tree = html.fromstring(resp_content)
297
+ if tree.xpath('//div[@class="no-results"]/text()'):
298
+ return
299
+
300
+ result_exists = False
301
+ for e in tree.xpath('//div[contains(@class, "results_links")]'):
302
+ href = e.xpath('.//a[contains(@class, "result__a")]/@href')
303
+ href = href[0] if href else None
304
+ if (
305
+ href
306
+ and href not in cache
307
+ and href != f"http://www.google.com/search?q={keywords}"
308
+ and not href.startswith("https://duckduckgo.com/y.js?ad_domain")
309
+ ):
310
+ cache.add(href)
311
+ title = e.xpath('.//a[contains(@class, "result__a")]/text()')
312
+ body = e.xpath('.//a[contains(@class, "result__snippet")]//text()')
313
+ result_exists = True
314
+ yield {
315
+ "title": _normalize(title[0]) if title else None,
316
+ "href": _normalize_url(href),
317
+ "body": _normalize("".join(body)) if body else None,
318
+ }
319
+ if max_results and len(cache) >= max_results:
320
+ return
321
+ if max_results is None or result_exists is False:
322
+ return
323
+ next_page = tree.xpath('.//div[@class="nav-link"]')
324
+ next_page = next_page[-1] if next_page else None
325
+ if next_page is None:
326
+ return
327
+
328
+ names = next_page.xpath('.//input[@type="hidden"]/@name')
329
+ values = next_page.xpath('.//input[@type="hidden"]/@value')
330
+ payload = {n: v for n, v in zip(names, values)}
331
+
332
+ async def _text_lite(
333
+ self,
334
+ keywords: str,
335
+ region: str = "wt-wt",
336
+ timelimit: Optional[str] = None,
337
+ max_results: Optional[int] = None,
338
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
339
+ """webscout text search generator. Query params: https://duckduckgo.com/params.
340
+
341
+ Args:
342
+ keywords: keywords for query.
343
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
344
+ timelimit: d, w, m, y. Defaults to None.
345
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
346
+
347
+ Yields:
348
+ dict with search results.
349
+
350
+ """
351
+ assert keywords, "keywords is mandatory"
352
+
353
+ self._asession.headers["Referer"] = "https://lite.duckduckgo.com/"
354
+ payload = {
355
+ "q": keywords,
356
+ "s": "0",
357
+ "o": "json",
358
+ "api": "d.js",
359
+ "kl": region,
360
+ "df": timelimit,
361
+ }
362
+ cache: Set[str] = set()
363
+ for _ in range(11):
364
+ resp_content = await self._aget_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
365
+ if resp_content is None:
366
+ return
367
+
368
+ if b"No more results." in resp_content:
369
+ return
370
+
371
+ tree = html.fromstring(resp_content)
372
+
373
+ result_exists = False
374
+ data = zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr"))
375
+ for i, e in data:
376
+ if i == 1:
377
+ href = e.xpath(".//a//@href")
378
+ href = href[0] if href else None
379
+ if (
380
+ href is None
381
+ or href in cache
382
+ or href == f"http://www.google.com/search?q={keywords}"
383
+ or href.startswith("https://duckduckgo.com/y.js?ad_domain")
384
+ ):
385
+ [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
386
+ else:
387
+ cache.add(href)
388
+ title = e.xpath(".//a//text()")[0]
389
+ elif i == 2:
390
+ body = e.xpath(".//td[@class='result-snippet']//text()")
391
+ body = "".join(body).strip()
392
+ elif i == 3:
393
+ result_exists = True
394
+ yield {
395
+ "title": _normalize(title),
396
+ "href": _normalize_url(href),
397
+ "body": _normalize(body),
398
+ }
399
+ if max_results and len(cache) >= max_results:
400
+ return
401
+ if max_results is None or result_exists is False:
402
+ return
403
+ next_page_s = tree.xpath("//form[./input[contains(@value, 'ext')]]/input[@name='s']/@value")
404
+ if not next_page_s:
405
+ return
406
+ payload["s"] = next_page_s[0]
407
+ payload["vqd"] = _extract_vqd(resp_content, keywords)
408
+
409
+ async def images(
410
+ self,
411
+ keywords: str,
412
+ region: str = "wt-wt",
413
+ safesearch: str = "moderate",
414
+ timelimit: Optional[str] = None,
415
+ size: Optional[str] = None,
416
+ color: Optional[str] = None,
417
+ type_image: Optional[str] = None,
418
+ layout: Optional[str] = None,
419
+ license_image: Optional[str] = None,
420
+ max_results: Optional[int] = None,
421
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
422
+ """webscout images search. Query params: https://duckduckgo.com/params.
423
+
424
+ Args:
425
+ keywords: keywords for query.
426
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
427
+ safesearch: on, moderate, off. Defaults to "moderate".
428
+ timelimit: Day, Week, Month, Year. Defaults to None.
429
+ size: Small, Medium, Large, Wallpaper. Defaults to None.
430
+ color: color, Monochrome, Red, Orange, Yellow, Green, Blue,
431
+ Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None.
432
+ type_image: photo, clipart, gif, transparent, line.
433
+ Defaults to None.
434
+ layout: Square, Tall, Wide. Defaults to None.
435
+ license_image: any (All Creative Commons), Public (PublicDomain),
436
+ Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially),
437
+ Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and
438
+ Use Commercially). Defaults to None.
439
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
440
+
441
+ Yields:
442
+ dict with image search results.
443
+
444
+ """
445
+ assert keywords, "keywords is mandatory"
446
+
447
+ vqd = await self._aget_vqd(keywords)
448
+
449
+ safesearch_base = {"on": 1, "moderate": 1, "off": -1}
450
+ timelimit = f"time:{timelimit}" if timelimit else ""
451
+ size = f"size:{size}" if size else ""
452
+ color = f"color:{color}" if color else ""
453
+ type_image = f"type:{type_image}" if type_image else ""
454
+ layout = f"layout:{layout}" if layout else ""
455
+ license_image = f"license:{license_image}" if license_image else ""
456
+ payload = {
457
+ "l": region,
458
+ "o": "json",
459
+ "q": keywords,
460
+ "vqd": vqd,
461
+ "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
462
+ "p": safesearch_base[safesearch.lower()],
463
+ }
464
+
465
+ cache = set()
466
+ for _ in range(10):
467
+ resp_content = await self._aget_url("GET", "https://duckduckgo.com/i.js", params=payload)
468
+ if resp_content is None:
469
+ return
470
+ try:
471
+ resp_json = json.loads(resp_content)
472
+ except Exception:
473
+ return
474
+ page_data = resp_json.get("results", None)
475
+ if page_data is None:
476
+ return
477
+
478
+ result_exists = False
479
+ for row in page_data:
480
+ image_url = row.get("image", None)
481
+ if image_url and image_url not in cache:
482
+ cache.add(image_url)
483
+ result_exists = True
484
+ yield {
485
+ "title": row["title"],
486
+ "image": _normalize_url(image_url),
487
+ "thumbnail": _normalize_url(row["thumbnail"]),
488
+ "url": _normalize_url(row["url"]),
489
+ "height": row["height"],
490
+ "width": row["width"],
491
+ "source": row["source"],
492
+ }
493
+ if max_results and len(cache) >= max_results:
494
+ return
495
+ if max_results is None or result_exists is False:
496
+ return
497
+ next = resp_json.get("next", None)
498
+ if next is None:
499
+ return
500
+ payload["s"] = next.split("s=")[-1].split("&")[0]
501
+
502
+ async def videos(
503
+ self,
504
+ keywords: str,
505
+ region: str = "wt-wt",
506
+ safesearch: str = "moderate",
507
+ timelimit: Optional[str] = None,
508
+ resolution: Optional[str] = None,
509
+ duration: Optional[str] = None,
510
+ license_videos: Optional[str] = None,
511
+ max_results: Optional[int] = None,
512
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
513
+ """webscout videos search. Query params: https://duckduckgo.com/params.
514
+
515
+ Args:
516
+ keywords: keywords for query.
517
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
518
+ safesearch: on, moderate, off. Defaults to "moderate".
519
+ timelimit: d, w, m. Defaults to None.
520
+ resolution: high, standart. Defaults to None.
521
+ duration: short, medium, long. Defaults to None.
522
+ license_videos: creativeCommon, youtube. Defaults to None.
523
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
524
+
525
+ Yields:
526
+ dict with videos search results
527
+
528
+ """
529
+ assert keywords, "keywords is mandatory"
530
+
531
+ vqd = await self._aget_vqd(keywords)
532
+
533
+ safesearch_base = {"on": 1, "moderate": -1, "off": -2}
534
+ timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
535
+ resolution = f"videoDefinition:{resolution}" if resolution else ""
536
+ duration = f"videoDuration:{duration}" if duration else ""
537
+ license_videos = f"videoLicense:{license_videos}" if license_videos else ""
538
+ payload = {
539
+ "l": region,
540
+ "o": "json",
541
+ "s": 0,
542
+ "q": keywords,
543
+ "vqd": vqd,
544
+ "f": f"{timelimit},{resolution},{duration},{license_videos}",
545
+ "p": safesearch_base[safesearch.lower()],
546
+ }
547
+
548
+ cache = set()
549
+ for _ in range(10):
550
+ resp_content = await self._aget_url("GET", "https://duckduckgo.com/v.js", params=payload)
551
+ if resp_content is None:
552
+ return
553
+ try:
554
+ resp_json = json.loads(resp_content)
555
+ except Exception:
556
+ return
557
+ page_data = resp_json.get("results", None)
558
+ if page_data is None:
559
+ return
560
+
561
+ result_exists = False
562
+ for row in page_data:
563
+ if row["content"] not in cache:
564
+ cache.add(row["content"])
565
+ result_exists = True
566
+ yield row
567
+ if max_results and len(cache) >= max_results:
568
+ return
569
+ if max_results is None or result_exists is False:
570
+ return
571
+ next = resp_json.get("next", None)
572
+ if next is None:
573
+ return
574
+ payload["s"] = next.split("s=")[-1].split("&")[0]
575
+
576
+ async def news(
577
+ self,
578
+ keywords: str,
579
+ region: str = "wt-wt",
580
+ safesearch: str = "moderate",
581
+ timelimit: Optional[str] = None,
582
+ max_results: Optional[int] = None,
583
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
584
+ """webscout news search. Query params: https://duckduckgo.com/params.
585
+
586
+ Args:
587
+ keywords: keywords for query.
588
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
589
+ safesearch: on, moderate, off. Defaults to "moderate".
590
+ timelimit: d, w, m. Defaults to None.
591
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
592
+
593
+ Yields:
594
+ dict with news search results.
595
+
596
+ """
597
+ assert keywords, "keywords is mandatory"
598
+
599
+ vqd = await self._aget_vqd(keywords)
600
+
601
+ safesearch_base = {"on": 1, "moderate": -1, "off": -2}
602
+ payload = {
603
+ "l": region,
604
+ "o": "json",
605
+ "noamp": "1",
606
+ "q": keywords,
607
+ "vqd": vqd,
608
+ "p": safesearch_base[safesearch.lower()],
609
+ "df": timelimit,
610
+ "s": 0,
611
+ }
612
+
613
+ cache = set()
614
+ for _ in range(10):
615
+ resp_content = await self._aget_url("GET", "https://duckduckgo.com/news.js", params=payload)
616
+ if resp_content is None:
617
+ return
618
+ try:
619
+ resp_json = json.loads(resp_content)
620
+ except Exception:
621
+ return
622
+ page_data = resp_json.get("results", None)
623
+ if page_data is None:
624
+ return
625
+
626
+ result_exists = False
627
+ for row in page_data:
628
+ if row["url"] not in cache:
629
+ cache.add(row["url"])
630
+ image_url = row.get("image", None)
631
+ result_exists = True
632
+ yield {
633
+ "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
634
+ "title": row["title"],
635
+ "body": _normalize(row["excerpt"]),
636
+ "url": _normalize_url(row["url"]),
637
+ "image": _normalize_url(image_url) if image_url else None,
638
+ "source": row["source"],
639
+ }
640
+ if max_results and len(cache) >= max_results:
641
+ return
642
+ if max_results is None or result_exists is False:
643
+ return
644
+ next = resp_json.get("next", None)
645
+ if next is None:
646
+ return
647
+ payload["s"] = next.split("s=")[-1].split("&")[0]
648
+
649
+ async def answers(self, keywords: str) -> AsyncGenerator[Dict[str, Optional[str]], None]:
650
+ """webscout instant answers. Query params: https://duckduckgo.com/params.
651
+
652
+ Args:
653
+ keywords: keywords for query.
654
+
655
+ Yields:
656
+ dict with instant answers results.
657
+
658
+ """
659
+ assert keywords, "keywords is mandatory"
660
+
661
+ payload = {
662
+ "q": f"what is {keywords}",
663
+ "format": "json",
664
+ }
665
+
666
+ resp_content = await self._aget_url("GET", "https://api.duckduckgo.com/", params=payload)
667
+ if resp_content is None:
668
+ yield None
669
+ try:
670
+ page_data = json.loads(resp_content)
671
+ except Exception:
672
+ page_data = None
673
+
674
+ if page_data:
675
+ answer = page_data.get("AbstractText", None)
676
+ url = page_data.get("AbstractURL", None)
677
+ if answer:
678
+ yield {
679
+ "icon": None,
680
+ "text": answer,
681
+ "topic": None,
682
+ "url": url,
683
+ }
684
+
685
+ # related:
686
+ payload = {
687
+ "q": f"{keywords}",
688
+ "format": "json",
689
+ }
690
+ resp_content = await self._aget_url("GET", "https://api.duckduckgo.com/", params=payload)
691
+ if resp_content is None:
692
+ yield None
693
+ try:
694
+ page_data = json.loads(resp_content).get("RelatedTopics", None)
695
+ except Exception:
696
+ page_data = None
697
+
698
+ if page_data:
699
+ for row in page_data:
700
+ topic = row.get("Name", None)
701
+ if not topic:
702
+ icon = row["Icon"].get("URL", None)
703
+ yield {
704
+ "icon": f"https://duckduckgo.com{icon}" if icon else None,
705
+ "text": row["Text"],
706
+ "topic": None,
707
+ "url": row["FirstURL"],
708
+ }
709
+ else:
710
+ for subrow in row["Topics"]:
711
+ icon = subrow["Icon"].get("URL", None)
712
+ yield {
713
+ "icon": f"https://duckduckgo.com{icon}" if icon else None,
714
+ "text": subrow["Text"],
715
+ "topic": topic,
716
+ "url": subrow["FirstURL"],
717
+ }
718
+
719
+ async def suggestions(self, keywords: str, region: str = "wt-wt") -> AsyncGenerator[Dict[str, Optional[str]], None]:
720
+ """webscout suggestions. Query params: https://duckduckgo.com/params.
721
+
722
+ Args:
723
+ keywords: keywords for query.
724
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
725
+
726
+ Yields:
727
+ dict with suggestions results.
728
+ """
729
+ assert keywords, "keywords is mandatory"
730
+
731
+ payload = {
732
+ "q": keywords,
733
+ "kl": region,
734
+ }
735
+ resp_content = await self._aget_url("GET", "https://duckduckgo.com/ac", params=payload)
736
+ if resp_content is None:
737
+ yield None
738
+ try:
739
+ page_data = json.loads(resp_content)
740
+ for r in page_data:
741
+ yield r
742
+ except Exception:
743
+ pass
744
+
745
+ async def maps(
746
+ self,
747
+ keywords: str,
748
+ place: Optional[str] = None,
749
+ street: Optional[str] = None,
750
+ city: Optional[str] = None,
751
+ county: Optional[str] = None,
752
+ state: Optional[str] = None,
753
+ country: Optional[str] = None,
754
+ postalcode: Optional[str] = None,
755
+ latitude: Optional[str] = None,
756
+ longitude: Optional[str] = None,
757
+ radius: int = 0,
758
+ max_results: Optional[int] = None,
759
+ ) -> AsyncGenerator[Dict[str, Optional[str]], None]:
760
+ """webscout maps search. Query params: https://duckduckgo.com/params.
761
+
762
+ Args:
763
+ keywords: keywords for query
764
+ place: if set, the other parameters are not used. Defaults to None.
765
+ street: house number/street. Defaults to None.
766
+ city: city of search. Defaults to None.
767
+ county: county of search. Defaults to None.
768
+ state: state of search. Defaults to None.
769
+ country: country of search. Defaults to None.
770
+ postalcode: postalcode of search. Defaults to None.
771
+ latitude: geographic coordinate (north-south position). Defaults to None.
772
+ longitude: geographic coordinate (east-west position); if latitude and
773
+ longitude are set, the other parameters are not used. Defaults to None.
774
+ radius: expand the search square by the distance in kilometers. Defaults to 0.
775
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
776
+
777
+ Yields:
778
+ dict with maps search results
779
+ """
780
+ assert keywords, "keywords is mandatory"
781
+
782
+ vqd = await self._aget_vqd(keywords)
783
+
784
+ # if longitude and latitude are specified, skip the request about bbox to the nominatim api
785
+ if latitude and longitude:
786
+ lat_t = Decimal(latitude.replace(",", "."))
787
+ lat_b = Decimal(latitude.replace(",", "."))
788
+ lon_l = Decimal(longitude.replace(",", "."))
789
+ lon_r = Decimal(longitude.replace(",", "."))
790
+ if radius == 0:
791
+ radius = 1
792
+ # otherwise request about bbox to nominatim api
793
+ else:
794
+ if place:
795
+ params: Dict[str, Optional[str]] = {
796
+ "q": place,
797
+ "polygon_geojson": "0",
798
+ "format": "jsonv2",
799
+ }
800
+ else:
801
+ params = {
802
+ "street": street,
803
+ "city": city,
804
+ "county": county,
805
+ "state": state,
806
+ "country": country,
807
+ "postalcode": postalcode,
808
+ "polygon_geojson": "0",
809
+ "format": "jsonv2",
810
+ }
811
+ try:
812
+ resp_content = await self._aget_url(
813
+ "GET",
814
+ "https://nominatim.openstreetmap.org/search.php",
815
+ params=params,
816
+ )
817
+ if resp_content is None:
818
+ yield None
819
+
820
+ coordinates = json.loads(resp_content)[0]["boundingbox"]
821
+ lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
822
+ lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
823
+ except Exception as ex:
824
+ logger.debug(f"ddg_maps() keywords={keywords} {type(ex).__name__} {ex}")
825
+ return
826
+
827
+ # if a radius is specified, expand the search square
828
+ lat_t += Decimal(radius) * Decimal(0.008983)
829
+ lat_b -= Decimal(radius) * Decimal(0.008983)
830
+ lon_l -= Decimal(radius) * Decimal(0.008983)
831
+ lon_r += Decimal(radius) * Decimal(0.008983)
832
+ logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
833
+
834
+ # сreate a queue of search squares (bboxes)
835
+ work_bboxes: Deque[Tuple[Decimal, Decimal, Decimal, Decimal]] = deque()
836
+ work_bboxes.append((lat_t, lon_l, lat_b, lon_r))
837
+
838
+ # bbox iterate
839
+ cache = set()
840
+ while work_bboxes:
841
+ lat_t, lon_l, lat_b, lon_r = work_bboxes.pop()
842
+ params = {
843
+ "q": keywords,
844
+ "vqd": vqd,
845
+ "tg": "maps_places",
846
+ "rt": "D",
847
+ "mkexp": "b",
848
+ "wiki_info": "1",
849
+ "is_requery": "1",
850
+ "bbox_tl": f"{lat_t},{lon_l}",
851
+ "bbox_br": f"{lat_b},{lon_r}",
852
+ "strict_bbox": "1",
853
+ }
854
+ resp_content = await self._aget_url("GET", "https://duckduckgo.com/local.js", params=params)
855
+ if resp_content is None:
856
+ return
857
+ try:
858
+ page_data = json.loads(resp_content).get("results", [])
859
+ except Exception:
860
+ return
861
+ if page_data is None:
862
+ return
863
+
864
+ for res in page_data:
865
+ result = MapsResult()
866
+ result.title = res["name"]
867
+ result.address = res["address"]
868
+ if f"{result.title} {result.address}" in cache:
869
+ continue
870
+ else:
871
+ cache.add(f"{result.title} {result.address}")
872
+ result.country_code = res["country_code"]
873
+ result.url = _normalize_url(res["website"])
874
+ result.phone = res["phone"]
875
+ result.latitude = res["coordinates"]["latitude"]
876
+ result.longitude = res["coordinates"]["longitude"]
877
+ result.source = _normalize_url(res["url"])
878
+ if res["embed"]:
879
+ result.image = res["embed"].get("image", "")
880
+ result.desc = res["embed"].get("description", "")
881
+ result.hours = res["hours"]
882
+ result.category = res["ddg_category"]
883
+ result.facebook = f"www.facebook.com/profile.php?id={x}" if (x := res["facebook_id"]) else None
884
+ result.instagram = f"https://www.instagram.com/{x}" if (x := res["instagram_id"]) else None
885
+ result.twitter = f"https://twitter.com/{x}" if (x := res["twitter_id"]) else None
886
+ yield result.__dict__
887
+ if max_results and len(cache) >= max_results:
888
+ return
889
+ if max_results is None:
890
+ return
891
+ # divide the square into 4 parts and add to the queue
892
+ if len(page_data) >= 15:
893
+ lat_middle = (lat_t + lat_b) / 2
894
+ lon_middle = (lon_l + lon_r) / 2
895
+ bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
896
+ bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
897
+ bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
898
+ bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
899
+ work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
900
+
901
+ async def translate(
902
+ self, keywords: str, from_: Optional[str] = None, to: str = "en"
903
+ ) -> Optional[Dict[str, Optional[str]]]:
904
+ """webscout translate.
905
+
906
+ Args:
907
+ keywords: string or a list of strings to translate
908
+ from_: translate from (defaults automatically). Defaults to None.
909
+ to: what language to translate. Defaults to "en".
910
+
911
+ Returns:
912
+ dict with translated keywords.
913
+ """
914
+ assert keywords, "keywords is mandatory"
915
+
916
+ vqd = await self._aget_vqd("translate")
917
+
918
+ payload = {
919
+ "vqd": vqd,
920
+ "query": "translate",
921
+ "to": to,
922
+ }
923
+ if from_:
924
+ payload["from"] = from_
925
+
926
+ resp_content = await self._aget_url(
927
+ "POST",
928
+ "https://duckduckgo.com/translation.js",
929
+ params=payload,
930
+ data=keywords.encode(),
931
+ )
932
+ if resp_content is None:
933
+ return None
934
+ try:
935
+ page_data = json.loads(resp_content)
936
+ page_data["original"] = keywords
937
+ except Exception:
938
+ page_data = None
939
+ return page_data
940
+
941
+ logger = logging.getLogger("duckduckgo_search.DDGS")
942
+ nest_asyncio.apply()
943
+
944
+
945
+ class DDGS(AsyncDDGS):
946
+ def __init__(self, headers=None, proxies=None, timeout=10):
947
+ if asyncio.get_event_loop().is_running():
948
+ warnings.warn("DDGS running in an async loop. This may cause errors. Use AsyncDDGS instead.", stacklevel=2)
949
+ super().__init__(headers, proxies, timeout)
950
+ self._loop = asyncio.get_event_loop()
951
+
952
+ def __enter__(self) -> "DDGS":
953
+ return self
954
+
955
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
956
+ self._loop.create_task(self.__aexit__(exc_type, exc_val, exc_tb))
957
+
958
+ def _iter_over_async(self, async_gen):
959
+ """Iterate over an async generator."""
960
+ while True:
961
+ try:
962
+ yield self._loop.run_until_complete(async_gen.__anext__())
963
+ except StopAsyncIteration:
964
+ break
965
+
966
+ def text(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
967
+ async_gen = super().text(*args, **kwargs)
968
+ return self._iter_over_async(async_gen)
969
+
970
+ def images(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
971
+ async_gen = super().images(*args, **kwargs)
972
+ return self._iter_over_async(async_gen)
973
+
974
+ def videos(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
975
+ async_gen = super().videos(*args, **kwargs)
976
+ return self._iter_over_async(async_gen)
977
+
978
+ def news(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
979
+ async_gen = super().news(*args, **kwargs)
980
+ return self._iter_over_async(async_gen)
981
+
982
+ def answers(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
983
+ async_gen = super().answers(*args, **kwargs)
984
+ return self._iter_over_async(async_gen)
985
+
986
+ def suggestions(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
987
+ async_gen = super().suggestions(*args, **kwargs)
988
+ return self._iter_over_async(async_gen)
989
+
990
+ def maps(self, *args, **kwargs) -> Generator[Dict[str, Optional[str]], None, None]:
991
+ async_gen = super().maps(*args, **kwargs)
992
+ return self._iter_over_async(async_gen)
993
+
994
+ def translate(self, *args, **kwargs) -> Optional[Dict[str, Optional[str]]]:
995
+ async_coro = super().translate(*args, **kwargs)
996
+ return self._loop.run_until_complete(async_coro)
997
+
998
  import g4f
999
  from webscout import DDGS
1000
  from time import time as t