Niansuh commited on
Commit
297ac09
1 Parent(s): 4e83b0e

Create webscout.py

Browse files
Files changed (1) hide show
  1. webscout.py +1691 -0
webscout.py ADDED
@@ -0,0 +1,1691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import warnings
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from datetime import datetime, timezone
5
+ from decimal import Decimal
6
+ from functools import cached_property
7
+ from itertools import cycle, islice
8
+ from threading import Event
9
+ from types import TracebackType
10
+ from typing import Dict, List, Optional, Tuple, Type, Union, cast
11
+
12
+ import pyreqwest_impersonate as pri # type: ignore
13
+
14
+ try:
15
+ from lxml.etree import _Element
16
+ from lxml.html import HTMLParser as LHTMLParser
17
+ from lxml.html import document_fromstring
18
+
19
+ LXML_AVAILABLE = True
20
+ except ImportError:
21
+ LXML_AVAILABLE = False
22
+
23
+ import re
24
+ from decimal import Decimal
25
+ from html import unescape
26
+ from math import atan2, cos, radians, sin, sqrt
27
+ from typing import Any, Dict, List, Union
28
+ from urllib.parse import unquote
29
+ import orjson
30
+ import requests
31
+ import base64
32
+ from typing import List, Dict, Union
33
+ import json
34
+ import requests
35
+ import base64
36
+ from typing import List, Dict, Union
37
+
38
+
39
+ REGEX_STRIP_TAGS = re.compile("<.*?>")
40
+
41
+
42
+ def json_dumps(obj: Any) -> str:
43
+ try:
44
+ return orjson.dumps(obj).decode("utf-8")
45
+ except Exception as ex:
46
+ raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
47
+
48
+
49
+ def json_loads(obj: Union[str, bytes]) -> Any:
50
+ try:
51
+ return orjson.loads(obj)
52
+ except Exception as ex:
53
+ raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
54
+
55
+
56
+ def _extract_vqd(html_bytes: bytes, keywords: str) -> str:
57
+ """Extract vqd from html bytes."""
58
+ for c1, c1_len, c2 in (
59
+ (b'vqd="', 5, b'"'),
60
+ (b"vqd=", 4, b"&"),
61
+ (b"vqd='", 5, b"'"),
62
+ ):
63
+ try:
64
+ start = html_bytes.index(c1) + c1_len
65
+ end = html_bytes.index(c2, start)
66
+ return html_bytes[start:end].decode()
67
+ except ValueError:
68
+ pass
69
+ raise WebscoutE(f"_extract_vqd() {keywords=} Could not extract vqd.")
70
+
71
+
72
+ def _text_extract_json(html_bytes: bytes, keywords: str) -> List[Dict[str, str]]:
73
+ """text(backend="api") -> extract json from html."""
74
+ try:
75
+ start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
76
+ end = html_bytes.index(b");DDG.duckbar.load(", start)
77
+ data = html_bytes[start:end]
78
+ result: List[Dict[str, str]] = json_loads(data)
79
+ return result
80
+ except Exception as ex:
81
+ raise WebscoutE(f"_text_extract_json() {keywords=} {type(ex).__name__}: {ex}") from ex
82
+ raise WebscoutE(f"_text_extract_json() {keywords=} return None")
83
+
84
+
85
+ def _normalize(raw_html: str) -> str:
86
+ """Strip HTML tags from the raw_html string."""
87
+ return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
88
+
89
+
90
+ def _normalize_url(url: str) -> str:
91
+ """Unquote URL and replace spaces with '+'."""
92
+ return unquote(url.replace(" ", "+")) if url else ""
93
+
94
+
95
+ def _calculate_distance(lat1: Decimal, lon1: Decimal, lat2: Decimal, lon2: Decimal) -> float:
96
+ """Calculate distance between two points in km. Haversine formula."""
97
+ R = 6371.0087714 # Earth's radius in km
98
+ rlat1, rlon1, rlat2, rlon2 = map(radians, [float(lat1), float(lon1), float(lat2), float(lon2)])
99
+ dlon, dlat = rlon2 - rlon1, rlat2 - rlat1
100
+ a = sin(dlat / 2) ** 2 + cos(rlat1) * cos(rlat2) * sin(dlon / 2) ** 2
101
+ c = 2 * atan2(sqrt(a), sqrt(1 - a))
102
+ return R * c
103
+
104
+ logger = logging.getLogger("webscout.WEBS")
105
+
106
+ class WebscoutE(Exception):
107
+ """Base exception class for search."""
108
+
109
+
110
+ class RatelimitE(Exception):
111
+ """Raised for rate limit exceeded errors during API requests."""
112
+
113
+
114
+ class TimeoutE(Exception):
115
+ """Raised for timeout errors during API requests."""
116
+
117
+ class FailedToGenerateResponseError(Exception):
118
+
119
+ """Provider failed to fetch response"""
120
+ class AllProvidersFailure(Exception):
121
+ """None of the providers generated response successfully"""
122
+
123
+ pass
124
+ class WEBS:
125
+ """webscout class to get search results from duckduckgo.com."""
126
+
127
+ _executor: ThreadPoolExecutor = ThreadPoolExecutor()
128
+
129
+ def __init__(
130
+ self,
131
+ headers: Optional[Dict[str, str]] = None,
132
+ proxy: Optional[str] = None,
133
+ proxies: Union[Dict[str, str], str, None] = None, # deprecated
134
+ timeout: Optional[int] = 10,
135
+ ) -> None:
136
+ """Initialize the WEBS object.
137
+
138
+ Args:
139
+ headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None.
140
+ proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
141
+ example: "http://user:[email protected]:3128". Defaults to None.
142
+ timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
143
+ """
144
+ self.proxy: Optional[str] = proxy
145
+ assert self.proxy is None or isinstance(self.proxy, str), "proxy must be a str"
146
+ if not proxy and proxies:
147
+ warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
148
+ self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
149
+ self.headers = headers if headers else {}
150
+ self.headers["Referer"] = "https://duckduckgo.com/"
151
+ self.client = pri.Client(
152
+ headers=self.headers,
153
+ proxy=self.proxy,
154
+ timeout=timeout,
155
+ cookie_store=True,
156
+ referer=True,
157
+ impersonate="chrome_124",
158
+ follow_redirects=False,
159
+ verify=False,
160
+ )
161
+ self._exception_event = Event()
162
+ self._chat_messages: List[Dict[str, str]] = []
163
+ self._chat_vqd: str = ""
164
+
165
+ def __enter__(self) -> "WEBS":
166
+ return self
167
+
168
+ def __exit__(
169
+ self,
170
+ exc_type: Optional[Type[BaseException]] = None,
171
+ exc_val: Optional[BaseException] = None,
172
+ exc_tb: Optional[TracebackType] = None,
173
+ ) -> None:
174
+ pass
175
+
176
+ @cached_property
177
+ def parser(self) -> "LHTMLParser":
178
+ """Get HTML parser."""
179
+ return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
180
+
181
+ def _get_url(
182
+ self,
183
+ method: str,
184
+ url: str,
185
+ params: Optional[Dict[str, str]] = None,
186
+ content: Optional[bytes] = None,
187
+ data: Optional[Union[Dict[str, str], bytes]] = None,
188
+ ) -> bytes:
189
+ if self._exception_event.is_set():
190
+ raise WebscoutE("Exception occurred in previous call.")
191
+ try:
192
+ resp = self.client.request(method, url, params=params, content=content, data=data)
193
+ except Exception as ex:
194
+ self._exception_event.set()
195
+ if "time" in str(ex).lower():
196
+ raise TimeoutE(f"{url} {type(ex).__name__}: {ex}") from ex
197
+ raise WebscoutE(f"{url} {type(ex).__name__}: {ex}") from ex
198
+ logger.debug(f"_get_url() {resp.url} {resp.status_code} {len(resp.content)}")
199
+ if resp.status_code == 200:
200
+ return cast(bytes, resp.content)
201
+ self._exception_event.set()
202
+ if resp.status_code in (202, 301, 403):
203
+ raise RatelimitE(f"{resp.url} {resp.status_code} Ratelimit")
204
+ raise WebscoutE(f"{resp.url} return None. {params=} {content=} {data=}")
205
+
206
+ def _get_vqd(self, keywords: str) -> str:
207
+ """Get vqd value for a search query."""
208
+ resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
209
+ return _extract_vqd(resp_content, keywords)
210
+
211
+ def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
212
+ """Initiates a chat session with DuckDuckGo AI.
213
+
214
+ Args:
215
+ keywords (str): The initial message or question to send to the AI.
216
+ model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
217
+ Defaults to "gpt-3.5".
218
+
219
+ Returns:
220
+ str: The response from the AI.
221
+ """
222
+ models = {
223
+ "claude-3-haiku": "claude-3-haiku-20240307",
224
+ "gpt-3.5": "gpt-3.5-turbo-0125",
225
+ "llama-3-70b": "meta-llama/Llama-3-70b-chat-hf",
226
+ "mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1",
227
+ }
228
+ # vqd
229
+ if not self._chat_vqd:
230
+ resp = self.client.get("https://duckduckgo.com/duckchat/v1/status", headers={"x-vqd-accept": "1"})
231
+ self._chat_vqd = resp.headers.get("x-vqd-4", "")
232
+
233
+ self._chat_messages.append({"role": "user", "content": keywords})
234
+
235
+ json_data = {
236
+ "model": models[model],
237
+ "messages": self._chat_messages,
238
+ }
239
+ resp = self.client.post(
240
+ "https://duckduckgo.com/duckchat/v1/chat", headers={"x-vqd-4": self._chat_vqd}, json=json_data
241
+ )
242
+ self._chat_vqd = resp.headers.get("x-vqd-4", "")
243
+
244
+ messages = []
245
+ for line in resp.text.replace("data: ", "").replace("[DONE]", "").split("\n\n"):
246
+ x = line.strip()
247
+ if x:
248
+ j = json_loads(x)
249
+ message = j.get("message", "")
250
+ messages.append(message)
251
+ result = "".join(messages)
252
+ self._chat_messages.append({"role": "assistant", "content": result})
253
+ return result
254
+
255
+ def text(
256
+ self,
257
+ keywords: str,
258
+ region: str = "wt-wt",
259
+ safesearch: str = "moderate",
260
+ timelimit: Optional[str] = None,
261
+ backend: str = "api",
262
+ max_results: Optional[int] = None,
263
+ ) -> List[Dict[str, str]]:
264
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
265
+
266
+ Args:
267
+ keywords: keywords for query.
268
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
269
+ safesearch: on, moderate, off. Defaults to "moderate".
270
+ timelimit: d, w, m, y. Defaults to None.
271
+ backend: api, html, lite. Defaults to api.
272
+ api - collect data from https://duckduckgo.com,
273
+ html - collect data from https://html.duckduckgo.com,
274
+ lite - collect data from https://lite.duckduckgo.com.
275
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
276
+
277
+ Returns:
278
+ List of dictionaries with search results, or None if there was an error.
279
+
280
+ Raises:
281
+ WebscoutE: Base exception for webscout errors.
282
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
283
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
284
+ """
285
+ if LXML_AVAILABLE is False and backend != "api":
286
+ backend = "api"
287
+ warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
288
+
289
+ if backend == "api":
290
+ results = self._text_api(keywords, region, safesearch, timelimit, max_results)
291
+ elif backend == "html":
292
+ results = self._text_html(keywords, region, timelimit, max_results)
293
+ elif backend == "lite":
294
+ results = self._text_lite(keywords, region, timelimit, max_results)
295
+ return results
296
+
297
+ def _text_api(
298
+ self,
299
+ keywords: str,
300
+ region: str = "wt-wt",
301
+ safesearch: str = "moderate",
302
+ timelimit: Optional[str] = None,
303
+ max_results: Optional[int] = None,
304
+ ) -> List[Dict[str, str]]:
305
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
306
+
307
+ Args:
308
+ keywords: keywords for query.
309
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
310
+ safesearch: on, moderate, off. Defaults to "moderate".
311
+ timelimit: d, w, m, y. Defaults to None.
312
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
313
+
314
+ Returns:
315
+ List of dictionaries with search results.
316
+
317
+ Raises:
318
+ WebscoutE: Base exception for webscout errors.
319
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
320
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
321
+ """
322
+ assert keywords, "keywords is mandatory"
323
+
324
+ vqd = self._get_vqd(keywords)
325
+
326
+ payload = {
327
+ "q": keywords,
328
+ "kl": region,
329
+ "l": region,
330
+ "p": "",
331
+ "s": "0",
332
+ "df": "",
333
+ "vqd": vqd,
334
+ "bing_market": f"{region[3:]}-{region[:2].upper()}",
335
+ "ex": "",
336
+ }
337
+ safesearch = safesearch.lower()
338
+ if safesearch == "moderate":
339
+ payload["ex"] = "-1"
340
+ elif safesearch == "off":
341
+ payload["ex"] = "-2"
342
+ elif safesearch == "on": # strict
343
+ payload["p"] = "1"
344
+ if timelimit:
345
+ payload["df"] = timelimit
346
+
347
+ cache = set()
348
+ results: List[Dict[str, str]] = []
349
+
350
+ def _text_api_page(s: int) -> List[Dict[str, str]]:
351
+ payload["s"] = f"{s}"
352
+ resp_content = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
353
+ page_data = _text_extract_json(resp_content, keywords)
354
+ page_results = []
355
+ for row in page_data:
356
+ href = row.get("u", None)
357
+ if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
358
+ cache.add(href)
359
+ body = _normalize(row["a"])
360
+ if body:
361
+ result = {
362
+ "title": _normalize(row["t"]),
363
+ "href": _normalize_url(href),
364
+ "body": body,
365
+ }
366
+ page_results.append(result)
367
+ return page_results
368
+
369
+ slist = [0]
370
+ if max_results:
371
+ max_results = min(max_results, 2023)
372
+ slist.extend(range(23, max_results, 50))
373
+ try:
374
+ for r in self._executor.map(_text_api_page, slist):
375
+ results.extend(r)
376
+ except Exception as e:
377
+ raise e
378
+
379
+ return list(islice(results, max_results))
380
+
381
+ def _text_html(
382
+ self,
383
+ keywords: str,
384
+ region: str = "wt-wt",
385
+ timelimit: Optional[str] = None,
386
+ max_results: Optional[int] = None,
387
+ ) -> List[Dict[str, str]]:
388
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
389
+
390
+ Args:
391
+ keywords: keywords for query.
392
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
393
+ timelimit: d, w, m, y. Defaults to None.
394
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
395
+
396
+ Returns:
397
+ List of dictionaries with search results.
398
+
399
+ Raises:
400
+ WebscoutE: Base exception for webscout errors.
401
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
402
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
403
+ """
404
+ assert keywords, "keywords is mandatory"
405
+
406
+ payload = {
407
+ "q": keywords,
408
+ "s": "0",
409
+ "o": "json",
410
+ "api": "d.js",
411
+ "vqd": "",
412
+ "kl": region,
413
+ "bing_market": region,
414
+ }
415
+ if timelimit:
416
+ payload["df"] = timelimit
417
+ if max_results and max_results > 20:
418
+ vqd = self._get_vqd(keywords)
419
+ payload["vqd"] = vqd
420
+
421
+ cache = set()
422
+ results: List[Dict[str, str]] = []
423
+
424
+ def _text_html_page(s: int) -> List[Dict[str, str]]:
425
+ payload["s"] = f"{s}"
426
+ resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload)
427
+ if b"No results." in resp_content:
428
+ return []
429
+
430
+ page_results = []
431
+ tree = document_fromstring(resp_content, self.parser)
432
+ elements = tree.xpath("//div[h2]")
433
+ if not isinstance(elements, List):
434
+ return []
435
+ for e in elements:
436
+ if isinstance(e, _Element):
437
+ hrefxpath = e.xpath("./a/@href")
438
+ href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
439
+ if (
440
+ href
441
+ and href not in cache
442
+ and not href.startswith(
443
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
444
+ )
445
+ ):
446
+ cache.add(href)
447
+ titlexpath = e.xpath("./h2/a/text()")
448
+ title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
449
+ bodyxpath = e.xpath("./a//text()")
450
+ body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
451
+ result = {
452
+ "title": _normalize(title),
453
+ "href": _normalize_url(href),
454
+ "body": _normalize(body),
455
+ }
456
+ page_results.append(result)
457
+ return page_results
458
+
459
+ slist = [0]
460
+ if max_results:
461
+ max_results = min(max_results, 2023)
462
+ slist.extend(range(23, max_results, 50))
463
+ try:
464
+ for r in self._executor.map(_text_html_page, slist):
465
+ results.extend(r)
466
+ except Exception as e:
467
+ raise e
468
+
469
+ return list(islice(results, max_results))
470
+
471
+ def _text_lite(
472
+ self,
473
+ keywords: str,
474
+ region: str = "wt-wt",
475
+ timelimit: Optional[str] = None,
476
+ max_results: Optional[int] = None,
477
+ ) -> List[Dict[str, str]]:
478
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
479
+
480
+ Args:
481
+ keywords: keywords for query.
482
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
483
+ timelimit: d, w, m, y. Defaults to None.
484
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
485
+
486
+ Returns:
487
+ List of dictionaries with search results.
488
+
489
+ Raises:
490
+ WebscoutE: Base exception for webscout errors.
491
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
492
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
493
+ """
494
+ assert keywords, "keywords is mandatory"
495
+
496
+ payload = {
497
+ "q": keywords,
498
+ "s": "0",
499
+ "o": "json",
500
+ "api": "d.js",
501
+ "vqd": "",
502
+ "kl": region,
503
+ "bing_market": region,
504
+ }
505
+ if timelimit:
506
+ payload["df"] = timelimit
507
+
508
+ cache = set()
509
+ results: List[Dict[str, str]] = []
510
+
511
+ def _text_lite_page(s: int) -> List[Dict[str, str]]:
512
+ payload["s"] = f"{s}"
513
+ resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
514
+ if b"No more results." in resp_content:
515
+ return []
516
+
517
+ page_results = []
518
+ tree = document_fromstring(resp_content, self.parser)
519
+ elements = tree.xpath("//table[last()]//tr")
520
+ if not isinstance(elements, List):
521
+ return []
522
+
523
+ data = zip(cycle(range(1, 5)), elements)
524
+ for i, e in data:
525
+ if isinstance(e, _Element):
526
+ if i == 1:
527
+ hrefxpath = e.xpath(".//a//@href")
528
+ href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
529
+ if (
530
+ href is None
531
+ or href in cache
532
+ or href.startswith(
533
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
534
+ )
535
+ ):
536
+ [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
537
+ else:
538
+ cache.add(href)
539
+ titlexpath = e.xpath(".//a//text()")
540
+ title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
541
+ elif i == 2:
542
+ bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
543
+ body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
544
+ if href:
545
+ result = {
546
+ "title": _normalize(title),
547
+ "href": _normalize_url(href),
548
+ "body": _normalize(body),
549
+ }
550
+ page_results.append(result)
551
+ return page_results
552
+
553
+ slist = [0]
554
+ if max_results:
555
+ max_results = min(max_results, 2023)
556
+ slist.extend(range(23, max_results, 50))
557
+ try:
558
+ for r in self._executor.map(_text_lite_page, slist):
559
+ results.extend(r)
560
+ except Exception as e:
561
+ raise e
562
+
563
+ return list(islice(results, max_results))
564
+
565
+ def images(
566
+ self,
567
+ keywords: str,
568
+ region: str = "wt-wt",
569
+ safesearch: str = "moderate",
570
+ timelimit: Optional[str] = None,
571
+ size: Optional[str] = None,
572
+ color: Optional[str] = None,
573
+ type_image: Optional[str] = None,
574
+ layout: Optional[str] = None,
575
+ license_image: Optional[str] = None,
576
+ max_results: Optional[int] = None,
577
+ ) -> List[Dict[str, str]]:
578
+ """DuckDuckGo images search. Query params: https://duckduckgo.com/params.
579
+
580
+ Args:
581
+ keywords: keywords for query.
582
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
583
+ safesearch: on, moderate, off. Defaults to "moderate".
584
+ timelimit: Day, Week, Month, Year. Defaults to None.
585
+ size: Small, Medium, Large, Wallpaper. Defaults to None.
586
+ color: color, Monochrome, Red, Orange, Yellow, Green, Blue,
587
+ Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None.
588
+ type_image: photo, clipart, gif, transparent, line.
589
+ Defaults to None.
590
+ layout: Square, Tall, Wide. Defaults to None.
591
+ license_image: any (All Creative Commons), Public (PublicDomain),
592
+ Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially),
593
+ Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and
594
+ Use Commercially). Defaults to None.
595
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
596
+
597
+ Returns:
598
+ List of dictionaries with images search results.
599
+
600
+ Raises:
601
+ WebscoutE: Base exception for webscout errors.
602
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
603
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
604
+ """
605
+ assert keywords, "keywords is mandatory"
606
+
607
+ vqd = self._get_vqd(keywords)
608
+
609
+ safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
610
+ timelimit = f"time:{timelimit}" if timelimit else ""
611
+ size = f"size:{size}" if size else ""
612
+ color = f"color:{color}" if color else ""
613
+ type_image = f"type:{type_image}" if type_image else ""
614
+ layout = f"layout:{layout}" if layout else ""
615
+ license_image = f"license:{license_image}" if license_image else ""
616
+ payload = {
617
+ "l": region,
618
+ "o": "json",
619
+ "q": keywords,
620
+ "vqd": vqd,
621
+ "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
622
+ "p": safesearch_base[safesearch.lower()],
623
+ }
624
+
625
+ cache = set()
626
+ results: List[Dict[str, str]] = []
627
+
628
+ def _images_page(s: int) -> List[Dict[str, str]]:
629
+ payload["s"] = f"{s}"
630
+ resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
631
+ resp_json = json_loads(resp_content)
632
+
633
+ page_data = resp_json.get("results", [])
634
+ page_results = []
635
+ for row in page_data:
636
+ image_url = row.get("image")
637
+ if image_url and image_url not in cache:
638
+ cache.add(image_url)
639
+ result = {
640
+ "title": row["title"],
641
+ "image": _normalize_url(image_url),
642
+ "thumbnail": _normalize_url(row["thumbnail"]),
643
+ "url": _normalize_url(row["url"]),
644
+ "height": row["height"],
645
+ "width": row["width"],
646
+ "source": row["source"],
647
+ }
648
+ page_results.append(result)
649
+ return page_results
650
+
651
+ slist = [0]
652
+ if max_results:
653
+ max_results = min(max_results, 500)
654
+ slist.extend(range(100, max_results, 100))
655
+ try:
656
+ for r in self._executor.map(_images_page, slist):
657
+ results.extend(r)
658
+ except Exception as e:
659
+ raise e
660
+
661
+ return list(islice(results, max_results))
662
+
663
+ def videos(
664
+ self,
665
+ keywords: str,
666
+ region: str = "wt-wt",
667
+ safesearch: str = "moderate",
668
+ timelimit: Optional[str] = None,
669
+ resolution: Optional[str] = None,
670
+ duration: Optional[str] = None,
671
+ license_videos: Optional[str] = None,
672
+ max_results: Optional[int] = None,
673
+ ) -> List[Dict[str, str]]:
674
+ """DuckDuckGo videos search. Query params: https://duckduckgo.com/params.
675
+
676
+ Args:
677
+ keywords: keywords for query.
678
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
679
+ safesearch: on, moderate, off. Defaults to "moderate".
680
+ timelimit: d, w, m. Defaults to None.
681
+ resolution: high, standart. Defaults to None.
682
+ duration: short, medium, long. Defaults to None.
683
+ license_videos: creativeCommon, youtube. Defaults to None.
684
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
685
+
686
+ Returns:
687
+ List of dictionaries with videos search results.
688
+
689
+ Raises:
690
+ WebscoutE: Base exception for webscout errors.
691
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
692
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
693
+ """
694
+ assert keywords, "keywords is mandatory"
695
+
696
+ vqd = self._get_vqd(keywords)
697
+
698
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
699
+ timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
700
+ resolution = f"videoDefinition:{resolution}" if resolution else ""
701
+ duration = f"videoDuration:{duration}" if duration else ""
702
+ license_videos = f"videoLicense:{license_videos}" if license_videos else ""
703
+ payload = {
704
+ "l": region,
705
+ "o": "json",
706
+ "q": keywords,
707
+ "vqd": vqd,
708
+ "f": f"{timelimit},{resolution},{duration},{license_videos}",
709
+ "p": safesearch_base[safesearch.lower()],
710
+ }
711
+
712
+ cache = set()
713
+ results: List[Dict[str, str]] = []
714
+
715
+ def _videos_page(s: int) -> List[Dict[str, str]]:
716
+ payload["s"] = f"{s}"
717
+ resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
718
+ resp_json = json_loads(resp_content)
719
+
720
+ page_data = resp_json.get("results", [])
721
+ page_results = []
722
+ for row in page_data:
723
+ if row["content"] not in cache:
724
+ cache.add(row["content"])
725
+ page_results.append(row)
726
+ return page_results
727
+
728
+ slist = [0]
729
+ if max_results:
730
+ max_results = min(max_results, 400)
731
+ slist.extend(range(60, max_results, 60))
732
+ try:
733
+ for r in self._executor.map(_videos_page, slist):
734
+ results.extend(r)
735
+ except Exception as e:
736
+ raise e
737
+
738
+ return list(islice(results, max_results))
739
+
740
+ def news(
741
+ self,
742
+ keywords: str,
743
+ region: str = "wt-wt",
744
+ safesearch: str = "moderate",
745
+ timelimit: Optional[str] = None,
746
+ max_results: Optional[int] = None,
747
+ ) -> List[Dict[str, str]]:
748
+ """DuckDuckGo news search. Query params: https://duckduckgo.com/params.
749
+
750
+ Args:
751
+ keywords: keywords for query.
752
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
753
+ safesearch: on, moderate, off. Defaults to "moderate".
754
+ timelimit: d, w, m. Defaults to None.
755
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
756
+
757
+ Returns:
758
+ List of dictionaries with news search results.
759
+
760
+ Raises:
761
+ WebscoutE: Base exception for webscout errors.
762
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
763
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
764
+ """
765
+ assert keywords, "keywords is mandatory"
766
+
767
+ vqd = self._get_vqd(keywords)
768
+
769
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
770
+ payload = {
771
+ "l": region,
772
+ "o": "json",
773
+ "noamp": "1",
774
+ "q": keywords,
775
+ "vqd": vqd,
776
+ "p": safesearch_base[safesearch.lower()],
777
+ }
778
+ if timelimit:
779
+ payload["df"] = timelimit
780
+
781
+ cache = set()
782
+ results: List[Dict[str, str]] = []
783
+
784
+ def _news_page(s: int) -> List[Dict[str, str]]:
785
+ payload["s"] = f"{s}"
786
+ resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
787
+ resp_json = json_loads(resp_content)
788
+ page_data = resp_json.get("results", [])
789
+ page_results = []
790
+ for row in page_data:
791
+ if row["url"] not in cache:
792
+ cache.add(row["url"])
793
+ image_url = row.get("image", None)
794
+ result = {
795
+ "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
796
+ "title": row["title"],
797
+ "body": _normalize(row["excerpt"]),
798
+ "url": _normalize_url(row["url"]),
799
+ "image": _normalize_url(image_url),
800
+ "source": row["source"],
801
+ }
802
+ page_results.append(result)
803
+ return page_results
804
+
805
+ slist = [0]
806
+ if max_results:
807
+ max_results = min(max_results, 120)
808
+ slist.extend(range(30, max_results, 30))
809
+ try:
810
+ for r in self._executor.map(_news_page, slist):
811
+ results.extend(r)
812
+ except Exception as e:
813
+ raise e
814
+
815
+ return list(islice(results, max_results))
816
+
817
+ def answers(self, keywords: str) -> List[Dict[str, str]]:
818
+ """DuckDuckGo instant answers. Query params: https://duckduckgo.com/params.
819
+
820
+ Args:
821
+ keywords: keywords for query,
822
+
823
+ Returns:
824
+ List of dictionaries with instant answers results.
825
+
826
+ Raises:
827
+ WebscoutE: Base exception for webscout errors.
828
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
829
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
830
+ """
831
+ assert keywords, "keywords is mandatory"
832
+
833
+ payload = {
834
+ "q": f"what is {keywords}",
835
+ "format": "json",
836
+ }
837
+ resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
838
+ page_data = json_loads(resp_content)
839
+
840
+ results = []
841
+ answer = page_data.get("AbstractText")
842
+ url = page_data.get("AbstractURL")
843
+ if answer:
844
+ results.append(
845
+ {
846
+ "icon": None,
847
+ "text": answer,
848
+ "topic": None,
849
+ "url": url,
850
+ }
851
+ )
852
+
853
+ # related
854
+ payload = {
855
+ "q": f"{keywords}",
856
+ "format": "json",
857
+ }
858
+ resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
859
+ resp_json = json_loads(resp_content)
860
+ page_data = resp_json.get("RelatedTopics", [])
861
+
862
+ for row in page_data:
863
+ topic = row.get("Name")
864
+ if not topic:
865
+ icon = row["Icon"].get("URL")
866
+ results.append(
867
+ {
868
+ "icon": f"https://duckduckgo.com{icon}" if icon else "",
869
+ "text": row["Text"],
870
+ "topic": None,
871
+ "url": row["FirstURL"],
872
+ }
873
+ )
874
+ else:
875
+ for subrow in row["Topics"]:
876
+ icon = subrow["Icon"].get("URL")
877
+ results.append(
878
+ {
879
+ "icon": f"https://duckduckgo.com{icon}" if icon else "",
880
+ "text": subrow["Text"],
881
+ "topic": topic,
882
+ "url": subrow["FirstURL"],
883
+ }
884
+ )
885
+
886
+ return results
887
+
888
+ def suggestions(self, keywords: str, region: str = "wt-wt") -> List[Dict[str, str]]:
889
+ """DuckDuckGo suggestions. Query params: https://duckduckgo.com/params.
890
+
891
+ Args:
892
+ keywords: keywords for query.
893
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
894
+
895
+ Returns:
896
+ List of dictionaries with suggestions results.
897
+
898
+ Raises:
899
+ WebscoutE: Base exception for webscout errors.
900
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
901
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
902
+ """
903
+ assert keywords, "keywords is mandatory"
904
+
905
+ payload = {
906
+ "q": keywords,
907
+ "kl": region,
908
+ }
909
+ resp_content = self._get_url("GET", "https://duckduckgo.com/ac/", params=payload)
910
+ page_data = json_loads(resp_content)
911
+ return [r for r in page_data]
912
+
913
+ def maps(
914
+ self,
915
+ keywords: str,
916
+ place: Optional[str] = None,
917
+ street: Optional[str] = None,
918
+ city: Optional[str] = None,
919
+ county: Optional[str] = None,
920
+ state: Optional[str] = None,
921
+ country: Optional[str] = None,
922
+ postalcode: Optional[str] = None,
923
+ latitude: Optional[str] = None,
924
+ longitude: Optional[str] = None,
925
+ radius: int = 0,
926
+ max_results: Optional[int] = None,
927
+ ) -> List[Dict[str, str]]:
928
+ """DuckDuckGo maps search. Query params: https://duckduckgo.com/params.
929
+
930
+ Args:
931
+ keywords: keywords for query
932
+ place: if set, the other parameters are not used. Defaults to None.
933
+ street: house number/street. Defaults to None.
934
+ city: city of search. Defaults to None.
935
+ county: county of search. Defaults to None.
936
+ state: state of search. Defaults to None.
937
+ country: country of search. Defaults to None.
938
+ postalcode: postalcode of search. Defaults to None.
939
+ latitude: geographic coordinate (north-south position). Defaults to None.
940
+ longitude: geographic coordinate (east-west position); if latitude and
941
+ longitude are set, the other parameters are not used. Defaults to None.
942
+ radius: expand the search square by the distance in kilometers. Defaults to 0.
943
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
944
+
945
+ Returns:
946
+ List of dictionaries with maps search results, or None if there was an error.
947
+
948
+ Raises:
949
+ WebscoutE: Base exception for webscout errors.
950
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
951
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
952
+ """
953
+ assert keywords, "keywords is mandatory"
954
+
955
+ vqd = self._get_vqd(keywords)
956
+
957
+ # if longitude and latitude are specified, skip the request about bbox to the nominatim api
958
+ if latitude and longitude:
959
+ lat_t = Decimal(latitude.replace(",", "."))
960
+ lat_b = Decimal(latitude.replace(",", "."))
961
+ lon_l = Decimal(longitude.replace(",", "."))
962
+ lon_r = Decimal(longitude.replace(",", "."))
963
+ if radius == 0:
964
+ radius = 1
965
+ # otherwise request about bbox to nominatim api
966
+ else:
967
+ if place:
968
+ params = {
969
+ "q": place,
970
+ "polygon_geojson": "0",
971
+ "format": "jsonv2",
972
+ }
973
+ else:
974
+ params = {
975
+ "polygon_geojson": "0",
976
+ "format": "jsonv2",
977
+ }
978
+ if street:
979
+ params["street"] = street
980
+ if city:
981
+ params["city"] = city
982
+ if county:
983
+ params["county"] = county
984
+ if state:
985
+ params["state"] = state
986
+ if country:
987
+ params["country"] = country
988
+ if postalcode:
989
+ params["postalcode"] = postalcode
990
+ # request nominatim api to get coordinates box
991
+ resp_content = self._get_url(
992
+ "GET",
993
+ "https://nominatim.openstreetmap.org/search.php",
994
+ params=params,
995
+ )
996
+ if resp_content == b"[]":
997
+ raise WebscoutE("maps() Coordinates are not found, check function parameters.")
998
+ resp_json = json_loads(resp_content)
999
+ coordinates = resp_json[0]["boundingbox"]
1000
+ lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
1001
+ lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
1002
+
1003
+ # if a radius is specified, expand the search square
1004
+ lat_t += Decimal(radius) * Decimal(0.008983)
1005
+ lat_b -= Decimal(radius) * Decimal(0.008983)
1006
+ lon_l -= Decimal(radius) * Decimal(0.008983)
1007
+ lon_r += Decimal(radius) * Decimal(0.008983)
1008
+ logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
1009
+
1010
+ cache = set()
1011
+ results: List[Dict[str, str]] = []
1012
+
1013
+ def _maps_page(
1014
+ bbox: Tuple[Decimal, Decimal, Decimal, Decimal],
1015
+ ) -> Optional[List[Dict[str, str]]]:
1016
+ if max_results and len(results) >= max_results:
1017
+ return None
1018
+ lat_t, lon_l, lat_b, lon_r = bbox
1019
+ params = {
1020
+ "q": keywords,
1021
+ "vqd": vqd,
1022
+ "tg": "maps_places",
1023
+ "rt": "D",
1024
+ "mkexp": "b",
1025
+ "wiki_info": "1",
1026
+ "is_requery": "1",
1027
+ "bbox_tl": f"{lat_t},{lon_l}",
1028
+ "bbox_br": f"{lat_b},{lon_r}",
1029
+ "strict_bbox": "1",
1030
+ }
1031
+ resp_content = self._get_url("GET", "https://duckduckgo.com/local.js", params=params)
1032
+ resp_json = json_loads(resp_content)
1033
+ page_data = resp_json.get("results", [])
1034
+
1035
+ page_results = []
1036
+ for res in page_data:
1037
+ r_name = f'{res["name"]} {res["address"]}'
1038
+ if r_name in cache:
1039
+ continue
1040
+ else:
1041
+ cache.add(r_name)
1042
+ result = {
1043
+ "title": res["name"],
1044
+ "address": res["address"],
1045
+ "country_code": res["country_code"],
1046
+ "url": _normalize_url(res["website"]),
1047
+ "phone": res["phone"] or "",
1048
+ "latitude": res["coordinates"]["latitude"],
1049
+ "longitude": res["coordinates"]["longitude"],
1050
+ "source": _normalize_url(res["url"]),
1051
+ "image": x.get("image", "") if (x := res["embed"]) else "",
1052
+ "desc": x.get("description", "") if (x := res["embed"]) else "",
1053
+ "hours": res["hours"] or "",
1054
+ "category": res["ddg_category"] or "",
1055
+ "facebook": f"www.facebook.com/profile.php?id={x}" if (x := res["facebook_id"]) else "",
1056
+ "instagram": f"https://www.instagram.com/{x}" if (x := res["instagram_id"]) else "",
1057
+ "twitter": f"https://twitter.com/{x}" if (x := res["twitter_id"]) else "",
1058
+ }
1059
+ page_results.append(result)
1060
+ return page_results
1061
+
1062
+ # search squares (bboxes)
1063
+ start_bbox = (lat_t, lon_l, lat_b, lon_r)
1064
+ work_bboxes = [start_bbox]
1065
+ while work_bboxes:
1066
+ queue_bboxes = [] # for next iteration, at the end of the iteration work_bboxes = queue_bboxes
1067
+ tasks = []
1068
+ for bbox in work_bboxes:
1069
+ tasks.append(bbox)
1070
+ # if distance between coordinates > 1, divide the square into 4 parts and save them in queue_bboxes
1071
+ if _calculate_distance(lat_t, lon_l, lat_b, lon_r) > 1:
1072
+ lat_t, lon_l, lat_b, lon_r = bbox
1073
+ lat_middle = (lat_t + lat_b) / 2
1074
+ lon_middle = (lon_l + lon_r) / 2
1075
+ bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
1076
+ bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
1077
+ bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
1078
+ bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
1079
+ queue_bboxes.extend([bbox1, bbox2, bbox3, bbox4])
1080
+
1081
+ # gather tasks using asyncio.wait_for and timeout
1082
+ work_bboxes_results = []
1083
+ try:
1084
+ for r in self._executor.map(_maps_page, tasks):
1085
+ if r:
1086
+ work_bboxes_results.extend(r)
1087
+ except Exception as e:
1088
+ raise e
1089
+
1090
+ for x in work_bboxes_results:
1091
+ if isinstance(x, list):
1092
+ results.extend(x)
1093
+ elif isinstance(x, dict):
1094
+ results.append(x)
1095
+
1096
+ work_bboxes = queue_bboxes
1097
+ if not max_results or len(results) >= max_results or len(work_bboxes_results) == 0:
1098
+ break
1099
+
1100
+ return list(islice(results, max_results))
1101
+
1102
+ def translate(
1103
+ self, keywords: Union[List[str], str], from_: Optional[str] = None, to: str = "en"
1104
+ ) -> List[Dict[str, str]]:
1105
+ """DuckDuckGo translate.
1106
+
1107
+ Args:
1108
+ keywords: string or list of strings to translate.
1109
+ from_: translate from (defaults automatically). Defaults to None.
1110
+ to: what language to translate. Defaults to "en".
1111
+
1112
+ Returns:
1113
+ List od dictionaries with translated keywords.
1114
+
1115
+ Raises:
1116
+ WebscoutE: Base exception for webscout errors.
1117
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
1118
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
1119
+ """
1120
+ assert keywords, "keywords is mandatory"
1121
+
1122
+ vqd = self._get_vqd("translate")
1123
+
1124
+ payload = {
1125
+ "vqd": vqd,
1126
+ "query": "translate",
1127
+ "to": to,
1128
+ }
1129
+ if from_:
1130
+ payload["from"] = from_
1131
+
1132
+ def _translate_keyword(keyword: str) -> Dict[str, str]:
1133
+ resp_content = self._get_url(
1134
+ "POST",
1135
+ "https://duckduckgo.com/translation.js",
1136
+ params=payload,
1137
+ content=keyword.encode(),
1138
+ )
1139
+ page_data: Dict[str, str] = json_loads(resp_content)
1140
+ page_data["original"] = keyword
1141
+ return page_data
1142
+
1143
+ if isinstance(keywords, str):
1144
+ keywords = [keywords]
1145
+
1146
+ results = []
1147
+ try:
1148
+ for r in self._executor.map(_translate_keyword, keywords):
1149
+ results.append(r)
1150
+ except Exception as e:
1151
+ raise e
1152
+
1153
+ return results
1154
+ import requests
1155
+ import http.cookiejar as cookiejar
1156
+ import sys
1157
+ import json
1158
+ from xml.etree import ElementTree
1159
+ import re
1160
+ from requests import HTTPError
1161
+ import html.parser
1162
+
1163
+ html_parser = html.parser.HTMLParser()
1164
+ import html
1165
+
1166
+ def unescape(string):
1167
+ return html.unescape(string)
1168
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
1169
+
1170
+ class TranscriptRetrievalError(Exception):
1171
+ """
1172
+ Base class for exceptions raised when a transcript cannot be retrieved.
1173
+ """
1174
+ ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
1175
+ CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
1176
+ CAUSE_MESSAGE = ''
1177
+ GITHUB_REFERRAL = (
1178
+ '\n\nIf you are sure that the described cause is not responsible for this error '
1179
+ 'and that a transcript should be retrievable, please create an issue at '
1180
+ 'https://github.com/OE-LUCIFER/Webscout/issues. '
1181
+ 'Please add which version of webscout you are using '
1182
+ 'and provide the information needed to replicate the error. '
1183
+ )
1184
+
1185
+ def __init__(self, video_id):
1186
+ self.video_id = video_id
1187
+ super(TranscriptRetrievalError, self).__init__(self._build_error_message())
1188
+
1189
+ def _build_error_message(self):
1190
+ cause = self.cause
1191
+ error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
1192
+
1193
+ if cause:
1194
+ error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
1195
+
1196
+ return error_message
1197
+
1198
+ @property
1199
+ def cause(self):
1200
+ return self.CAUSE_MESSAGE
1201
+
1202
+ class YouTubeRequestFailedError(TranscriptRetrievalError):
1203
+ CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
1204
+
1205
+ def __init__(self, video_id, http_error):
1206
+ self.reason = str(http_error)
1207
+ super(YouTubeRequestFailedError, self).__init__(video_id)
1208
+
1209
+ @property
1210
+ def cause(self):
1211
+ return self.CAUSE_MESSAGE.format(reason=self.reason)
1212
+
1213
+ class VideoUnavailableError(TranscriptRetrievalError):
1214
+ CAUSE_MESSAGE = 'The video is no longer available'
1215
+
1216
+ class InvalidVideoIdError(TranscriptRetrievalError):
1217
+ CAUSE_MESSAGE = (
1218
+ 'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n'
1219
+ 'Do NOT run: `YouTubeTranscriptApi.get_transcript("https://www.youtube.com/watch?v=1234")`\n'
1220
+ 'Instead run: `YouTubeTranscriptApi.get_transcript("1234")`'
1221
+ )
1222
+
1223
+ class TooManyRequestsError(TranscriptRetrievalError):
1224
+ CAUSE_MESSAGE = (
1225
+ 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
1226
+ 'One of the following things can be done to work around this:\n\
1227
+ - Manually solve the captcha in a browser and export the cookie. '
1228
+ 'Read here how to use that cookie with '
1229
+ 'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
1230
+ - Use a different IP address\n\
1231
+ - Wait until the ban on your IP has been lifted'
1232
+ )
1233
+
1234
+ class TranscriptsDisabledError(TranscriptRetrievalError):
1235
+ CAUSE_MESSAGE = 'Subtitles are disabled for this video'
1236
+
1237
+ class NoTranscriptAvailableError(TranscriptRetrievalError):
1238
+ CAUSE_MESSAGE = 'No transcripts are available for this video'
1239
+
1240
+ class NotTranslatableError(TranscriptRetrievalError):
1241
+ CAUSE_MESSAGE = 'The requested language is not translatable'
1242
+
1243
+ class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
1244
+ CAUSE_MESSAGE = 'The requested translation language is not available'
1245
+
1246
+ class CookiePathInvalidError(TranscriptRetrievalError):
1247
+ CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
1248
+
1249
+ class CookiesInvalidError(TranscriptRetrievalError):
1250
+ CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
1251
+
1252
+ class FailedToCreateConsentCookieError(TranscriptRetrievalError):
1253
+ CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
1254
+
1255
+ class NoTranscriptFoundError(TranscriptRetrievalError):
1256
+ CAUSE_MESSAGE = (
1257
+ 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
1258
+ '{transcript_data}'
1259
+ )
1260
+
1261
+ def __init__(self, video_id, requested_language_codes, transcript_data):
1262
+ self._requested_language_codes = requested_language_codes
1263
+ self._transcript_data = transcript_data
1264
+ super(NoTranscriptFoundError, self).__init__(video_id)
1265
+
1266
+ @property
1267
+ def cause(self):
1268
+ return self.CAUSE_MESSAGE.format(
1269
+ requested_language_codes=self._requested_language_codes,
1270
+ transcript_data=str(self._transcript_data),
1271
+ )
1272
+
1273
+
1274
+
1275
+ def _raise_http_errors(response, video_id):
1276
+ try:
1277
+ response.raise_for_status()
1278
+ return response
1279
+ except HTTPError as error:
1280
+ raise YouTubeRequestFailedError(error, video_id)
1281
+
1282
+
1283
+ class TranscriptListFetcher(object):
1284
+ def __init__(self, http_client):
1285
+ self._http_client = http_client
1286
+
1287
+ def fetch(self, video_id):
1288
+ return TranscriptList.build(
1289
+ self._http_client,
1290
+ video_id,
1291
+ self._extract_captions_json(self._fetch_video_html(video_id), video_id),
1292
+ )
1293
+
1294
+ def _extract_captions_json(self, html, video_id):
1295
+ splitted_html = html.split('"captions":')
1296
+
1297
+ if len(splitted_html) <= 1:
1298
+ if video_id.startswith('http://') or video_id.startswith('https://'):
1299
+ raise InvalidVideoIdError(video_id)
1300
+ if 'class="g-recaptcha"' in html:
1301
+ raise TooManyRequestsError(video_id)
1302
+ if '"playabilityStatus":' not in html:
1303
+ raise VideoUnavailableError(video_id)
1304
+
1305
+ raise TranscriptsDisabledError(video_id)
1306
+
1307
+ captions_json = json.loads(
1308
+ splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
1309
+ ).get('playerCaptionsTracklistRenderer')
1310
+ if captions_json is None:
1311
+ raise TranscriptsDisabledError(video_id)
1312
+
1313
+ if 'captionTracks' not in captions_json:
1314
+ raise TranscriptsDisabledError(video_id)
1315
+
1316
+ return captions_json
1317
+
1318
+ def _create_consent_cookie(self, html, video_id):
1319
+ match = re.search('name="v" value="(.*?)"', html)
1320
+ if match is None:
1321
+ raise FailedToCreateConsentCookieError(video_id)
1322
+ self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
1323
+
1324
+ def _fetch_video_html(self, video_id):
1325
+ html = self._fetch_html(video_id)
1326
+ if 'action="https://consent.youtube.com/s"' in html:
1327
+ self._create_consent_cookie(html, video_id)
1328
+ html = self._fetch_html(video_id)
1329
+ if 'action="https://consent.youtube.com/s"' in html:
1330
+ raise FailedToCreateConsentCookieError(video_id)
1331
+ return html
1332
+
1333
+ def _fetch_html(self, video_id):
1334
+ response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
1335
+ return unescape(_raise_http_errors(response, video_id).text)
1336
+
1337
+
1338
+ class TranscriptList(object):
1339
+ """
1340
+ This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
1341
+ for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
1342
+ """
1343
+
1344
+ def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
1345
+ """
1346
+ The constructor is only for internal use. Use the static build method instead.
1347
+
1348
+ :param video_id: the id of the video this TranscriptList is for
1349
+ :type video_id: str
1350
+ :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
1351
+ :type manually_created_transcripts: dict[str, Transcript]
1352
+ :param generated_transcripts: dict mapping language codes to the generated transcripts
1353
+ :type generated_transcripts: dict[str, Transcript]
1354
+ :param translation_languages: list of languages which can be used for translatable languages
1355
+ :type translation_languages: list[dict[str, str]]
1356
+ """
1357
+ self.video_id = video_id
1358
+ self._manually_created_transcripts = manually_created_transcripts
1359
+ self._generated_transcripts = generated_transcripts
1360
+ self._translation_languages = translation_languages
1361
+
1362
+ @staticmethod
1363
+ def build(http_client, video_id, captions_json):
1364
+ """
1365
+ Factory method for TranscriptList.
1366
+
1367
+ :param http_client: http client which is used to make the transcript retrieving http calls
1368
+ :type http_client: requests.Session
1369
+ :param video_id: the id of the video this TranscriptList is for
1370
+ :type video_id: str
1371
+ :param captions_json: the JSON parsed from the YouTube pages static HTML
1372
+ :type captions_json: dict
1373
+ :return: the created TranscriptList
1374
+ :rtype TranscriptList:
1375
+ """
1376
+ translation_languages = [
1377
+ {
1378
+ 'language': translation_language['languageName']['simpleText'],
1379
+ 'language_code': translation_language['languageCode'],
1380
+ } for translation_language in captions_json.get('translationLanguages', [])
1381
+ ]
1382
+
1383
+ manually_created_transcripts = {}
1384
+ generated_transcripts = {}
1385
+
1386
+ for caption in captions_json['captionTracks']:
1387
+ if caption.get('kind', '') == 'asr':
1388
+ transcript_dict = generated_transcripts
1389
+ else:
1390
+ transcript_dict = manually_created_transcripts
1391
+
1392
+ transcript_dict[caption['languageCode']] = Transcript(
1393
+ http_client,
1394
+ video_id,
1395
+ caption['baseUrl'],
1396
+ caption['name']['simpleText'],
1397
+ caption['languageCode'],
1398
+ caption.get('kind', '') == 'asr',
1399
+ translation_languages if caption.get('isTranslatable', False) else [],
1400
+ )
1401
+
1402
+ return TranscriptList(
1403
+ video_id,
1404
+ manually_created_transcripts,
1405
+ generated_transcripts,
1406
+ translation_languages,
1407
+ )
1408
+
1409
+ def __iter__(self):
1410
+ return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
1411
+
1412
+ def find_transcript(self, language_codes):
1413
+ """
1414
+ Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
1415
+ are found, generated transcripts are used. If you only want generated transcripts use
1416
+ `find_manually_created_transcript` instead.
1417
+
1418
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
1419
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
1420
+ it fails to do so.
1421
+ :type languages: list[str]
1422
+ :return: the found Transcript
1423
+ :rtype Transcript:
1424
+ :raises: NoTranscriptFound
1425
+ """
1426
+ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
1427
+
1428
+ def find_generated_transcript(self, language_codes):
1429
+ """
1430
+ Finds an automatically generated transcript for a given language code.
1431
+
1432
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
1433
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
1434
+ it fails to do so.
1435
+ :type languages: list[str]
1436
+ :return: the found Transcript
1437
+ :rtype Transcript:
1438
+ :raises: NoTranscriptFound
1439
+ """
1440
+ return self._find_transcript(language_codes, [self._generated_transcripts])
1441
+
1442
+ def find_manually_created_transcript(self, language_codes):
1443
+ """
1444
+ Finds a manually created transcript for a given language code.
1445
+
1446
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
1447
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
1448
+ it fails to do so.
1449
+ :type languages: list[str]
1450
+ :return: the found Transcript
1451
+ :rtype Transcript:
1452
+ :raises: NoTranscriptFound
1453
+ """
1454
+ return self._find_transcript(language_codes, [self._manually_created_transcripts])
1455
+
1456
+ def _find_transcript(self, language_codes, transcript_dicts):
1457
+ for language_code in language_codes:
1458
+ for transcript_dict in transcript_dicts:
1459
+ if language_code in transcript_dict:
1460
+ return transcript_dict[language_code]
1461
+
1462
+ raise NoTranscriptFoundError(
1463
+ self.video_id,
1464
+ language_codes,
1465
+ self
1466
+ )
1467
+
1468
+ def __str__(self):
1469
+ return (
1470
+ 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
1471
+ '(MANUALLY CREATED)\n'
1472
+ '{available_manually_created_transcript_languages}\n\n'
1473
+ '(GENERATED)\n'
1474
+ '{available_generated_transcripts}\n\n'
1475
+ '(TRANSLATION LANGUAGES)\n'
1476
+ '{available_translation_languages}'
1477
+ ).format(
1478
+ video_id=self.video_id,
1479
+ available_manually_created_transcript_languages=self._get_language_description(
1480
+ str(transcript) for transcript in self._manually_created_transcripts.values()
1481
+ ),
1482
+ available_generated_transcripts=self._get_language_description(
1483
+ str(transcript) for transcript in self._generated_transcripts.values()
1484
+ ),
1485
+ available_translation_languages=self._get_language_description(
1486
+ '{language_code} ("{language}")'.format(
1487
+ language=translation_language['language'],
1488
+ language_code=translation_language['language_code'],
1489
+ ) for translation_language in self._translation_languages
1490
+ )
1491
+ )
1492
+
1493
+ def _get_language_description(self, transcript_strings):
1494
+ description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
1495
+ return description if description else 'None'
1496
+
1497
+
1498
+ class Transcript(object):
1499
+ def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
1500
+ """
1501
+ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
1502
+ TranscriptList.
1503
+
1504
+ :param http_client: http client which is used to make the transcript retrieving http calls
1505
+ :type http_client: requests.Session
1506
+ :param video_id: the id of the video this TranscriptList is for
1507
+ :type video_id: str
1508
+ :param url: the url which needs to be called to fetch the transcript
1509
+ :param language: the name of the language this transcript uses
1510
+ :param language_code:
1511
+ :param is_generated:
1512
+ :param translation_languages:
1513
+ """
1514
+ self._http_client = http_client
1515
+ self.video_id = video_id
1516
+ self._url = url
1517
+ self.language = language
1518
+ self.language_code = language_code
1519
+ self.is_generated = is_generated
1520
+ self.translation_languages = translation_languages
1521
+ self._translation_languages_dict = {
1522
+ translation_language['language_code']: translation_language['language']
1523
+ for translation_language in translation_languages
1524
+ }
1525
+
1526
+ def fetch(self, preserve_formatting=False):
1527
+ """
1528
+ Loads the actual transcript data.
1529
+ :param preserve_formatting: whether to keep select HTML text formatting
1530
+ :type preserve_formatting: bool
1531
+ :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
1532
+ :rtype [{'text': str, 'start': float, 'end': float}]:
1533
+ """
1534
+ response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
1535
+ return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
1536
+ _raise_http_errors(response, self.video_id).text,
1537
+ )
1538
+
1539
+ def __str__(self):
1540
+ return '{language_code} ("{language}"){translation_description}'.format(
1541
+ language=self.language,
1542
+ language_code=self.language_code,
1543
+ translation_description='[TRANSLATABLE]' if self.is_translatable else ''
1544
+ )
1545
+
1546
+ @property
1547
+ def is_translatable(self):
1548
+ return len(self.translation_languages) > 0
1549
+
1550
+ def translate(self, language_code):
1551
+ if not self.is_translatable:
1552
+ raise NotTranslatableError(self.video_id)
1553
+
1554
+ if language_code not in self._translation_languages_dict:
1555
+ raise TranslationLanguageNotAvailableError(self.video_id)
1556
+
1557
+ return Transcript(
1558
+ self._http_client,
1559
+ self.video_id,
1560
+ '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
1561
+ self._translation_languages_dict[language_code],
1562
+ language_code,
1563
+ True,
1564
+ [],
1565
+ )
1566
+
1567
+
1568
+ class _TranscriptParser(object):
1569
+ _FORMATTING_TAGS = [
1570
+ 'strong', # important
1571
+ 'em', # emphasized
1572
+ 'b', # bold
1573
+ 'i', # italic
1574
+ 'mark', # marked
1575
+ 'small', # smaller
1576
+ 'del', # deleted
1577
+ 'ins', # inserted
1578
+ 'sub', # subscript
1579
+ 'sup', # superscript
1580
+ ]
1581
+
1582
+ def __init__(self, preserve_formatting=False):
1583
+ self._html_regex = self._get_html_regex(preserve_formatting)
1584
+
1585
+ def _get_html_regex(self, preserve_formatting):
1586
+ if preserve_formatting:
1587
+ formats_regex = '|'.join(self._FORMATTING_TAGS)
1588
+ formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
1589
+ html_regex = re.compile(formats_regex, re.IGNORECASE)
1590
+ else:
1591
+ html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
1592
+ return html_regex
1593
+
1594
+ def parse(self, plain_data):
1595
+ return [
1596
+ {
1597
+ 'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
1598
+ 'start': float(xml_element.attrib['start']),
1599
+ 'duration': float(xml_element.attrib.get('dur', '0.0')),
1600
+ }
1601
+ for xml_element in ElementTree.fromstring(plain_data)
1602
+ if xml_element.text is not None
1603
+ ]
1604
+
1605
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
1606
+
1607
+ class transcriber(object):
1608
+ @classmethod
1609
+ def list_transcripts(cls, video_id, proxies=None, cookies=None):
1610
+ with requests.Session() as http_client:
1611
+ if cookies:
1612
+ http_client.cookies = cls._load_cookies(cookies, video_id)
1613
+ http_client.proxies = proxies if proxies else {}
1614
+ return TranscriptListFetcher(http_client).fetch(video_id)
1615
+
1616
+ @classmethod
1617
+ def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
1618
+ cookies=None, preserve_formatting=False):
1619
+
1620
+ assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
1621
+
1622
+ data = {}
1623
+ unretrievable_videos = []
1624
+
1625
+ for video_id in video_ids:
1626
+ try:
1627
+ data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
1628
+ except Exception as exception:
1629
+ if not continue_after_error:
1630
+ raise exception
1631
+
1632
+ unretrievable_videos.append(video_id)
1633
+
1634
+ return data, unretrievable_videos
1635
+
1636
+ @classmethod
1637
+ def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
1638
+ assert isinstance(video_id, str), "`video_id` must be a string"
1639
+ return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
1640
+
1641
+ @classmethod
1642
+ def _load_cookies(cls, cookies, video_id):
1643
+ try:
1644
+ cookie_jar = cookiejar.MozillaCookieJar()
1645
+ cookie_jar.load(cookies)
1646
+ if not cookie_jar:
1647
+ raise CookiesInvalidError(video_id)
1648
+ return cookie_jar
1649
+ except:
1650
+ raise CookiePathInvalidError(video_id)
1651
+
1652
+ class LLM:
1653
+ def __init__(self, model: str, system_message: str = "You are a Helpful AI."):
1654
+ self.model = model
1655
+ self.conversation_history = [{"role": "system", "content": system_message}]
1656
+
1657
+ def chat(self, messages: List[Dict[str, str]]) -> Union[str, None]:
1658
+ url = "https://api.deepinfra.com/v1/openai/chat/completions"
1659
+ headers = {
1660
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
1661
+ 'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,es-ES;q=0.7,es;q=0.6,en-US;q=0.5,am;q=0.4,de;q=0.3',
1662
+ 'Cache-Control': 'no-cache',
1663
+ 'Connection': 'keep-alive',
1664
+ 'Content-Type': 'application/json',
1665
+ 'Origin': 'https://deepinfra.com',
1666
+ 'Pragma': 'no-cache',
1667
+ 'Referer': 'https://deepinfra.com/',
1668
+ 'Sec-Fetch-Dest': 'empty',
1669
+ 'Sec-Fetch-Mode': 'cors',
1670
+ 'Sec-Fetch-Site': 'same-site',
1671
+ 'X-Deepinfra-Source': 'web-embed',
1672
+ 'accept': 'text/event-stream',
1673
+ 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
1674
+ 'sec-ch-ua-mobile': '?0',
1675
+ 'sec-ch-ua-platform': '"macOS"'
1676
+ }
1677
+ data = json.dumps(
1678
+ {
1679
+ 'model': self.model,
1680
+ 'messages': messages,
1681
+ 'temperature': 0.7,
1682
+ 'max_tokens': 16000,
1683
+ 'stop': [],
1684
+ 'stream': False #dont change it
1685
+ }, separators=(',', ':')
1686
+ )
1687
+ try:
1688
+ result = requests.post(url=url, data=data, headers=headers)
1689
+ return result.json()['choices'][0]['message']['content']
1690
+ except:
1691
+ return None