renwei2024 commited on
Commit
5067f71
·
1 Parent(s): acb85aa

Add tool text_web_browser and update requirements.txt

Browse files
Files changed (3) hide show
  1. requirements.txt +2 -0
  2. tools/cookies.py +715 -0
  3. tools/text_web_browser.py +567 -0
requirements.txt CHANGED
@@ -11,3 +11,5 @@ pydub
11
  SpeechRecognition
12
  beautifulsoup4
13
  youtube-transcript-api
 
 
 
11
  SpeechRecognition
12
  beautifulsoup4
13
  youtube-transcript-api
14
+ pathvalidate
15
+ serpapi
tools/cookies.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from requests.cookies import RequestsCookieJar
2
+
3
+
4
+ COOKIES_LIST = [
5
+ {
6
+ "domain": ".youtube.com",
7
+ "expirationDate": 1718884961,
8
+ "hostOnly": False,
9
+ "httpOnly": False,
10
+ "name": "ST-xuwub9",
11
+ "path": "/",
12
+ "sameSite": None,
13
+ "secure": False,
14
+ "session": False,
15
+ "storeId": None,
16
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
17
+ },
18
+ {
19
+ "domain": ".youtube.com",
20
+ "expirationDate": 1753004444.745411,
21
+ "hostOnly": False,
22
+ "httpOnly": True,
23
+ "name": "__Secure-YEC",
24
+ "path": "/",
25
+ "sameSite": "lax",
26
+ "secure": True,
27
+ "session": False,
28
+ "storeId": None,
29
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
30
+ },
31
+ {
32
+ "domain": ".youtube.com",
33
+ "expirationDate": 1753434620.050824,
34
+ "hostOnly": False,
35
+ "httpOnly": True,
36
+ "name": "__Secure-3PSID",
37
+ "path": "/",
38
+ "sameSite": "no_restriction",
39
+ "secure": True,
40
+ "session": False,
41
+ "storeId": None,
42
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
43
+ },
44
+ {
45
+ "domain": ".youtube.com",
46
+ "expirationDate": 1750420959.974642,
47
+ "hostOnly": False,
48
+ "httpOnly": False,
49
+ "name": "SIDCC",
50
+ "path": "/",
51
+ "sameSite": None,
52
+ "secure": False,
53
+ "session": False,
54
+ "storeId": None,
55
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
56
+ },
57
+ {
58
+ "domain": ".youtube.com",
59
+ "expirationDate": 1753434620.050652,
60
+ "hostOnly": False,
61
+ "httpOnly": False,
62
+ "name": "SID",
63
+ "path": "/",
64
+ "sameSite": None,
65
+ "secure": False,
66
+ "session": False,
67
+ "storeId": None,
68
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
69
+ },
70
+ {
71
+ "domain": ".youtube.com",
72
+ "expirationDate": 1750420958.397534,
73
+ "hostOnly": False,
74
+ "httpOnly": True,
75
+ "name": "__Secure-1PSIDTS",
76
+ "path": "/",
77
+ "sameSite": None,
78
+ "secure": True,
79
+ "session": False,
80
+ "storeId": None,
81
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
82
+ },
83
+ {
84
+ "domain": ".youtube.com",
85
+ "expirationDate": 1753433494.44729,
86
+ "hostOnly": False,
87
+ "httpOnly": False,
88
+ "name": "_ga_M0180HEFCY",
89
+ "path": "/",
90
+ "sameSite": None,
91
+ "secure": False,
92
+ "session": False,
93
+ "storeId": None,
94
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
95
+ },
96
+ {
97
+ "domain": ".youtube.com",
98
+ "expirationDate": 1753434620.050933,
99
+ "hostOnly": False,
100
+ "httpOnly": False,
101
+ "name": "SAPISID",
102
+ "path": "/",
103
+ "sameSite": None,
104
+ "secure": True,
105
+ "session": False,
106
+ "storeId": None,
107
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
108
+ },
109
+ {
110
+ "domain": ".youtube.com",
111
+ "expirationDate": 1750420959.974764,
112
+ "hostOnly": False,
113
+ "httpOnly": True,
114
+ "name": "__Secure-1PSIDCC",
115
+ "path": "/",
116
+ "sameSite": None,
117
+ "secure": True,
118
+ "session": False,
119
+ "storeId": None,
120
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
121
+ },
122
+ {
123
+ "domain": ".youtube.com",
124
+ "expirationDate": 1753434620.050881,
125
+ "hostOnly": False,
126
+ "httpOnly": True,
127
+ "name": "SSID",
128
+ "path": "/",
129
+ "sameSite": None,
130
+ "secure": True,
131
+ "session": False,
132
+ "storeId": None,
133
+ "value": "AmlwXHnQvOQ10LVd-",
134
+ },
135
+ {
136
+ "domain": ".youtube.com",
137
+ "expirationDate": 1753434620.050959,
138
+ "hostOnly": False,
139
+ "httpOnly": False,
140
+ "name": "__Secure-1PAPISID",
141
+ "path": "/",
142
+ "sameSite": None,
143
+ "secure": True,
144
+ "session": False,
145
+ "storeId": None,
146
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
147
+ },
148
+ {
149
+ "domain": ".youtube.com",
150
+ "expirationDate": 1753434620.050795,
151
+ "hostOnly": False,
152
+ "httpOnly": True,
153
+ "name": "__Secure-1PSID",
154
+ "path": "/",
155
+ "sameSite": None,
156
+ "secure": True,
157
+ "session": False,
158
+ "storeId": None,
159
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
160
+ },
161
+ {
162
+ "domain": ".youtube.com",
163
+ "expirationDate": 1753434620.050993,
164
+ "hostOnly": False,
165
+ "httpOnly": False,
166
+ "name": "__Secure-3PAPISID",
167
+ "path": "/",
168
+ "sameSite": "no_restriction",
169
+ "secure": True,
170
+ "session": False,
171
+ "storeId": None,
172
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
173
+ },
174
+ {
175
+ "domain": ".youtube.com",
176
+ "expirationDate": 1750420959.974815,
177
+ "hostOnly": False,
178
+ "httpOnly": True,
179
+ "name": "__Secure-3PSIDCC",
180
+ "path": "/",
181
+ "sameSite": "no_restriction",
182
+ "secure": True,
183
+ "session": False,
184
+ "storeId": None,
185
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
186
+ },
187
+ {
188
+ "domain": ".youtube.com",
189
+ "expirationDate": 1750420958.397647,
190
+ "hostOnly": False,
191
+ "httpOnly": True,
192
+ "name": "__Secure-3PSIDTS",
193
+ "path": "/",
194
+ "sameSite": "no_restriction",
195
+ "secure": True,
196
+ "session": False,
197
+ "storeId": None,
198
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
199
+ },
200
+ {
201
+ "domain": ".youtube.com",
202
+ "expirationDate": 1753434620.050908,
203
+ "hostOnly": False,
204
+ "httpOnly": False,
205
+ "name": "APISID",
206
+ "path": "/",
207
+ "sameSite": None,
208
+ "secure": False,
209
+ "session": False,
210
+ "storeId": None,
211
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
212
+ },
213
+ {
214
+ "domain": ".youtube.com",
215
+ "expirationDate": 1753434620.050855,
216
+ "hostOnly": False,
217
+ "httpOnly": True,
218
+ "name": "HSID",
219
+ "path": "/",
220
+ "sameSite": None,
221
+ "secure": False,
222
+ "session": False,
223
+ "storeId": None,
224
+ "value": "AasA7hmRuTFv7vjoq",
225
+ },
226
+ {
227
+ "domain": ".youtube.com",
228
+ "expirationDate": 1753435873.577793,
229
+ "hostOnly": False,
230
+ "httpOnly": True,
231
+ "name": "LOGIN_INFO",
232
+ "path": "/",
233
+ "sameSite": "no_restriction",
234
+ "secure": True,
235
+ "session": False,
236
+ "storeId": None,
237
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
238
+ },
239
+ {
240
+ "domain": ".youtube.com",
241
+ "expirationDate": 1753444956.555608,
242
+ "hostOnly": False,
243
+ "httpOnly": False,
244
+ "name": "PREF",
245
+ "path": "/",
246
+ "sameSite": None,
247
+ "secure": True,
248
+ "session": False,
249
+ "storeId": None,
250
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
251
+ },
252
+ ]
253
+
254
+ COOKIES_LIST += [
255
+ {
256
+ "domain": ".www.researchgate.net",
257
+ "hostOnly": False,
258
+ "httpOnly": True,
259
+ "name": "isInstIp",
260
+ "path": "/",
261
+ "sameSite": None,
262
+ "secure": True,
263
+ "session": True,
264
+ "storeId": None,
265
+ "value": "False",
266
+ },
267
+ {
268
+ "domain": ".researchgate.net",
269
+ "expirationDate": 1734423981,
270
+ "hostOnly": False,
271
+ "httpOnly": False,
272
+ "name": "__eoi",
273
+ "path": "/",
274
+ "sameSite": None,
275
+ "secure": False,
276
+ "session": False,
277
+ "storeId": None,
278
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
279
+ },
280
+ {
281
+ "domain": ".www.researchgate.net",
282
+ "expirationDate": 1753444909.646103,
283
+ "hostOnly": False,
284
+ "httpOnly": True,
285
+ "name": "ptc",
286
+ "path": "/",
287
+ "sameSite": None,
288
+ "secure": True,
289
+ "session": False,
290
+ "storeId": None,
291
+ "value": "RG1.8947708639250500550.1718872043",
292
+ },
293
+ {
294
+ "domain": ".researchgate.net",
295
+ "expirationDate": 1750507578,
296
+ "hostOnly": False,
297
+ "httpOnly": False,
298
+ "name": "euconsent-v2-didomi",
299
+ "path": "/",
300
+ "sameSite": "lax",
301
+ "secure": True,
302
+ "session": False,
303
+ "storeId": None,
304
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
305
+ },
306
+ {
307
+ "domain": ".researchgate.net",
308
+ "expirationDate": 1718885236,
309
+ "hostOnly": False,
310
+ "httpOnly": False,
311
+ "name": "_gat",
312
+ "path": "/",
313
+ "sameSite": None,
314
+ "secure": False,
315
+ "session": False,
316
+ "storeId": None,
317
+ "value": "1",
318
+ },
319
+ {
320
+ "domain": "www.researchgate.net",
321
+ "expirationDate": 1721477183,
322
+ "hostOnly": True,
323
+ "httpOnly": False,
324
+ "name": "_pbjs_userid_consent_data",
325
+ "path": "/",
326
+ "sameSite": "lax",
327
+ "secure": False,
328
+ "session": False,
329
+ "storeId": None,
330
+ "value": "3524755945110770",
331
+ },
332
+ {
333
+ "domain": ".researchgate.net",
334
+ "expirationDate": 1752567981,
335
+ "hostOnly": False,
336
+ "httpOnly": False,
337
+ "name": "__gads",
338
+ "path": "/",
339
+ "sameSite": None,
340
+ "secure": False,
341
+ "session": False,
342
+ "storeId": None,
343
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
344
+ },
345
+ {
346
+ "domain": ".researchgate.net",
347
+ "expirationDate": 1718886709.646173,
348
+ "hostOnly": False,
349
+ "httpOnly": True,
350
+ "name": "__cf_bm",
351
+ "path": "/",
352
+ "sameSite": "no_restriction",
353
+ "secure": True,
354
+ "session": False,
355
+ "storeId": None,
356
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
357
+ },
358
+ {
359
+ "domain": ".researchgate.net",
360
+ "expirationDate": 1752567981,
361
+ "hostOnly": False,
362
+ "httpOnly": False,
363
+ "name": "__gpi",
364
+ "path": "/",
365
+ "sameSite": None,
366
+ "secure": False,
367
+ "session": False,
368
+ "storeId": None,
369
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
370
+ },
371
+ {
372
+ "domain": ".researchgate.net",
373
+ "hostOnly": False,
374
+ "httpOnly": True,
375
+ "name": "_cfuvid",
376
+ "path": "/",
377
+ "sameSite": "no_restriction",
378
+ "secure": True,
379
+ "session": True,
380
+ "storeId": None,
381
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
382
+ },
383
+ {
384
+ "domain": ".researchgate.net",
385
+ "expirationDate": 1753445177.271667,
386
+ "hostOnly": False,
387
+ "httpOnly": False,
388
+ "name": "_ga",
389
+ "path": "/",
390
+ "sameSite": None,
391
+ "secure": False,
392
+ "session": False,
393
+ "storeId": None,
394
+ "value": "GA1.1.1525244793.1718885177",
395
+ },
396
+ {
397
+ "domain": ".researchgate.net",
398
+ "expirationDate": 1753445177.271482,
399
+ "hostOnly": False,
400
+ "httpOnly": False,
401
+ "name": "_ga_4P31SJ70EJ",
402
+ "path": "/",
403
+ "sameSite": None,
404
+ "secure": False,
405
+ "session": False,
406
+ "storeId": None,
407
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
408
+ },
409
+ {
410
+ "domain": ".researchgate.net",
411
+ "expirationDate": 1718971576,
412
+ "hostOnly": False,
413
+ "httpOnly": False,
414
+ "name": "_gid",
415
+ "path": "/",
416
+ "sameSite": None,
417
+ "secure": False,
418
+ "session": False,
419
+ "storeId": None,
420
+ "value": "GA1.2.854907463.1718885177",
421
+ },
422
+ {
423
+ "domain": ".www.researchgate.net",
424
+ "expirationDate": 1750407982.506505,
425
+ "hostOnly": False,
426
+ "httpOnly": True,
427
+ "name": "did",
428
+ "path": "/",
429
+ "sameSite": None,
430
+ "secure": True,
431
+ "session": False,
432
+ "storeId": None,
433
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
434
+ },
435
+ {
436
+ "domain": ".researchgate.net",
437
+ "expirationDate": 1750507578,
438
+ "hostOnly": False,
439
+ "httpOnly": False,
440
+ "name": "didomi_token",
441
+ "path": "/",
442
+ "sameSite": "lax",
443
+ "secure": True,
444
+ "session": False,
445
+ "storeId": None,
446
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
447
+ },
448
+ {
449
+ "domain": ".www.researchgate.net",
450
+ "hostOnly": False,
451
+ "httpOnly": True,
452
+ "name": "hasPdpNext",
453
+ "path": "/",
454
+ "sameSite": None,
455
+ "secure": True,
456
+ "session": True,
457
+ "storeId": None,
458
+ "value": "False",
459
+ },
460
+ {
461
+ "domain": ".researchgate.net",
462
+ "expirationDate": 1750421183,
463
+ "hostOnly": False,
464
+ "httpOnly": False,
465
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
466
+ "path": "/",
467
+ "sameSite": "lax",
468
+ "secure": True,
469
+ "session": False,
470
+ "storeId": None,
471
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
472
+ },
473
+ {
474
+ "domain": ".www.researchgate.net",
475
+ "hostOnly": False,
476
+ "httpOnly": True,
477
+ "name": "sid",
478
+ "path": "/",
479
+ "sameSite": None,
480
+ "secure": True,
481
+ "session": True,
482
+ "storeId": None,
483
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
484
+ },
485
+ ]
486
+
487
+ COOKIES_LIST += [
488
+ {
489
+ "domain": "github.com",
490
+ "hostOnly": True,
491
+ "httpOnly": True,
492
+ "name": "_gh_sess",
493
+ "path": "/",
494
+ "sameSite": "lax",
495
+ "secure": True,
496
+ "session": True,
497
+ "storeId": None,
498
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
499
+ },
500
+ {
501
+ "domain": ".github.com",
502
+ "expirationDate": 1750408875.763785,
503
+ "hostOnly": False,
504
+ "httpOnly": False,
505
+ "name": "_octo",
506
+ "path": "/",
507
+ "sameSite": "lax",
508
+ "secure": True,
509
+ "session": False,
510
+ "storeId": None,
511
+ "value": "GH1.1.728652011.1718872875",
512
+ },
513
+ {
514
+ "domain": ".github.com",
515
+ "expirationDate": 1750408875.763926,
516
+ "hostOnly": False,
517
+ "httpOnly": True,
518
+ "name": "logged_in",
519
+ "path": "/",
520
+ "sameSite": "lax",
521
+ "secure": True,
522
+ "session": False,
523
+ "storeId": None,
524
+ "value": "no",
525
+ },
526
+ {
527
+ "domain": ".github.com",
528
+ "hostOnly": False,
529
+ "httpOnly": False,
530
+ "name": "preferred_color_mode",
531
+ "path": "/",
532
+ "sameSite": "lax",
533
+ "secure": True,
534
+ "session": True,
535
+ "storeId": None,
536
+ "value": "dark",
537
+ },
538
+ {
539
+ "domain": ".github.com",
540
+ "hostOnly": False,
541
+ "httpOnly": False,
542
+ "name": "tz",
543
+ "path": "/",
544
+ "sameSite": "lax",
545
+ "secure": True,
546
+ "session": True,
547
+ "storeId": None,
548
+ "value": "Europe%2FParis",
549
+ },
550
+ ]
551
+
552
+ COOKIES_LIST += [
553
+ {
554
+ "domain": ".web.archive.org",
555
+ "expirationDate": 1718886430,
556
+ "hostOnly": False,
557
+ "httpOnly": False,
558
+ "name": "_gat",
559
+ "path": "/web/20201123221659/http://orcid.org/",
560
+ "sameSite": None,
561
+ "secure": False,
562
+ "session": False,
563
+ "storeId": None,
564
+ "value": "1",
565
+ },
566
+ {
567
+ "domain": ".web.archive.org",
568
+ "expirationDate": 1718972770,
569
+ "hostOnly": False,
570
+ "httpOnly": False,
571
+ "name": "_gid",
572
+ "path": "/web/20201123221659/http://orcid.org/",
573
+ "sameSite": None,
574
+ "secure": False,
575
+ "session": False,
576
+ "storeId": None,
577
+ "value": "GA1.2.402246368.1606169825",
578
+ },
579
+ {
580
+ "domain": ".web.archive.org",
581
+ "expirationDate": 1753446370.315621,
582
+ "hostOnly": False,
583
+ "httpOnly": False,
584
+ "name": "_ga",
585
+ "path": "/web/20201123221659/http://orcid.org/",
586
+ "sameSite": None,
587
+ "secure": False,
588
+ "session": False,
589
+ "storeId": None,
590
+ "value": "GA1.2.1301409987.1606169825",
591
+ },
592
+ {
593
+ "domain": ".web.archive.org",
594
+ "expirationDate": 1750422367,
595
+ "hostOnly": False,
596
+ "httpOnly": False,
597
+ "name": "_hjid",
598
+ "path": "/web/20201123221659/http://orcid.org/",
599
+ "sameSite": "lax",
600
+ "secure": False,
601
+ "session": False,
602
+ "storeId": None,
603
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
604
+ },
605
+ {
606
+ "domain": ".web.archive.org",
607
+ "expirationDate": 1718888167,
608
+ "hostOnly": False,
609
+ "httpOnly": False,
610
+ "name": "_hjFirstSeen",
611
+ "path": "/web/20201123221659/http://orcid.org/",
612
+ "sameSite": "lax",
613
+ "secure": False,
614
+ "session": False,
615
+ "storeId": None,
616
+ "value": "1",
617
+ },
618
+ ]
619
+ COOKIES_LIST += [
620
+ {
621
+ "domain": "orcid.org",
622
+ "hostOnly": True,
623
+ "httpOnly": False,
624
+ "name": "AWSELBCORS",
625
+ "path": "/",
626
+ "sameSite": "no_restriction",
627
+ "secure": True,
628
+ "session": True,
629
+ "storeId": None,
630
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
631
+ },
632
+ {
633
+ "domain": ".orcid.org",
634
+ "expirationDate": 1753452454.637671,
635
+ "hostOnly": False,
636
+ "httpOnly": False,
637
+ "name": "_ga_9R61FWK9H5",
638
+ "path": "/",
639
+ "sameSite": None,
640
+ "secure": False,
641
+ "session": False,
642
+ "storeId": None,
643
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
644
+ },
645
+ {
646
+ "domain": ".orcid.org",
647
+ "expirationDate": 1753452454.63421,
648
+ "hostOnly": False,
649
+ "httpOnly": False,
650
+ "name": "_ga",
651
+ "path": "/",
652
+ "sameSite": None,
653
+ "secure": False,
654
+ "session": False,
655
+ "storeId": None,
656
+ "value": "GA1.1.2021310691.1718892455",
657
+ },
658
+ {
659
+ "domain": "orcid.org",
660
+ "hostOnly": True,
661
+ "httpOnly": False,
662
+ "name": "AWSELB",
663
+ "path": "/",
664
+ "sameSite": None,
665
+ "secure": False,
666
+ "session": True,
667
+ "storeId": None,
668
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
669
+ },
670
+ {
671
+ "domain": ".orcid.org",
672
+ "expirationDate": 1750428454,
673
+ "hostOnly": False,
674
+ "httpOnly": False,
675
+ "name": "OptanonAlertBoxClosed",
676
+ "path": "/",
677
+ "sameSite": "lax",
678
+ "secure": False,
679
+ "session": False,
680
+ "storeId": None,
681
+ "value": "2024-06-20T14:07:34.583Z",
682
+ },
683
+ {
684
+ "domain": ".orcid.org",
685
+ "expirationDate": 1750428454,
686
+ "hostOnly": False,
687
+ "httpOnly": False,
688
+ "name": "OptanonConsent",
689
+ "path": "/",
690
+ "sameSite": "lax",
691
+ "secure": False,
692
+ "session": False,
693
+ "storeId": None,
694
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
695
+ },
696
+ {
697
+ "domain": "orcid.org",
698
+ "hostOnly": True,
699
+ "httpOnly": False,
700
+ "name": "XSRF-TOKEN",
701
+ "path": "/",
702
+ "sameSite": None,
703
+ "secure": True,
704
+ "session": True,
705
+ "storeId": None,
706
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
707
+ },
708
+ ]
709
+
710
+ # Create a RequestsCookieJar instance
711
+ COOKIES = RequestsCookieJar()
712
+
713
+ # Add cookies to the jar
714
+ for cookie in COOKIES_LIST:
715
+ COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
tools/text_web_browser.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import mimetypes
4
+ import os
5
+ import pathlib
6
+ import re
7
+ import time
8
+ import uuid
9
+ from typing import Any
10
+ from urllib.parse import unquote, urljoin, urlparse
11
+
12
+ import pathvalidate
13
+ import requests
14
+ from serpapi import GoogleSearch
15
+
16
+ from smolagents import Tool
17
+
18
+ from .cookies import COOKIES
19
+ from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
20
+
21
+
22
+ class SimpleTextBrowser:
23
+ """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
24
+
25
+ def __init__(
26
+ self,
27
+ start_page: str | None = None,
28
+ viewport_size: int | None = 1024 * 8,
29
+ downloads_folder: str | None | None = None,
30
+ serpapi_key: str | None | None = None,
31
+ request_kwargs: dict[str, Any] | None | None = None,
32
+ ):
33
+ self.start_page: str = start_page if start_page else "about:blank"
34
+ self.viewport_size = viewport_size # Applies only to the standard uri types
35
+ self.downloads_folder = downloads_folder
36
+ self.history: list[tuple[str, float]] = list()
37
+ self.page_title: str | None = None
38
+ self.viewport_current_page = 0
39
+ self.viewport_pages: list[tuple[int, int]] = list()
40
+ self.set_address(self.start_page)
41
+ self.serpapi_key = serpapi_key
42
+ self.request_kwargs = request_kwargs
43
+ self.request_kwargs["cookies"] = COOKIES
44
+ self._mdconvert = MarkdownConverter()
45
+ self._page_content: str = ""
46
+
47
+ self._find_on_page_query: str | None = None
48
+ self._find_on_page_last_result: int | None = None # Location of the last result
49
+
50
+ @property
51
+ def address(self) -> str:
52
+ """Return the address of the current page."""
53
+ return self.history[-1][0]
54
+
55
+ def set_address(self, uri_or_path: str, filter_year: int | None = None) -> None:
56
+ # TODO: Handle anchors
57
+ self.history.append((uri_or_path, time.time()))
58
+
59
+ # Handle special URIs
60
+ if uri_or_path == "about:blank":
61
+ self._set_page_content("")
62
+ elif uri_or_path.startswith("google:"):
63
+ self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
64
+ else:
65
+ if (
66
+ not uri_or_path.startswith("http:")
67
+ and not uri_or_path.startswith("https:")
68
+ and not uri_or_path.startswith("file:")
69
+ ):
70
+ if len(self.history) > 1:
71
+ prior_address = self.history[-2][0]
72
+ uri_or_path = urljoin(prior_address, uri_or_path)
73
+ # Update the address with the fully-qualified path
74
+ self.history[-1] = (uri_or_path, self.history[-1][1])
75
+ self._fetch_page(uri_or_path)
76
+
77
+ self.viewport_current_page = 0
78
+ self.find_on_page_query = None
79
+ self.find_on_page_viewport = None
80
+
81
+ @property
82
+ def viewport(self) -> str:
83
+ """Return the content of the current viewport."""
84
+ bounds = self.viewport_pages[self.viewport_current_page]
85
+ return self.page_content[bounds[0] : bounds[1]]
86
+
87
+ @property
88
+ def page_content(self) -> str:
89
+ """Return the full contents of the current page."""
90
+ return self._page_content
91
+
92
+ def _set_page_content(self, content: str) -> None:
93
+ """Sets the text content of the current page."""
94
+ self._page_content = content
95
+ self._split_pages()
96
+ if self.viewport_current_page >= len(self.viewport_pages):
97
+ self.viewport_current_page = len(self.viewport_pages) - 1
98
+
99
+ def page_down(self) -> None:
100
+ self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
101
+
102
+ def page_up(self) -> None:
103
+ self.viewport_current_page = max(self.viewport_current_page - 1, 0)
104
+
105
+ def find_on_page(self, query: str) -> str | None:
106
+ """Searches for the query from the current viewport forward, looping back to the start if necessary."""
107
+
108
+ # Did we get here via a previous find_on_page search with the same query?
109
+ # If so, map to find_next
110
+ if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
111
+ return self.find_next()
112
+
113
+ # Ok it's a new search start from the current viewport
114
+ self._find_on_page_query = query
115
+ viewport_match = self._find_next_viewport(query, self.viewport_current_page)
116
+ if viewport_match is None:
117
+ self._find_on_page_last_result = None
118
+ return None
119
+ else:
120
+ self.viewport_current_page = viewport_match
121
+ self._find_on_page_last_result = viewport_match
122
+ return self.viewport
123
+
124
+ def find_next(self) -> str | None:
125
+ """Scroll to the next viewport that matches the query"""
126
+
127
+ if self._find_on_page_query is None:
128
+ return None
129
+
130
+ starting_viewport = self._find_on_page_last_result
131
+ if starting_viewport is None:
132
+ starting_viewport = 0
133
+ else:
134
+ starting_viewport += 1
135
+ if starting_viewport >= len(self.viewport_pages):
136
+ starting_viewport = 0
137
+
138
+ viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
139
+ if viewport_match is None:
140
+ self._find_on_page_last_result = None
141
+ return None
142
+ else:
143
+ self.viewport_current_page = viewport_match
144
+ self._find_on_page_last_result = viewport_match
145
+ return self.viewport
146
+
147
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> int | None:
148
+ """Search for matches between the starting viewport looping when reaching the end."""
149
+
150
+ if query is None:
151
+ return None
152
+
153
+ # Normalize the query, and convert to a regular expression
154
+ nquery = re.sub(r"\*", "__STAR__", query)
155
+ nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
156
+ nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
157
+ nquery = nquery.replace("__STAR__", ".*").lower()
158
+
159
+ if nquery.strip() == "":
160
+ return None
161
+
162
+ idxs = list()
163
+ idxs.extend(range(starting_viewport, len(self.viewport_pages)))
164
+ idxs.extend(range(0, starting_viewport))
165
+
166
+ for i in idxs:
167
+ bounds = self.viewport_pages[i]
168
+ content = self.page_content[bounds[0] : bounds[1]]
169
+
170
+ # TODO: Remove markdown links and images
171
+ ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
172
+ if re.search(nquery, ncontent):
173
+ return i
174
+
175
+ return None
176
+
177
+ def visit_page(self, path_or_uri: str, filter_year: int | None = None) -> str:
178
+ """Update the address, visit the page, and return the content of the viewport."""
179
+ self.set_address(path_or_uri, filter_year=filter_year)
180
+ return self.viewport
181
+
182
+ def _split_pages(self) -> None:
183
+ # Do not split search results
184
+ if self.address.startswith("google:"):
185
+ self.viewport_pages = [(0, len(self._page_content))]
186
+ return
187
+
188
+ # Handle empty pages
189
+ if len(self._page_content) == 0:
190
+ self.viewport_pages = [(0, 0)]
191
+ return
192
+
193
+ # Break the viewport into pages
194
+ self.viewport_pages = []
195
+ start_idx = 0
196
+ while start_idx < len(self._page_content):
197
+ end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
198
+ # Adjust to end on a space
199
+ while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
200
+ end_idx += 1
201
+ self.viewport_pages.append((start_idx, end_idx))
202
+ start_idx = end_idx
203
+
204
+ def _serpapi_search(self, query: str, filter_year: int | None = None) -> None:
205
+ if self.serpapi_key is None:
206
+ raise ValueError("Missing SerpAPI key.")
207
+
208
+ params = {
209
+ "engine": "google",
210
+ "q": query,
211
+ "api_key": self.serpapi_key,
212
+ }
213
+ if filter_year is not None:
214
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
215
+
216
+ search = GoogleSearch(params)
217
+ results = search.get_dict()
218
+ self.page_title = f"{query} - Search"
219
+ if "organic_results" not in results.keys():
220
+ raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
221
+ if len(results["organic_results"]) == 0:
222
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
223
+ self._set_page_content(
224
+ f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
225
+ )
226
+ return
227
+
228
+ def _prev_visit(url):
229
+ for i in range(len(self.history) - 1, -1, -1):
230
+ if self.history[i][0] == url:
231
+ return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
232
+ return ""
233
+
234
+ web_snippets: list[str] = list()
235
+ idx = 0
236
+ if "organic_results" in results:
237
+ for page in results["organic_results"]:
238
+ idx += 1
239
+ date_published = ""
240
+ if "date" in page:
241
+ date_published = "\nDate published: " + page["date"]
242
+
243
+ source = ""
244
+ if "source" in page:
245
+ source = "\nSource: " + page["source"]
246
+
247
+ snippet = ""
248
+ if "snippet" in page:
249
+ snippet = "\n" + page["snippet"]
250
+
251
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
252
+
253
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
254
+ web_snippets.append(redacted_version)
255
+
256
+ content = (
257
+ f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
258
+ + "\n\n".join(web_snippets)
259
+ )
260
+
261
+ self._set_page_content(content)
262
+
263
+ def _fetch_page(self, url: str) -> None:
264
+ download_path = ""
265
+ try:
266
+ if url.startswith("file://"):
267
+ download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
268
+ res = self._mdconvert.convert_local(download_path)
269
+ self.page_title = res.title
270
+ self._set_page_content(res.text_content)
271
+ else:
272
+ # Prepare the request parameters
273
+ request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
274
+ request_kwargs["stream"] = True
275
+
276
+ # Send a HTTP request to the URL
277
+ response = requests.get(url, **request_kwargs)
278
+ response.raise_for_status()
279
+
280
+ # If the HTTP request was successful
281
+ content_type = response.headers.get("content-type", "")
282
+
283
+ # Text or HTML
284
+ if "text/" in content_type.lower():
285
+ res = self._mdconvert.convert_response(response)
286
+ self.page_title = res.title
287
+ self._set_page_content(res.text_content)
288
+ # A download
289
+ else:
290
+ # Try producing a safe filename
291
+ fname = None
292
+ download_path = None
293
+ try:
294
+ fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
295
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
296
+
297
+ suffix = 0
298
+ while os.path.exists(download_path) and suffix < 1000:
299
+ suffix += 1
300
+ base, ext = os.path.splitext(fname)
301
+ new_fname = f"{base}__{suffix}{ext}"
302
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
303
+
304
+ except NameError:
305
+ pass
306
+
307
+ # No suitable name, so make one
308
+ if fname is None:
309
+ extension = mimetypes.guess_extension(content_type)
310
+ if extension is None:
311
+ extension = ".download"
312
+ fname = str(uuid.uuid4()) + extension
313
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
314
+
315
+ # Open a file for writing
316
+ with open(download_path, "wb") as fh:
317
+ for chunk in response.iter_content(chunk_size=512):
318
+ fh.write(chunk)
319
+
320
+ # Render it
321
+ local_uri = pathlib.Path(download_path).as_uri()
322
+ self.set_address(local_uri)
323
+
324
+ except UnsupportedFormatException as e:
325
+ print(e)
326
+ self.page_title = ("Download complete.",)
327
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
328
+ except FileConversionException as e:
329
+ print(e)
330
+ self.page_title = ("Download complete.",)
331
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
332
+ except FileNotFoundError:
333
+ self.page_title = "Error 404"
334
+ self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
335
+ except requests.exceptions.RequestException as request_exception:
336
+ try:
337
+ self.page_title = f"Error {response.status_code}"
338
+
339
+ # If the error was rendered in HTML we might as well render it
340
+ content_type = response.headers.get("content-type", "")
341
+ if content_type is not None and "text/html" in content_type.lower():
342
+ res = self._mdconvert.convert(response)
343
+ self.page_title = f"Error {response.status_code}"
344
+ self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
345
+ else:
346
+ text = ""
347
+ for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
348
+ text += chunk
349
+ self.page_title = f"Error {response.status_code}"
350
+ self._set_page_content(f"## Error {response.status_code}\n\n{text}")
351
+ except NameError:
352
+ self.page_title = "Error"
353
+ self._set_page_content(f"## Error\n\n{str(request_exception)}")
354
+
355
+ def _state(self) -> tuple[str, str]:
356
+ header = f"Address: {self.address}\n"
357
+ if self.page_title is not None:
358
+ header += f"Title: {self.page_title}\n"
359
+
360
+ current_page = self.viewport_current_page
361
+ total_pages = len(self.viewport_pages)
362
+
363
+ address = self.address
364
+ for i in range(len(self.history) - 2, -1, -1): # Start from the second last
365
+ if self.history[i][0] == address:
366
+ header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
367
+ break
368
+
369
+ header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
370
+ return (header, self.viewport)
371
+
372
+
373
+ class SearchInformationTool(Tool):
374
+ name = "web_search"
375
+ description = "Perform a web search query (think a google search) and returns the search results."
376
+ inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
377
+ inputs["filter_year"] = {
378
+ "type": "string",
379
+ "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
380
+ "nullable": True,
381
+ }
382
+ output_type = "string"
383
+
384
+ def __init__(self, browser):
385
+ super().__init__()
386
+ self.browser = browser
387
+
388
+ def forward(self, query: str, filter_year: int | None = None) -> str:
389
+ self.browser.visit_page(f"google: {query}", filter_year=filter_year)
390
+ header, content = self.browser._state()
391
+ return header.strip() + "\n=======================\n" + content
392
+
393
+
394
+ class VisitTool(Tool):
395
+ name = "visit_page"
396
+ description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
397
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webpage to visit."}}
398
+ output_type = "string"
399
+
400
+ def __init__(self, browser=None):
401
+ super().__init__()
402
+ self.browser = browser
403
+
404
+ def forward(self, url: str) -> str:
405
+ self.browser.visit_page(url)
406
+ header, content = self.browser._state()
407
+ return header.strip() + "\n=======================\n" + content
408
+
409
+
410
+ class DownloadTool(Tool):
411
+ name = "download_file"
412
+ description = """
413
+ Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".png", ".docx"]
414
+ After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
415
+ DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
416
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
417
+ output_type = "string"
418
+
419
+ def __init__(self, browser):
420
+ super().__init__()
421
+ self.browser = browser
422
+
423
+ def forward(self, url: str) -> str:
424
+ import requests
425
+
426
+ if "arxiv" in url:
427
+ url = url.replace("abs", "pdf")
428
+ response = requests.get(url)
429
+ content_type = response.headers.get("content-type", "")
430
+ extension = mimetypes.guess_extension(content_type)
431
+ if extension and isinstance(extension, str):
432
+ new_path = f"./downloads/file{extension}"
433
+ else:
434
+ new_path = "./downloads/file.object"
435
+
436
+ with open(new_path, "wb") as f:
437
+ f.write(response.content)
438
+
439
+ if "pdf" in extension or "txt" in extension or "htm" in extension:
440
+ raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
441
+
442
+ return f"File was downloaded and saved under path {new_path}."
443
+
444
+
445
+ class ArchiveSearchTool(Tool):
446
+ name = "find_archived_url"
447
+ description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
448
+ inputs = {
449
+ "url": {"type": "string", "description": "The url you need the archive for."},
450
+ "date": {
451
+ "type": "string",
452
+ "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
453
+ },
454
+ }
455
+ output_type = "string"
456
+
457
+ def __init__(self, browser=None):
458
+ super().__init__()
459
+ self.browser = browser
460
+
461
+ def forward(self, url, date) -> str:
462
+ import requests
463
+
464
+ no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
465
+ archive_url = no_timestamp_url + f"&timestamp={date}"
466
+ response = requests.get(archive_url).json()
467
+ response_notimestamp = requests.get(no_timestamp_url).json()
468
+ if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
469
+ closest = response["archived_snapshots"]["closest"]
470
+ print("Archive found!", closest)
471
+
472
+ elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
473
+ closest = response_notimestamp["archived_snapshots"]["closest"]
474
+ print("Archive found!", closest)
475
+ else:
476
+ raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
477
+ target_url = closest["url"]
478
+ self.browser.visit_page(target_url)
479
+ header, content = self.browser._state()
480
+ return (
481
+ f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
482
+ + header.strip()
483
+ + "\n=======================\n"
484
+ + content
485
+ )
486
+
487
+
488
+ class PageUpTool(Tool):
489
+ name = "page_up"
490
+ description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
491
+ inputs = {}
492
+ output_type = "string"
493
+
494
+ def __init__(self, browser=None):
495
+ super().__init__()
496
+ self.browser = browser
497
+
498
+ def forward(self) -> str:
499
+ self.browser.page_up()
500
+ header, content = self.browser._state()
501
+ return header.strip() + "\n=======================\n" + content
502
+
503
+
504
+ class PageDownTool(Tool):
505
+ name = "page_down"
506
+ description = (
507
+ "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
508
+ )
509
+ inputs = {}
510
+ output_type = "string"
511
+
512
+ def __init__(self, browser=None):
513
+ super().__init__()
514
+ self.browser = browser
515
+
516
+ def forward(self) -> str:
517
+ self.browser.page_down()
518
+ header, content = self.browser._state()
519
+ return header.strip() + "\n=======================\n" + content
520
+
521
+
522
+ class FinderTool(Tool):
523
+ name = "find_on_page_ctrl_f"
524
+ description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
525
+ inputs = {
526
+ "search_string": {
527
+ "type": "string",
528
+ "description": "The string to search for on the page. This search string supports wildcards like '*'",
529
+ }
530
+ }
531
+ output_type = "string"
532
+
533
+ def __init__(self, browser=None):
534
+ super().__init__()
535
+ self.browser = browser
536
+
537
+ def forward(self, search_string: str) -> str:
538
+ find_result = self.browser.find_on_page(search_string)
539
+ header, content = self.browser._state()
540
+
541
+ if find_result is None:
542
+ return (
543
+ header.strip()
544
+ + f"\n=======================\nThe search string '{search_string}' was not found on this page."
545
+ )
546
+ else:
547
+ return header.strip() + "\n=======================\n" + content
548
+
549
+
550
+ class FindNextTool(Tool):
551
+ name = "find_next"
552
+ description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
553
+ inputs = {}
554
+ output_type = "string"
555
+
556
+ def __init__(self, browser=None):
557
+ super().__init__()
558
+ self.browser = browser
559
+
560
+ def forward(self) -> str:
561
+ find_result = self.browser.find_next()
562
+ header, content = self.browser._state()
563
+
564
+ if find_result is None:
565
+ return header.strip() + "\n=======================\nThe search string was not found on this page."
566
+ else:
567
+ return header.strip() + "\n=======================\n" + content