Commit
·
5067f71
1
Parent(s):
acb85aa
Add tool text_web_browser and update requirements.txt
Browse files- requirements.txt +2 -0
- tools/cookies.py +715 -0
- tools/text_web_browser.py +567 -0
requirements.txt
CHANGED
@@ -11,3 +11,5 @@ pydub
|
|
11 |
SpeechRecognition
|
12 |
beautifulsoup4
|
13 |
youtube-transcript-api
|
|
|
|
|
|
11 |
SpeechRecognition
|
12 |
beautifulsoup4
|
13 |
youtube-transcript-api
|
14 |
+
pathvalidate
|
15 |
+
serpapi
|
tools/cookies.py
ADDED
@@ -0,0 +1,715 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from requests.cookies import RequestsCookieJar
|
2 |
+
|
3 |
+
|
4 |
+
COOKIES_LIST = [
|
5 |
+
{
|
6 |
+
"domain": ".youtube.com",
|
7 |
+
"expirationDate": 1718884961,
|
8 |
+
"hostOnly": False,
|
9 |
+
"httpOnly": False,
|
10 |
+
"name": "ST-xuwub9",
|
11 |
+
"path": "/",
|
12 |
+
"sameSite": None,
|
13 |
+
"secure": False,
|
14 |
+
"session": False,
|
15 |
+
"storeId": None,
|
16 |
+
"value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"domain": ".youtube.com",
|
20 |
+
"expirationDate": 1753004444.745411,
|
21 |
+
"hostOnly": False,
|
22 |
+
"httpOnly": True,
|
23 |
+
"name": "__Secure-YEC",
|
24 |
+
"path": "/",
|
25 |
+
"sameSite": "lax",
|
26 |
+
"secure": True,
|
27 |
+
"session": False,
|
28 |
+
"storeId": None,
|
29 |
+
"value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"domain": ".youtube.com",
|
33 |
+
"expirationDate": 1753434620.050824,
|
34 |
+
"hostOnly": False,
|
35 |
+
"httpOnly": True,
|
36 |
+
"name": "__Secure-3PSID",
|
37 |
+
"path": "/",
|
38 |
+
"sameSite": "no_restriction",
|
39 |
+
"secure": True,
|
40 |
+
"session": False,
|
41 |
+
"storeId": None,
|
42 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"domain": ".youtube.com",
|
46 |
+
"expirationDate": 1750420959.974642,
|
47 |
+
"hostOnly": False,
|
48 |
+
"httpOnly": False,
|
49 |
+
"name": "SIDCC",
|
50 |
+
"path": "/",
|
51 |
+
"sameSite": None,
|
52 |
+
"secure": False,
|
53 |
+
"session": False,
|
54 |
+
"storeId": None,
|
55 |
+
"value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"domain": ".youtube.com",
|
59 |
+
"expirationDate": 1753434620.050652,
|
60 |
+
"hostOnly": False,
|
61 |
+
"httpOnly": False,
|
62 |
+
"name": "SID",
|
63 |
+
"path": "/",
|
64 |
+
"sameSite": None,
|
65 |
+
"secure": False,
|
66 |
+
"session": False,
|
67 |
+
"storeId": None,
|
68 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"domain": ".youtube.com",
|
72 |
+
"expirationDate": 1750420958.397534,
|
73 |
+
"hostOnly": False,
|
74 |
+
"httpOnly": True,
|
75 |
+
"name": "__Secure-1PSIDTS",
|
76 |
+
"path": "/",
|
77 |
+
"sameSite": None,
|
78 |
+
"secure": True,
|
79 |
+
"session": False,
|
80 |
+
"storeId": None,
|
81 |
+
"value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"domain": ".youtube.com",
|
85 |
+
"expirationDate": 1753433494.44729,
|
86 |
+
"hostOnly": False,
|
87 |
+
"httpOnly": False,
|
88 |
+
"name": "_ga_M0180HEFCY",
|
89 |
+
"path": "/",
|
90 |
+
"sameSite": None,
|
91 |
+
"secure": False,
|
92 |
+
"session": False,
|
93 |
+
"storeId": None,
|
94 |
+
"value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"domain": ".youtube.com",
|
98 |
+
"expirationDate": 1753434620.050933,
|
99 |
+
"hostOnly": False,
|
100 |
+
"httpOnly": False,
|
101 |
+
"name": "SAPISID",
|
102 |
+
"path": "/",
|
103 |
+
"sameSite": None,
|
104 |
+
"secure": True,
|
105 |
+
"session": False,
|
106 |
+
"storeId": None,
|
107 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"domain": ".youtube.com",
|
111 |
+
"expirationDate": 1750420959.974764,
|
112 |
+
"hostOnly": False,
|
113 |
+
"httpOnly": True,
|
114 |
+
"name": "__Secure-1PSIDCC",
|
115 |
+
"path": "/",
|
116 |
+
"sameSite": None,
|
117 |
+
"secure": True,
|
118 |
+
"session": False,
|
119 |
+
"storeId": None,
|
120 |
+
"value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"domain": ".youtube.com",
|
124 |
+
"expirationDate": 1753434620.050881,
|
125 |
+
"hostOnly": False,
|
126 |
+
"httpOnly": True,
|
127 |
+
"name": "SSID",
|
128 |
+
"path": "/",
|
129 |
+
"sameSite": None,
|
130 |
+
"secure": True,
|
131 |
+
"session": False,
|
132 |
+
"storeId": None,
|
133 |
+
"value": "AmlwXHnQvOQ10LVd-",
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"domain": ".youtube.com",
|
137 |
+
"expirationDate": 1753434620.050959,
|
138 |
+
"hostOnly": False,
|
139 |
+
"httpOnly": False,
|
140 |
+
"name": "__Secure-1PAPISID",
|
141 |
+
"path": "/",
|
142 |
+
"sameSite": None,
|
143 |
+
"secure": True,
|
144 |
+
"session": False,
|
145 |
+
"storeId": None,
|
146 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"domain": ".youtube.com",
|
150 |
+
"expirationDate": 1753434620.050795,
|
151 |
+
"hostOnly": False,
|
152 |
+
"httpOnly": True,
|
153 |
+
"name": "__Secure-1PSID",
|
154 |
+
"path": "/",
|
155 |
+
"sameSite": None,
|
156 |
+
"secure": True,
|
157 |
+
"session": False,
|
158 |
+
"storeId": None,
|
159 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"domain": ".youtube.com",
|
163 |
+
"expirationDate": 1753434620.050993,
|
164 |
+
"hostOnly": False,
|
165 |
+
"httpOnly": False,
|
166 |
+
"name": "__Secure-3PAPISID",
|
167 |
+
"path": "/",
|
168 |
+
"sameSite": "no_restriction",
|
169 |
+
"secure": True,
|
170 |
+
"session": False,
|
171 |
+
"storeId": None,
|
172 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"domain": ".youtube.com",
|
176 |
+
"expirationDate": 1750420959.974815,
|
177 |
+
"hostOnly": False,
|
178 |
+
"httpOnly": True,
|
179 |
+
"name": "__Secure-3PSIDCC",
|
180 |
+
"path": "/",
|
181 |
+
"sameSite": "no_restriction",
|
182 |
+
"secure": True,
|
183 |
+
"session": False,
|
184 |
+
"storeId": None,
|
185 |
+
"value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"domain": ".youtube.com",
|
189 |
+
"expirationDate": 1750420958.397647,
|
190 |
+
"hostOnly": False,
|
191 |
+
"httpOnly": True,
|
192 |
+
"name": "__Secure-3PSIDTS",
|
193 |
+
"path": "/",
|
194 |
+
"sameSite": "no_restriction",
|
195 |
+
"secure": True,
|
196 |
+
"session": False,
|
197 |
+
"storeId": None,
|
198 |
+
"value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"domain": ".youtube.com",
|
202 |
+
"expirationDate": 1753434620.050908,
|
203 |
+
"hostOnly": False,
|
204 |
+
"httpOnly": False,
|
205 |
+
"name": "APISID",
|
206 |
+
"path": "/",
|
207 |
+
"sameSite": None,
|
208 |
+
"secure": False,
|
209 |
+
"session": False,
|
210 |
+
"storeId": None,
|
211 |
+
"value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"domain": ".youtube.com",
|
215 |
+
"expirationDate": 1753434620.050855,
|
216 |
+
"hostOnly": False,
|
217 |
+
"httpOnly": True,
|
218 |
+
"name": "HSID",
|
219 |
+
"path": "/",
|
220 |
+
"sameSite": None,
|
221 |
+
"secure": False,
|
222 |
+
"session": False,
|
223 |
+
"storeId": None,
|
224 |
+
"value": "AasA7hmRuTFv7vjoq",
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"domain": ".youtube.com",
|
228 |
+
"expirationDate": 1753435873.577793,
|
229 |
+
"hostOnly": False,
|
230 |
+
"httpOnly": True,
|
231 |
+
"name": "LOGIN_INFO",
|
232 |
+
"path": "/",
|
233 |
+
"sameSite": "no_restriction",
|
234 |
+
"secure": True,
|
235 |
+
"session": False,
|
236 |
+
"storeId": None,
|
237 |
+
"value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"domain": ".youtube.com",
|
241 |
+
"expirationDate": 1753444956.555608,
|
242 |
+
"hostOnly": False,
|
243 |
+
"httpOnly": False,
|
244 |
+
"name": "PREF",
|
245 |
+
"path": "/",
|
246 |
+
"sameSite": None,
|
247 |
+
"secure": True,
|
248 |
+
"session": False,
|
249 |
+
"storeId": None,
|
250 |
+
"value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
|
251 |
+
},
|
252 |
+
]
|
253 |
+
|
254 |
+
COOKIES_LIST += [
|
255 |
+
{
|
256 |
+
"domain": ".www.researchgate.net",
|
257 |
+
"hostOnly": False,
|
258 |
+
"httpOnly": True,
|
259 |
+
"name": "isInstIp",
|
260 |
+
"path": "/",
|
261 |
+
"sameSite": None,
|
262 |
+
"secure": True,
|
263 |
+
"session": True,
|
264 |
+
"storeId": None,
|
265 |
+
"value": "False",
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"domain": ".researchgate.net",
|
269 |
+
"expirationDate": 1734423981,
|
270 |
+
"hostOnly": False,
|
271 |
+
"httpOnly": False,
|
272 |
+
"name": "__eoi",
|
273 |
+
"path": "/",
|
274 |
+
"sameSite": None,
|
275 |
+
"secure": False,
|
276 |
+
"session": False,
|
277 |
+
"storeId": None,
|
278 |
+
"value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"domain": ".www.researchgate.net",
|
282 |
+
"expirationDate": 1753444909.646103,
|
283 |
+
"hostOnly": False,
|
284 |
+
"httpOnly": True,
|
285 |
+
"name": "ptc",
|
286 |
+
"path": "/",
|
287 |
+
"sameSite": None,
|
288 |
+
"secure": True,
|
289 |
+
"session": False,
|
290 |
+
"storeId": None,
|
291 |
+
"value": "RG1.8947708639250500550.1718872043",
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"domain": ".researchgate.net",
|
295 |
+
"expirationDate": 1750507578,
|
296 |
+
"hostOnly": False,
|
297 |
+
"httpOnly": False,
|
298 |
+
"name": "euconsent-v2-didomi",
|
299 |
+
"path": "/",
|
300 |
+
"sameSite": "lax",
|
301 |
+
"secure": True,
|
302 |
+
"session": False,
|
303 |
+
"storeId": None,
|
304 |
+
"value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"domain": ".researchgate.net",
|
308 |
+
"expirationDate": 1718885236,
|
309 |
+
"hostOnly": False,
|
310 |
+
"httpOnly": False,
|
311 |
+
"name": "_gat",
|
312 |
+
"path": "/",
|
313 |
+
"sameSite": None,
|
314 |
+
"secure": False,
|
315 |
+
"session": False,
|
316 |
+
"storeId": None,
|
317 |
+
"value": "1",
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"domain": "www.researchgate.net",
|
321 |
+
"expirationDate": 1721477183,
|
322 |
+
"hostOnly": True,
|
323 |
+
"httpOnly": False,
|
324 |
+
"name": "_pbjs_userid_consent_data",
|
325 |
+
"path": "/",
|
326 |
+
"sameSite": "lax",
|
327 |
+
"secure": False,
|
328 |
+
"session": False,
|
329 |
+
"storeId": None,
|
330 |
+
"value": "3524755945110770",
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"domain": ".researchgate.net",
|
334 |
+
"expirationDate": 1752567981,
|
335 |
+
"hostOnly": False,
|
336 |
+
"httpOnly": False,
|
337 |
+
"name": "__gads",
|
338 |
+
"path": "/",
|
339 |
+
"sameSite": None,
|
340 |
+
"secure": False,
|
341 |
+
"session": False,
|
342 |
+
"storeId": None,
|
343 |
+
"value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
|
344 |
+
},
|
345 |
+
{
|
346 |
+
"domain": ".researchgate.net",
|
347 |
+
"expirationDate": 1718886709.646173,
|
348 |
+
"hostOnly": False,
|
349 |
+
"httpOnly": True,
|
350 |
+
"name": "__cf_bm",
|
351 |
+
"path": "/",
|
352 |
+
"sameSite": "no_restriction",
|
353 |
+
"secure": True,
|
354 |
+
"session": False,
|
355 |
+
"storeId": None,
|
356 |
+
"value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"domain": ".researchgate.net",
|
360 |
+
"expirationDate": 1752567981,
|
361 |
+
"hostOnly": False,
|
362 |
+
"httpOnly": False,
|
363 |
+
"name": "__gpi",
|
364 |
+
"path": "/",
|
365 |
+
"sameSite": None,
|
366 |
+
"secure": False,
|
367 |
+
"session": False,
|
368 |
+
"storeId": None,
|
369 |
+
"value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
|
370 |
+
},
|
371 |
+
{
|
372 |
+
"domain": ".researchgate.net",
|
373 |
+
"hostOnly": False,
|
374 |
+
"httpOnly": True,
|
375 |
+
"name": "_cfuvid",
|
376 |
+
"path": "/",
|
377 |
+
"sameSite": "no_restriction",
|
378 |
+
"secure": True,
|
379 |
+
"session": True,
|
380 |
+
"storeId": None,
|
381 |
+
"value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
|
382 |
+
},
|
383 |
+
{
|
384 |
+
"domain": ".researchgate.net",
|
385 |
+
"expirationDate": 1753445177.271667,
|
386 |
+
"hostOnly": False,
|
387 |
+
"httpOnly": False,
|
388 |
+
"name": "_ga",
|
389 |
+
"path": "/",
|
390 |
+
"sameSite": None,
|
391 |
+
"secure": False,
|
392 |
+
"session": False,
|
393 |
+
"storeId": None,
|
394 |
+
"value": "GA1.1.1525244793.1718885177",
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"domain": ".researchgate.net",
|
398 |
+
"expirationDate": 1753445177.271482,
|
399 |
+
"hostOnly": False,
|
400 |
+
"httpOnly": False,
|
401 |
+
"name": "_ga_4P31SJ70EJ",
|
402 |
+
"path": "/",
|
403 |
+
"sameSite": None,
|
404 |
+
"secure": False,
|
405 |
+
"session": False,
|
406 |
+
"storeId": None,
|
407 |
+
"value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"domain": ".researchgate.net",
|
411 |
+
"expirationDate": 1718971576,
|
412 |
+
"hostOnly": False,
|
413 |
+
"httpOnly": False,
|
414 |
+
"name": "_gid",
|
415 |
+
"path": "/",
|
416 |
+
"sameSite": None,
|
417 |
+
"secure": False,
|
418 |
+
"session": False,
|
419 |
+
"storeId": None,
|
420 |
+
"value": "GA1.2.854907463.1718885177",
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"domain": ".www.researchgate.net",
|
424 |
+
"expirationDate": 1750407982.506505,
|
425 |
+
"hostOnly": False,
|
426 |
+
"httpOnly": True,
|
427 |
+
"name": "did",
|
428 |
+
"path": "/",
|
429 |
+
"sameSite": None,
|
430 |
+
"secure": True,
|
431 |
+
"session": False,
|
432 |
+
"storeId": None,
|
433 |
+
"value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
|
434 |
+
},
|
435 |
+
{
|
436 |
+
"domain": ".researchgate.net",
|
437 |
+
"expirationDate": 1750507578,
|
438 |
+
"hostOnly": False,
|
439 |
+
"httpOnly": False,
|
440 |
+
"name": "didomi_token",
|
441 |
+
"path": "/",
|
442 |
+
"sameSite": "lax",
|
443 |
+
"secure": True,
|
444 |
+
"session": False,
|
445 |
+
"storeId": None,
|
446 |
+
"value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"domain": ".www.researchgate.net",
|
450 |
+
"hostOnly": False,
|
451 |
+
"httpOnly": True,
|
452 |
+
"name": "hasPdpNext",
|
453 |
+
"path": "/",
|
454 |
+
"sameSite": None,
|
455 |
+
"secure": True,
|
456 |
+
"session": True,
|
457 |
+
"storeId": None,
|
458 |
+
"value": "False",
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"domain": ".researchgate.net",
|
462 |
+
"expirationDate": 1750421183,
|
463 |
+
"hostOnly": False,
|
464 |
+
"httpOnly": False,
|
465 |
+
"name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
|
466 |
+
"path": "/",
|
467 |
+
"sameSite": "lax",
|
468 |
+
"secure": True,
|
469 |
+
"session": False,
|
470 |
+
"storeId": None,
|
471 |
+
"value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
|
472 |
+
},
|
473 |
+
{
|
474 |
+
"domain": ".www.researchgate.net",
|
475 |
+
"hostOnly": False,
|
476 |
+
"httpOnly": True,
|
477 |
+
"name": "sid",
|
478 |
+
"path": "/",
|
479 |
+
"sameSite": None,
|
480 |
+
"secure": True,
|
481 |
+
"session": True,
|
482 |
+
"storeId": None,
|
483 |
+
"value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
|
484 |
+
},
|
485 |
+
]
|
486 |
+
|
487 |
+
COOKIES_LIST += [
|
488 |
+
{
|
489 |
+
"domain": "github.com",
|
490 |
+
"hostOnly": True,
|
491 |
+
"httpOnly": True,
|
492 |
+
"name": "_gh_sess",
|
493 |
+
"path": "/",
|
494 |
+
"sameSite": "lax",
|
495 |
+
"secure": True,
|
496 |
+
"session": True,
|
497 |
+
"storeId": None,
|
498 |
+
"value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
|
499 |
+
},
|
500 |
+
{
|
501 |
+
"domain": ".github.com",
|
502 |
+
"expirationDate": 1750408875.763785,
|
503 |
+
"hostOnly": False,
|
504 |
+
"httpOnly": False,
|
505 |
+
"name": "_octo",
|
506 |
+
"path": "/",
|
507 |
+
"sameSite": "lax",
|
508 |
+
"secure": True,
|
509 |
+
"session": False,
|
510 |
+
"storeId": None,
|
511 |
+
"value": "GH1.1.728652011.1718872875",
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"domain": ".github.com",
|
515 |
+
"expirationDate": 1750408875.763926,
|
516 |
+
"hostOnly": False,
|
517 |
+
"httpOnly": True,
|
518 |
+
"name": "logged_in",
|
519 |
+
"path": "/",
|
520 |
+
"sameSite": "lax",
|
521 |
+
"secure": True,
|
522 |
+
"session": False,
|
523 |
+
"storeId": None,
|
524 |
+
"value": "no",
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"domain": ".github.com",
|
528 |
+
"hostOnly": False,
|
529 |
+
"httpOnly": False,
|
530 |
+
"name": "preferred_color_mode",
|
531 |
+
"path": "/",
|
532 |
+
"sameSite": "lax",
|
533 |
+
"secure": True,
|
534 |
+
"session": True,
|
535 |
+
"storeId": None,
|
536 |
+
"value": "dark",
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"domain": ".github.com",
|
540 |
+
"hostOnly": False,
|
541 |
+
"httpOnly": False,
|
542 |
+
"name": "tz",
|
543 |
+
"path": "/",
|
544 |
+
"sameSite": "lax",
|
545 |
+
"secure": True,
|
546 |
+
"session": True,
|
547 |
+
"storeId": None,
|
548 |
+
"value": "Europe%2FParis",
|
549 |
+
},
|
550 |
+
]
|
551 |
+
|
552 |
+
COOKIES_LIST += [
|
553 |
+
{
|
554 |
+
"domain": ".web.archive.org",
|
555 |
+
"expirationDate": 1718886430,
|
556 |
+
"hostOnly": False,
|
557 |
+
"httpOnly": False,
|
558 |
+
"name": "_gat",
|
559 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
560 |
+
"sameSite": None,
|
561 |
+
"secure": False,
|
562 |
+
"session": False,
|
563 |
+
"storeId": None,
|
564 |
+
"value": "1",
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"domain": ".web.archive.org",
|
568 |
+
"expirationDate": 1718972770,
|
569 |
+
"hostOnly": False,
|
570 |
+
"httpOnly": False,
|
571 |
+
"name": "_gid",
|
572 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
573 |
+
"sameSite": None,
|
574 |
+
"secure": False,
|
575 |
+
"session": False,
|
576 |
+
"storeId": None,
|
577 |
+
"value": "GA1.2.402246368.1606169825",
|
578 |
+
},
|
579 |
+
{
|
580 |
+
"domain": ".web.archive.org",
|
581 |
+
"expirationDate": 1753446370.315621,
|
582 |
+
"hostOnly": False,
|
583 |
+
"httpOnly": False,
|
584 |
+
"name": "_ga",
|
585 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
586 |
+
"sameSite": None,
|
587 |
+
"secure": False,
|
588 |
+
"session": False,
|
589 |
+
"storeId": None,
|
590 |
+
"value": "GA1.2.1301409987.1606169825",
|
591 |
+
},
|
592 |
+
{
|
593 |
+
"domain": ".web.archive.org",
|
594 |
+
"expirationDate": 1750422367,
|
595 |
+
"hostOnly": False,
|
596 |
+
"httpOnly": False,
|
597 |
+
"name": "_hjid",
|
598 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
599 |
+
"sameSite": "lax",
|
600 |
+
"secure": False,
|
601 |
+
"session": False,
|
602 |
+
"storeId": None,
|
603 |
+
"value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
|
604 |
+
},
|
605 |
+
{
|
606 |
+
"domain": ".web.archive.org",
|
607 |
+
"expirationDate": 1718888167,
|
608 |
+
"hostOnly": False,
|
609 |
+
"httpOnly": False,
|
610 |
+
"name": "_hjFirstSeen",
|
611 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
612 |
+
"sameSite": "lax",
|
613 |
+
"secure": False,
|
614 |
+
"session": False,
|
615 |
+
"storeId": None,
|
616 |
+
"value": "1",
|
617 |
+
},
|
618 |
+
]
|
619 |
+
COOKIES_LIST += [
|
620 |
+
{
|
621 |
+
"domain": "orcid.org",
|
622 |
+
"hostOnly": True,
|
623 |
+
"httpOnly": False,
|
624 |
+
"name": "AWSELBCORS",
|
625 |
+
"path": "/",
|
626 |
+
"sameSite": "no_restriction",
|
627 |
+
"secure": True,
|
628 |
+
"session": True,
|
629 |
+
"storeId": None,
|
630 |
+
"value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
|
631 |
+
},
|
632 |
+
{
|
633 |
+
"domain": ".orcid.org",
|
634 |
+
"expirationDate": 1753452454.637671,
|
635 |
+
"hostOnly": False,
|
636 |
+
"httpOnly": False,
|
637 |
+
"name": "_ga_9R61FWK9H5",
|
638 |
+
"path": "/",
|
639 |
+
"sameSite": None,
|
640 |
+
"secure": False,
|
641 |
+
"session": False,
|
642 |
+
"storeId": None,
|
643 |
+
"value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
|
644 |
+
},
|
645 |
+
{
|
646 |
+
"domain": ".orcid.org",
|
647 |
+
"expirationDate": 1753452454.63421,
|
648 |
+
"hostOnly": False,
|
649 |
+
"httpOnly": False,
|
650 |
+
"name": "_ga",
|
651 |
+
"path": "/",
|
652 |
+
"sameSite": None,
|
653 |
+
"secure": False,
|
654 |
+
"session": False,
|
655 |
+
"storeId": None,
|
656 |
+
"value": "GA1.1.2021310691.1718892455",
|
657 |
+
},
|
658 |
+
{
|
659 |
+
"domain": "orcid.org",
|
660 |
+
"hostOnly": True,
|
661 |
+
"httpOnly": False,
|
662 |
+
"name": "AWSELB",
|
663 |
+
"path": "/",
|
664 |
+
"sameSite": None,
|
665 |
+
"secure": False,
|
666 |
+
"session": True,
|
667 |
+
"storeId": None,
|
668 |
+
"value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
|
669 |
+
},
|
670 |
+
{
|
671 |
+
"domain": ".orcid.org",
|
672 |
+
"expirationDate": 1750428454,
|
673 |
+
"hostOnly": False,
|
674 |
+
"httpOnly": False,
|
675 |
+
"name": "OptanonAlertBoxClosed",
|
676 |
+
"path": "/",
|
677 |
+
"sameSite": "lax",
|
678 |
+
"secure": False,
|
679 |
+
"session": False,
|
680 |
+
"storeId": None,
|
681 |
+
"value": "2024-06-20T14:07:34.583Z",
|
682 |
+
},
|
683 |
+
{
|
684 |
+
"domain": ".orcid.org",
|
685 |
+
"expirationDate": 1750428454,
|
686 |
+
"hostOnly": False,
|
687 |
+
"httpOnly": False,
|
688 |
+
"name": "OptanonConsent",
|
689 |
+
"path": "/",
|
690 |
+
"sameSite": "lax",
|
691 |
+
"secure": False,
|
692 |
+
"session": False,
|
693 |
+
"storeId": None,
|
694 |
+
"value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"domain": "orcid.org",
|
698 |
+
"hostOnly": True,
|
699 |
+
"httpOnly": False,
|
700 |
+
"name": "XSRF-TOKEN",
|
701 |
+
"path": "/",
|
702 |
+
"sameSite": None,
|
703 |
+
"secure": True,
|
704 |
+
"session": True,
|
705 |
+
"storeId": None,
|
706 |
+
"value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
|
707 |
+
},
|
708 |
+
]
|
709 |
+
|
710 |
+
# Create a RequestsCookieJar instance
|
711 |
+
COOKIES = RequestsCookieJar()
|
712 |
+
|
713 |
+
# Add cookies to the jar
|
714 |
+
for cookie in COOKIES_LIST:
|
715 |
+
COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
|
tools/text_web_browser.py
ADDED
@@ -0,0 +1,567 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
|
2 |
+
# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
|
3 |
+
import mimetypes
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import re
|
7 |
+
import time
|
8 |
+
import uuid
|
9 |
+
from typing import Any
|
10 |
+
from urllib.parse import unquote, urljoin, urlparse
|
11 |
+
|
12 |
+
import pathvalidate
|
13 |
+
import requests
|
14 |
+
from serpapi import GoogleSearch
|
15 |
+
|
16 |
+
from smolagents import Tool
|
17 |
+
|
18 |
+
from .cookies import COOKIES
|
19 |
+
from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
|
20 |
+
|
21 |
+
|
22 |
+
class SimpleTextBrowser:
|
23 |
+
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
start_page: str | None = None,
|
28 |
+
viewport_size: int | None = 1024 * 8,
|
29 |
+
downloads_folder: str | None | None = None,
|
30 |
+
serpapi_key: str | None | None = None,
|
31 |
+
request_kwargs: dict[str, Any] | None | None = None,
|
32 |
+
):
|
33 |
+
self.start_page: str = start_page if start_page else "about:blank"
|
34 |
+
self.viewport_size = viewport_size # Applies only to the standard uri types
|
35 |
+
self.downloads_folder = downloads_folder
|
36 |
+
self.history: list[tuple[str, float]] = list()
|
37 |
+
self.page_title: str | None = None
|
38 |
+
self.viewport_current_page = 0
|
39 |
+
self.viewport_pages: list[tuple[int, int]] = list()
|
40 |
+
self.set_address(self.start_page)
|
41 |
+
self.serpapi_key = serpapi_key
|
42 |
+
self.request_kwargs = request_kwargs
|
43 |
+
self.request_kwargs["cookies"] = COOKIES
|
44 |
+
self._mdconvert = MarkdownConverter()
|
45 |
+
self._page_content: str = ""
|
46 |
+
|
47 |
+
self._find_on_page_query: str | None = None
|
48 |
+
self._find_on_page_last_result: int | None = None # Location of the last result
|
49 |
+
|
50 |
+
@property
|
51 |
+
def address(self) -> str:
|
52 |
+
"""Return the address of the current page."""
|
53 |
+
return self.history[-1][0]
|
54 |
+
|
55 |
+
def set_address(self, uri_or_path: str, filter_year: int | None = None) -> None:
|
56 |
+
# TODO: Handle anchors
|
57 |
+
self.history.append((uri_or_path, time.time()))
|
58 |
+
|
59 |
+
# Handle special URIs
|
60 |
+
if uri_or_path == "about:blank":
|
61 |
+
self._set_page_content("")
|
62 |
+
elif uri_or_path.startswith("google:"):
|
63 |
+
self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
|
64 |
+
else:
|
65 |
+
if (
|
66 |
+
not uri_or_path.startswith("http:")
|
67 |
+
and not uri_or_path.startswith("https:")
|
68 |
+
and not uri_or_path.startswith("file:")
|
69 |
+
):
|
70 |
+
if len(self.history) > 1:
|
71 |
+
prior_address = self.history[-2][0]
|
72 |
+
uri_or_path = urljoin(prior_address, uri_or_path)
|
73 |
+
# Update the address with the fully-qualified path
|
74 |
+
self.history[-1] = (uri_or_path, self.history[-1][1])
|
75 |
+
self._fetch_page(uri_or_path)
|
76 |
+
|
77 |
+
self.viewport_current_page = 0
|
78 |
+
self.find_on_page_query = None
|
79 |
+
self.find_on_page_viewport = None
|
80 |
+
|
81 |
+
@property
|
82 |
+
def viewport(self) -> str:
|
83 |
+
"""Return the content of the current viewport."""
|
84 |
+
bounds = self.viewport_pages[self.viewport_current_page]
|
85 |
+
return self.page_content[bounds[0] : bounds[1]]
|
86 |
+
|
87 |
+
@property
|
88 |
+
def page_content(self) -> str:
|
89 |
+
"""Return the full contents of the current page."""
|
90 |
+
return self._page_content
|
91 |
+
|
92 |
+
def _set_page_content(self, content: str) -> None:
|
93 |
+
"""Sets the text content of the current page."""
|
94 |
+
self._page_content = content
|
95 |
+
self._split_pages()
|
96 |
+
if self.viewport_current_page >= len(self.viewport_pages):
|
97 |
+
self.viewport_current_page = len(self.viewport_pages) - 1
|
98 |
+
|
99 |
+
def page_down(self) -> None:
|
100 |
+
self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
|
101 |
+
|
102 |
+
def page_up(self) -> None:
|
103 |
+
self.viewport_current_page = max(self.viewport_current_page - 1, 0)
|
104 |
+
|
105 |
+
def find_on_page(self, query: str) -> str | None:
|
106 |
+
"""Searches for the query from the current viewport forward, looping back to the start if necessary."""
|
107 |
+
|
108 |
+
# Did we get here via a previous find_on_page search with the same query?
|
109 |
+
# If so, map to find_next
|
110 |
+
if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
|
111 |
+
return self.find_next()
|
112 |
+
|
113 |
+
# Ok it's a new search start from the current viewport
|
114 |
+
self._find_on_page_query = query
|
115 |
+
viewport_match = self._find_next_viewport(query, self.viewport_current_page)
|
116 |
+
if viewport_match is None:
|
117 |
+
self._find_on_page_last_result = None
|
118 |
+
return None
|
119 |
+
else:
|
120 |
+
self.viewport_current_page = viewport_match
|
121 |
+
self._find_on_page_last_result = viewport_match
|
122 |
+
return self.viewport
|
123 |
+
|
124 |
+
def find_next(self) -> str | None:
|
125 |
+
"""Scroll to the next viewport that matches the query"""
|
126 |
+
|
127 |
+
if self._find_on_page_query is None:
|
128 |
+
return None
|
129 |
+
|
130 |
+
starting_viewport = self._find_on_page_last_result
|
131 |
+
if starting_viewport is None:
|
132 |
+
starting_viewport = 0
|
133 |
+
else:
|
134 |
+
starting_viewport += 1
|
135 |
+
if starting_viewport >= len(self.viewport_pages):
|
136 |
+
starting_viewport = 0
|
137 |
+
|
138 |
+
viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
|
139 |
+
if viewport_match is None:
|
140 |
+
self._find_on_page_last_result = None
|
141 |
+
return None
|
142 |
+
else:
|
143 |
+
self.viewport_current_page = viewport_match
|
144 |
+
self._find_on_page_last_result = viewport_match
|
145 |
+
return self.viewport
|
146 |
+
|
147 |
+
def _find_next_viewport(self, query: str, starting_viewport: int) -> int | None:
|
148 |
+
"""Search for matches between the starting viewport looping when reaching the end."""
|
149 |
+
|
150 |
+
if query is None:
|
151 |
+
return None
|
152 |
+
|
153 |
+
# Normalize the query, and convert to a regular expression
|
154 |
+
nquery = re.sub(r"\*", "__STAR__", query)
|
155 |
+
nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
|
156 |
+
nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
|
157 |
+
nquery = nquery.replace("__STAR__", ".*").lower()
|
158 |
+
|
159 |
+
if nquery.strip() == "":
|
160 |
+
return None
|
161 |
+
|
162 |
+
idxs = list()
|
163 |
+
idxs.extend(range(starting_viewport, len(self.viewport_pages)))
|
164 |
+
idxs.extend(range(0, starting_viewport))
|
165 |
+
|
166 |
+
for i in idxs:
|
167 |
+
bounds = self.viewport_pages[i]
|
168 |
+
content = self.page_content[bounds[0] : bounds[1]]
|
169 |
+
|
170 |
+
# TODO: Remove markdown links and images
|
171 |
+
ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
|
172 |
+
if re.search(nquery, ncontent):
|
173 |
+
return i
|
174 |
+
|
175 |
+
return None
|
176 |
+
|
177 |
+
def visit_page(self, path_or_uri: str, filter_year: int | None = None) -> str:
|
178 |
+
"""Update the address, visit the page, and return the content of the viewport."""
|
179 |
+
self.set_address(path_or_uri, filter_year=filter_year)
|
180 |
+
return self.viewport
|
181 |
+
|
182 |
+
def _split_pages(self) -> None:
|
183 |
+
# Do not split search results
|
184 |
+
if self.address.startswith("google:"):
|
185 |
+
self.viewport_pages = [(0, len(self._page_content))]
|
186 |
+
return
|
187 |
+
|
188 |
+
# Handle empty pages
|
189 |
+
if len(self._page_content) == 0:
|
190 |
+
self.viewport_pages = [(0, 0)]
|
191 |
+
return
|
192 |
+
|
193 |
+
# Break the viewport into pages
|
194 |
+
self.viewport_pages = []
|
195 |
+
start_idx = 0
|
196 |
+
while start_idx < len(self._page_content):
|
197 |
+
end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
|
198 |
+
# Adjust to end on a space
|
199 |
+
while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
|
200 |
+
end_idx += 1
|
201 |
+
self.viewport_pages.append((start_idx, end_idx))
|
202 |
+
start_idx = end_idx
|
203 |
+
|
204 |
+
def _serpapi_search(self, query: str, filter_year: int | None = None) -> None:
|
205 |
+
if self.serpapi_key is None:
|
206 |
+
raise ValueError("Missing SerpAPI key.")
|
207 |
+
|
208 |
+
params = {
|
209 |
+
"engine": "google",
|
210 |
+
"q": query,
|
211 |
+
"api_key": self.serpapi_key,
|
212 |
+
}
|
213 |
+
if filter_year is not None:
|
214 |
+
params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
|
215 |
+
|
216 |
+
search = GoogleSearch(params)
|
217 |
+
results = search.get_dict()
|
218 |
+
self.page_title = f"{query} - Search"
|
219 |
+
if "organic_results" not in results.keys():
|
220 |
+
raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
|
221 |
+
if len(results["organic_results"]) == 0:
|
222 |
+
year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
|
223 |
+
self._set_page_content(
|
224 |
+
f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
|
225 |
+
)
|
226 |
+
return
|
227 |
+
|
228 |
+
def _prev_visit(url):
|
229 |
+
for i in range(len(self.history) - 1, -1, -1):
|
230 |
+
if self.history[i][0] == url:
|
231 |
+
return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
|
232 |
+
return ""
|
233 |
+
|
234 |
+
web_snippets: list[str] = list()
|
235 |
+
idx = 0
|
236 |
+
if "organic_results" in results:
|
237 |
+
for page in results["organic_results"]:
|
238 |
+
idx += 1
|
239 |
+
date_published = ""
|
240 |
+
if "date" in page:
|
241 |
+
date_published = "\nDate published: " + page["date"]
|
242 |
+
|
243 |
+
source = ""
|
244 |
+
if "source" in page:
|
245 |
+
source = "\nSource: " + page["source"]
|
246 |
+
|
247 |
+
snippet = ""
|
248 |
+
if "snippet" in page:
|
249 |
+
snippet = "\n" + page["snippet"]
|
250 |
+
|
251 |
+
redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
|
252 |
+
|
253 |
+
redacted_version = redacted_version.replace("Your browser can't play this video.", "")
|
254 |
+
web_snippets.append(redacted_version)
|
255 |
+
|
256 |
+
content = (
|
257 |
+
f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
|
258 |
+
+ "\n\n".join(web_snippets)
|
259 |
+
)
|
260 |
+
|
261 |
+
self._set_page_content(content)
|
262 |
+
|
263 |
+
def _fetch_page(self, url: str) -> None:
|
264 |
+
download_path = ""
|
265 |
+
try:
|
266 |
+
if url.startswith("file://"):
|
267 |
+
download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
|
268 |
+
res = self._mdconvert.convert_local(download_path)
|
269 |
+
self.page_title = res.title
|
270 |
+
self._set_page_content(res.text_content)
|
271 |
+
else:
|
272 |
+
# Prepare the request parameters
|
273 |
+
request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
|
274 |
+
request_kwargs["stream"] = True
|
275 |
+
|
276 |
+
# Send a HTTP request to the URL
|
277 |
+
response = requests.get(url, **request_kwargs)
|
278 |
+
response.raise_for_status()
|
279 |
+
|
280 |
+
# If the HTTP request was successful
|
281 |
+
content_type = response.headers.get("content-type", "")
|
282 |
+
|
283 |
+
# Text or HTML
|
284 |
+
if "text/" in content_type.lower():
|
285 |
+
res = self._mdconvert.convert_response(response)
|
286 |
+
self.page_title = res.title
|
287 |
+
self._set_page_content(res.text_content)
|
288 |
+
# A download
|
289 |
+
else:
|
290 |
+
# Try producing a safe filename
|
291 |
+
fname = None
|
292 |
+
download_path = None
|
293 |
+
try:
|
294 |
+
fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
|
295 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
|
296 |
+
|
297 |
+
suffix = 0
|
298 |
+
while os.path.exists(download_path) and suffix < 1000:
|
299 |
+
suffix += 1
|
300 |
+
base, ext = os.path.splitext(fname)
|
301 |
+
new_fname = f"{base}__{suffix}{ext}"
|
302 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
|
303 |
+
|
304 |
+
except NameError:
|
305 |
+
pass
|
306 |
+
|
307 |
+
# No suitable name, so make one
|
308 |
+
if fname is None:
|
309 |
+
extension = mimetypes.guess_extension(content_type)
|
310 |
+
if extension is None:
|
311 |
+
extension = ".download"
|
312 |
+
fname = str(uuid.uuid4()) + extension
|
313 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
|
314 |
+
|
315 |
+
# Open a file for writing
|
316 |
+
with open(download_path, "wb") as fh:
|
317 |
+
for chunk in response.iter_content(chunk_size=512):
|
318 |
+
fh.write(chunk)
|
319 |
+
|
320 |
+
# Render it
|
321 |
+
local_uri = pathlib.Path(download_path).as_uri()
|
322 |
+
self.set_address(local_uri)
|
323 |
+
|
324 |
+
except UnsupportedFormatException as e:
|
325 |
+
print(e)
|
326 |
+
self.page_title = ("Download complete.",)
|
327 |
+
self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
|
328 |
+
except FileConversionException as e:
|
329 |
+
print(e)
|
330 |
+
self.page_title = ("Download complete.",)
|
331 |
+
self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
|
332 |
+
except FileNotFoundError:
|
333 |
+
self.page_title = "Error 404"
|
334 |
+
self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
|
335 |
+
except requests.exceptions.RequestException as request_exception:
|
336 |
+
try:
|
337 |
+
self.page_title = f"Error {response.status_code}"
|
338 |
+
|
339 |
+
# If the error was rendered in HTML we might as well render it
|
340 |
+
content_type = response.headers.get("content-type", "")
|
341 |
+
if content_type is not None and "text/html" in content_type.lower():
|
342 |
+
res = self._mdconvert.convert(response)
|
343 |
+
self.page_title = f"Error {response.status_code}"
|
344 |
+
self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
|
345 |
+
else:
|
346 |
+
text = ""
|
347 |
+
for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
|
348 |
+
text += chunk
|
349 |
+
self.page_title = f"Error {response.status_code}"
|
350 |
+
self._set_page_content(f"## Error {response.status_code}\n\n{text}")
|
351 |
+
except NameError:
|
352 |
+
self.page_title = "Error"
|
353 |
+
self._set_page_content(f"## Error\n\n{str(request_exception)}")
|
354 |
+
|
355 |
+
def _state(self) -> tuple[str, str]:
|
356 |
+
header = f"Address: {self.address}\n"
|
357 |
+
if self.page_title is not None:
|
358 |
+
header += f"Title: {self.page_title}\n"
|
359 |
+
|
360 |
+
current_page = self.viewport_current_page
|
361 |
+
total_pages = len(self.viewport_pages)
|
362 |
+
|
363 |
+
address = self.address
|
364 |
+
for i in range(len(self.history) - 2, -1, -1): # Start from the second last
|
365 |
+
if self.history[i][0] == address:
|
366 |
+
header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
|
367 |
+
break
|
368 |
+
|
369 |
+
header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
|
370 |
+
return (header, self.viewport)
|
371 |
+
|
372 |
+
|
373 |
+
class SearchInformationTool(Tool):
|
374 |
+
name = "web_search"
|
375 |
+
description = "Perform a web search query (think a google search) and returns the search results."
|
376 |
+
inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
|
377 |
+
inputs["filter_year"] = {
|
378 |
+
"type": "string",
|
379 |
+
"description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
|
380 |
+
"nullable": True,
|
381 |
+
}
|
382 |
+
output_type = "string"
|
383 |
+
|
384 |
+
def __init__(self, browser):
|
385 |
+
super().__init__()
|
386 |
+
self.browser = browser
|
387 |
+
|
388 |
+
def forward(self, query: str, filter_year: int | None = None) -> str:
|
389 |
+
self.browser.visit_page(f"google: {query}", filter_year=filter_year)
|
390 |
+
header, content = self.browser._state()
|
391 |
+
return header.strip() + "\n=======================\n" + content
|
392 |
+
|
393 |
+
|
394 |
+
class VisitTool(Tool):
|
395 |
+
name = "visit_page"
|
396 |
+
description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
|
397 |
+
inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webpage to visit."}}
|
398 |
+
output_type = "string"
|
399 |
+
|
400 |
+
def __init__(self, browser=None):
|
401 |
+
super().__init__()
|
402 |
+
self.browser = browser
|
403 |
+
|
404 |
+
def forward(self, url: str) -> str:
|
405 |
+
self.browser.visit_page(url)
|
406 |
+
header, content = self.browser._state()
|
407 |
+
return header.strip() + "\n=======================\n" + content
|
408 |
+
|
409 |
+
|
410 |
+
class DownloadTool(Tool):
|
411 |
+
name = "download_file"
|
412 |
+
description = """
|
413 |
+
Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".png", ".docx"]
|
414 |
+
After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
|
415 |
+
DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
|
416 |
+
inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
|
417 |
+
output_type = "string"
|
418 |
+
|
419 |
+
def __init__(self, browser):
|
420 |
+
super().__init__()
|
421 |
+
self.browser = browser
|
422 |
+
|
423 |
+
def forward(self, url: str) -> str:
|
424 |
+
import requests
|
425 |
+
|
426 |
+
if "arxiv" in url:
|
427 |
+
url = url.replace("abs", "pdf")
|
428 |
+
response = requests.get(url)
|
429 |
+
content_type = response.headers.get("content-type", "")
|
430 |
+
extension = mimetypes.guess_extension(content_type)
|
431 |
+
if extension and isinstance(extension, str):
|
432 |
+
new_path = f"./downloads/file{extension}"
|
433 |
+
else:
|
434 |
+
new_path = "./downloads/file.object"
|
435 |
+
|
436 |
+
with open(new_path, "wb") as f:
|
437 |
+
f.write(response.content)
|
438 |
+
|
439 |
+
if "pdf" in extension or "txt" in extension or "htm" in extension:
|
440 |
+
raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
|
441 |
+
|
442 |
+
return f"File was downloaded and saved under path {new_path}."
|
443 |
+
|
444 |
+
|
445 |
+
class ArchiveSearchTool(Tool):
|
446 |
+
name = "find_archived_url"
|
447 |
+
description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
|
448 |
+
inputs = {
|
449 |
+
"url": {"type": "string", "description": "The url you need the archive for."},
|
450 |
+
"date": {
|
451 |
+
"type": "string",
|
452 |
+
"description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
|
453 |
+
},
|
454 |
+
}
|
455 |
+
output_type = "string"
|
456 |
+
|
457 |
+
def __init__(self, browser=None):
|
458 |
+
super().__init__()
|
459 |
+
self.browser = browser
|
460 |
+
|
461 |
+
def forward(self, url, date) -> str:
|
462 |
+
import requests
|
463 |
+
|
464 |
+
no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
|
465 |
+
archive_url = no_timestamp_url + f"×tamp={date}"
|
466 |
+
response = requests.get(archive_url).json()
|
467 |
+
response_notimestamp = requests.get(no_timestamp_url).json()
|
468 |
+
if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
|
469 |
+
closest = response["archived_snapshots"]["closest"]
|
470 |
+
print("Archive found!", closest)
|
471 |
+
|
472 |
+
elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
|
473 |
+
closest = response_notimestamp["archived_snapshots"]["closest"]
|
474 |
+
print("Archive found!", closest)
|
475 |
+
else:
|
476 |
+
raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
|
477 |
+
target_url = closest["url"]
|
478 |
+
self.browser.visit_page(target_url)
|
479 |
+
header, content = self.browser._state()
|
480 |
+
return (
|
481 |
+
f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
|
482 |
+
+ header.strip()
|
483 |
+
+ "\n=======================\n"
|
484 |
+
+ content
|
485 |
+
)
|
486 |
+
|
487 |
+
|
488 |
+
class PageUpTool(Tool):
|
489 |
+
name = "page_up"
|
490 |
+
description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
|
491 |
+
inputs = {}
|
492 |
+
output_type = "string"
|
493 |
+
|
494 |
+
def __init__(self, browser=None):
|
495 |
+
super().__init__()
|
496 |
+
self.browser = browser
|
497 |
+
|
498 |
+
def forward(self) -> str:
|
499 |
+
self.browser.page_up()
|
500 |
+
header, content = self.browser._state()
|
501 |
+
return header.strip() + "\n=======================\n" + content
|
502 |
+
|
503 |
+
|
504 |
+
class PageDownTool(Tool):
|
505 |
+
name = "page_down"
|
506 |
+
description = (
|
507 |
+
"Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
|
508 |
+
)
|
509 |
+
inputs = {}
|
510 |
+
output_type = "string"
|
511 |
+
|
512 |
+
def __init__(self, browser=None):
|
513 |
+
super().__init__()
|
514 |
+
self.browser = browser
|
515 |
+
|
516 |
+
def forward(self) -> str:
|
517 |
+
self.browser.page_down()
|
518 |
+
header, content = self.browser._state()
|
519 |
+
return header.strip() + "\n=======================\n" + content
|
520 |
+
|
521 |
+
|
522 |
+
class FinderTool(Tool):
|
523 |
+
name = "find_on_page_ctrl_f"
|
524 |
+
description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
|
525 |
+
inputs = {
|
526 |
+
"search_string": {
|
527 |
+
"type": "string",
|
528 |
+
"description": "The string to search for on the page. This search string supports wildcards like '*'",
|
529 |
+
}
|
530 |
+
}
|
531 |
+
output_type = "string"
|
532 |
+
|
533 |
+
def __init__(self, browser=None):
|
534 |
+
super().__init__()
|
535 |
+
self.browser = browser
|
536 |
+
|
537 |
+
def forward(self, search_string: str) -> str:
|
538 |
+
find_result = self.browser.find_on_page(search_string)
|
539 |
+
header, content = self.browser._state()
|
540 |
+
|
541 |
+
if find_result is None:
|
542 |
+
return (
|
543 |
+
header.strip()
|
544 |
+
+ f"\n=======================\nThe search string '{search_string}' was not found on this page."
|
545 |
+
)
|
546 |
+
else:
|
547 |
+
return header.strip() + "\n=======================\n" + content
|
548 |
+
|
549 |
+
|
550 |
+
class FindNextTool(Tool):
|
551 |
+
name = "find_next"
|
552 |
+
description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
|
553 |
+
inputs = {}
|
554 |
+
output_type = "string"
|
555 |
+
|
556 |
+
def __init__(self, browser=None):
|
557 |
+
super().__init__()
|
558 |
+
self.browser = browser
|
559 |
+
|
560 |
+
def forward(self) -> str:
|
561 |
+
find_result = self.browser.find_next()
|
562 |
+
header, content = self.browser._state()
|
563 |
+
|
564 |
+
if find_result is None:
|
565 |
+
return header.strip() + "\n=======================\nThe search string was not found on this page."
|
566 |
+
else:
|
567 |
+
return header.strip() + "\n=======================\n" + content
|