Spaces:
Running
Running
Merge branch 'rolling' into rolling
Browse files- Cargo.lock +56 -4
- Cargo.toml +1 -1
- src/engines/brave.rs +3 -3
- src/engines/duckduckgo.rs +3 -2
- src/engines/searx.rs +3 -2
- src/models/engine_models.rs +5 -5
- src/results/aggregator.rs +15 -1
Cargo.lock
CHANGED
@@ -284,6 +284,21 @@ dependencies = [
|
|
284 |
"memchr",
|
285 |
]
|
286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
[[package]]
|
288 |
name = "anes"
|
289 |
version = "0.1.6"
|
@@ -326,6 +341,20 @@ version = "0.10.3"
|
|
326 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
327 |
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
[[package]]
|
330 |
name = "async-once-cell"
|
331 |
version = "0.5.3"
|
@@ -437,6 +466,27 @@ dependencies = [
|
|
437 |
"generic-array",
|
438 |
]
|
439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
[[package]]
|
441 |
name = "bstr"
|
442 |
version = "1.7.0"
|
@@ -502,9 +552,9 @@ dependencies = [
|
|
502 |
|
503 |
[[package]]
|
504 |
name = "cargo-platform"
|
505 |
-
version = "0.1.
|
506 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
507 |
-
checksum = "
|
508 |
dependencies = [
|
509 |
"serde",
|
510 |
]
|
@@ -2840,6 +2890,7 @@ version = "0.11.22"
|
|
2840 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2841 |
checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b"
|
2842 |
dependencies = [
|
|
|
2843 |
"base64 0.21.5",
|
2844 |
"bytes 1.5.0",
|
2845 |
"encoding_rs",
|
@@ -2865,6 +2916,7 @@ dependencies = [
|
|
2865 |
"system-configuration",
|
2866 |
"tokio 1.33.0",
|
2867 |
"tokio-rustls",
|
|
|
2868 |
"tower-service",
|
2869 |
"url 2.4.1",
|
2870 |
"wasm-bindgen",
|
@@ -2920,9 +2972,9 @@ dependencies = [
|
|
2920 |
|
2921 |
[[package]]
|
2922 |
name = "rustix"
|
2923 |
-
version = "0.38.
|
2924 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2925 |
-
checksum = "
|
2926 |
dependencies = [
|
2927 |
"bitflags 2.4.1",
|
2928 |
"errno",
|
|
|
284 |
"memchr",
|
285 |
]
|
286 |
|
287 |
+
[[package]]
|
288 |
+
name = "alloc-no-stdlib"
|
289 |
+
version = "2.0.4"
|
290 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
291 |
+
checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
|
292 |
+
|
293 |
+
[[package]]
|
294 |
+
name = "alloc-stdlib"
|
295 |
+
version = "0.2.2"
|
296 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
297 |
+
checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
|
298 |
+
dependencies = [
|
299 |
+
"alloc-no-stdlib",
|
300 |
+
]
|
301 |
+
|
302 |
[[package]]
|
303 |
name = "anes"
|
304 |
version = "0.1.6"
|
|
|
341 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
342 |
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
343 |
|
344 |
+
[[package]]
|
345 |
+
name = "async-compression"
|
346 |
+
version = "0.4.5"
|
347 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
348 |
+
checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
|
349 |
+
dependencies = [
|
350 |
+
"brotli",
|
351 |
+
"flate2",
|
352 |
+
"futures-core",
|
353 |
+
"memchr",
|
354 |
+
"pin-project-lite",
|
355 |
+
"tokio 1.34.0",
|
356 |
+
]
|
357 |
+
|
358 |
[[package]]
|
359 |
name = "async-once-cell"
|
360 |
version = "0.5.3"
|
|
|
466 |
"generic-array",
|
467 |
]
|
468 |
|
469 |
+
[[package]]
|
470 |
+
name = "brotli"
|
471 |
+
version = "3.4.0"
|
472 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
473 |
+
checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f"
|
474 |
+
dependencies = [
|
475 |
+
"alloc-no-stdlib",
|
476 |
+
"alloc-stdlib",
|
477 |
+
"brotli-decompressor",
|
478 |
+
]
|
479 |
+
|
480 |
+
[[package]]
|
481 |
+
name = "brotli-decompressor"
|
482 |
+
version = "2.5.1"
|
483 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
484 |
+
checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f"
|
485 |
+
dependencies = [
|
486 |
+
"alloc-no-stdlib",
|
487 |
+
"alloc-stdlib",
|
488 |
+
]
|
489 |
+
|
490 |
[[package]]
|
491 |
name = "bstr"
|
492 |
version = "1.7.0"
|
|
|
552 |
|
553 |
[[package]]
|
554 |
name = "cargo-platform"
|
555 |
+
version = "0.1.5"
|
556 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
557 |
+
checksum = "e34637b3140142bdf929fb439e8aa4ebad7651ebf7b1080b3930aa16ac1459ff"
|
558 |
dependencies = [
|
559 |
"serde",
|
560 |
]
|
|
|
2890 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2891 |
checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b"
|
2892 |
dependencies = [
|
2893 |
+
"async-compression",
|
2894 |
"base64 0.21.5",
|
2895 |
"bytes 1.5.0",
|
2896 |
"encoding_rs",
|
|
|
2916 |
"system-configuration",
|
2917 |
"tokio 1.33.0",
|
2918 |
"tokio-rustls",
|
2919 |
+
"tokio-util",
|
2920 |
"tower-service",
|
2921 |
"url 2.4.1",
|
2922 |
"wasm-bindgen",
|
|
|
2972 |
|
2973 |
[[package]]
|
2974 |
name = "rustix"
|
2975 |
+
version = "0.38.25"
|
2976 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2977 |
+
checksum = "dc99bc2d4f1fed22595588a013687477aedf3cdcfb26558c559edb67b4d9b22e"
|
2978 |
dependencies = [
|
2979 |
"bitflags 2.4.1",
|
2980 |
"errno",
|
Cargo.toml
CHANGED
@@ -13,7 +13,7 @@ bench = false
|
|
13 |
path = "src/bin/websurfx.rs"
|
14 |
|
15 |
[dependencies]
|
16 |
-
reqwest = {version="0.11.22", default-features=false, features=["rustls-tls"]}
|
17 |
tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false}
|
18 |
serde = {version="1.0.190", default-features=false, features=["derive"]}
|
19 |
serde_json = {version="1.0.108", default-features=false}
|
|
|
13 |
path = "src/bin/websurfx.rs"
|
14 |
|
15 |
[dependencies]
|
16 |
+
reqwest = {version="0.11.22", default-features=false, features=["rustls-tls","brotli", "gzip"]}
|
17 |
tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false}
|
18 |
serde = {version="1.0.190", default-features=false, features=["derive"]}
|
19 |
serde_json = {version="1.0.108", default-features=false}
|
src/engines/brave.rs
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
|
5 |
use std::collections::HashMap;
|
6 |
|
7 |
-
use reqwest::header::HeaderMap;
|
8 |
use scraper::Html;
|
9 |
|
10 |
use crate::models::aggregation_models::SearchResult;
|
@@ -42,7 +42,7 @@ impl SearchEngine for Brave {
|
|
42 |
query: &str,
|
43 |
page: u32,
|
44 |
user_agent: &str,
|
45 |
-
|
46 |
safe_search: u8,
|
47 |
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
48 |
let url = format!("https://search.brave.com/search?q={query}&offset={page}");
|
@@ -68,7 +68,7 @@ impl SearchEngine for Brave {
|
|
68 |
.change_context(EngineError::UnexpectedError)?;
|
69 |
|
70 |
let document: Html = Html::parse_document(
|
71 |
-
&Brave::fetch_html_from_upstream(self, &url, header_map,
|
72 |
);
|
73 |
|
74 |
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
|
|
|
4 |
|
5 |
use std::collections::HashMap;
|
6 |
|
7 |
+
use reqwest::{header::HeaderMap, Client};
|
8 |
use scraper::Html;
|
9 |
|
10 |
use crate::models::aggregation_models::SearchResult;
|
|
|
42 |
query: &str,
|
43 |
page: u32,
|
44 |
user_agent: &str,
|
45 |
+
client: &Client,
|
46 |
safe_search: u8,
|
47 |
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
48 |
let url = format!("https://search.brave.com/search?q={query}&offset={page}");
|
|
|
68 |
.change_context(EngineError::UnexpectedError)?;
|
69 |
|
70 |
let document: Html = Html::parse_document(
|
71 |
+
&Brave::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
72 |
);
|
73 |
|
74 |
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
|
src/engines/duckduckgo.rs
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
use std::collections::HashMap;
|
6 |
|
7 |
use reqwest::header::HeaderMap;
|
|
|
8 |
use scraper::Html;
|
9 |
|
10 |
use crate::models::aggregation_models::SearchResult;
|
@@ -44,7 +45,7 @@ impl SearchEngine for DuckDuckGo {
|
|
44 |
query: &str,
|
45 |
page: u32,
|
46 |
user_agent: &str,
|
47 |
-
|
48 |
_safe_search: u8,
|
49 |
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
50 |
// Page number can be missing or empty string and so appropriate handling is required
|
@@ -76,7 +77,7 @@ impl SearchEngine for DuckDuckGo {
|
|
76 |
.change_context(EngineError::UnexpectedError)?;
|
77 |
|
78 |
let document: Html = Html::parse_document(
|
79 |
-
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map,
|
80 |
);
|
81 |
|
82 |
if self.parser.parse_for_no_results(&document).next().is_some() {
|
|
|
5 |
use std::collections::HashMap;
|
6 |
|
7 |
use reqwest::header::HeaderMap;
|
8 |
+
use reqwest::Client;
|
9 |
use scraper::Html;
|
10 |
|
11 |
use crate::models::aggregation_models::SearchResult;
|
|
|
45 |
query: &str,
|
46 |
page: u32,
|
47 |
user_agent: &str,
|
48 |
+
client: &Client,
|
49 |
_safe_search: u8,
|
50 |
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
51 |
// Page number can be missing or empty string and so appropriate handling is required
|
|
|
77 |
.change_context(EngineError::UnexpectedError)?;
|
78 |
|
79 |
let document: Html = Html::parse_document(
|
80 |
+
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
81 |
);
|
82 |
|
83 |
if self.parser.parse_for_no_results(&document).next().is_some() {
|
src/engines/searx.rs
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
//! number if provided.
|
4 |
|
5 |
use reqwest::header::HeaderMap;
|
|
|
6 |
use scraper::Html;
|
7 |
use std::collections::HashMap;
|
8 |
|
@@ -40,7 +41,7 @@ impl SearchEngine for Searx {
|
|
40 |
query: &str,
|
41 |
page: u32,
|
42 |
user_agent: &str,
|
43 |
-
|
44 |
mut safe_search: u8,
|
45 |
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
46 |
// Page number can be missing or empty string and so appropriate handling is required
|
@@ -68,7 +69,7 @@ impl SearchEngine for Searx {
|
|
68 |
.change_context(EngineError::UnexpectedError)?;
|
69 |
|
70 |
let document: Html = Html::parse_document(
|
71 |
-
&Searx::fetch_html_from_upstream(self, &url, header_map,
|
72 |
);
|
73 |
|
74 |
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
|
|
3 |
//! number if provided.
|
4 |
|
5 |
use reqwest::header::HeaderMap;
|
6 |
+
use reqwest::Client;
|
7 |
use scraper::Html;
|
8 |
use std::collections::HashMap;
|
9 |
|
|
|
41 |
query: &str,
|
42 |
page: u32,
|
43 |
user_agent: &str,
|
44 |
+
client: &Client,
|
45 |
mut safe_search: u8,
|
46 |
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
47 |
// Page number can be missing or empty string and so appropriate handling is required
|
|
|
69 |
.change_context(EngineError::UnexpectedError)?;
|
70 |
|
71 |
let document: Html = Html::parse_document(
|
72 |
+
&Searx::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
73 |
);
|
74 |
|
75 |
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
src/models/engine_models.rs
CHANGED
@@ -3,7 +3,8 @@
|
|
3 |
|
4 |
use super::aggregation_models::SearchResult;
|
5 |
use error_stack::{Report, Result, ResultExt};
|
6 |
-
use
|
|
|
7 |
|
8 |
/// A custom error type used for handle engine associated errors.
|
9 |
#[derive(Debug)]
|
@@ -71,12 +72,11 @@ pub trait SearchEngine: Sync + Send {
|
|
71 |
&self,
|
72 |
url: &str,
|
73 |
header_map: reqwest::header::HeaderMap,
|
74 |
-
|
75 |
) -> Result<String, EngineError> {
|
76 |
// fetch the html from upstream search engine
|
77 |
-
Ok(
|
78 |
.get(url)
|
79 |
-
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
80 |
.headers(header_map) // add spoofed headers to emulate human behavior
|
81 |
.send()
|
82 |
.await
|
@@ -109,7 +109,7 @@ pub trait SearchEngine: Sync + Send {
|
|
109 |
query: &str,
|
110 |
page: u32,
|
111 |
user_agent: &str,
|
112 |
-
|
113 |
safe_search: u8,
|
114 |
) -> Result<HashMap<String, SearchResult>, EngineError>;
|
115 |
}
|
|
|
3 |
|
4 |
use super::aggregation_models::SearchResult;
|
5 |
use error_stack::{Report, Result, ResultExt};
|
6 |
+
use reqwest::Client;
|
7 |
+
use std::{collections::HashMap, fmt};
|
8 |
|
9 |
/// A custom error type used for handle engine associated errors.
|
10 |
#[derive(Debug)]
|
|
|
72 |
&self,
|
73 |
url: &str,
|
74 |
header_map: reqwest::header::HeaderMap,
|
75 |
+
client: &Client,
|
76 |
) -> Result<String, EngineError> {
|
77 |
// fetch the html from upstream search engine
|
78 |
+
Ok(client
|
79 |
.get(url)
|
|
|
80 |
.headers(header_map) // add spoofed headers to emulate human behavior
|
81 |
.send()
|
82 |
.await
|
|
|
109 |
query: &str,
|
110 |
page: u32,
|
111 |
user_agent: &str,
|
112 |
+
client: &Client,
|
113 |
safe_search: u8,
|
114 |
) -> Result<HashMap<String, SearchResult>, EngineError>;
|
115 |
}
|
src/results/aggregator.rs
CHANGED
@@ -9,6 +9,7 @@ use crate::models::{
|
|
9 |
};
|
10 |
use error_stack::Report;
|
11 |
use regex::Regex;
|
|
|
12 |
use std::time::{SystemTime, UNIX_EPOCH};
|
13 |
use std::{
|
14 |
collections::HashMap,
|
@@ -18,6 +19,9 @@ use std::{
|
|
18 |
use std::{fs::File, io::BufRead};
|
19 |
use tokio::task::JoinHandle;
|
20 |
|
|
|
|
|
|
|
21 |
/// Aliases for long type annotations
|
22 |
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
23 |
|
@@ -68,6 +72,16 @@ pub async fn aggregate(
|
|
68 |
request_timeout: u8,
|
69 |
safe_search: u8,
|
70 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
let user_agent: &str = random_user_agent();
|
72 |
|
73 |
// Add a random delay before making the request.
|
@@ -88,7 +102,7 @@ pub async fn aggregate(
|
|
88 |
let query: String = query.to_owned();
|
89 |
tasks.push(tokio::spawn(async move {
|
90 |
search_engine
|
91 |
-
.results(&query, page, user_agent,
|
92 |
.await
|
93 |
}));
|
94 |
}
|
|
|
9 |
};
|
10 |
use error_stack::Report;
|
11 |
use regex::Regex;
|
12 |
+
use reqwest::{Client, ClientBuilder};
|
13 |
use std::time::{SystemTime, UNIX_EPOCH};
|
14 |
use std::{
|
15 |
collections::HashMap,
|
|
|
19 |
use std::{fs::File, io::BufRead};
|
20 |
use tokio::task::JoinHandle;
|
21 |
|
22 |
+
/// A constant for holding the prebuilt Client globally in the app.
|
23 |
+
static CLIENT: std::sync::OnceLock<Client> = std::sync::OnceLock::new();
|
24 |
+
|
25 |
/// Aliases for long type annotations
|
26 |
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
27 |
|
|
|
72 |
request_timeout: u8,
|
73 |
safe_search: u8,
|
74 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
75 |
+
let client = CLIENT.get_or_init(|| {
|
76 |
+
ClientBuilder::new()
|
77 |
+
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
78 |
+
.https_only(true)
|
79 |
+
.gzip(true)
|
80 |
+
.brotli(true)
|
81 |
+
.build()
|
82 |
+
.unwrap()
|
83 |
+
});
|
84 |
+
|
85 |
let user_agent: &str = random_user_agent();
|
86 |
|
87 |
// Add a random delay before making the request.
|
|
|
102 |
let query: String = query.to_owned();
|
103 |
tasks.push(tokio::spawn(async move {
|
104 |
search_engine
|
105 |
+
.results(&query, page, user_agent, client, safe_search)
|
106 |
.await
|
107 |
}));
|
108 |
}
|