Spaces:
Running
Running
File size: 11,513 Bytes
fc69ace 141ae26 c170de8 bb50e8b fc69ace 2a4dd07 5aca5c0 15fc415 049b1c1 15fc415 049b1c1 15fc415 049b1c1 15fc415 049b1c1 b1bcf41 bb50e8b 15fc415 f94ac50 fc69ace 15dfda6 2a4dd07 fc69ace 2885f23 f94ac50 2885f23 bb50e8b 2885f23 f94ac50 bb50e8b fc69ace 2885f23 f94ac50 28fee6b 4a505fb 2885f23 28fee6b f94ac50 049b1c1 2885f23 3aeb3b3 049b1c1 3aeb3b3 049b1c1 3aeb3b3 049b1c1 c4935f2 3aeb3b3 049b1c1 2a4dd07 049b1c1 2885f23 3aeb3b3 f56002d 2885f23 3aeb3b3 2885f23 c4935f2 f56002d 2885f23 c4935f2 3aeb3b3 cecffe4 c170de8 fc69ace 519ebe0 15fc415 049b1c1 b1bcf41 049b1c1 b1bcf41 1a22221 fb231de 410257c 1a22221 fb231de 410257c b428ced 15fc415 f94ac50 fc69ace 2a4dd07 fc69ace 2a4dd07 1a22221 2a4dd07 b1bcf41 2885f23 f94ac50 b1bcf41 410257c b428ced f94ac50 137c62e 410257c 991f3f5 410257c b1bcf41 410257c b1bcf41 410257c 137c62e b428ced f94ac50 bb50e8b b1bcf41 bb50e8b b1bcf41 bb50e8b b1bcf41 bb50e8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 |
//! This module provides public models for handling, storing and serializing of search results
//! data scraped from the upstream search engines.
use super::engine_models::EngineError;
use serde::{Deserialize, Serialize};
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
use thesaurus::synonyms;
/// A named struct to store the raw scraped search results scraped search results from the
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
/// to write idiomatic rust using `Iterators`.
///
/// (href url in html in simple words).
///
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
/// The title of the search result.
pub title: String,
/// The url which is accessed when clicked on it
pub url: String,
/// The description of the search result.
pub description: String,
/// The names of the upstream engines from which this results were provided.
pub engine: Vec<String>,
/// The td-tdf score of the result in regards to the title, url and description and the user's query
pub relevance_score: f32,
}
impl SearchResult {
/// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `title` - The title of the search result.
/// * `url` - The url which is accessed when clicked on it
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(title: &str, url: &str, description: &str, engine: &[&str]) -> Self {
SearchResult {
title: title.to_owned(),
url: url.to_owned(),
description: description.to_owned(),
relevance_score: 0.0,
engine: engine.iter().map(|name| name.to_string()).collect(),
}
}
/// calculates and update the relevance score of the current search.
/// # Arguments
///
/// * query - the query string used to obtain the results
///
///
pub fn calculate_relevance(&mut self, query: &str) {
use stop_words::{get, LANGUAGE};
// when language settings can change to any of the ones supported on this crate: https://docs.rs/crate/stop-words/0.8.0
let documents = [
self.title.clone(),
self.url.clone(),
self.description.clone(),
];
let stop_words = get(LANGUAGE::English);
let punctuation = [
".".to_owned(),
",".to_owned(),
":".to_owned(),
";".to_owned(),
"!".to_owned(),
"?".to_owned(),
"(".to_owned(),
")".to_owned(),
"[".to_owned(),
"]".to_owned(),
"{".to_owned(),
"}".to_owned(),
"\"".to_owned(),
"'".to_owned(),
"<".to_owned(),
">".to_owned(),
];
self.relevance_score = calculate_tf_idf(query, &documents, &stop_words, &punctuation);
}
/// A function which adds the engine name provided as a string into a vector of strings.
///
/// # Arguments
///
/// * `engine` - Takes an engine name provided as a String.
pub fn add_engines(&mut self, engine: &str) {
self.engine.push(engine.to_owned())
}
/// A function which returns the engine name stored from the struct as a string.
///
/// # Returns
///
/// An engine name stored as a string from the struct.
pub fn engine(&mut self) -> String {
std::mem::take(&mut self.engine[0])
}
}
/// A named struct that stores the error info related to the upstream search engines.
#[derive(Serialize, Deserialize, Clone)]
pub struct EngineErrorInfo {
/// It stores the error type which occured while fetching the result from a particular search
/// engine.
pub error: String,
/// It stores the name of the engine that failed to provide the requested search results.
pub engine: String,
/// It stores the name of the color to indicate whether how severe the particular error is (In
/// other words it indicates the severity of the error/issue).
pub severity_color: String,
}
impl EngineErrorInfo {
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `error` - It takes the error type which occured while fetching the result from a particular
/// search engine.
/// * `engine` - It takes the name of the engine that failed to provide the requested search results.
pub fn new(error: &EngineError, engine: &str) -> Self {
Self {
error: match error {
EngineError::NoSuchEngineFound(_) => "EngineNotFound".to_owned(),
EngineError::RequestError => "RequestError".to_owned(),
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
},
engine: engine.to_owned(),
severity_color: match error {
EngineError::NoSuchEngineFound(_) => "red".to_owned(),
EngineError::RequestError => "green".to_owned(),
EngineError::EmptyResultSet => "blue".to_owned(),
EngineError::UnexpectedError => "red".to_owned(),
},
}
}
}
/// A named struct to store, serialize, deserialize the all the search results scraped and
/// aggregated from the upstream search engines.
/// `SearchResult` structs.
#[derive(Serialize, Deserialize, Default, Clone)]
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
/// Stores the individual serializable `SearchResult` struct into a vector of
pub results: Box<[SearchResult]>,
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
pub engine_errors_info: Box<[EngineErrorInfo]>,
/// Stores the flag option which holds the check value that the following
/// search query was disallowed when the safe search level set to 4 and it
/// was present in the `Blocklist` file.
pub disallowed: bool,
/// Stores the flag option which holds the check value that the following
/// search query was filtered when the safe search level set to 3 and it
/// was present in the `Blocklist` file.
pub filtered: bool,
/// Stores the safe search level `safesearch` provided in the search url.
pub safe_search_level: u8,
/// Stores the flag option which holds the check value that whether any search engines were
/// selected or not.
pub no_engines_selected: bool,
}
impl SearchResults {
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `results` - Takes an argument of individual serializable `SearchResult` struct
/// and stores it into a vector of `SearchResult` structs.
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
/// the search url.
/// * `engine_errors_info` - Takes an array of structs which contains information regarding
/// which engines failed with their names, reason and their severity color name.
pub fn new(results: Box<[SearchResult]>, engine_errors_info: Box<[EngineErrorInfo]>) -> Self {
Self {
results,
engine_errors_info,
disallowed: Default::default(),
filtered: Default::default(),
safe_search_level: Default::default(),
no_engines_selected: Default::default(),
}
}
/// A setter function that sets disallowed to true.
pub fn set_disallowed(&mut self) {
self.disallowed = true;
}
/// A setter function that sets the filtered to true.
pub fn set_filtered(&mut self, filtered: bool) {
self.filtered = filtered;
}
/// A getter function that gets the value of `engine_errors_info`.
pub fn engine_errors_info(&mut self) -> Box<[EngineErrorInfo]> {
std::mem::take(&mut self.engine_errors_info)
}
/// A getter function that gets the value of `results`.
pub fn results(&mut self) -> Box<[SearchResult]> {
self.results.clone()
}
/// A setter function to set the current page safe search level.
pub fn set_safe_search_level(&mut self, safe_search_level: u8) {
self.safe_search_level = safe_search_level;
}
/// A getter function that gets the value of `no_engines_selected`.
pub fn no_engines_selected(&self) -> bool {
self.no_engines_selected
}
/// A setter function to set the `no_engines_selected` to true.
pub fn set_no_engines_selected(&mut self) {
self.no_engines_selected = true;
}
}
/// Helper function to calculate the tf-idf for the search query.
/// <br> The approach is as [`as`](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).
/// <br> Find a sample article about TF-IDF [`here`](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3)
/// ### Arguments
/// * `query` - a user's search query
/// * `documents` - a list of text used for comparision (url, title, description)
/// * `stop_words` - A list of language specific stop words.
/// * `punctuation` - list of punctuation symbols.
/// ### Returns
/// * `score` - The average tf-idf score of the word tokens (and synonyms) in the query
fn calculate_tf_idf(
query: &str,
documents: &[String],
stop_words: &[String],
punctuation: &[String],
) -> f32 {
use keyword_extraction::{
tf_idf::{TfIdf, TfIdfParams},
tokenizer::Tokenizer,
};
let params = TfIdfParams::UnprocessedDocuments(documents, stop_words, Some(punctuation));
let tf_idf = TfIdf::new(params);
let tokener = Tokenizer::new(query, stop_words, Some(punctuation));
let query_tokens = tokener.split_into_words();
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let mut extra_tokens = vec![];
let total_score: f32 = query_tokens
.iter()
.map(|token| {
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
{
// find some synonyms and add them to the search (from wordnet or moby if feature is enabled)
extra_tokens.extend(synonyms(token))
}
tf_idf.get_score(token)
})
.sum();
#[cfg(not(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
)))]
let result = total_score / (query_tokens.len() as f32);
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let extra_total_score: f32 = extra_tokens
.iter()
.map(|token| tf_idf.get_score(token))
.sum();
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let result =
(extra_total_score + total_score) / ((query_tokens.len() + extra_tokens.len()) as f32);
f32::from(!result.is_nan()) * result
}
|