-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #419 from neon-mmd/FEAT/314_startpage-for-the-sear…
…ch-engine ✨ `Startpage` for the search engine
- Loading branch information
Showing
6 changed files
with
116 additions
and
14 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,3 +7,4 @@ pub mod brave; | |
pub mod duckduckgo; | ||
pub mod search_result_parser; | ||
pub mod searx; | ||
pub mod startpage; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
//! The `duckduckgo` module handles the scraping of results from the duckduckgo search engine | ||
//! by querying the upstream duckduckgo search engine with user provided query and with a page | ||
//! number if provided. | ||
use std::collections::HashMap; | ||
|
||
use reqwest::header::HeaderMap; | ||
use reqwest::Client; | ||
use scraper::Html; | ||
|
||
use crate::models::aggregation_models::SearchResult; | ||
|
||
use crate::models::engine_models::{EngineError, SearchEngine}; | ||
|
||
use error_stack::{Report, Result, ResultExt}; | ||
|
||
use super::search_result_parser::SearchResultParser; | ||
|
||
/// A new Startpage engine type defined in-order to implement the `SearchEngine` trait which allows to | ||
/// reduce code duplication as well as allows to create vector of different search engines easily. | ||
pub struct Startpage { | ||
/// The parser, used to interpret the search result. | ||
parser: SearchResultParser, | ||
} | ||
|
||
impl Startpage { | ||
/// Creates the Startpage parser. | ||
pub fn new() -> Result<Self, EngineError> { | ||
Ok(Self { | ||
parser: SearchResultParser::new( | ||
".no-results", | ||
".w-gl__result__main", | ||
".w-gl__result-second-line-container>.w-gl__result-title>h3", | ||
".w-gl__result-url", | ||
".w-gl__description", | ||
)?, | ||
}) | ||
} | ||
} | ||
|
||
#[async_trait::async_trait] | ||
impl SearchEngine for Startpage { | ||
async fn results( | ||
&self, | ||
query: &str, | ||
page: u32, | ||
user_agent: &str, | ||
client: &Client, | ||
_safe_search: u8, | ||
) -> Result<HashMap<String, SearchResult>, EngineError> { | ||
// Page number can be missing or empty string and so appropriate handling is required | ||
// so that upstream server recieves valid page number. | ||
let url: String = match page { | ||
1 | 0 => { | ||
format!("https://startpage.com/do/dsearch?q={query}&num=10&start=0") | ||
} | ||
_ => { | ||
format!( | ||
"https://startpage.com/do/dsearch?q={query}&num=10&start={}", | ||
page * 10, | ||
) | ||
} | ||
}; | ||
|
||
// initializing HeaderMap and adding appropriate headers. | ||
let header_map = HeaderMap::try_from(&HashMap::from([ | ||
("USER_AGENT".to_string(), user_agent.to_string()), | ||
("REFERER".to_string(), "https://google.com/".to_string()), | ||
( | ||
"CONTENT_TYPE".to_string(), | ||
"application/x-www-form-urlencoded".to_string(), | ||
), | ||
("COOKIE".to_string(), "preferences=connect_to_serverEEE0N1Ndate_timeEEEworldN1Ndisable_family_filterEEE0N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE1N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fnight%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE10N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius".to_string()), | ||
])) | ||
.change_context(EngineError::UnexpectedError)?; | ||
|
||
let document: Html = Html::parse_document( | ||
&Startpage::fetch_html_from_upstream(self, &url, header_map, client).await?, | ||
); | ||
|
||
if self.parser.parse_for_no_results(&document).next().is_some() { | ||
return Err(Report::new(EngineError::EmptyResultSet)); | ||
} | ||
|
||
// scrape all the results from the html | ||
self.parser | ||
.parse_for_results(&document, |title, url, desc| { | ||
Some(SearchResult::new( | ||
title.inner_html().trim(), | ||
&format!("{}", url.inner_html().trim()), | ||
desc.inner_html().trim(), | ||
&["startpage"], | ||
)) | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters