diff --git a/src/search/engine.rs b/src/search/engine.rs index 9e88606..fdbcb85 100644 --- a/src/search/engine.rs +++ b/src/search/engine.rs @@ -13,6 +13,32 @@ pub fn normalize_string(input_string: &str) -> String { string_without_double_spaces.to_lowercase() } +/// SearchEngine represents a search engine that indexes and searches documents based on the BM25 ranking algorithm. +/// +/// The search engine maintains an index of words and their frequencies in each document, as well as the actual document content. +/// It provides methods to index documents, perform searches, and calculate relevance scores using the BM25 algorithm. +/// +/// # Examples +/// +/// ``` +/// use std::collections::HashMap; +/// use rustysearch::search::engine::SearchEngine; +/// +/// // Create a new search engine with k1 = 1.2 and b = 0.75 +/// let mut engine = SearchEngine::new(1.2, 0.75); +/// +/// // Index a document +/// engine.index("https://example.com/doc1", "This is the content of document 1"); +/// +/// // Perform a search +/// let results = engine.search("content"); +/// +/// // Print the search results +/// for (url, score) in results { +/// println!("{} - Relevance Score: {}", url, score); +/// } +/// ``` + #[derive(Default, Debug, Clone)] pub struct SearchEngine { index: HashMap>, @@ -22,6 +48,16 @@ pub struct SearchEngine { } impl SearchEngine { + /// Creates a new instance of SearchEngine with the given parameters. + /// + /// # Arguments + /// + /// * `k1` - The k1 parameter of the BM25 algorithm. + /// * `b` - The b parameter of the BM25 algorithm. + /// + /// **Returns** + /// + /// A new instance of SearchEngine. pub fn new(k1: f64, b: f64) -> SearchEngine { SearchEngine { index: HashMap::new(), @@ -31,25 +67,58 @@ impl SearchEngine { } } + /// Returns a vector of all the document URLs in the search engine's index. + /// + /// **Returns** + /// + /// A vector of document URLs. pub fn posts(&self) -> Vec { self.documents.keys().cloned().collect() } + /// Returns the number of documents in the search engine's index. + /// + /// **Returns** + /// + /// The number of documents. pub fn number_of_documents(&self) -> usize { self.documents.len() } + /// Returns the average document length in terms of number of words. + /// + /// **Returns** + /// + /// The average document length. pub fn avdl(&self) -> f64 { let total_length: usize = self.documents.values().map(|d| d.len()).sum(); total_length as f64 / self.documents.len() as f64 } + /// Calculates the inverse document frequency (IDF) score for a given keyword. + /// + /// **Arguments** + /// + /// * `kw` - The keyword for which to calculate the IDF score. + /// + /// **Returns** + /// + /// The IDF score. pub fn idf(&self, kw: &str) -> f64 { let n = self.number_of_documents() as f64; let n_kw = self.get_urls(kw).len() as f64; ((n - n_kw + 0.5) / (n_kw + 0.5) + 1.0).ln() } + /// Calculates the BM25 relevance scores for a given keyword. + /// + /// **Arguments** + /// + /// * `kw` - The keyword for which to calculate the relevance scores. + /// + /// **Returns** + /// + /// A HashMap containing the document URLs as keys and their relevance scores as values. pub fn bm25(&self, kw: &str) -> HashMap { let mut result = HashMap::new(); let idf_score = self.idf(kw); @@ -62,6 +131,15 @@ impl SearchEngine { result } + /// Performs a search for the given query and returns the relevance scores for the matching documents. + /// + /// **Arguments** + /// + /// * `query` - The search query. + /// + /// **Returns** + /// + /// A HashMap containing the document URLs as keys and their relevance scores as values. pub fn search(&mut self, query: &str) -> HashMap { let keywords = normalize_string(query).split_whitespace().map(|s| s.to_string()).collect::>(); let mut url_scores: HashMap = HashMap::new(); @@ -72,6 +150,12 @@ impl SearchEngine { url_scores } + /// Indexes a document with the given URL and content. + /// + /// **Arguments** + /// + /// * `url` - The URL of the document. + /// * `content` - The content of the document. pub fn index(&mut self, url: &str, content: &str) { self.documents.insert(url.to_string(), content.to_string()); let words = normalize_string(content).split_whitespace().map(|s| s.to_string()).collect::>(); @@ -80,17 +164,32 @@ impl SearchEngine { } } + /// Bulk indexes multiple documents. + /// + /// **Arguments** + /// + /// * `documents` - A vector of tuples containing the URL and content of each document. pub fn bulk_index(&mut self, documents: Vec<(&str, &str)>) { for (url, content) in documents { self.index(url, content); } } + /// Returns the URLs and frequencies of a given keyword in the search engine's index. + /// + /// **Arguments** + /// + /// * `keyword` - The keyword to search for. + /// + /// **Returns** + /// + /// A HashMap containing the document URLs as keys and their frequencies as values. pub fn get_urls(&self, keyword: &str) -> HashMap { let keyword = normalize_string(keyword); self.index.get(&keyword).cloned().unwrap_or(HashMap::new()) } + /// Prints the current state of the search engine's index and document collection for debugging purposes. pub fn debug_index(&self) { log::debug!("Index: {:?}", self.index); log::debug!("Documents: {:?}", self.documents);