feat: write index to disk

This commit is contained in:
Alex Wellnitz 2024-12-16 17:02:31 +01:00
parent 1966acf92d
commit 95299d10b7
3 changed files with 135 additions and 50 deletions

10
Cargo.lock generated
View File

@ -313,6 +313,15 @@ version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "2.6.0" version = "2.6.0"
@ -1169,6 +1178,7 @@ name = "rustysearch"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"actix-web", "actix-web",
"bincode",
"clap", "clap",
"env_logger", "env_logger",
"log", "log",

View File

@ -10,6 +10,7 @@ license = "MIT"
[dependencies] [dependencies]
actix-web = "4" actix-web = "4"
bincode = "1.3.3"
clap = { version = "4.5.1", features = ["derive"] } clap = { version = "4.5.1", features = ["derive"] }
env_logger = "0.10.0" env_logger = "0.10.0"
log = "0.4.19" log = "0.4.19"

View File

@ -1,38 +1,56 @@
use std::collections::HashMap; use bincode::{deserialize_from, serialize_into};
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashMap};
use std::f64; use std::f64;
use std::fs::File;
use std::io::{BufReader, BufWriter};
#[derive(Serialize, Deserialize)]
struct SavedIndex {
index_btree_map: BTreeMap<String, HashMap<String, i32>>,
documents_btree_map: BTreeMap<String, String>,
}
fn update_url_scores(old: &mut HashMap<String, f64>, new: &HashMap<String, f64>) { fn update_url_scores(old: &mut HashMap<String, f64>, new: &HashMap<String, f64>) {
for (url, score) in new { for (url, score) in new {
old.entry(url.to_string()).and_modify(|e| *e += score).or_insert(*score); old.entry(url.to_string())
.and_modify(|e| *e += score)
.or_insert(*score);
} }
} }
fn normalize_string(input_string: &str) -> String { fn normalize_string(input_string: &str) -> String {
let string_without_punc: String = input_string.chars().filter(|&c| !c.is_ascii_punctuation()).collect(); let string_without_punc: String = input_string
let string_without_double_spaces: String = string_without_punc.split_whitespace().collect::<Vec<&str>>().join(" "); .chars()
.filter(|&c| !c.is_ascii_punctuation())
.collect();
let string_without_double_spaces: String = string_without_punc
.split_whitespace()
.collect::<Vec<&str>>()
.join(" ");
string_without_double_spaces.to_lowercase() string_without_double_spaces.to_lowercase()
} }
/// SearchEngine represents a search engine that indexes and searches documents based on the BM25 ranking algorithm. /// SearchEngine represents a search engine that indexes and searches documents based on the BM25 ranking algorithm.
/// ///
/// The search engine maintains an index of words and their frequencies in each document, as well as the actual document content. /// The search engine maintains an index of words and their frequencies in each document, as well as the actual document content.
/// It provides methods to index documents, perform searches, and calculate relevance scores using the BM25 algorithm. /// It provides methods to index documents, perform searches, and calculate relevance scores using the BM25 algorithm.
/// ///
/// # Examples /// # Examples
/// ///
/// ``` /// ```
/// use std::collections::HashMap; /// use std::collections::HashMap;
/// use rustysearch::search::engine::SearchEngine; /// use rustysearch::search::engine::SearchEngine;
/// ///
/// // Create a new search engine with k1 = 1.2 and b = 0.75 /// // Create a new search engine with k1 = 1.2 and b = 0.75
/// let mut engine = SearchEngine::new(1.2, 0.75); /// let mut engine = SearchEngine::new(1.2, 0.75);
/// ///
/// // Index a document /// // Index a document
/// engine.index("https://example.com/doc1", "This is the content of document 1"); /// engine.index("https://example.com/doc1", "This is the content of document 1");
/// ///
/// // Perform a search /// // Perform a search
/// let results = engine.search("content"); /// let results = engine.search("content");
/// ///
/// // Print the search results /// // Print the search results
/// for (url, score) in results { /// for (url, score) in results {
/// println!("{} - Relevance Score: {}", url, score); /// println!("{} - Relevance Score: {}", url, score);
@ -40,54 +58,64 @@ fn normalize_string(input_string: &str) -> String {
/// ``` /// ```
#[derive(Default, Debug, Clone)] #[derive(Default, Debug, Clone)]
pub struct SearchEngine { pub struct SearchEngine {
index: HashMap<String, HashMap<String, i32>>, index: BTreeMap<String, HashMap<String, i32>>,
documents: HashMap<String, String>, documents: BTreeMap<String, String>,
k1: f64, k1: f64,
b: f64, b: f64,
} }
impl SearchEngine { impl SearchEngine {
/// Creates a new instance of SearchEngine with the given parameters. /// Creates a new instance of SearchEngine with the given parameters.
/// ///
/// # Arguments /// # Arguments
/// ///
/// * `k1` - The k1 parameter of the BM25 algorithm. /// * `k1` - The k1 parameter of the BM25 algorithm.
/// * `b` - The b parameter of the BM25 algorithm. /// * `b` - The b parameter of the BM25 algorithm.
/// ///
/// **Returns** /// **Returns**
/// ///
/// A new instance of SearchEngine. /// A new instance of SearchEngine.
pub fn new(k1: f64, b: f64) -> SearchEngine { pub fn new(k1: f64, b: f64) -> SearchEngine {
// try to get Index from disk
let mut index_btreemap = BTreeMap::new();
let mut documents_btreemap = BTreeMap::new();
let saved_index = get_index_from_disk();
if !saved_index.index_btree_map.is_empty() && !saved_index.documents_btree_map.is_empty() {
index_btreemap = saved_index.index_btree_map;
documents_btreemap = saved_index.documents_btree_map;
}
SearchEngine { SearchEngine {
index: HashMap::new(), index: index_btreemap,
documents: HashMap::new(), documents: documents_btreemap,
k1, k1,
b, b,
} }
} }
/// Returns a vector of all the document URLs in the search engine's index. /// Returns a vector of all the document URLs in the search engine's index.
/// ///
/// **Returns** /// **Returns**
/// ///
/// A vector of document URLs. /// A vector of document URLs.
pub fn posts(&self) -> Vec<String> { pub fn posts(&self) -> Vec<String> {
self.documents.keys().cloned().collect() self.documents.keys().cloned().collect()
} }
/// Returns the number of documents in the search engine's index. /// Returns the number of documents in the search engine's index.
/// ///
/// **Returns** /// **Returns**
/// ///
/// The number of documents. /// The number of documents.
pub fn number_of_documents(&self) -> usize { pub fn number_of_documents(&self) -> usize {
self.documents.len() self.documents.len()
} }
/// Returns the average document length in terms of number of words. /// Returns the average document length in terms of number of words.
/// ///
/// **Returns** /// **Returns**
/// ///
/// The average document length. /// The average document length.
pub fn avdl(&self) -> f64 { pub fn avdl(&self) -> f64 {
let total_length: usize = self.documents.values().map(|d| d.len()).sum(); let total_length: usize = self.documents.values().map(|d| d.len()).sum();
@ -95,13 +123,13 @@ impl SearchEngine {
} }
/// Calculates the inverse document frequency (IDF) score for a given keyword. /// Calculates the inverse document frequency (IDF) score for a given keyword.
/// ///
/// **Arguments** /// **Arguments**
/// ///
/// * `kw` - The keyword for which to calculate the IDF score. /// * `kw` - The keyword for which to calculate the IDF score.
/// ///
/// **Returns** /// **Returns**
/// ///
/// The IDF score. /// The IDF score.
pub fn idf(&self, kw: &str) -> f64 { pub fn idf(&self, kw: &str) -> f64 {
let n = self.number_of_documents() as f64; let n = self.number_of_documents() as f64;
@ -110,13 +138,13 @@ impl SearchEngine {
} }
/// Calculates the BM25 relevance scores for a given keyword. /// Calculates the BM25 relevance scores for a given keyword.
/// ///
/// **Arguments** /// **Arguments**
/// ///
/// * `kw` - The keyword for which to calculate the relevance scores. /// * `kw` - The keyword for which to calculate the relevance scores.
/// ///
/// **Returns** /// **Returns**
/// ///
/// A HashMap containing the document URLs as keys and their relevance scores as values. /// A HashMap containing the document URLs as keys and their relevance scores as values.
pub fn bm25(&self, kw: &str) -> HashMap<String, f64> { pub fn bm25(&self, kw: &str) -> HashMap<String, f64> {
let mut result = HashMap::new(); let mut result = HashMap::new();
@ -124,23 +152,29 @@ impl SearchEngine {
let avdl = self.avdl(); let avdl = self.avdl();
for (url, freq) in self.get_urls(kw) { for (url, freq) in self.get_urls(kw) {
let numerator = freq as f64 * (self.k1 + 1.0); let numerator = freq as f64 * (self.k1 + 1.0);
let denominator = freq as f64 + self.k1 * (1.0 - self.b + self.b * self.documents.get(&url).unwrap().len() as f64 / avdl); let denominator = freq as f64
+ self.k1
* (1.0 - self.b
+ self.b * self.documents.get(&url).unwrap().len() as f64 / avdl);
result.insert(url.to_string(), idf_score * numerator / denominator); result.insert(url.to_string(), idf_score * numerator / denominator);
} }
result result
} }
/// Performs a search for the given query and returns the relevance scores for the matching documents. /// Performs a search for the given query and returns the relevance scores for the matching documents.
/// ///
/// **Arguments** /// **Arguments**
/// ///
/// * `query` - The search query. /// * `query` - The search query.
/// ///
/// **Returns** /// **Returns**
/// ///
/// A HashMap containing the document URLs as keys and their relevance scores as values. /// A HashMap containing the document URLs as keys and their relevance scores as values.
pub fn search(&mut self, query: &str) -> HashMap<String, f64> { pub fn search(&mut self, query: &str) -> HashMap<String, f64> {
let keywords = normalize_string(query).split_whitespace().map(|s| s.to_string()).collect::<Vec<String>>(); let keywords = normalize_string(query)
.split_whitespace()
.map(|s| s.to_string())
.collect::<Vec<String>>();
let mut url_scores: HashMap<String, f64> = HashMap::new(); let mut url_scores: HashMap<String, f64> = HashMap::new();
for kw in keywords { for kw in keywords {
let kw_urls_score = self.bm25(&kw); let kw_urls_score = self.bm25(&kw);
@ -150,23 +184,34 @@ impl SearchEngine {
} }
/// Indexes a document with the given URL and content. /// Indexes a document with the given URL and content.
/// ///
/// **Arguments** /// **Arguments**
/// ///
/// * `url` - The URL of the document. /// * `url` - The URL of the document.
/// * `content` - The content of the document. /// * `content` - The content of the document.
pub fn index(&mut self, url: &str, content: &str) { pub fn index(&mut self, url: &str, content: &str) {
self.documents.insert(url.to_string(), content.to_string()); self.documents.insert(url.to_string(), content.to_string());
let words = normalize_string(content).split_whitespace().map(|s| s.to_string()).collect::<Vec<String>>(); let words = normalize_string(content)
.split_whitespace()
.map(|s| s.to_string())
.collect::<Vec<String>>();
for word in words { for word in words {
*self.index.entry(word).or_insert(HashMap::new()).entry(url.to_string()).or_insert(0) += 1; *self
.index
.entry(word)
.or_default()
.entry(url.to_string())
.or_insert(0) += 1;
} }
// TODO: After updating the index
self.write_index_to_disk();
} }
/// Bulk indexes multiple documents. /// Bulk indexes multiple documents.
/// ///
/// **Arguments** /// **Arguments**
/// ///
/// * `documents` - A vector of tuples containing the URL and content of each document. /// * `documents` - A vector of tuples containing the URL and content of each document.
pub fn bulk_index(&mut self, documents: Vec<(&str, &str)>) { pub fn bulk_index(&mut self, documents: Vec<(&str, &str)>) {
for (url, content) in documents { for (url, content) in documents {
@ -175,22 +220,51 @@ impl SearchEngine {
} }
/// Returns the URLs and frequencies of a given keyword in the search engine's index. /// Returns the URLs and frequencies of a given keyword in the search engine's index.
/// ///
/// **Arguments** /// **Arguments**
/// ///
/// * `keyword` - The keyword to search for. /// * `keyword` - The keyword to search for.
/// ///
/// **Returns** /// **Returns**
/// ///
/// A HashMap containing the document URLs as keys and their frequencies as values. /// A HashMap containing the document URLs as keys and their frequencies as values.
pub fn get_urls(&self, keyword: &str) -> HashMap<String, i32> { pub fn get_urls(&self, keyword: &str) -> HashMap<String, i32> {
let keyword = normalize_string(keyword); let keyword = normalize_string(keyword);
self.index.get(&keyword).cloned().unwrap_or(HashMap::new()) self.index.get(&keyword).cloned().unwrap_or(HashMap::new())
} }
// Write the current index to disk as binary to the configured location
fn write_index_to_disk(&self) {
let index_hash_map = self.index.clone();
let documents_hash_map = self.documents.clone();
let btree_index: BTreeMap<_, _> = index_hash_map.into_iter().collect();
let btree_documents: BTreeMap<_, _> = documents_hash_map.into_iter().collect();
let data = SavedIndex {
index_btree_map: btree_index,
documents_btree_map: btree_documents,
};
let mut file = BufWriter::new(File::create("/tmp/search.db").unwrap());
serialize_into(&mut file, &data).unwrap();
}
/// Prints the current state of the search engine's index and document collection for debugging purposes. /// Prints the current state of the search engine's index and document collection for debugging purposes.
pub fn debug_index(&self) { pub fn debug_index(&self) {
log::debug!("Index: {:?}", self.index); log::debug!("Index: {:?}", self.index);
log::debug!("Documents: {:?}", self.documents); log::debug!("Documents: {:?}", self.documents);
} }
} }
fn get_index_from_disk() -> SavedIndex {
let file = File::open("/tmp/search.db");
let mut data = SavedIndex{ documents_btree_map: BTreeMap::new(), index_btree_map: BTreeMap::new()};
if file.is_ok() {
let reader = BufReader::new(file.unwrap());
data = deserialize_from(reader).unwrap();
}
data
}