feat: write index to disk
This commit is contained in:
parent
1966acf92d
commit
95299d10b7
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -313,6 +313,15 @@ version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "1.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.6.0"
|
||||
@ -1169,6 +1178,7 @@ name = "rustysearch"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"actix-web",
|
||||
"bincode",
|
||||
"clap",
|
||||
"env_logger",
|
||||
"log",
|
||||
|
@ -10,6 +10,7 @@ license = "MIT"
|
||||
|
||||
[dependencies]
|
||||
actix-web = "4"
|
||||
bincode = "1.3.3"
|
||||
clap = { version = "4.5.1", features = ["derive"] }
|
||||
env_logger = "0.10.0"
|
||||
log = "0.4.19"
|
||||
|
@ -1,15 +1,33 @@
|
||||
use std::collections::HashMap;
|
||||
use bincode::{deserialize_from, serialize_into};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::f64;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, BufWriter};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct SavedIndex {
|
||||
index_btree_map: BTreeMap<String, HashMap<String, i32>>,
|
||||
documents_btree_map: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
fn update_url_scores(old: &mut HashMap<String, f64>, new: &HashMap<String, f64>) {
|
||||
for (url, score) in new {
|
||||
old.entry(url.to_string()).and_modify(|e| *e += score).or_insert(*score);
|
||||
old.entry(url.to_string())
|
||||
.and_modify(|e| *e += score)
|
||||
.or_insert(*score);
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_string(input_string: &str) -> String {
|
||||
let string_without_punc: String = input_string.chars().filter(|&c| !c.is_ascii_punctuation()).collect();
|
||||
let string_without_double_spaces: String = string_without_punc.split_whitespace().collect::<Vec<&str>>().join(" ");
|
||||
let string_without_punc: String = input_string
|
||||
.chars()
|
||||
.filter(|&c| !c.is_ascii_punctuation())
|
||||
.collect();
|
||||
let string_without_double_spaces: String = string_without_punc
|
||||
.split_whitespace()
|
||||
.collect::<Vec<&str>>()
|
||||
.join(" ");
|
||||
string_without_double_spaces.to_lowercase()
|
||||
}
|
||||
|
||||
@ -40,8 +58,8 @@ fn normalize_string(input_string: &str) -> String {
|
||||
/// ```
|
||||
#[derive(Default, Debug, Clone)]
|
||||
pub struct SearchEngine {
|
||||
index: HashMap<String, HashMap<String, i32>>,
|
||||
documents: HashMap<String, String>,
|
||||
index: BTreeMap<String, HashMap<String, i32>>,
|
||||
documents: BTreeMap<String, String>,
|
||||
k1: f64,
|
||||
b: f64,
|
||||
}
|
||||
@ -58,9 +76,19 @@ impl SearchEngine {
|
||||
///
|
||||
/// A new instance of SearchEngine.
|
||||
pub fn new(k1: f64, b: f64) -> SearchEngine {
|
||||
// try to get Index from disk
|
||||
let mut index_btreemap = BTreeMap::new();
|
||||
let mut documents_btreemap = BTreeMap::new();
|
||||
let saved_index = get_index_from_disk();
|
||||
|
||||
if !saved_index.index_btree_map.is_empty() && !saved_index.documents_btree_map.is_empty() {
|
||||
index_btreemap = saved_index.index_btree_map;
|
||||
documents_btreemap = saved_index.documents_btree_map;
|
||||
}
|
||||
|
||||
SearchEngine {
|
||||
index: HashMap::new(),
|
||||
documents: HashMap::new(),
|
||||
index: index_btreemap,
|
||||
documents: documents_btreemap,
|
||||
k1,
|
||||
b,
|
||||
}
|
||||
@ -124,7 +152,10 @@ impl SearchEngine {
|
||||
let avdl = self.avdl();
|
||||
for (url, freq) in self.get_urls(kw) {
|
||||
let numerator = freq as f64 * (self.k1 + 1.0);
|
||||
let denominator = freq as f64 + self.k1 * (1.0 - self.b + self.b * self.documents.get(&url).unwrap().len() as f64 / avdl);
|
||||
let denominator = freq as f64
|
||||
+ self.k1
|
||||
* (1.0 - self.b
|
||||
+ self.b * self.documents.get(&url).unwrap().len() as f64 / avdl);
|
||||
result.insert(url.to_string(), idf_score * numerator / denominator);
|
||||
}
|
||||
result
|
||||
@ -140,7 +171,10 @@ impl SearchEngine {
|
||||
///
|
||||
/// A HashMap containing the document URLs as keys and their relevance scores as values.
|
||||
pub fn search(&mut self, query: &str) -> HashMap<String, f64> {
|
||||
let keywords = normalize_string(query).split_whitespace().map(|s| s.to_string()).collect::<Vec<String>>();
|
||||
let keywords = normalize_string(query)
|
||||
.split_whitespace()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<String>>();
|
||||
let mut url_scores: HashMap<String, f64> = HashMap::new();
|
||||
for kw in keywords {
|
||||
let kw_urls_score = self.bm25(&kw);
|
||||
@ -157,10 +191,21 @@ impl SearchEngine {
|
||||
/// * `content` - The content of the document.
|
||||
pub fn index(&mut self, url: &str, content: &str) {
|
||||
self.documents.insert(url.to_string(), content.to_string());
|
||||
let words = normalize_string(content).split_whitespace().map(|s| s.to_string()).collect::<Vec<String>>();
|
||||
let words = normalize_string(content)
|
||||
.split_whitespace()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<String>>();
|
||||
for word in words {
|
||||
*self.index.entry(word).or_insert(HashMap::new()).entry(url.to_string()).or_insert(0) += 1;
|
||||
*self
|
||||
.index
|
||||
.entry(word)
|
||||
.or_default()
|
||||
.entry(url.to_string())
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
|
||||
// TODO: After updating the index
|
||||
self.write_index_to_disk();
|
||||
}
|
||||
|
||||
/// Bulk indexes multiple documents.
|
||||
@ -188,9 +233,38 @@ impl SearchEngine {
|
||||
self.index.get(&keyword).cloned().unwrap_or(HashMap::new())
|
||||
}
|
||||
|
||||
// Write the current index to disk as binary to the configured location
|
||||
fn write_index_to_disk(&self) {
|
||||
let index_hash_map = self.index.clone();
|
||||
let documents_hash_map = self.documents.clone();
|
||||
|
||||
let btree_index: BTreeMap<_, _> = index_hash_map.into_iter().collect();
|
||||
let btree_documents: BTreeMap<_, _> = documents_hash_map.into_iter().collect();
|
||||
|
||||
let data = SavedIndex {
|
||||
index_btree_map: btree_index,
|
||||
documents_btree_map: btree_documents,
|
||||
};
|
||||
|
||||
let mut file = BufWriter::new(File::create("/tmp/search.db").unwrap());
|
||||
serialize_into(&mut file, &data).unwrap();
|
||||
}
|
||||
|
||||
/// Prints the current state of the search engine's index and document collection for debugging purposes.
|
||||
pub fn debug_index(&self) {
|
||||
log::debug!("Index: {:?}", self.index);
|
||||
log::debug!("Documents: {:?}", self.documents);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_index_from_disk() -> SavedIndex {
|
||||
let file = File::open("/tmp/search.db");
|
||||
let mut data = SavedIndex{ documents_btree_map: BTreeMap::new(), index_btree_map: BTreeMap::new()};
|
||||
|
||||
if file.is_ok() {
|
||||
let reader = BufReader::new(file.unwrap());
|
||||
data = deserialize_from(reader).unwrap();
|
||||
}
|
||||
|
||||
data
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user