From 1589fdb6b1089d5ac44b46f5854f8b69f834fcc5 Mon Sep 17 00:00:00 2001 From: Alex Wellnitz Date: Wed, 25 Oct 2023 22:20:43 +0200 Subject: [PATCH] feat: Add hash term function --- Cargo.lock | 7 +++++ Cargo.toml | 3 +- src/search.rs | 68 +++++++++++++++++++++++++++++++++++++------- tests/rustysearch.rs | 18 ++++++++++++ 4 files changed, 84 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 734302b..27bb045 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "memchr" version = "2.6.4" @@ -74,6 +80,7 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" name = "rustysearch" version = "0.1.0" dependencies = [ + "md5", "regex", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 3022c40..0ae5d84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,4 +12,5 @@ license = "MIT" serde = { version = "1.0.189", features = ["derive"] } serde_json = "1.0.107" unicode-segmentation = "^1" -regex = "^1" \ No newline at end of file +regex = "^1" +md5 = "0.7.0" diff --git a/src/search.rs b/src/search.rs index 94552c4..113259e 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,6 +1,6 @@ -use std::{fs, path::Path, collections::HashMap, cmp::min}; +use std::{cmp::min, collections::HashMap, fs, path::Path}; -use crate::{types::Stats, analyze::tokenizer::Tokenizer}; +use crate::{analyze::tokenizer::Tokenizer, types::Stats}; pub struct Rustysearch { base_directory: String, @@ -76,7 +76,7 @@ impl Rustysearch { /// 'version': '1.0.0', /// 'total_docs': 25, /// } - /// + /// pub fn write_stats(&self, new_stats: Stats) -> std::io::Result<()> { // Write new_stats as json to stats_path let new_stats_json = serde_json::to_string(&new_stats).unwrap(); @@ -88,7 +88,7 @@ impl Rustysearch { /// /// This is important for scoring reasons & is typically called as part /// of the indexing process. - /// + /// pub fn increment_total_docs(&self) { let mut current_stats = self.read_stats().unwrap(); current_stats.total_docs += 1; @@ -96,14 +96,14 @@ impl Rustysearch { } /// Returns the total number of documents the index is aware of - /// + /// pub fn get_total_docs(&self) -> i32 { let stats = self.read_stats().unwrap(); return stats.total_docs; } /// Given a string (``blob``) of text, this will return a Vector of tokens. - /// + /// pub fn make_tokens(&self, blob: &str) -> Vec { let tokenizer = Tokenizer::new(blob, vec![], None); let tokens = tokenizer.split_into_words(); @@ -111,7 +111,7 @@ impl Rustysearch { } /// **Converts a iterable of ``tokens`` into n-grams** - /// + /// /// This assumes front grams (all grams made starting from the left side /// of the token). /// @@ -121,17 +121,63 @@ impl Rustysearch { /// Optionally accepts a ``max_gram`` parameter, which takes an integer & /// controls the maximum gram length. Default is ``6``. /// - pub fn make_ngrams(&self, tokens: Vec, min_gram: usize, max_gram: usize) -> HashMap> { + pub fn make_ngrams( + &self, + tokens: Vec, + min_gram: usize, + max_gram: usize, + ) -> HashMap> { let mut terms: HashMap> = HashMap::new(); - + for (position, token) in tokens.iter().enumerate() { for window_length in min_gram..min(max_gram + 1, token.len() + 1) { // Assuming "front" grams. let gram = &token[..window_length]; - terms.entry(gram.to_string()).or_insert(Vec::new()).push(position); + terms + .entry(gram.to_string()) + .or_insert(Vec::new()) + .push(position); } } - + return terms; } + + /// Given a ``term``, hashes it & returns a string of the first N letters + /// + /// Optionally accepts a ``length`` parameter, which takes an integer & + /// controls how much of the hash is returned. Default is ``6``. + /// + /// This is usefully when writing files to the file system, as it helps + /// us keep from putting too many files in a given directory (~32K max + /// with the default). + /// + pub fn hash_name(&self, term: &str, length: usize) -> String { + // Make sure it's ASCII. + let term = term.to_ascii_lowercase(); + + // We hash & slice the term to get a small-ish number of fields + // and good distribution between them. + let hash = md5::compute(&term); + let hashed = format!("{:x}", hash); + + // Cut string after length characters + let hashed = &hashed[..length]; + + return hashed.to_string(); + } + + /// Given a ``term``, creates a segment filename based on the hash of the term. + /// + /// Returns the full path to the segment. + /// + pub fn make_segment_name(&self, term: &str) -> String { + let term = &self.hash_name(term, 6); + + let index_file_name = format!("{}.index", term); + let segment_path = Path::new(&self.index_path).join(index_file_name); + let segment_path = segment_path.to_str().unwrap().to_string(); + + return segment_path; + } } diff --git a/tests/rustysearch.rs b/tests/rustysearch.rs index bd5b13f..f323503 100644 --- a/tests/rustysearch.rs +++ b/tests/rustysearch.rs @@ -78,6 +78,24 @@ mod tests { assert_eq!(terms["hel"].len(), 1); } + #[test] + fn test_hash_name(){ + let search = Rustysearch::new("/tmp/rustysearch_hashname"); + search.setup(); + + let hash = search.hash_name("hello", 6); + assert_eq!(hash, "5d4140"); + } + + #[test] + fn test_make_segment_name(){ + let search = Rustysearch::new("/tmp/rustysearch_makesegmentname"); + search.setup(); + + let segment_name = search.make_segment_name("hello"); + assert_eq!(segment_name, "/tmp/rustysearch_makesegmentname/index/5d4140.index"); + } + // Helper function to clean up the stats file fn clean_stats(tmp_path: &str){ let search = Rustysearch::new(tmp_path);