From 1589fdb6b1089d5ac44b46f5854f8b69f834fcc5 Mon Sep 17 00:00:00 2001
From: Alex Wellnitz <moin@wellnitz-alex.de>
Date: Wed, 25 Oct 2023 22:20:43 +0200
Subject: [PATCH] feat: Add hash term function

---
 Cargo.lock           |  7 +++++
 Cargo.toml           |  3 +-
 src/search.rs        | 68 +++++++++++++++++++++++++++++++++++++-------
 tests/rustysearch.rs | 18 ++++++++++++
 4 files changed, 84 insertions(+), 12 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 734302b..27bb045 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,6 +17,12 @@ version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
 
+[[package]]
+name = "md5"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -74,6 +80,7 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
 name = "rustysearch"
 version = "0.1.0"
 dependencies = [
+ "md5",
  "regex",
  "serde",
  "serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 3022c40..0ae5d84 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,4 +12,5 @@ license = "MIT"
 serde = { version = "1.0.189", features = ["derive"] }
 serde_json = "1.0.107"
 unicode-segmentation = "^1"
-regex = "^1"
\ No newline at end of file
+regex = "^1"
+md5 = "0.7.0"
diff --git a/src/search.rs b/src/search.rs
index 94552c4..113259e 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -1,6 +1,6 @@
-use std::{fs, path::Path, collections::HashMap, cmp::min};
+use std::{cmp::min, collections::HashMap, fs, path::Path};
 
-use crate::{types::Stats, analyze::tokenizer::Tokenizer};
+use crate::{analyze::tokenizer::Tokenizer, types::Stats};
 
 pub struct Rustysearch {
     base_directory: String,
@@ -76,7 +76,7 @@ impl Rustysearch {
     ///        'version': '1.0.0',
     ///        'total_docs': 25,
     ///    }
-    /// 
+    ///
     pub fn write_stats(&self, new_stats: Stats) -> std::io::Result<()> {
         // Write new_stats as json to stats_path
         let new_stats_json = serde_json::to_string(&new_stats).unwrap();
@@ -88,7 +88,7 @@ impl Rustysearch {
     ///
     /// This is important for scoring reasons & is typically called as part
     /// of the indexing process.
-    /// 
+    ///
     pub fn increment_total_docs(&self) {
         let mut current_stats = self.read_stats().unwrap();
         current_stats.total_docs += 1;
@@ -96,14 +96,14 @@ impl Rustysearch {
     }
 
     /// Returns the total number of documents the index is aware of
-    /// 
+    ///
     pub fn get_total_docs(&self) -> i32 {
         let stats = self.read_stats().unwrap();
         return stats.total_docs;
     }
 
     /// Given a string (``blob``) of text, this will return a Vector of tokens.
-    /// 
+    ///
     pub fn make_tokens(&self, blob: &str) -> Vec<String> {
         let tokenizer = Tokenizer::new(blob, vec![], None);
         let tokens = tokenizer.split_into_words();
@@ -111,7 +111,7 @@ impl Rustysearch {
     }
 
     /// **Converts a iterable of ``tokens`` into n-grams**
-    /// 
+    ///
     /// This assumes front grams (all grams made starting from the left side
     /// of the token).
     ///
@@ -121,17 +121,63 @@ impl Rustysearch {
     /// Optionally accepts a ``max_gram`` parameter, which takes an integer &
     /// controls the maximum gram length. Default is ``6``.
     ///
-    pub fn make_ngrams(&self, tokens: Vec<String>, min_gram: usize, max_gram: usize) -> HashMap<String, Vec<usize>> {
+    pub fn make_ngrams(
+        &self,
+        tokens: Vec<String>,
+        min_gram: usize,
+        max_gram: usize,
+    ) -> HashMap<String, Vec<usize>> {
         let mut terms: HashMap<String, Vec<usize>> = HashMap::new();
-    
+
         for (position, token) in tokens.iter().enumerate() {
             for window_length in min_gram..min(max_gram + 1, token.len() + 1) {
                 // Assuming "front" grams.
                 let gram = &token[..window_length];
-                terms.entry(gram.to_string()).or_insert(Vec::new()).push(position);
+                terms
+                    .entry(gram.to_string())
+                    .or_insert(Vec::new())
+                    .push(position);
             }
         }
-    
+
         return terms;
     }
+
+    /// Given a ``term``, hashes it & returns a string of the first N letters
+    ///
+    /// Optionally accepts a ``length`` parameter, which takes an integer &
+    /// controls how much of the hash is returned. Default is ``6``.
+    ///
+    /// This is usefully when writing files to the file system, as it helps
+    /// us keep from putting too many files in a given directory (~32K max
+    /// with the default).
+    ///
+    pub fn hash_name(&self, term: &str, length: usize) -> String {
+        // Make sure it's ASCII.
+        let term = term.to_ascii_lowercase();
+
+        // We hash & slice the term to get a small-ish number of fields
+        // and good distribution between them.
+        let hash = md5::compute(&term);
+        let hashed = format!("{:x}", hash);
+
+        // Cut string after length characters
+        let hashed = &hashed[..length];
+
+        return hashed.to_string();
+    }
+
+    /// Given a ``term``, creates a segment filename based on the hash of the term.
+    ///
+    /// Returns the full path to the segment.
+    ///
+    pub fn make_segment_name(&self, term: &str) -> String {
+        let term = &self.hash_name(term, 6);
+        
+        let index_file_name = format!("{}.index", term);
+        let segment_path = Path::new(&self.index_path).join(index_file_name);
+        let segment_path = segment_path.to_str().unwrap().to_string();
+
+        return segment_path;
+    }
 }
diff --git a/tests/rustysearch.rs b/tests/rustysearch.rs
index bd5b13f..f323503 100644
--- a/tests/rustysearch.rs
+++ b/tests/rustysearch.rs
@@ -78,6 +78,24 @@ mod tests {
         assert_eq!(terms["hel"].len(), 1);
     }
 
+    #[test]
+    fn test_hash_name(){
+        let search = Rustysearch::new("/tmp/rustysearch_hashname");
+        search.setup();
+
+        let hash = search.hash_name("hello", 6);
+        assert_eq!(hash, "5d4140");
+    }
+
+    #[test]
+    fn test_make_segment_name(){
+        let search = Rustysearch::new("/tmp/rustysearch_makesegmentname");
+        search.setup();
+
+        let segment_name = search.make_segment_name("hello");
+        assert_eq!(segment_name, "/tmp/rustysearch_makesegmentname/index/5d4140.index");
+    }
+
     // Helper function to clean up the stats file
     fn clean_stats(tmp_path: &str){
         let search = Rustysearch::new(tmp_path);