feat: Add make N-grams function

2023-10-25 11:07:45 +02:00
parent c3d4eccae6
commit 3321fea4d0
2 changed files with 38 additions and 1 deletions
--- a/src/search.rs
+++ b/src/search.rs
@@ -1,4 +1,4 @@
-use std::{fs, path::Path};
+use std::{fs, path::Path, collections::HashMap, cmp::min};

 use crate::{types::Stats, analyze::tokenizer::Tokenizer};

@@ -109,4 +109,29 @@ impl Rustysearch {
        let tokens = tokenizer.split_into_words();
        return tokens;
    }
+
+    /// **Converts a iterable of ``tokens`` into n-grams**
+    /// 
+    /// This assumes front grams (all grams made starting from the left side
+    /// of the token).
+    ///
+    /// Optionally accepts a ``min_gram`` parameter, which takes an integer &
+    /// controls the minimum gram length. Default is ``3``.
+    ///
+    /// Optionally accepts a ``max_gram`` parameter, which takes an integer &
+    /// controls the maximum gram length. Default is ``6``.
+    ///
+    pub fn make_ngrams(&self, tokens: Vec<String>, min_gram: usize, max_gram: usize) -> HashMap<String, Vec<usize>> {
+        let mut terms: HashMap<String, Vec<usize>> = HashMap::new();
+    
+        for (position, token) in tokens.iter().enumerate() {
+            for window_length in min_gram..min(max_gram + 1, token.len() + 1) {
+                // Assuming "front" grams.
+                let gram = &token[..window_length];
+                terms.entry(gram.to_string()).or_insert(Vec::new()).push(position);
+            }
+        }
+    
+        return terms;
+    }
 }
--- a/tests/rustysearch.rs
+++ b/tests/rustysearch.rs
@@ -63,6 +63,18 @@ mod tests {
        assert_eq!(total_docs, 1);
    }

+    #[test]
+    fn test_make_ngrams(){
+        let search = Rustysearch::new("/tmp/rustysearch");
+        search.setup();
+
+        let tokens = vec!["hello".to_string(), "world".to_string()];
+        let terms = search.make_ngrams(tokens, 3, 6);
+
+        assert_eq!(terms["hel"].len(), 1);
+    }
+
+    // Helper function to clean up the stats file
    fn clean_stats(){
        let search = Rustysearch::new("/tmp/rustysearch");
        search.setup();