diff --git a/src/search.rs b/src/search.rs index f6cbf25..94552c4 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,4 +1,4 @@ -use std::{fs, path::Path}; +use std::{fs, path::Path, collections::HashMap, cmp::min}; use crate::{types::Stats, analyze::tokenizer::Tokenizer}; @@ -109,4 +109,29 @@ impl Rustysearch { let tokens = tokenizer.split_into_words(); return tokens; } + + /// **Converts a iterable of ``tokens`` into n-grams** + /// + /// This assumes front grams (all grams made starting from the left side + /// of the token). + /// + /// Optionally accepts a ``min_gram`` parameter, which takes an integer & + /// controls the minimum gram length. Default is ``3``. + /// + /// Optionally accepts a ``max_gram`` parameter, which takes an integer & + /// controls the maximum gram length. Default is ``6``. + /// + pub fn make_ngrams(&self, tokens: Vec, min_gram: usize, max_gram: usize) -> HashMap> { + let mut terms: HashMap> = HashMap::new(); + + for (position, token) in tokens.iter().enumerate() { + for window_length in min_gram..min(max_gram + 1, token.len() + 1) { + // Assuming "front" grams. + let gram = &token[..window_length]; + terms.entry(gram.to_string()).or_insert(Vec::new()).push(position); + } + } + + return terms; + } } diff --git a/tests/rustysearch.rs b/tests/rustysearch.rs index 90e9a93..11ce3ec 100644 --- a/tests/rustysearch.rs +++ b/tests/rustysearch.rs @@ -63,6 +63,18 @@ mod tests { assert_eq!(total_docs, 1); } + #[test] + fn test_make_ngrams(){ + let search = Rustysearch::new("/tmp/rustysearch"); + search.setup(); + + let tokens = vec!["hello".to_string(), "world".to_string()]; + let terms = search.make_ngrams(tokens, 3, 6); + + assert_eq!(terms["hel"].len(), 1); + } + + // Helper function to clean up the stats file fn clean_stats(){ let search = Rustysearch::new("/tmp/rustysearch"); search.setup();