feat: Add make N-grams function

This commit is contained in:
Alex Wellnitz 2023-10-25 11:07:45 +02:00
parent c3d4eccae6
commit 3321fea4d0
2 changed files with 38 additions and 1 deletions

View File

@ -1,4 +1,4 @@
use std::{fs, path::Path}; use std::{fs, path::Path, collections::HashMap, cmp::min};
use crate::{types::Stats, analyze::tokenizer::Tokenizer}; use crate::{types::Stats, analyze::tokenizer::Tokenizer};
@ -109,4 +109,29 @@ impl Rustysearch {
let tokens = tokenizer.split_into_words(); let tokens = tokenizer.split_into_words();
return tokens; return tokens;
} }
/// **Converts a iterable of ``tokens`` into n-grams**
///
/// This assumes front grams (all grams made starting from the left side
/// of the token).
///
/// Optionally accepts a ``min_gram`` parameter, which takes an integer &
/// controls the minimum gram length. Default is ``3``.
///
/// Optionally accepts a ``max_gram`` parameter, which takes an integer &
/// controls the maximum gram length. Default is ``6``.
///
pub fn make_ngrams(&self, tokens: Vec<String>, min_gram: usize, max_gram: usize) -> HashMap<String, Vec<usize>> {
let mut terms: HashMap<String, Vec<usize>> = HashMap::new();
for (position, token) in tokens.iter().enumerate() {
for window_length in min_gram..min(max_gram + 1, token.len() + 1) {
// Assuming "front" grams.
let gram = &token[..window_length];
terms.entry(gram.to_string()).or_insert(Vec::new()).push(position);
}
}
return terms;
}
} }

View File

@ -63,6 +63,18 @@ mod tests {
assert_eq!(total_docs, 1); assert_eq!(total_docs, 1);
} }
#[test]
fn test_make_ngrams(){
let search = Rustysearch::new("/tmp/rustysearch");
search.setup();
let tokens = vec!["hello".to_string(), "world".to_string()];
let terms = search.make_ngrams(tokens, 3, 6);
assert_eq!(terms["hel"].len(), 1);
}
// Helper function to clean up the stats file
fn clean_stats(){ fn clean_stats(){
let search = Rustysearch::new("/tmp/rustysearch"); let search = Rustysearch::new("/tmp/rustysearch");
search.setup(); search.setup();