feat: Add make N-grams function
This commit is contained in:
parent
c3d4eccae6
commit
3321fea4d0
@ -1,4 +1,4 @@
|
|||||||
use std::{fs, path::Path};
|
use std::{fs, path::Path, collections::HashMap, cmp::min};
|
||||||
|
|
||||||
use crate::{types::Stats, analyze::tokenizer::Tokenizer};
|
use crate::{types::Stats, analyze::tokenizer::Tokenizer};
|
||||||
|
|
||||||
@ -109,4 +109,29 @@ impl Rustysearch {
|
|||||||
let tokens = tokenizer.split_into_words();
|
let tokens = tokenizer.split_into_words();
|
||||||
return tokens;
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// **Converts a iterable of ``tokens`` into n-grams**
|
||||||
|
///
|
||||||
|
/// This assumes front grams (all grams made starting from the left side
|
||||||
|
/// of the token).
|
||||||
|
///
|
||||||
|
/// Optionally accepts a ``min_gram`` parameter, which takes an integer &
|
||||||
|
/// controls the minimum gram length. Default is ``3``.
|
||||||
|
///
|
||||||
|
/// Optionally accepts a ``max_gram`` parameter, which takes an integer &
|
||||||
|
/// controls the maximum gram length. Default is ``6``.
|
||||||
|
///
|
||||||
|
pub fn make_ngrams(&self, tokens: Vec<String>, min_gram: usize, max_gram: usize) -> HashMap<String, Vec<usize>> {
|
||||||
|
let mut terms: HashMap<String, Vec<usize>> = HashMap::new();
|
||||||
|
|
||||||
|
for (position, token) in tokens.iter().enumerate() {
|
||||||
|
for window_length in min_gram..min(max_gram + 1, token.len() + 1) {
|
||||||
|
// Assuming "front" grams.
|
||||||
|
let gram = &token[..window_length];
|
||||||
|
terms.entry(gram.to_string()).or_insert(Vec::new()).push(position);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -63,6 +63,18 @@ mod tests {
|
|||||||
assert_eq!(total_docs, 1);
|
assert_eq!(total_docs, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_make_ngrams(){
|
||||||
|
let search = Rustysearch::new("/tmp/rustysearch");
|
||||||
|
search.setup();
|
||||||
|
|
||||||
|
let tokens = vec!["hello".to_string(), "world".to_string()];
|
||||||
|
let terms = search.make_ngrams(tokens, 3, 6);
|
||||||
|
|
||||||
|
assert_eq!(terms["hel"].len(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to clean up the stats file
|
||||||
fn clean_stats(){
|
fn clean_stats(){
|
||||||
let search = Rustysearch::new("/tmp/rustysearch");
|
let search = Rustysearch::new("/tmp/rustysearch");
|
||||||
search.setup();
|
search.setup();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user