feat: Add hash term function
This commit is contained in:
parent
ad2af90692
commit
1589fdb6b1
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -17,6 +17,12 @@ version = "1.0.9"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
|
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "md5"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.6.4"
|
version = "2.6.4"
|
||||||
@ -74,6 +80,7 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
|||||||
name = "rustysearch"
|
name = "rustysearch"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"md5",
|
||||||
"regex",
|
"regex",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
@ -13,3 +13,4 @@ serde = { version = "1.0.189", features = ["derive"] }
|
|||||||
serde_json = "1.0.107"
|
serde_json = "1.0.107"
|
||||||
unicode-segmentation = "^1"
|
unicode-segmentation = "^1"
|
||||||
regex = "^1"
|
regex = "^1"
|
||||||
|
md5 = "0.7.0"
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::{fs, path::Path, collections::HashMap, cmp::min};
|
use std::{cmp::min, collections::HashMap, fs, path::Path};
|
||||||
|
|
||||||
use crate::{types::Stats, analyze::tokenizer::Tokenizer};
|
use crate::{analyze::tokenizer::Tokenizer, types::Stats};
|
||||||
|
|
||||||
pub struct Rustysearch {
|
pub struct Rustysearch {
|
||||||
base_directory: String,
|
base_directory: String,
|
||||||
@ -121,17 +121,63 @@ impl Rustysearch {
|
|||||||
/// Optionally accepts a ``max_gram`` parameter, which takes an integer &
|
/// Optionally accepts a ``max_gram`` parameter, which takes an integer &
|
||||||
/// controls the maximum gram length. Default is ``6``.
|
/// controls the maximum gram length. Default is ``6``.
|
||||||
///
|
///
|
||||||
pub fn make_ngrams(&self, tokens: Vec<String>, min_gram: usize, max_gram: usize) -> HashMap<String, Vec<usize>> {
|
pub fn make_ngrams(
|
||||||
|
&self,
|
||||||
|
tokens: Vec<String>,
|
||||||
|
min_gram: usize,
|
||||||
|
max_gram: usize,
|
||||||
|
) -> HashMap<String, Vec<usize>> {
|
||||||
let mut terms: HashMap<String, Vec<usize>> = HashMap::new();
|
let mut terms: HashMap<String, Vec<usize>> = HashMap::new();
|
||||||
|
|
||||||
for (position, token) in tokens.iter().enumerate() {
|
for (position, token) in tokens.iter().enumerate() {
|
||||||
for window_length in min_gram..min(max_gram + 1, token.len() + 1) {
|
for window_length in min_gram..min(max_gram + 1, token.len() + 1) {
|
||||||
// Assuming "front" grams.
|
// Assuming "front" grams.
|
||||||
let gram = &token[..window_length];
|
let gram = &token[..window_length];
|
||||||
terms.entry(gram.to_string()).or_insert(Vec::new()).push(position);
|
terms
|
||||||
|
.entry(gram.to_string())
|
||||||
|
.or_insert(Vec::new())
|
||||||
|
.push(position);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return terms;
|
return terms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Given a ``term``, hashes it & returns a string of the first N letters
|
||||||
|
///
|
||||||
|
/// Optionally accepts a ``length`` parameter, which takes an integer &
|
||||||
|
/// controls how much of the hash is returned. Default is ``6``.
|
||||||
|
///
|
||||||
|
/// This is usefully when writing files to the file system, as it helps
|
||||||
|
/// us keep from putting too many files in a given directory (~32K max
|
||||||
|
/// with the default).
|
||||||
|
///
|
||||||
|
pub fn hash_name(&self, term: &str, length: usize) -> String {
|
||||||
|
// Make sure it's ASCII.
|
||||||
|
let term = term.to_ascii_lowercase();
|
||||||
|
|
||||||
|
// We hash & slice the term to get a small-ish number of fields
|
||||||
|
// and good distribution between them.
|
||||||
|
let hash = md5::compute(&term);
|
||||||
|
let hashed = format!("{:x}", hash);
|
||||||
|
|
||||||
|
// Cut string after length characters
|
||||||
|
let hashed = &hashed[..length];
|
||||||
|
|
||||||
|
return hashed.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given a ``term``, creates a segment filename based on the hash of the term.
|
||||||
|
///
|
||||||
|
/// Returns the full path to the segment.
|
||||||
|
///
|
||||||
|
pub fn make_segment_name(&self, term: &str) -> String {
|
||||||
|
let term = &self.hash_name(term, 6);
|
||||||
|
|
||||||
|
let index_file_name = format!("{}.index", term);
|
||||||
|
let segment_path = Path::new(&self.index_path).join(index_file_name);
|
||||||
|
let segment_path = segment_path.to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
return segment_path;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -78,6 +78,24 @@ mod tests {
|
|||||||
assert_eq!(terms["hel"].len(), 1);
|
assert_eq!(terms["hel"].len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_hash_name(){
|
||||||
|
let search = Rustysearch::new("/tmp/rustysearch_hashname");
|
||||||
|
search.setup();
|
||||||
|
|
||||||
|
let hash = search.hash_name("hello", 6);
|
||||||
|
assert_eq!(hash, "5d4140");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_make_segment_name(){
|
||||||
|
let search = Rustysearch::new("/tmp/rustysearch_makesegmentname");
|
||||||
|
search.setup();
|
||||||
|
|
||||||
|
let segment_name = search.make_segment_name("hello");
|
||||||
|
assert_eq!(segment_name, "/tmp/rustysearch_makesegmentname/index/5d4140.index");
|
||||||
|
}
|
||||||
|
|
||||||
// Helper function to clean up the stats file
|
// Helper function to clean up the stats file
|
||||||
fn clean_stats(tmp_path: &str){
|
fn clean_stats(tmp_path: &str){
|
||||||
let search = Rustysearch::new(tmp_path);
|
let search = Rustysearch::new(tmp_path);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user