feat: Add hash term function

This commit is contained in:
Alex Wellnitz 2023-10-25 22:20:43 +02:00
parent ad2af90692
commit 1589fdb6b1
4 changed files with 84 additions and 12 deletions

7
Cargo.lock generated
View File

@ -17,6 +17,12 @@ version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "memchr"
version = "2.6.4"
@ -74,6 +80,7 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
name = "rustysearch"
version = "0.1.0"
dependencies = [
"md5",
"regex",
"serde",
"serde_json",

View File

@ -12,4 +12,5 @@ license = "MIT"
serde = { version = "1.0.189", features = ["derive"] }
serde_json = "1.0.107"
unicode-segmentation = "^1"
regex = "^1"
regex = "^1"
md5 = "0.7.0"

View File

@ -1,6 +1,6 @@
use std::{fs, path::Path, collections::HashMap, cmp::min};
use std::{cmp::min, collections::HashMap, fs, path::Path};
use crate::{types::Stats, analyze::tokenizer::Tokenizer};
use crate::{analyze::tokenizer::Tokenizer, types::Stats};
pub struct Rustysearch {
base_directory: String,
@ -76,7 +76,7 @@ impl Rustysearch {
/// 'version': '1.0.0',
/// 'total_docs': 25,
/// }
///
///
pub fn write_stats(&self, new_stats: Stats) -> std::io::Result<()> {
// Write new_stats as json to stats_path
let new_stats_json = serde_json::to_string(&new_stats).unwrap();
@ -88,7 +88,7 @@ impl Rustysearch {
///
/// This is important for scoring reasons & is typically called as part
/// of the indexing process.
///
///
pub fn increment_total_docs(&self) {
let mut current_stats = self.read_stats().unwrap();
current_stats.total_docs += 1;
@ -96,14 +96,14 @@ impl Rustysearch {
}
/// Returns the total number of documents the index is aware of
///
///
pub fn get_total_docs(&self) -> i32 {
let stats = self.read_stats().unwrap();
return stats.total_docs;
}
/// Given a string (``blob``) of text, this will return a Vector of tokens.
///
///
pub fn make_tokens(&self, blob: &str) -> Vec<String> {
let tokenizer = Tokenizer::new(blob, vec![], None);
let tokens = tokenizer.split_into_words();
@ -111,7 +111,7 @@ impl Rustysearch {
}
/// **Converts a iterable of ``tokens`` into n-grams**
///
///
/// This assumes front grams (all grams made starting from the left side
/// of the token).
///
@ -121,17 +121,63 @@ impl Rustysearch {
/// Optionally accepts a ``max_gram`` parameter, which takes an integer &
/// controls the maximum gram length. Default is ``6``.
///
pub fn make_ngrams(&self, tokens: Vec<String>, min_gram: usize, max_gram: usize) -> HashMap<String, Vec<usize>> {
pub fn make_ngrams(
&self,
tokens: Vec<String>,
min_gram: usize,
max_gram: usize,
) -> HashMap<String, Vec<usize>> {
let mut terms: HashMap<String, Vec<usize>> = HashMap::new();
for (position, token) in tokens.iter().enumerate() {
for window_length in min_gram..min(max_gram + 1, token.len() + 1) {
// Assuming "front" grams.
let gram = &token[..window_length];
terms.entry(gram.to_string()).or_insert(Vec::new()).push(position);
terms
.entry(gram.to_string())
.or_insert(Vec::new())
.push(position);
}
}
return terms;
}
/// Given a ``term``, hashes it & returns a string of the first N letters
///
/// Optionally accepts a ``length`` parameter, which takes an integer &
/// controls how much of the hash is returned. Default is ``6``.
///
/// This is usefully when writing files to the file system, as it helps
/// us keep from putting too many files in a given directory (~32K max
/// with the default).
///
pub fn hash_name(&self, term: &str, length: usize) -> String {
// Make sure it's ASCII.
let term = term.to_ascii_lowercase();
// We hash & slice the term to get a small-ish number of fields
// and good distribution between them.
let hash = md5::compute(&term);
let hashed = format!("{:x}", hash);
// Cut string after length characters
let hashed = &hashed[..length];
return hashed.to_string();
}
/// Given a ``term``, creates a segment filename based on the hash of the term.
///
/// Returns the full path to the segment.
///
pub fn make_segment_name(&self, term: &str) -> String {
let term = &self.hash_name(term, 6);
let index_file_name = format!("{}.index", term);
let segment_path = Path::new(&self.index_path).join(index_file_name);
let segment_path = segment_path.to_str().unwrap().to_string();
return segment_path;
}
}

View File

@ -78,6 +78,24 @@ mod tests {
assert_eq!(terms["hel"].len(), 1);
}
#[test]
fn test_hash_name(){
let search = Rustysearch::new("/tmp/rustysearch_hashname");
search.setup();
let hash = search.hash_name("hello", 6);
assert_eq!(hash, "5d4140");
}
#[test]
fn test_make_segment_name(){
let search = Rustysearch::new("/tmp/rustysearch_makesegmentname");
search.setup();
let segment_name = search.make_segment_name("hello");
assert_eq!(segment_name, "/tmp/rustysearch_makesegmentname/index/5d4140.index");
}
// Helper function to clean up the stats file
fn clean_stats(tmp_path: &str){
let search = Rustysearch::new(tmp_path);