feat: Add Update and Save Segment functions

This commit is contained in:
Alex Wellnitz 2023-10-26 19:20:59 +02:00
parent 718e57355b
commit f070b45fac
4 changed files with 300 additions and 4 deletions

148
Cargo.lock generated
View File

@ -11,12 +11,58 @@ dependencies = [
"memchr",
]
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "errno"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "fastrand"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
[[package]]
name = "itoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
[[package]]
name = "libc"
version = "0.2.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
[[package]]
name = "linux-raw-sys"
version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
[[package]]
name = "md5"
version = "0.7.0"
@ -47,6 +93,15 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "redox_syscall"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "regex"
version = "1.10.2"
@ -76,6 +131,19 @@ version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]]
name = "rustix"
version = "0.38.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0"
dependencies = [
"bitflags 2.4.1",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
name = "rustysearch"
version = "0.1.0"
@ -84,6 +152,7 @@ dependencies = [
"regex",
"serde",
"serde_json",
"tempfile",
"unicode-segmentation",
]
@ -135,6 +204,19 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
dependencies = [
"cfg-if",
"fastrand",
"redox_syscall",
"rustix",
"windows-sys",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
@ -146,3 +228,69 @@ name = "unicode-segmentation"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_i686_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"

View File

@ -14,3 +14,4 @@ serde_json = "1.0.107"
unicode-segmentation = "^1"
regex = "^1"
md5 = "0.7.0"
tempfile = "3.8.0"

View File

@ -1,6 +1,13 @@
use std::{cmp::min, collections::HashMap, fs, path::Path};
use std::{
cmp::min,
collections::{HashMap, HashSet},
fs,
io::{Read, Write},
path::Path,
};
use tempfile::NamedTempFile;
use serde_json::{Value, json};
use serde_json::{json, Value};
use crate::{analyze::tokenizer::Tokenizer, types::Stats};
@ -204,4 +211,100 @@ impl Rustysearch {
pub fn make_record(&self, term: &str, term_info: &Value) -> String {
format!("{}\t{}\n", term, json!(term_info).to_string())
}
/// Takes existing ``orig_info`` & ``new_info`` dicts & combines them
/// intelligently.
///
/// Used for updating term_info within the segments.
///
pub fn update_term_info(&self, orig_info: &mut Value, new_info: &Value) -> Value {
for (doc_id, positions) in new_info.as_object().unwrap().iter() {
if !orig_info.as_object().unwrap().contains_key(doc_id) {
orig_info[doc_id] = positions.clone();
} else {
let mut orig_positions: HashSet<_> = orig_info[doc_id]
.as_array()
.unwrap()
.iter()
.map(|v| v.as_str().unwrap().to_string())
.collect();
let new_positions: HashSet<_> = positions
.as_array()
.unwrap()
.iter()
.map(|v| v.as_str().unwrap().to_string())
.collect();
orig_positions.extend(new_positions);
orig_info[doc_id] = Value::Array(
orig_positions
.iter()
.map(|v| Value::String(v.clone()))
.collect(),
);
}
}
return orig_info.to_owned();
}
/// Writes out new index data to disk.
///
/// Takes a ``term`` string & ``term_info`` dict. It will
/// rewrite the segment in alphabetical order, adding in the data
/// where appropriate.
///
/// Optionally takes an ``update`` parameter, which is a boolean &
/// determines whether the provided ``term_info`` should overwrite or
/// update the data in the segment. Default is ``False`` (overwrite).
///
pub fn save_segment(&self, term: &str, term_info: &Value, update: bool) -> bool {
let seg_name = &self.make_segment_name(term);
let mut new_seg_file = NamedTempFile::new().unwrap();
let mut written = false;
if !Path::new(&seg_name).exists() {
fs::write(&seg_name, "").unwrap();
}
let mut seg_file = fs::OpenOptions::new().read(true).open(&seg_name).unwrap();
let mut buf = String::new();
seg_file.read_to_string(&mut buf).unwrap();
for line in buf.lines() {
let (seg_term, seg_term_info) = self.parse_record(line);
if !written && seg_term > term.to_string() {
let new_line = self.make_record(term, term_info);
new_seg_file.write_all(new_line.as_bytes()).unwrap();
written = true;
} else if seg_term == term {
if update {
let new_info = self.update_term_info(&mut json!(seg_term_info), term_info);
let new_line = self.make_record(term, &new_info);
new_seg_file.write_all(new_line.as_bytes()).unwrap();
} else {
let line = self.make_record(term, term_info);
new_seg_file.write_all(line.as_bytes()).unwrap();
}
written = true;
}
new_seg_file.write_all(line.as_bytes()).unwrap();
}
if !written {
let line = self.make_record(term, term_info);
new_seg_file.write_all(line.as_bytes()).unwrap();
}
fs::rename(&new_seg_file.path(), &seg_name).unwrap();
new_seg_file.flush().unwrap();
// new_seg_file.close().unwrap();
return true;
}
}

View File

@ -132,6 +132,50 @@ mod tests {
assert_eq!(record, "hello world\t{\"frequency\":100,\"idf\":1.5}\n");
}
#[test]
fn test_update_term_info() {
let mut orig_info = json!({
"doc1": ["1", "2"],
"doc2": ["3", "4"]
});
let new_info = json!({
"doc3": ["1", "2"]
});
let expected_result = json!({
"doc1": ["1", "2"],
"doc2": ["3", "4"],
"doc3": ["1", "2"]
});
let search = Rustysearch::new("/tmp/rustysearch");
let result = search.update_term_info(&mut orig_info, &new_info);
assert_eq!(result, expected_result);
}
#[test]
fn test_save_segment() {
let search = Rustysearch::new("/tmp/rustysearch_save_segment");
search.setup();
let term = "rust";
let term_info = json!({"doc1": ["1", "5"], "doc2": ["2", "6"]});
// Test saving a new segment
let result = search.save_segment(term, &term_info, false);
assert_eq!(result, true);
// Test updating an existing segment
let new_term_info = json!({"doc1": ["1", "5", "10"], "doc3": ["3", "7"]});
let result = search.save_segment(term, &new_term_info, true);
assert_eq!(result, true);
// Test overwriting an existing segment
let result = search.save_segment(term, &term_info, false);
assert_eq!(result, true);
}
// Helper function to clean up the stats file
fn clean_stats(tmp_path: &str) {
let search = Rustysearch::new(tmp_path);