From f070b45fac7e869ecfaf1c5bf28f5fd22b9c287d Mon Sep 17 00:00:00 2001 From: Alex Wellnitz Date: Thu, 26 Oct 2023 19:20:59 +0200 Subject: [PATCH] feat: Add Update and Save Segment functions --- Cargo.lock | 148 +++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/search.rs | 111 ++++++++++++++++++++++++++++++-- tests/rustysearch.rs | 44 +++++++++++++ 4 files changed, 300 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 27bb045..e51b354 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,12 +11,58 @@ dependencies = [ "memchr", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "errno" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + [[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "libc" +version = "0.2.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" + +[[package]] +name = "linux-raw-sys" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" + [[package]] name = "md5" version = "0.7.0" @@ -47,6 +93,15 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.2" @@ -76,6 +131,19 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "rustix" +version = "0.38.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustysearch" version = "0.1.0" @@ -84,6 +152,7 @@ dependencies = [ "regex", "serde", "serde_json", + "tempfile", "unicode-segmentation", ] @@ -135,6 +204,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + [[package]] name = "unicode-ident" version = "1.0.12" @@ -146,3 +228,69 @@ name = "unicode-segmentation" version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/Cargo.toml b/Cargo.toml index 0ae5d84..996059a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,4 @@ serde_json = "1.0.107" unicode-segmentation = "^1" regex = "^1" md5 = "0.7.0" +tempfile = "3.8.0" diff --git a/src/search.rs b/src/search.rs index caaad88..23092cf 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,6 +1,13 @@ -use std::{cmp::min, collections::HashMap, fs, path::Path}; +use std::{ + cmp::min, + collections::{HashMap, HashSet}, + fs, + io::{Read, Write}, + path::Path, +}; +use tempfile::NamedTempFile; -use serde_json::{Value, json}; +use serde_json::{json, Value}; use crate::{analyze::tokenizer::Tokenizer, types::Stats}; @@ -198,10 +205,106 @@ impl Rustysearch { (term, info) } - /// Given a ``term`` and a dict of ``term_info``, creates a line for + /// Given a ``term`` and a dict of ``term_info``, creates a line for /// writing to the segment file. - /// + /// pub fn make_record(&self, term: &str, term_info: &Value) -> String { format!("{}\t{}\n", term, json!(term_info).to_string()) } + + /// Takes existing ``orig_info`` & ``new_info`` dicts & combines them + /// intelligently. + /// + /// Used for updating term_info within the segments. + /// + pub fn update_term_info(&self, orig_info: &mut Value, new_info: &Value) -> Value { + for (doc_id, positions) in new_info.as_object().unwrap().iter() { + if !orig_info.as_object().unwrap().contains_key(doc_id) { + orig_info[doc_id] = positions.clone(); + } else { + let mut orig_positions: HashSet<_> = orig_info[doc_id] + .as_array() + .unwrap() + .iter() + .map(|v| v.as_str().unwrap().to_string()) + .collect(); + let new_positions: HashSet<_> = positions + .as_array() + .unwrap() + .iter() + .map(|v| v.as_str().unwrap().to_string()) + .collect(); + + orig_positions.extend(new_positions); + + orig_info[doc_id] = Value::Array( + orig_positions + .iter() + .map(|v| Value::String(v.clone())) + .collect(), + ); + } + } + + return orig_info.to_owned(); + } + + /// Writes out new index data to disk. + /// + /// Takes a ``term`` string & ``term_info`` dict. It will + /// rewrite the segment in alphabetical order, adding in the data + /// where appropriate. + /// + /// Optionally takes an ``update`` parameter, which is a boolean & + /// determines whether the provided ``term_info`` should overwrite or + /// update the data in the segment. Default is ``False`` (overwrite). + /// + pub fn save_segment(&self, term: &str, term_info: &Value, update: bool) -> bool { + let seg_name = &self.make_segment_name(term); + let mut new_seg_file = NamedTempFile::new().unwrap(); + let mut written = false; + + if !Path::new(&seg_name).exists() { + fs::write(&seg_name, "").unwrap(); + } + + let mut seg_file = fs::OpenOptions::new().read(true).open(&seg_name).unwrap(); + let mut buf = String::new(); + seg_file.read_to_string(&mut buf).unwrap(); + + for line in buf.lines() { + let (seg_term, seg_term_info) = self.parse_record(line); + + if !written && seg_term > term.to_string() { + let new_line = self.make_record(term, term_info); + new_seg_file.write_all(new_line.as_bytes()).unwrap(); + written = true; + } else if seg_term == term { + if update { + let new_info = self.update_term_info(&mut json!(seg_term_info), term_info); + let new_line = self.make_record(term, &new_info); + new_seg_file.write_all(new_line.as_bytes()).unwrap(); + } else { + let line = self.make_record(term, term_info); + new_seg_file.write_all(line.as_bytes()).unwrap(); + } + + written = true; + } + + new_seg_file.write_all(line.as_bytes()).unwrap(); + } + + if !written { + let line = self.make_record(term, term_info); + new_seg_file.write_all(line.as_bytes()).unwrap(); + } + + fs::rename(&new_seg_file.path(), &seg_name).unwrap(); + + new_seg_file.flush().unwrap(); + // new_seg_file.close().unwrap(); + + return true; + } } diff --git a/tests/rustysearch.rs b/tests/rustysearch.rs index bef568b..6b97370 100644 --- a/tests/rustysearch.rs +++ b/tests/rustysearch.rs @@ -132,6 +132,50 @@ mod tests { assert_eq!(record, "hello world\t{\"frequency\":100,\"idf\":1.5}\n"); } + #[test] + fn test_update_term_info() { + let mut orig_info = json!({ + "doc1": ["1", "2"], + "doc2": ["3", "4"] + }); + + let new_info = json!({ + "doc3": ["1", "2"] + }); + + let expected_result = json!({ + "doc1": ["1", "2"], + "doc2": ["3", "4"], + "doc3": ["1", "2"] + }); + let search = Rustysearch::new("/tmp/rustysearch"); + let result = search.update_term_info(&mut orig_info, &new_info); + + assert_eq!(result, expected_result); + } + + #[test] + fn test_save_segment() { + let search = Rustysearch::new("/tmp/rustysearch_save_segment"); + search.setup(); + + let term = "rust"; + let term_info = json!({"doc1": ["1", "5"], "doc2": ["2", "6"]}); + + // Test saving a new segment + let result = search.save_segment(term, &term_info, false); + assert_eq!(result, true); + + // Test updating an existing segment + let new_term_info = json!({"doc1": ["1", "5", "10"], "doc3": ["3", "7"]}); + let result = search.save_segment(term, &new_term_info, true); + assert_eq!(result, true); + + // Test overwriting an existing segment + let result = search.save_segment(term, &term_info, false); + assert_eq!(result, true); + } + // Helper function to clean up the stats file fn clean_stats(tmp_path: &str) { let search = Rustysearch::new(tmp_path);