feat: Add Update and Save Segment functions
This commit is contained in:
parent
718e57355b
commit
f070b45fac
148
Cargo.lock
generated
148
Cargo.lock
generated
@ -11,12 +11,58 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags"
|
||||||
|
version = "1.3.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags"
|
||||||
|
version = "2.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfg-if"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "errno"
|
||||||
|
version = "0.3.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fastrand"
|
||||||
|
version = "2.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "1.0.9"
|
version = "1.0.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
|
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libc"
|
||||||
|
version = "0.2.149"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "linux-raw-sys"
|
||||||
|
version = "0.4.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "md5"
|
name = "md5"
|
||||||
version = "0.7.0"
|
version = "0.7.0"
|
||||||
@ -47,6 +93,15 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "redox_syscall"
|
||||||
|
version = "0.3.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 1.3.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex"
|
name = "regex"
|
||||||
version = "1.10.2"
|
version = "1.10.2"
|
||||||
@ -76,6 +131,19 @@ version = "0.8.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustix"
|
||||||
|
version = "0.38.20"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 2.4.1",
|
||||||
|
"errno",
|
||||||
|
"libc",
|
||||||
|
"linux-raw-sys",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustysearch"
|
name = "rustysearch"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@ -84,6 +152,7 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"tempfile",
|
||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -135,6 +204,19 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tempfile"
|
||||||
|
version = "3.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"fastrand",
|
||||||
|
"redox_syscall",
|
||||||
|
"rustix",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-ident"
|
name = "unicode-ident"
|
||||||
version = "1.0.12"
|
version = "1.0.12"
|
||||||
@ -146,3 +228,69 @@ name = "unicode-segmentation"
|
|||||||
version = "1.10.1"
|
version = "1.10.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.48.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
|
||||||
|
dependencies = [
|
||||||
|
"windows-targets",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-targets"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
|
||||||
|
dependencies = [
|
||||||
|
"windows_aarch64_gnullvm",
|
||||||
|
"windows_aarch64_msvc",
|
||||||
|
"windows_i686_gnu",
|
||||||
|
"windows_i686_msvc",
|
||||||
|
"windows_x86_64_gnu",
|
||||||
|
"windows_x86_64_gnullvm",
|
||||||
|
"windows_x86_64_msvc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_gnullvm"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_msvc"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnu"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_msvc"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnu"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnullvm"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_msvc"
|
||||||
|
version = "0.48.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
|
||||||
|
@ -14,3 +14,4 @@ serde_json = "1.0.107"
|
|||||||
unicode-segmentation = "^1"
|
unicode-segmentation = "^1"
|
||||||
regex = "^1"
|
regex = "^1"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
|
tempfile = "3.8.0"
|
||||||
|
107
src/search.rs
107
src/search.rs
@ -1,6 +1,13 @@
|
|||||||
use std::{cmp::min, collections::HashMap, fs, path::Path};
|
use std::{
|
||||||
|
cmp::min,
|
||||||
|
collections::{HashMap, HashSet},
|
||||||
|
fs,
|
||||||
|
io::{Read, Write},
|
||||||
|
path::Path,
|
||||||
|
};
|
||||||
|
use tempfile::NamedTempFile;
|
||||||
|
|
||||||
use serde_json::{Value, json};
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
use crate::{analyze::tokenizer::Tokenizer, types::Stats};
|
use crate::{analyze::tokenizer::Tokenizer, types::Stats};
|
||||||
|
|
||||||
@ -204,4 +211,100 @@ impl Rustysearch {
|
|||||||
pub fn make_record(&self, term: &str, term_info: &Value) -> String {
|
pub fn make_record(&self, term: &str, term_info: &Value) -> String {
|
||||||
format!("{}\t{}\n", term, json!(term_info).to_string())
|
format!("{}\t{}\n", term, json!(term_info).to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Takes existing ``orig_info`` & ``new_info`` dicts & combines them
|
||||||
|
/// intelligently.
|
||||||
|
///
|
||||||
|
/// Used for updating term_info within the segments.
|
||||||
|
///
|
||||||
|
pub fn update_term_info(&self, orig_info: &mut Value, new_info: &Value) -> Value {
|
||||||
|
for (doc_id, positions) in new_info.as_object().unwrap().iter() {
|
||||||
|
if !orig_info.as_object().unwrap().contains_key(doc_id) {
|
||||||
|
orig_info[doc_id] = positions.clone();
|
||||||
|
} else {
|
||||||
|
let mut orig_positions: HashSet<_> = orig_info[doc_id]
|
||||||
|
.as_array()
|
||||||
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.as_str().unwrap().to_string())
|
||||||
|
.collect();
|
||||||
|
let new_positions: HashSet<_> = positions
|
||||||
|
.as_array()
|
||||||
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.as_str().unwrap().to_string())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
orig_positions.extend(new_positions);
|
||||||
|
|
||||||
|
orig_info[doc_id] = Value::Array(
|
||||||
|
orig_positions
|
||||||
|
.iter()
|
||||||
|
.map(|v| Value::String(v.clone()))
|
||||||
|
.collect(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return orig_info.to_owned();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Writes out new index data to disk.
|
||||||
|
///
|
||||||
|
/// Takes a ``term`` string & ``term_info`` dict. It will
|
||||||
|
/// rewrite the segment in alphabetical order, adding in the data
|
||||||
|
/// where appropriate.
|
||||||
|
///
|
||||||
|
/// Optionally takes an ``update`` parameter, which is a boolean &
|
||||||
|
/// determines whether the provided ``term_info`` should overwrite or
|
||||||
|
/// update the data in the segment. Default is ``False`` (overwrite).
|
||||||
|
///
|
||||||
|
pub fn save_segment(&self, term: &str, term_info: &Value, update: bool) -> bool {
|
||||||
|
let seg_name = &self.make_segment_name(term);
|
||||||
|
let mut new_seg_file = NamedTempFile::new().unwrap();
|
||||||
|
let mut written = false;
|
||||||
|
|
||||||
|
if !Path::new(&seg_name).exists() {
|
||||||
|
fs::write(&seg_name, "").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut seg_file = fs::OpenOptions::new().read(true).open(&seg_name).unwrap();
|
||||||
|
let mut buf = String::new();
|
||||||
|
seg_file.read_to_string(&mut buf).unwrap();
|
||||||
|
|
||||||
|
for line in buf.lines() {
|
||||||
|
let (seg_term, seg_term_info) = self.parse_record(line);
|
||||||
|
|
||||||
|
if !written && seg_term > term.to_string() {
|
||||||
|
let new_line = self.make_record(term, term_info);
|
||||||
|
new_seg_file.write_all(new_line.as_bytes()).unwrap();
|
||||||
|
written = true;
|
||||||
|
} else if seg_term == term {
|
||||||
|
if update {
|
||||||
|
let new_info = self.update_term_info(&mut json!(seg_term_info), term_info);
|
||||||
|
let new_line = self.make_record(term, &new_info);
|
||||||
|
new_seg_file.write_all(new_line.as_bytes()).unwrap();
|
||||||
|
} else {
|
||||||
|
let line = self.make_record(term, term_info);
|
||||||
|
new_seg_file.write_all(line.as_bytes()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
written = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
new_seg_file.write_all(line.as_bytes()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
if !written {
|
||||||
|
let line = self.make_record(term, term_info);
|
||||||
|
new_seg_file.write_all(line.as_bytes()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fs::rename(&new_seg_file.path(), &seg_name).unwrap();
|
||||||
|
|
||||||
|
new_seg_file.flush().unwrap();
|
||||||
|
// new_seg_file.close().unwrap();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -132,6 +132,50 @@ mod tests {
|
|||||||
assert_eq!(record, "hello world\t{\"frequency\":100,\"idf\":1.5}\n");
|
assert_eq!(record, "hello world\t{\"frequency\":100,\"idf\":1.5}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_update_term_info() {
|
||||||
|
let mut orig_info = json!({
|
||||||
|
"doc1": ["1", "2"],
|
||||||
|
"doc2": ["3", "4"]
|
||||||
|
});
|
||||||
|
|
||||||
|
let new_info = json!({
|
||||||
|
"doc3": ["1", "2"]
|
||||||
|
});
|
||||||
|
|
||||||
|
let expected_result = json!({
|
||||||
|
"doc1": ["1", "2"],
|
||||||
|
"doc2": ["3", "4"],
|
||||||
|
"doc3": ["1", "2"]
|
||||||
|
});
|
||||||
|
let search = Rustysearch::new("/tmp/rustysearch");
|
||||||
|
let result = search.update_term_info(&mut orig_info, &new_info);
|
||||||
|
|
||||||
|
assert_eq!(result, expected_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_save_segment() {
|
||||||
|
let search = Rustysearch::new("/tmp/rustysearch_save_segment");
|
||||||
|
search.setup();
|
||||||
|
|
||||||
|
let term = "rust";
|
||||||
|
let term_info = json!({"doc1": ["1", "5"], "doc2": ["2", "6"]});
|
||||||
|
|
||||||
|
// Test saving a new segment
|
||||||
|
let result = search.save_segment(term, &term_info, false);
|
||||||
|
assert_eq!(result, true);
|
||||||
|
|
||||||
|
// Test updating an existing segment
|
||||||
|
let new_term_info = json!({"doc1": ["1", "5", "10"], "doc3": ["3", "7"]});
|
||||||
|
let result = search.save_segment(term, &new_term_info, true);
|
||||||
|
assert_eq!(result, true);
|
||||||
|
|
||||||
|
// Test overwriting an existing segment
|
||||||
|
let result = search.save_segment(term, &term_info, false);
|
||||||
|
assert_eq!(result, true);
|
||||||
|
}
|
||||||
|
|
||||||
// Helper function to clean up the stats file
|
// Helper function to clean up the stats file
|
||||||
fn clean_stats(tmp_path: &str) {
|
fn clean_stats(tmp_path: &str) {
|
||||||
let search = Rustysearch::new(tmp_path);
|
let search = Rustysearch::new(tmp_path);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user