diff --git a/src/analyze/mod.rs b/src/analyze/mod.rs deleted file mode 100644 index 5768976..0000000 --- a/src/analyze/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod tokenizer; \ No newline at end of file diff --git a/src/analyze/tokenizer.rs b/src/analyze/tokenizer.rs deleted file mode 100644 index a4aa7a6..0000000 --- a/src/analyze/tokenizer.rs +++ /dev/null @@ -1,127 +0,0 @@ -use std::collections::HashSet; - -use regex::Regex; -use unicode_segmentation::UnicodeSegmentation; - -pub struct Tokenizer { - text: String, - stopwords: HashSet, - punctuation: HashSet, -} - -impl Tokenizer { - pub fn new(text: &str, stopwords: Vec, punctuation: Option>) -> Self { - Self { - text: text.to_owned(), - stopwords: stopwords - .iter() - .map(|s| s.to_owned()) - .collect::>(), - punctuation: punctuation - .unwrap_or( - vec![ - "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/", - ":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", - "}", "~", "-", - ] - .iter() - .map(|s| s.to_string()) - .collect::>(), - ) - .iter() - .map(|s| s.to_string()) - .collect::>(), - } - } - - // Split text into words - pub fn split_into_words(&self) -> Vec { - self.text - .split_word_bounds() - .filter_map(|w| { - process_word( - w, - &get_special_char_regex(), - &self.stopwords, - &self.punctuation, - ) - }) - .collect::>() - } - - pub fn split_into_sentences(&self) -> Vec { - let special_char_regex = get_special_char_regex(); - get_sentence_space_regex() - .replace_all(&self.text, ".") - .unicode_sentences() - .map(|s| { - s.split_word_bounds() - .filter_map(|w| { - process_word(w, &special_char_regex, &self.stopwords, &self.punctuation) - }) - .collect::>() - .join(" ") - }) - .collect::>() - } - - pub fn split_into_paragraphs(&self) -> Vec { - get_newline_regex() - .split(&self.text) - .filter_map(|s| { - if s.trim().is_empty() { - return None; - } - - Some( - s.unicode_sentences() - .map(|s| { - s.split_word_bounds() - .filter_map(|w| { - process_word( - w, - &get_special_char_regex(), - &self.stopwords, - &self.punctuation, - ) - }) - .collect::>() - .join(" ") - }) - .collect::>() - .join(" "), - ) - }) - .collect::>() - } -} - -fn process_word( - w: &str, - special_char_regex: &Regex, - stopwords: &HashSet, - punctuation: &HashSet, -) -> Option { - let word = special_char_regex.replace_all(w.trim(), "").to_lowercase(); - - if word.is_empty() - || (word.graphemes(true).count() == 1) && punctuation.contains(&word) - || stopwords.contains(&word) - { - return None; - } - - Some(word) -} - -fn get_special_char_regex() -> Regex { - Regex::new(r"('s|,|\.)").unwrap() -} - -fn get_sentence_space_regex() -> Regex { - Regex::new(r"^([\.!?])[\n\t\r]").unwrap() -} - -fn get_newline_regex() -> Regex { - Regex::new(r"(\r|\n|\r\n)").unwrap() -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 9fd09e8..e092c24 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1 @@ -pub mod types; -pub mod search; -pub mod analyze; \ No newline at end of file +pub mod search; \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 2e1b391..f3dfbc0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,13 @@ -use rustysearch::search::Rustysearch; +use rustysearch::search::engine::SearchEngine; fn main() { - println!("Hello, world!"); - let search = Rustysearch::new("/tmp/rustysearch"); - search.setup(); + let mut engine = SearchEngine::new(1.5, 0.75); + engine.index("https://www.rust-lang.org/", "Rust is a systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety."); + engine.index("https://en.wikipedia.org/wiki/Rust_(programming_language)", "Rust is a multi-paradigm system programming language focused on safety, especially safe concurrency."); + + let query = "Rust programming language threads"; + let results = engine.search(query); + for (url, score) in results { + println!("{}: {}", url, score); + } } \ No newline at end of file diff --git a/src/search.rs b/src/search.rs deleted file mode 100644 index 23092cf..0000000 --- a/src/search.rs +++ /dev/null @@ -1,310 +0,0 @@ -use std::{ - cmp::min, - collections::{HashMap, HashSet}, - fs, - io::{Read, Write}, - path::Path, -}; -use tempfile::NamedTempFile; - -use serde_json::{json, Value}; - -use crate::{analyze::tokenizer::Tokenizer, types::Stats}; - -pub struct Rustysearch { - base_directory: String, - index_path: String, - docs_path: String, - stats_path: String, -} - -impl Rustysearch { - /// **Sets up the object & the data directory** - /// - /// Requires a ``base_directory`` parameter, which specifies the parent - /// directory the index/document/stats data will be kept in. - /// - pub fn new(path: &str) -> Self { - Self { - base_directory: path.to_string(), - index_path: format!("{}/index", path), - docs_path: format!("{}/docs", path), - stats_path: format!("{}/stats.json", path), - } - } - - /// **Handles the creation of the various data directories** - /// - /// If the paths do not exist, it will create them. As a side effect, you - /// must have read/write access to the location you're trying to create - /// the data at. - /// - pub fn setup(&self) { - // Create the base directory - if !Path::new(&self.base_directory).exists() { - fs::create_dir(&self.base_directory).expect("Unable to create base directory"); - } - // Create the index directory - if !Path::new(&self.index_path).exists() { - fs::create_dir(&self.index_path).expect("Unable to create index directory"); - } - // Create the docs directory - if !Path::new(&self.docs_path).exists() { - fs::create_dir(&self.docs_path).expect("Unable to create docs directory"); - } - } - - /// **Reads the index-wide stats** - /// - /// If the stats do not exist, it makes returns data with the current - /// version of ``rustysearch`` & zero docs (used in scoring). - /// - pub fn read_stats(&self) -> std::io::Result { - let stats: Stats; - - if !Path::new(&self.stats_path).exists() { - stats = Stats { - version: String::from("0.1.0"), - total_docs: 0, - }; - } else { - // Read the stats file - let stats_json = fs::read_to_string(&self.stats_path).expect("Unable to read stats"); - stats = serde_json::from_str(&stats_json).unwrap(); - } - - Ok(stats) - } - - /// **Writes the index-wide stats** - /// - /// Takes a ``new_stats`` parameter, which should be a dictionary of - /// stat data. Example stat data:: - /// - /// { - /// 'version': '1.0.0', - /// 'total_docs': 25, - /// } - /// - pub fn write_stats(&self, new_stats: Stats) -> std::io::Result<()> { - // Write new_stats as json to stats_path - let new_stats_json = serde_json::to_string(&new_stats).unwrap(); - fs::write(&self.stats_path, new_stats_json)?; - Ok(()) - } - - /// **Increments the total number of documents the index is aware of** - /// - /// This is important for scoring reasons & is typically called as part - /// of the indexing process. - /// - pub fn increment_total_docs(&self) { - let mut current_stats = self.read_stats().unwrap(); - current_stats.total_docs += 1; - self.write_stats(current_stats).unwrap(); - } - - /// Returns the total number of documents the index is aware of - /// - pub fn get_total_docs(&self) -> i32 { - let stats = self.read_stats().unwrap(); - return stats.total_docs; - } - - /// Given a string (``blob``) of text, this will return a Vector of tokens. - /// - pub fn make_tokens(&self, blob: &str) -> Vec { - let tokenizer = Tokenizer::new(blob, vec![], None); - let tokens = tokenizer.split_into_words(); - return tokens; - } - - /// **Converts a iterable of ``tokens`` into n-grams** - /// - /// This assumes front grams (all grams made starting from the left side - /// of the token). - /// - /// Optionally accepts a ``min_gram`` parameter, which takes an integer & - /// controls the minimum gram length. Default is ``3``. - /// - /// Optionally accepts a ``max_gram`` parameter, which takes an integer & - /// controls the maximum gram length. Default is ``6``. - /// - pub fn make_ngrams( - &self, - tokens: Vec, - min_gram: usize, - max_gram: usize, - ) -> HashMap> { - let mut terms: HashMap> = HashMap::new(); - - for (position, token) in tokens.iter().enumerate() { - for window_length in min_gram..min(max_gram + 1, token.len() + 1) { - // Assuming "front" grams. - let gram = &token[..window_length]; - terms - .entry(gram.to_string()) - .or_insert(Vec::new()) - .push(position); - } - } - - return terms; - } - - /// Given a ``term``, hashes it & returns a string of the first N letters - /// - /// Optionally accepts a ``length`` parameter, which takes an integer & - /// controls how much of the hash is returned. Default is ``6``. - /// - /// This is usefully when writing files to the file system, as it helps - /// us keep from putting too many files in a given directory (~32K max - /// with the default). - /// - pub fn hash_name(&self, term: &str, length: usize) -> String { - // Make sure it's ASCII. - let term = term.to_ascii_lowercase(); - - // We hash & slice the term to get a small-ish number of fields - // and good distribution between them. - let hash = md5::compute(&term); - let hashed = format!("{:x}", hash); - - // Cut string after length characters - let hashed = &hashed[..length]; - - return hashed.to_string(); - } - - /// Given a ``term``, creates a segment filename based on the hash of the term. - /// - /// Returns the full path to the segment. - /// - pub fn make_segment_name(&self, term: &str) -> String { - let term = &self.hash_name(term, 6); - - let index_file_name = format!("{}.index", term); - let segment_path = Path::new(&self.index_path).join(index_file_name); - let segment_path = segment_path.to_str().unwrap().to_string(); - - fs::write(&segment_path, "").expect("Unable to create segment file"); - - return segment_path; - } - - /// Given a ``line`` from the segment file, this returns the term & its info. - /// - /// The term info is stored as serialized JSON. The default separator - /// between the term & info is the ``\t`` character, which would never - /// appear in a term due to the way tokenization is done. - /// - pub fn parse_record(&self, line: &str) -> (String, String) { - let mut parts = line.trim().split("\t"); - let term = parts.next().unwrap().to_string(); - let info = parts.next().unwrap().to_string(); - (term, info) - } - - /// Given a ``term`` and a dict of ``term_info``, creates a line for - /// writing to the segment file. - /// - pub fn make_record(&self, term: &str, term_info: &Value) -> String { - format!("{}\t{}\n", term, json!(term_info).to_string()) - } - - /// Takes existing ``orig_info`` & ``new_info`` dicts & combines them - /// intelligently. - /// - /// Used for updating term_info within the segments. - /// - pub fn update_term_info(&self, orig_info: &mut Value, new_info: &Value) -> Value { - for (doc_id, positions) in new_info.as_object().unwrap().iter() { - if !orig_info.as_object().unwrap().contains_key(doc_id) { - orig_info[doc_id] = positions.clone(); - } else { - let mut orig_positions: HashSet<_> = orig_info[doc_id] - .as_array() - .unwrap() - .iter() - .map(|v| v.as_str().unwrap().to_string()) - .collect(); - let new_positions: HashSet<_> = positions - .as_array() - .unwrap() - .iter() - .map(|v| v.as_str().unwrap().to_string()) - .collect(); - - orig_positions.extend(new_positions); - - orig_info[doc_id] = Value::Array( - orig_positions - .iter() - .map(|v| Value::String(v.clone())) - .collect(), - ); - } - } - - return orig_info.to_owned(); - } - - /// Writes out new index data to disk. - /// - /// Takes a ``term`` string & ``term_info`` dict. It will - /// rewrite the segment in alphabetical order, adding in the data - /// where appropriate. - /// - /// Optionally takes an ``update`` parameter, which is a boolean & - /// determines whether the provided ``term_info`` should overwrite or - /// update the data in the segment. Default is ``False`` (overwrite). - /// - pub fn save_segment(&self, term: &str, term_info: &Value, update: bool) -> bool { - let seg_name = &self.make_segment_name(term); - let mut new_seg_file = NamedTempFile::new().unwrap(); - let mut written = false; - - if !Path::new(&seg_name).exists() { - fs::write(&seg_name, "").unwrap(); - } - - let mut seg_file = fs::OpenOptions::new().read(true).open(&seg_name).unwrap(); - let mut buf = String::new(); - seg_file.read_to_string(&mut buf).unwrap(); - - for line in buf.lines() { - let (seg_term, seg_term_info) = self.parse_record(line); - - if !written && seg_term > term.to_string() { - let new_line = self.make_record(term, term_info); - new_seg_file.write_all(new_line.as_bytes()).unwrap(); - written = true; - } else if seg_term == term { - if update { - let new_info = self.update_term_info(&mut json!(seg_term_info), term_info); - let new_line = self.make_record(term, &new_info); - new_seg_file.write_all(new_line.as_bytes()).unwrap(); - } else { - let line = self.make_record(term, term_info); - new_seg_file.write_all(line.as_bytes()).unwrap(); - } - - written = true; - } - - new_seg_file.write_all(line.as_bytes()).unwrap(); - } - - if !written { - let line = self.make_record(term, term_info); - new_seg_file.write_all(line.as_bytes()).unwrap(); - } - - fs::rename(&new_seg_file.path(), &seg_name).unwrap(); - - new_seg_file.flush().unwrap(); - // new_seg_file.close().unwrap(); - - return true; - } -} diff --git a/src/search/engine.rs b/src/search/engine.rs new file mode 100644 index 0000000..097e3b3 --- /dev/null +++ b/src/search/engine.rs @@ -0,0 +1,86 @@ +use std::collections::HashMap; +use std::f64; + +pub fn update_url_scores(old: &mut HashMap, new: &HashMap) { + for (url, score) in new { + old.entry(url.to_string()).and_modify(|e| *e += score).or_insert(*score); + } +} + +pub fn normalize_string(input_string: &str) -> String { + let string_without_punc: String = input_string.chars().filter(|&c| !c.is_ascii_punctuation()).collect(); + let string_without_double_spaces: String = string_without_punc.split_whitespace().collect::>().join(" "); + string_without_double_spaces.to_lowercase() +} + +pub struct SearchEngine { + index: HashMap>, + documents: HashMap, + k1: f64, + b: f64, +} + +impl SearchEngine { + pub fn new(k1: f64, b: f64) -> SearchEngine { + SearchEngine { + index: HashMap::new(), + documents: HashMap::new(), + k1, + b, + } + } + + pub fn posts(&self) -> Vec { + self.documents.keys().cloned().collect() + } + + pub fn number_of_documents(&self) -> usize { + self.documents.len() + } + + pub fn avdl(&self) -> f64 { + let total_length: usize = self.documents.values().map(|d| d.len()).sum(); + total_length as f64 / self.documents.len() as f64 + } + + pub fn idf(&self, kw: &str) -> f64 { + let n = self.number_of_documents() as f64; + let n_kw = self.get_urls(kw).len() as f64; + ((n - n_kw + 0.5) / (n_kw + 0.5) + 1.0).ln() + } + + pub fn bm25(&self, kw: &str) -> HashMap { + let mut result = HashMap::new(); + let idf_score = self.idf(kw); + let avdl = self.avdl(); + for (url, freq) in self.get_urls(kw) { + let numerator = freq as f64 * (self.k1 + 1.0); + let denominator = freq as f64 + self.k1 * (1.0 - self.b + self.b * self.documents.get(&url).unwrap().len() as f64 / avdl); + result.insert(url.to_string(), idf_score * numerator / denominator); + } + result + } + + pub fn search(&mut self, query: &str) -> HashMap { + let keywords = normalize_string(query).split_whitespace().map(|s| s.to_string()).collect::>(); + let mut url_scores: HashMap = HashMap::new(); + for kw in keywords { + let kw_urls_score = self.bm25(&kw); + update_url_scores(&mut url_scores, &kw_urls_score); + } + url_scores + } + + pub fn index(&mut self, url: &str, content: &str) { + self.documents.insert(url.to_string(), content.to_string()); + let words = normalize_string(content).split_whitespace().map(|s| s.to_string()).collect::>(); + for word in words { + *self.index.entry(word).or_insert(HashMap::new()).entry(url.to_string()).or_insert(0) += 1; + } + } + + pub fn get_urls(&self, keyword: &str) -> HashMap { + let keyword = normalize_string(keyword); + self.index.get(&keyword).cloned().unwrap_or(HashMap::new()) + } +} \ No newline at end of file diff --git a/src/search/mod.rs b/src/search/mod.rs new file mode 100644 index 0000000..0d9bc4a --- /dev/null +++ b/src/search/mod.rs @@ -0,0 +1 @@ +pub mod engine; \ No newline at end of file diff --git a/src/types.rs b/src/types.rs deleted file mode 100644 index 610ac5d..0000000 --- a/src/types.rs +++ /dev/null @@ -1,7 +0,0 @@ -use serde::{Deserialize, Serialize}; - -#[derive(Serialize, Deserialize, Debug)] -pub struct Stats{ - pub version: String, - pub total_docs: i32, -} \ No newline at end of file diff --git a/tests/rustysearch.rs b/tests/rustysearch.rs deleted file mode 100644 index 6b97370..0000000 --- a/tests/rustysearch.rs +++ /dev/null @@ -1,190 +0,0 @@ -#[cfg(test)] -mod tests { - use rustysearch::{search::Rustysearch, types::Stats}; - use serde_json::json; - - #[test] - fn test_write_new_stats() { - let stats = Stats { - version: String::from("0.1.0"), - total_docs: 0, - }; - - assert_eq!(stats.version, "0.1.0"); - assert_eq!(stats.total_docs, 0); - - let tmp_path = "/tmp/rustysearch_writenewstats"; - let search = Rustysearch::new(&tmp_path); - search.setup(); - - search.write_stats(stats).unwrap(); - } - - #[test] - fn test_read_stats() { - let tmp_path = "/tmp/rustysearch_readstats"; - let search = Rustysearch::new(&tmp_path); - search.setup(); - - clean_stats(tmp_path); - - let stats = search.read_stats().unwrap(); - assert_eq!(stats.version, "0.1.0"); - assert_eq!(stats.total_docs, 0); - } - - #[test] - fn test_increment_total_docs() { - let tmp_path = "/tmp/rustysearch_incrementtotaldocs"; - let search = Rustysearch::new(&tmp_path); - search.setup(); - - clean_stats(tmp_path); - - let stats = search.read_stats().unwrap(); - assert_eq!(stats.total_docs, 0); - - search.increment_total_docs(); - let stats = search.read_stats().unwrap(); - assert_eq!(stats.total_docs, 1); - } - - #[test] - fn test_get_total_docs() { - let tmp_path = "/tmp/rustysearch_gettotaldocs"; - let search = Rustysearch::new(&tmp_path); - search.setup(); - - clean_stats(tmp_path); - - let stats = search.read_stats().unwrap(); - assert_eq!(stats.total_docs, 0); - - search.increment_total_docs(); - let stats = search.read_stats().unwrap(); - assert_eq!(stats.total_docs, 1); - - let total_docs = search.get_total_docs(); - assert_eq!(total_docs, 1); - } - - #[test] - fn test_make_ngrams() { - let search = Rustysearch::new("/tmp/rustysearch_makengrams"); - search.setup(); - - let tokens = vec!["hello".to_string(), "world".to_string()]; - let terms = search.make_ngrams(tokens, 3, 6); - - assert_eq!(terms["hel"].len(), 1); - } - - #[test] - fn test_hash_name() { - let search = Rustysearch::new("/tmp/rustysearch_hashname"); - search.setup(); - - let hash = search.hash_name("hello", 6); - assert_eq!(hash, "5d4140"); - } - - #[test] - fn test_make_segment_name() { - let search = Rustysearch::new("/tmp/rustysearch_makesegmentname"); - search.setup(); - - let segment_name = search.make_segment_name("hello"); - assert_eq!( - segment_name, - "/tmp/rustysearch_makesegmentname/index/5d4140.index" - ); - } - - #[test] - fn test_parse_record() { - let search = Rustysearch::new("/tmp/rustysearch_parserecord"); - search.setup(); - - let line = "my_term\t{\"frequency\": 100}"; - let (term, info) = search.parse_record(line); - - assert_eq!(term, "my_term"); - assert_eq!(info, "{\"frequency\": 100}"); - } - - #[test] - fn test_make_tokens() { - let search = Rustysearch::new("/tmp/rustysearch"); - let tokens = search.make_tokens("Hello, world!"); - assert_eq!(tokens, vec!["hello", "world"]); - } - - #[test] - fn test_make_record() { - let search = Rustysearch::new("/tmp/rustysearch"); - let term = "hello world"; - let term_info = json!({ - "frequency": 100, - "idf": 1.5, - }); - - let record = search.make_record(term, &term_info); - assert_eq!(record, "hello world\t{\"frequency\":100,\"idf\":1.5}\n"); - } - - #[test] - fn test_update_term_info() { - let mut orig_info = json!({ - "doc1": ["1", "2"], - "doc2": ["3", "4"] - }); - - let new_info = json!({ - "doc3": ["1", "2"] - }); - - let expected_result = json!({ - "doc1": ["1", "2"], - "doc2": ["3", "4"], - "doc3": ["1", "2"] - }); - let search = Rustysearch::new("/tmp/rustysearch"); - let result = search.update_term_info(&mut orig_info, &new_info); - - assert_eq!(result, expected_result); - } - - #[test] - fn test_save_segment() { - let search = Rustysearch::new("/tmp/rustysearch_save_segment"); - search.setup(); - - let term = "rust"; - let term_info = json!({"doc1": ["1", "5"], "doc2": ["2", "6"]}); - - // Test saving a new segment - let result = search.save_segment(term, &term_info, false); - assert_eq!(result, true); - - // Test updating an existing segment - let new_term_info = json!({"doc1": ["1", "5", "10"], "doc3": ["3", "7"]}); - let result = search.save_segment(term, &new_term_info, true); - assert_eq!(result, true); - - // Test overwriting an existing segment - let result = search.save_segment(term, &term_info, false); - assert_eq!(result, true); - } - - // Helper function to clean up the stats file - fn clean_stats(tmp_path: &str) { - let search = Rustysearch::new(tmp_path); - search.setup(); - - let new_stats = Stats { - version: String::from("0.1.0"), - total_docs: 0, - }; - search.write_stats(new_stats).unwrap(); - } -} diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs deleted file mode 100644 index af0053e..0000000 --- a/tests/tokenizer.rs +++ /dev/null @@ -1,53 +0,0 @@ -#[cfg(test)] -mod tests { - use rustysearch::analyze::tokenizer::Tokenizer; - - #[test] - fn test_split_into_words() { - let text = "The quick brown fox jumps over the lazy dog."; - let stopwords = vec!["the".to_string(), "over".to_string()]; - let tokenizer = Tokenizer::new(text, stopwords, None); - let words = tokenizer.split_into_words(); - assert_eq!( - words, - vec![ - "quick".to_string(), - "brown".to_string(), - "fox".to_string(), - "jumps".to_string(), - "lazy".to_string(), - "dog".to_string(), - ] - ); - } - - #[test] - fn test_split_into_sentences() { - let text = "The quick brown fox jumps over the lazy dog. The end."; - let stopwords = vec!["the".to_string(), "over".to_string()]; - let tokenizer = Tokenizer::new(text, stopwords, None); - let sentences = tokenizer.split_into_sentences(); - assert_eq!( - sentences, - vec![ - "quick brown fox jumps lazy dog".to_string(), - "end".to_string(), - ] - ); - } - - #[test] - fn test_split_into_paragraphs() { - let text = "The quick brown fox jumps over the lazy dog.\n\nThe end."; - let stopwords = vec!["the".to_string(), "over".to_string()]; - let tokenizer = Tokenizer::new(text, stopwords, None); - let paragraphs = tokenizer.split_into_paragraphs(); - assert_eq!( - paragraphs, - vec![ - "quick brown fox jumps lazy dog".to_string(), - "end".to_string(), - ] - ); - } -}