From bcc8fb34611a17aa4cf1f4116d19046d37d7364b Mon Sep 17 00:00:00 2001 From: Alex Wellnitz Date: Tue, 24 Oct 2023 23:17:09 +0200 Subject: [PATCH] feat: Add Tokenizer, split Text into Vec --- Cargo.lock | 52 ++++++++++++++++ Cargo.toml | 2 + src/analyze/mod.rs | 1 + src/analyze/tokenizer.rs | 127 +++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +- src/search.rs | 12 +++- 6 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 src/analyze/mod.rs create mode 100644 src/analyze/tokenizer.rs diff --git a/Cargo.lock b/Cargo.lock index abb348b..734302b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,27 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + [[package]] name = "proc-macro2" version = "1.0.69" @@ -26,12 +41,43 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "rustysearch" version = "0.1.0" dependencies = [ + "regex", "serde", "serde_json", + "unicode-segmentation", ] [[package]] @@ -87,3 +133,9 @@ name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" diff --git a/Cargo.toml b/Cargo.toml index 09895e8..3022c40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,3 +11,5 @@ license = "MIT" [dependencies] serde = { version = "1.0.189", features = ["derive"] } serde_json = "1.0.107" +unicode-segmentation = "^1" +regex = "^1" \ No newline at end of file diff --git a/src/analyze/mod.rs b/src/analyze/mod.rs new file mode 100644 index 0000000..5768976 --- /dev/null +++ b/src/analyze/mod.rs @@ -0,0 +1 @@ +pub mod tokenizer; \ No newline at end of file diff --git a/src/analyze/tokenizer.rs b/src/analyze/tokenizer.rs new file mode 100644 index 0000000..a4aa7a6 --- /dev/null +++ b/src/analyze/tokenizer.rs @@ -0,0 +1,127 @@ +use std::collections::HashSet; + +use regex::Regex; +use unicode_segmentation::UnicodeSegmentation; + +pub struct Tokenizer { + text: String, + stopwords: HashSet, + punctuation: HashSet, +} + +impl Tokenizer { + pub fn new(text: &str, stopwords: Vec, punctuation: Option>) -> Self { + Self { + text: text.to_owned(), + stopwords: stopwords + .iter() + .map(|s| s.to_owned()) + .collect::>(), + punctuation: punctuation + .unwrap_or( + vec![ + "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/", + ":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", + "}", "~", "-", + ] + .iter() + .map(|s| s.to_string()) + .collect::>(), + ) + .iter() + .map(|s| s.to_string()) + .collect::>(), + } + } + + // Split text into words + pub fn split_into_words(&self) -> Vec { + self.text + .split_word_bounds() + .filter_map(|w| { + process_word( + w, + &get_special_char_regex(), + &self.stopwords, + &self.punctuation, + ) + }) + .collect::>() + } + + pub fn split_into_sentences(&self) -> Vec { + let special_char_regex = get_special_char_regex(); + get_sentence_space_regex() + .replace_all(&self.text, ".") + .unicode_sentences() + .map(|s| { + s.split_word_bounds() + .filter_map(|w| { + process_word(w, &special_char_regex, &self.stopwords, &self.punctuation) + }) + .collect::>() + .join(" ") + }) + .collect::>() + } + + pub fn split_into_paragraphs(&self) -> Vec { + get_newline_regex() + .split(&self.text) + .filter_map(|s| { + if s.trim().is_empty() { + return None; + } + + Some( + s.unicode_sentences() + .map(|s| { + s.split_word_bounds() + .filter_map(|w| { + process_word( + w, + &get_special_char_regex(), + &self.stopwords, + &self.punctuation, + ) + }) + .collect::>() + .join(" ") + }) + .collect::>() + .join(" "), + ) + }) + .collect::>() + } +} + +fn process_word( + w: &str, + special_char_regex: &Regex, + stopwords: &HashSet, + punctuation: &HashSet, +) -> Option { + let word = special_char_regex.replace_all(w.trim(), "").to_lowercase(); + + if word.is_empty() + || (word.graphemes(true).count() == 1) && punctuation.contains(&word) + || stopwords.contains(&word) + { + return None; + } + + Some(word) +} + +fn get_special_char_regex() -> Regex { + Regex::new(r"('s|,|\.)").unwrap() +} + +fn get_sentence_space_regex() -> Regex { + Regex::new(r"^([\.!?])[\n\t\r]").unwrap() +} + +fn get_newline_regex() -> Regex { + Regex::new(r"(\r|\n|\r\n)").unwrap() +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index d761767..9fd09e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,3 @@ pub mod types; -pub mod search; \ No newline at end of file +pub mod search; +pub mod analyze; \ No newline at end of file diff --git a/src/search.rs b/src/search.rs index afd35d2..f05d61d 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,6 +1,6 @@ use std::{fs, path::Path}; -use crate::types::Stats; +use crate::{types::Stats, analyze::tokenizer::Tokenizer}; pub struct Rustysearch { base_directory: String, @@ -92,10 +92,18 @@ impl Rustysearch { self.write_stats(current_stats); } - /// **Returns the total number of documents the index is aware of** + /// Returns the total number of documents the index is aware of /// pub fn get_total_docs(&self) -> i32 { let stats = self.read_stats(); return stats.total_docs; } + + /// Given a string (``blob``) of text, this will return a Vector of tokens. + /// + pub fn make_tokens(&self, blob: &str) -> Vec { + let tokenizer = Tokenizer::new(blob, vec![], None); + let tokens = tokenizer.split_into_words(); + return tokens; + } }