From bcc8fb34611a17aa4cf1f4116d19046d37d7364b Mon Sep 17 00:00:00 2001
From: Alex Wellnitz <moin@wellnitz-alex.de>
Date: Tue, 24 Oct 2023 23:17:09 +0200
Subject: [PATCH] feat: Add Tokenizer, split Text into Vec<String>

---
 Cargo.lock               |  52 ++++++++++++++++
 Cargo.toml               |   2 +
 src/analyze/mod.rs       |   1 +
 src/analyze/tokenizer.rs | 127 +++++++++++++++++++++++++++++++++++++++
 src/lib.rs               |   3 +-
 src/search.rs            |  12 +++-
 6 files changed, 194 insertions(+), 3 deletions(-)
 create mode 100644 src/analyze/mod.rs
 create mode 100644 src/analyze/tokenizer.rs
diff --git a/Cargo.lock b/Cargo.lock
index abb348b..734302b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,12 +2,27 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "aho-corasick"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
 
+[[package]]
+name = "memchr"
+version = "2.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.69"
@@ -26,12 +41,43 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "regex"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
+
 [[package]]
 name = "rustysearch"
 version = "0.1.0"
 dependencies = [
+ "regex",
  "serde",
  "serde_json",
+ "unicode-segmentation",
 ]
 
 [[package]]
@@ -87,3 +133,9 @@ name = "unicode-ident"
 version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
diff --git a/Cargo.toml b/Cargo.toml
index 09895e8..3022c40 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,3 +11,5 @@ license = "MIT"
 [dependencies]
 serde = { version = "1.0.189", features = ["derive"] }
 serde_json = "1.0.107"
+unicode-segmentation = "^1"
+regex = "^1"
\ No newline at end of file
diff --git a/src/analyze/mod.rs b/src/analyze/mod.rs
new file mode 100644
index 0000000..5768976
--- /dev/null
+++ b/src/analyze/mod.rs
@@ -0,0 +1 @@
+pub mod tokenizer;
\ No newline at end of file
diff --git a/src/analyze/tokenizer.rs b/src/analyze/tokenizer.rs
new file mode 100644
index 0000000..a4aa7a6
--- /dev/null
+++ b/src/analyze/tokenizer.rs
@@ -0,0 +1,127 @@
+use std::collections::HashSet;
+
+use regex::Regex;
+use unicode_segmentation::UnicodeSegmentation;
+
+pub struct Tokenizer {
+    text: String,
+    stopwords: HashSet<String>,
+    punctuation: HashSet<String>,
+}
+
+impl Tokenizer {
+    pub fn new(text: &str, stopwords: Vec<String>, punctuation: Option<Vec<String>>) -> Self {
+        Self {
+            text: text.to_owned(),
+            stopwords: stopwords
+                .iter()
+                .map(|s| s.to_owned())
+                .collect::<HashSet<String>>(),
+            punctuation: punctuation
+                .unwrap_or(
+                    vec![
+                        "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/",
+                        ":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
+                        "}", "~", "-",
+                    ]
+                    .iter()
+                    .map(|s| s.to_string())
+                    .collect::<Vec<String>>(),
+                )
+                .iter()
+                .map(|s| s.to_string())
+                .collect::<HashSet<String>>(),
+        }
+    }
+
+    // Split text into words
+    pub fn split_into_words(&self) -> Vec<String> {
+        self.text
+            .split_word_bounds()
+            .filter_map(|w| {
+                process_word(
+                    w,
+                    &get_special_char_regex(),
+                    &self.stopwords,
+                    &self.punctuation,
+                )
+            })
+            .collect::<Vec<String>>()
+    }
+
+    pub fn split_into_sentences(&self) -> Vec<String> {
+        let special_char_regex = get_special_char_regex();
+        get_sentence_space_regex()
+            .replace_all(&self.text, ".")
+            .unicode_sentences()
+            .map(|s| {
+                s.split_word_bounds()
+                    .filter_map(|w| {
+                        process_word(w, &special_char_regex, &self.stopwords, &self.punctuation)
+                    })
+                    .collect::<Vec<String>>()
+                    .join(" ")
+            })
+            .collect::<Vec<String>>()
+    }
+
+    pub fn split_into_paragraphs(&self) -> Vec<String> {
+        get_newline_regex()
+            .split(&self.text)
+            .filter_map(|s| {
+                if s.trim().is_empty() {
+                    return None;
+                }
+
+                Some(
+                    s.unicode_sentences()
+                        .map(|s| {
+                            s.split_word_bounds()
+                                .filter_map(|w| {
+                                    process_word(
+                                        w,
+                                        &get_special_char_regex(),
+                                        &self.stopwords,
+                                        &self.punctuation,
+                                    )
+                                })
+                                .collect::<Vec<String>>()
+                                .join(" ")
+                        })
+                        .collect::<Vec<String>>()
+                        .join(" "),
+                )
+            })
+            .collect::<Vec<String>>()
+    }
+}
+
+fn process_word(
+    w: &str,
+    special_char_regex: &Regex,
+    stopwords: &HashSet<String>,
+    punctuation: &HashSet<String>,
+) -> Option<String> {
+    let word = special_char_regex.replace_all(w.trim(), "").to_lowercase();
+
+    if word.is_empty()
+        || (word.graphemes(true).count() == 1) && punctuation.contains(&word)
+        || stopwords.contains(&word)
+    {
+        return None;
+    }
+
+    Some(word)
+}
+
+fn get_special_char_regex() -> Regex {
+    Regex::new(r"('s|,|\.)").unwrap()
+}
+
+fn get_sentence_space_regex() -> Regex {
+    Regex::new(r"^([\.!?])[\n\t\r]").unwrap()
+}
+
+fn get_newline_regex() -> Regex {
+    Regex::new(r"(\r|\n|\r\n)").unwrap()
+}
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index d761767..9fd09e8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod types;
-pub mod search;
\ No newline at end of file
+pub mod search;
+pub mod analyze;
\ No newline at end of file
diff --git a/src/search.rs b/src/search.rs
index afd35d2..f05d61d 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -1,6 +1,6 @@
 use std::{fs, path::Path};
 
-use crate::types::Stats;
+use crate::{types::Stats, analyze::tokenizer::Tokenizer};
 
 pub struct Rustysearch {
     base_directory: String,
@@ -92,10 +92,18 @@ impl Rustysearch {
         self.write_stats(current_stats);
     }
 
-    /// **Returns the total number of documents the index is aware of**
+    /// Returns the total number of documents the index is aware of
     /// 
     pub fn get_total_docs(&self) -> i32 {
         let stats = self.read_stats();
         return stats.total_docs;
     }
+
+    /// Given a string (``blob``) of text, this will return a Vector of tokens.
+    /// 
+    pub fn make_tokens(&self, blob: &str) -> Vec<String> {
+        let tokenizer = Tokenizer::new(blob, vec![], None);
+        let tokens = tokenizer.split_into_words();
+        return tokens;
+    }
 }