feat: Add Tokenizer, split Text into Vec<String>

2023-10-24 23:17:09 +02:00
parent 383e9aa093
commit bcc8fb3461
6 changed files with 194 additions and 3 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,12 +2,27 @@
 # It is not intended for manual editing.
 version = 3

+[[package]]
+name = "aho-corasick"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"

+[[package]]
+name = "memchr"
+version = "2.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.69"
@@ -26,12 +41,43 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "regex"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
+
 [[package]]
 name = "rustysearch"
 version = "0.1.0"
 dependencies = [
+ "regex",
 "serde",
 "serde_json",
+ "unicode-segmentation",
 ]

 [[package]]
@@ -87,3 +133,9 @@ name = "unicode-ident"
 version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,3 +11,5 @@ license = "MIT"
 [dependencies]
 serde = { version = "1.0.189", features = ["derive"] }
 serde_json = "1.0.107"
+unicode-segmentation = "^1"
+regex = "^1"
--- a/src/analyze/mod.rs
+++ b/src/analyze/mod.rs
@@ -0,0 +1 @@
+pub mod tokenizer;
--- a/src/analyze/tokenizer.rs
+++ b/src/analyze/tokenizer.rs
@@ -0,0 +1,127 @@
+use std::collections::HashSet;
+
+use regex::Regex;
+use unicode_segmentation::UnicodeSegmentation;
+
+pub struct Tokenizer {
+    text: String,
+    stopwords: HashSet<String>,
+    punctuation: HashSet<String>,
+}
+
+impl Tokenizer {
+    pub fn new(text: &str, stopwords: Vec<String>, punctuation: Option<Vec<String>>) -> Self {
+        Self {
+            text: text.to_owned(),
+            stopwords: stopwords
+                .iter()
+                .map(|s| s.to_owned())
+                .collect::<HashSet<String>>(),
+            punctuation: punctuation
+                .unwrap_or(
+                    vec![
+                        "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/",
+                        ":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
+                        "}", "~", "-",
+                    ]
+                    .iter()
+                    .map(|s| s.to_string())
+                    .collect::<Vec<String>>(),
+                )
+                .iter()
+                .map(|s| s.to_string())
+                .collect::<HashSet<String>>(),
+        }
+    }
+
+    // Split text into words
+    pub fn split_into_words(&self) -> Vec<String> {
+        self.text
+            .split_word_bounds()
+            .filter_map(|w| {
+                process_word(
+                    w,
+                    &get_special_char_regex(),
+                    &self.stopwords,
+                    &self.punctuation,
+                )
+            })
+            .collect::<Vec<String>>()
+    }
+
+    pub fn split_into_sentences(&self) -> Vec<String> {
+        let special_char_regex = get_special_char_regex();
+        get_sentence_space_regex()
+            .replace_all(&self.text, ".")
+            .unicode_sentences()
+            .map(|s| {
+                s.split_word_bounds()
+                    .filter_map(|w| {
+                        process_word(w, &special_char_regex, &self.stopwords, &self.punctuation)
+                    })
+                    .collect::<Vec<String>>()
+                    .join(" ")
+            })
+            .collect::<Vec<String>>()
+    }
+
+    pub fn split_into_paragraphs(&self) -> Vec<String> {
+        get_newline_regex()
+            .split(&self.text)
+            .filter_map(|s| {
+                if s.trim().is_empty() {
+                    return None;
+                }
+
+                Some(
+                    s.unicode_sentences()
+                        .map(|s| {
+                            s.split_word_bounds()
+                                .filter_map(|w| {
+                                    process_word(
+                                        w,
+                                        &get_special_char_regex(),
+                                        &self.stopwords,
+                                        &self.punctuation,
+                                    )
+                                })
+                                .collect::<Vec<String>>()
+                                .join(" ")
+                        })
+                        .collect::<Vec<String>>()
+                        .join(" "),
+                )
+            })
+            .collect::<Vec<String>>()
+    }
+}
+
+fn process_word(
+    w: &str,
+    special_char_regex: &Regex,
+    stopwords: &HashSet<String>,
+    punctuation: &HashSet<String>,
+) -> Option<String> {
+    let word = special_char_regex.replace_all(w.trim(), "").to_lowercase();
+
+    if word.is_empty()
+        || (word.graphemes(true).count() == 1) && punctuation.contains(&word)
+        || stopwords.contains(&word)
+    {
+        return None;
+    }
+
+    Some(word)
+}
+
+fn get_special_char_regex() -> Regex {
+    Regex::new(r"('s|,|\.)").unwrap()
+}
+
+fn get_sentence_space_regex() -> Regex {
+    Regex::new(r"^([\.!?])[\n\t\r]").unwrap()
+}
+
+fn get_newline_regex() -> Regex {
+    Regex::new(r"(\r|\n|\r\n)").unwrap()
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod types;
-pub mod search;
+pub mod search;
+pub mod analyze;
--- a/src/search.rs
+++ b/src/search.rs
@@ -1,6 +1,6 @@
 use std::{fs, path::Path};

-use crate::types::Stats;
+use crate::{types::Stats, analyze::tokenizer::Tokenizer};

 pub struct Rustysearch {
    base_directory: String,
@@ -92,10 +92,18 @@ impl Rustysearch {
        self.write_stats(current_stats);
    }

-    /// **Returns the total number of documents the index is aware of**
+    /// Returns the total number of documents the index is aware of
    /// 
    pub fn get_total_docs(&self) -> i32 {
        let stats = self.read_stats();
        return stats.total_docs;
    }
+
+    /// Given a string (``blob``) of text, this will return a Vector of tokens.
+    /// 
+    pub fn make_tokens(&self, blob: &str) -> Vec<String> {
+        let tokenizer = Tokenizer::new(blob, vec![], None);
+        let tokens = tokenizer.split_into_words();
+        return tokens;
+    }
 }