feat: Add Tokenizer, split Text into Vec<String>

2023-10-24 23:17:09 +02:00
parent 383e9aa093
commit bcc8fb3461
6 changed files with 194 additions and 3 deletions
--- a/src/analyze/mod.rs
+++ b/src/analyze/mod.rs
@@ -0,0 +1 @@
+pub mod tokenizer;
--- a/src/analyze/tokenizer.rs
+++ b/src/analyze/tokenizer.rs
@@ -0,0 +1,127 @@
+use std::collections::HashSet;
+
+use regex::Regex;
+use unicode_segmentation::UnicodeSegmentation;
+
+pub struct Tokenizer {
+    text: String,
+    stopwords: HashSet<String>,
+    punctuation: HashSet<String>,
+}
+
+impl Tokenizer {
+    pub fn new(text: &str, stopwords: Vec<String>, punctuation: Option<Vec<String>>) -> Self {
+        Self {
+            text: text.to_owned(),
+            stopwords: stopwords
+                .iter()
+                .map(|s| s.to_owned())
+                .collect::<HashSet<String>>(),
+            punctuation: punctuation
+                .unwrap_or(
+                    vec![
+                        "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/",
+                        ":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
+                        "}", "~", "-",
+                    ]
+                    .iter()
+                    .map(|s| s.to_string())
+                    .collect::<Vec<String>>(),
+                )
+                .iter()
+                .map(|s| s.to_string())
+                .collect::<HashSet<String>>(),
+        }
+    }
+
+    // Split text into words
+    pub fn split_into_words(&self) -> Vec<String> {
+        self.text
+            .split_word_bounds()
+            .filter_map(|w| {
+                process_word(
+                    w,
+                    &get_special_char_regex(),
+                    &self.stopwords,
+                    &self.punctuation,
+                )
+            })
+            .collect::<Vec<String>>()
+    }
+
+    pub fn split_into_sentences(&self) -> Vec<String> {
+        let special_char_regex = get_special_char_regex();
+        get_sentence_space_regex()
+            .replace_all(&self.text, ".")
+            .unicode_sentences()
+            .map(|s| {
+                s.split_word_bounds()
+                    .filter_map(|w| {
+                        process_word(w, &special_char_regex, &self.stopwords, &self.punctuation)
+                    })
+                    .collect::<Vec<String>>()
+                    .join(" ")
+            })
+            .collect::<Vec<String>>()
+    }
+
+    pub fn split_into_paragraphs(&self) -> Vec<String> {
+        get_newline_regex()
+            .split(&self.text)
+            .filter_map(|s| {
+                if s.trim().is_empty() {
+                    return None;
+                }
+
+                Some(
+                    s.unicode_sentences()
+                        .map(|s| {
+                            s.split_word_bounds()
+                                .filter_map(|w| {
+                                    process_word(
+                                        w,
+                                        &get_special_char_regex(),
+                                        &self.stopwords,
+                                        &self.punctuation,
+                                    )
+                                })
+                                .collect::<Vec<String>>()
+                                .join(" ")
+                        })
+                        .collect::<Vec<String>>()
+                        .join(" "),
+                )
+            })
+            .collect::<Vec<String>>()
+    }
+}
+
+fn process_word(
+    w: &str,
+    special_char_regex: &Regex,
+    stopwords: &HashSet<String>,
+    punctuation: &HashSet<String>,
+) -> Option<String> {
+    let word = special_char_regex.replace_all(w.trim(), "").to_lowercase();
+
+    if word.is_empty()
+        || (word.graphemes(true).count() == 1) && punctuation.contains(&word)
+        || stopwords.contains(&word)
+    {
+        return None;
+    }
+
+    Some(word)
+}
+
+fn get_special_char_regex() -> Regex {
+    Regex::new(r"('s|,|\.)").unwrap()
+}
+
+fn get_sentence_space_regex() -> Regex {
+    Regex::new(r"^([\.!?])[\n\t\r]").unwrap()
+}
+
+fn get_newline_regex() -> Regex {
+    Regex::new(r"(\r|\n|\r\n)").unwrap()
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod types;
-pub mod search;
+pub mod search;
+pub mod analyze;
--- a/src/search.rs
+++ b/src/search.rs
@@ -1,6 +1,6 @@
 use std::{fs, path::Path};

-use crate::types::Stats;
+use crate::{types::Stats, analyze::tokenizer::Tokenizer};

 pub struct Rustysearch {
    base_directory: String,
@@ -92,10 +92,18 @@ impl Rustysearch {
        self.write_stats(current_stats);
    }

-    /// **Returns the total number of documents the index is aware of**
+    /// Returns the total number of documents the index is aware of
    /// 
    pub fn get_total_docs(&self) -> i32 {
        let stats = self.read_stats();
        return stats.total_docs;
    }
+
+    /// Given a string (``blob``) of text, this will return a Vector of tokens.
+    /// 
+    pub fn make_tokens(&self, blob: &str) -> Vec<String> {
+        let tokenizer = Tokenizer::new(blob, vec![], None);
+        let tokens = tokenizer.split_into_words();
+        return tokens;
+    }
 }