feat: Add Tokenizer, split Text into Vec<String>

This commit is contained in:
Alex Wellnitz 2023-10-24 23:17:09 +02:00
parent 383e9aa093
commit bcc8fb3461
6 changed files with 194 additions and 3 deletions

52
Cargo.lock generated
View File

@ -2,12 +2,27 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]]
name = "itoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
[[package]]
name = "memchr"
version = "2.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]]
name = "proc-macro2"
version = "1.0.69"
@ -26,12 +41,43 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]]
name = "rustysearch"
version = "0.1.0"
dependencies = [
"regex",
"serde",
"serde_json",
"unicode-segmentation",
]
[[package]]
@ -87,3 +133,9 @@ name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "unicode-segmentation"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"

View File

@ -11,3 +11,5 @@ license = "MIT"
[dependencies]
serde = { version = "1.0.189", features = ["derive"] }
serde_json = "1.0.107"
unicode-segmentation = "^1"
regex = "^1"

1
src/analyze/mod.rs Normal file
View File

@ -0,0 +1 @@
pub mod tokenizer;

127
src/analyze/tokenizer.rs Normal file
View File

@ -0,0 +1,127 @@
use std::collections::HashSet;
use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;
pub struct Tokenizer {
text: String,
stopwords: HashSet<String>,
punctuation: HashSet<String>,
}
impl Tokenizer {
pub fn new(text: &str, stopwords: Vec<String>, punctuation: Option<Vec<String>>) -> Self {
Self {
text: text.to_owned(),
stopwords: stopwords
.iter()
.map(|s| s.to_owned())
.collect::<HashSet<String>>(),
punctuation: punctuation
.unwrap_or(
vec![
"!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/",
":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
"}", "~", "-",
]
.iter()
.map(|s| s.to_string())
.collect::<Vec<String>>(),
)
.iter()
.map(|s| s.to_string())
.collect::<HashSet<String>>(),
}
}
// Split text into words
pub fn split_into_words(&self) -> Vec<String> {
self.text
.split_word_bounds()
.filter_map(|w| {
process_word(
w,
&get_special_char_regex(),
&self.stopwords,
&self.punctuation,
)
})
.collect::<Vec<String>>()
}
pub fn split_into_sentences(&self) -> Vec<String> {
let special_char_regex = get_special_char_regex();
get_sentence_space_regex()
.replace_all(&self.text, ".")
.unicode_sentences()
.map(|s| {
s.split_word_bounds()
.filter_map(|w| {
process_word(w, &special_char_regex, &self.stopwords, &self.punctuation)
})
.collect::<Vec<String>>()
.join(" ")
})
.collect::<Vec<String>>()
}
pub fn split_into_paragraphs(&self) -> Vec<String> {
get_newline_regex()
.split(&self.text)
.filter_map(|s| {
if s.trim().is_empty() {
return None;
}
Some(
s.unicode_sentences()
.map(|s| {
s.split_word_bounds()
.filter_map(|w| {
process_word(
w,
&get_special_char_regex(),
&self.stopwords,
&self.punctuation,
)
})
.collect::<Vec<String>>()
.join(" ")
})
.collect::<Vec<String>>()
.join(" "),
)
})
.collect::<Vec<String>>()
}
}
fn process_word(
w: &str,
special_char_regex: &Regex,
stopwords: &HashSet<String>,
punctuation: &HashSet<String>,
) -> Option<String> {
let word = special_char_regex.replace_all(w.trim(), "").to_lowercase();
if word.is_empty()
|| (word.graphemes(true).count() == 1) && punctuation.contains(&word)
|| stopwords.contains(&word)
{
return None;
}
Some(word)
}
fn get_special_char_regex() -> Regex {
Regex::new(r"('s|,|\.)").unwrap()
}
fn get_sentence_space_regex() -> Regex {
Regex::new(r"^([\.!?])[\n\t\r]").unwrap()
}
fn get_newline_regex() -> Regex {
Regex::new(r"(\r|\n|\r\n)").unwrap()
}

View File

@ -1,2 +1,3 @@
pub mod types;
pub mod search;
pub mod search;
pub mod analyze;

View File

@ -1,6 +1,6 @@
use std::{fs, path::Path};
use crate::types::Stats;
use crate::{types::Stats, analyze::tokenizer::Tokenizer};
pub struct Rustysearch {
base_directory: String,
@ -92,10 +92,18 @@ impl Rustysearch {
self.write_stats(current_stats);
}
/// **Returns the total number of documents the index is aware of**
/// Returns the total number of documents the index is aware of
///
pub fn get_total_docs(&self) -> i32 {
let stats = self.read_stats();
return stats.total_docs;
}
/// Given a string (``blob``) of text, this will return a Vector of tokens.
///
pub fn make_tokens(&self, blob: &str) -> Vec<String> {
let tokenizer = Tokenizer::new(blob, vec![], None);
let tokens = tokenizer.split_into_words();
return tokens;
}
}