feat: Add Tokenizer, split Text into Vec<String>
This commit is contained in:
parent
383e9aa093
commit
bcc8fb3461
52
Cargo.lock
generated
52
Cargo.lock
generated
@ -2,12 +2,27 @@
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.69"
|
||||
@ -26,12 +41,43 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||
|
||||
[[package]]
|
||||
name = "rustysearch"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -87,3 +133,9 @@ name = "unicode-ident"
|
||||
version = "1.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
||||
|
@ -11,3 +11,5 @@ license = "MIT"
|
||||
[dependencies]
|
||||
serde = { version = "1.0.189", features = ["derive"] }
|
||||
serde_json = "1.0.107"
|
||||
unicode-segmentation = "^1"
|
||||
regex = "^1"
|
1
src/analyze/mod.rs
Normal file
1
src/analyze/mod.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod tokenizer;
|
127
src/analyze/tokenizer.rs
Normal file
127
src/analyze/tokenizer.rs
Normal file
@ -0,0 +1,127 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use regex::Regex;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
pub struct Tokenizer {
|
||||
text: String,
|
||||
stopwords: HashSet<String>,
|
||||
punctuation: HashSet<String>,
|
||||
}
|
||||
|
||||
impl Tokenizer {
|
||||
pub fn new(text: &str, stopwords: Vec<String>, punctuation: Option<Vec<String>>) -> Self {
|
||||
Self {
|
||||
text: text.to_owned(),
|
||||
stopwords: stopwords
|
||||
.iter()
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<HashSet<String>>(),
|
||||
punctuation: punctuation
|
||||
.unwrap_or(
|
||||
vec![
|
||||
"!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/",
|
||||
":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
|
||||
"}", "~", "-",
|
||||
]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<String>>(),
|
||||
)
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<HashSet<String>>(),
|
||||
}
|
||||
}
|
||||
|
||||
// Split text into words
|
||||
pub fn split_into_words(&self) -> Vec<String> {
|
||||
self.text
|
||||
.split_word_bounds()
|
||||
.filter_map(|w| {
|
||||
process_word(
|
||||
w,
|
||||
&get_special_char_regex(),
|
||||
&self.stopwords,
|
||||
&self.punctuation,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
}
|
||||
|
||||
pub fn split_into_sentences(&self) -> Vec<String> {
|
||||
let special_char_regex = get_special_char_regex();
|
||||
get_sentence_space_regex()
|
||||
.replace_all(&self.text, ".")
|
||||
.unicode_sentences()
|
||||
.map(|s| {
|
||||
s.split_word_bounds()
|
||||
.filter_map(|w| {
|
||||
process_word(w, &special_char_regex, &self.stopwords, &self.punctuation)
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
.join(" ")
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
}
|
||||
|
||||
pub fn split_into_paragraphs(&self) -> Vec<String> {
|
||||
get_newline_regex()
|
||||
.split(&self.text)
|
||||
.filter_map(|s| {
|
||||
if s.trim().is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(
|
||||
s.unicode_sentences()
|
||||
.map(|s| {
|
||||
s.split_word_bounds()
|
||||
.filter_map(|w| {
|
||||
process_word(
|
||||
w,
|
||||
&get_special_char_regex(),
|
||||
&self.stopwords,
|
||||
&self.punctuation,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
.join(" ")
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
.join(" "),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
}
|
||||
}
|
||||
|
||||
fn process_word(
|
||||
w: &str,
|
||||
special_char_regex: &Regex,
|
||||
stopwords: &HashSet<String>,
|
||||
punctuation: &HashSet<String>,
|
||||
) -> Option<String> {
|
||||
let word = special_char_regex.replace_all(w.trim(), "").to_lowercase();
|
||||
|
||||
if word.is_empty()
|
||||
|| (word.graphemes(true).count() == 1) && punctuation.contains(&word)
|
||||
|| stopwords.contains(&word)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(word)
|
||||
}
|
||||
|
||||
fn get_special_char_regex() -> Regex {
|
||||
Regex::new(r"('s|,|\.)").unwrap()
|
||||
}
|
||||
|
||||
fn get_sentence_space_regex() -> Regex {
|
||||
Regex::new(r"^([\.!?])[\n\t\r]").unwrap()
|
||||
}
|
||||
|
||||
fn get_newline_regex() -> Regex {
|
||||
Regex::new(r"(\r|\n|\r\n)").unwrap()
|
||||
}
|
@ -1,2 +1,3 @@
|
||||
pub mod types;
|
||||
pub mod search;
|
||||
pub mod search;
|
||||
pub mod analyze;
|
@ -1,6 +1,6 @@
|
||||
use std::{fs, path::Path};
|
||||
|
||||
use crate::types::Stats;
|
||||
use crate::{types::Stats, analyze::tokenizer::Tokenizer};
|
||||
|
||||
pub struct Rustysearch {
|
||||
base_directory: String,
|
||||
@ -92,10 +92,18 @@ impl Rustysearch {
|
||||
self.write_stats(current_stats);
|
||||
}
|
||||
|
||||
/// **Returns the total number of documents the index is aware of**
|
||||
/// Returns the total number of documents the index is aware of
|
||||
///
|
||||
pub fn get_total_docs(&self) -> i32 {
|
||||
let stats = self.read_stats();
|
||||
return stats.total_docs;
|
||||
}
|
||||
|
||||
/// Given a string (``blob``) of text, this will return a Vector of tokens.
|
||||
///
|
||||
pub fn make_tokens(&self, blob: &str) -> Vec<String> {
|
||||
let tokenizer = Tokenizer::new(blob, vec![], None);
|
||||
let tokens = tokenizer.split_into_words();
|
||||
return tokens;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user