feat: Add Tokenizer, split Text into Vec<String>

This commit is contained in:
2023-10-24 23:17:09 +02:00
parent 383e9aa093
commit bcc8fb3461
6 changed files with 194 additions and 3 deletions

1
src/analyze/mod.rs Normal file
View File

@@ -0,0 +1 @@
pub mod tokenizer;

127
src/analyze/tokenizer.rs Normal file
View File

@@ -0,0 +1,127 @@
use std::collections::HashSet;
use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;
pub struct Tokenizer {
text: String,
stopwords: HashSet<String>,
punctuation: HashSet<String>,
}
impl Tokenizer {
pub fn new(text: &str, stopwords: Vec<String>, punctuation: Option<Vec<String>>) -> Self {
Self {
text: text.to_owned(),
stopwords: stopwords
.iter()
.map(|s| s.to_owned())
.collect::<HashSet<String>>(),
punctuation: punctuation
.unwrap_or(
vec![
"!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", ";", ".", "/",
":", ",", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
"}", "~", "-",
]
.iter()
.map(|s| s.to_string())
.collect::<Vec<String>>(),
)
.iter()
.map(|s| s.to_string())
.collect::<HashSet<String>>(),
}
}
// Split text into words
pub fn split_into_words(&self) -> Vec<String> {
self.text
.split_word_bounds()
.filter_map(|w| {
process_word(
w,
&get_special_char_regex(),
&self.stopwords,
&self.punctuation,
)
})
.collect::<Vec<String>>()
}
pub fn split_into_sentences(&self) -> Vec<String> {
let special_char_regex = get_special_char_regex();
get_sentence_space_regex()
.replace_all(&self.text, ".")
.unicode_sentences()
.map(|s| {
s.split_word_bounds()
.filter_map(|w| {
process_word(w, &special_char_regex, &self.stopwords, &self.punctuation)
})
.collect::<Vec<String>>()
.join(" ")
})
.collect::<Vec<String>>()
}
pub fn split_into_paragraphs(&self) -> Vec<String> {
get_newline_regex()
.split(&self.text)
.filter_map(|s| {
if s.trim().is_empty() {
return None;
}
Some(
s.unicode_sentences()
.map(|s| {
s.split_word_bounds()
.filter_map(|w| {
process_word(
w,
&get_special_char_regex(),
&self.stopwords,
&self.punctuation,
)
})
.collect::<Vec<String>>()
.join(" ")
})
.collect::<Vec<String>>()
.join(" "),
)
})
.collect::<Vec<String>>()
}
}
fn process_word(
w: &str,
special_char_regex: &Regex,
stopwords: &HashSet<String>,
punctuation: &HashSet<String>,
) -> Option<String> {
let word = special_char_regex.replace_all(w.trim(), "").to_lowercase();
if word.is_empty()
|| (word.graphemes(true).count() == 1) && punctuation.contains(&word)
|| stopwords.contains(&word)
{
return None;
}
Some(word)
}
fn get_special_char_regex() -> Regex {
Regex::new(r"('s|,|\.)").unwrap()
}
fn get_sentence_space_regex() -> Regex {
Regex::new(r"^([\.!?])[\n\t\r]").unwrap()
}
fn get_newline_regex() -> Regex {
Regex::new(r"(\r|\n|\r\n)").unwrap()
}

View File

@@ -1,2 +1,3 @@
pub mod types;
pub mod search;
pub mod search;
pub mod analyze;

View File

@@ -1,6 +1,6 @@
use std::{fs, path::Path};
use crate::types::Stats;
use crate::{types::Stats, analyze::tokenizer::Tokenizer};
pub struct Rustysearch {
base_directory: String,
@@ -92,10 +92,18 @@ impl Rustysearch {
self.write_stats(current_stats);
}
/// **Returns the total number of documents the index is aware of**
/// Returns the total number of documents the index is aware of
///
pub fn get_total_docs(&self) -> i32 {
let stats = self.read_stats();
return stats.total_docs;
}
/// Given a string (``blob``) of text, this will return a Vector of tokens.
///
pub fn make_tokens(&self, blob: &str) -> Vec<String> {
let tokenizer = Tokenizer::new(blob, vec![], None);
let tokens = tokenizer.split_into_words();
return tokens;
}
}