feat: Added search functionality with TF-IDF scoring and inverted index data structure for efficient document retrieval.

This commit is contained in:
Alex Wellnitz 2025-03-25 17:04:40 +01:00
parent 4c5a57d109
commit 8746f789c4
5 changed files with 198 additions and 3 deletions

View File

@ -1,9 +1,49 @@
package controller
import (
"fmt"
"os"
"strings"
"git.dev-null.rocks/alexohneander/gosearch/pkg/search"
"github.com/gofiber/fiber/v3"
)
func SearchQuery(c fiber.Ctx) error {
return c.SendString("Hello, World!")
query := c.Params("query")
query = strings.TrimSpace(query)
terms, queryType := parseQuery(query)
results := search.Search(terms, queryType, search.Index, search.DocFreq, len(search.Files))
var response string
response = fmt.Sprintf("Search Results (%s query):\n", queryType)
for _, result := range results {
response = response + "\n" + fmt.Sprintf("- %s (Score: %.4f)\n", result.Document, result.Score)
}
return c.SendString(response)
}
// parseQuery parses the query to determine query type and terms
func parseQuery(query string) ([]string, string) {
if strings.Contains(query, "AND") {
return strings.Split(query, " AND "), "AND"
} else if strings.Contains(query, "OR") {
return strings.Split(query, " OR "), "OR"
}
return strings.Fields(query), "SIMPLE"
}
// phraseMatch checks if all terms appear in the given document in sequence
func phraseMatch(terms []string, doc string) bool {
// Read the full document content
content, err := os.ReadFile(doc)
if err != nil {
return false
}
// Check if the exact phrase (joined terms) is in the document content
phrase := strings.Join(terms, " ")
return strings.Contains(strings.ToLower(string(content)), phrase)
}

View File

@ -12,7 +12,7 @@ func configureRoutes(app *fiber.App) *fiber.App {
app.Get("/test", controller.Index)
// Search
app.Get("/api/search/:index/:query", controller.SearchQuery)
app.Get("/api/search/:query", controller.SearchQuery)
// Monitor
// app.Get("/metrics", monitor.New(monitor.Config{Title: "MyService Metrics Page"}))

View File

@ -1,8 +1,13 @@
package main
import "git.dev-null.rocks/alexohneander/gosearch/internal/http"
import (
"git.dev-null.rocks/alexohneander/gosearch/internal/http"
"git.dev-null.rocks/alexohneander/gosearch/pkg/search"
)
func main() {
search.TestIndex()
// Start HTTP Server
http.StartService()
}

63
pkg/search/index.go Normal file
View File

@ -0,0 +1,63 @@
package search
import (
"bufio"
"log"
"os"
"strings"
)
// needs to be saved in a file or database
type InvertedIndex map[string]map[string]int
type DocumentFrequency map[string]int
var Index InvertedIndex
var DocFreq DocumentFrequency
var Files []string
// BuildIndex reads files and builds an inverted index.
func BuildIndex(files []string) (InvertedIndex, DocumentFrequency, error) {
index := make(InvertedIndex)
docFreq := make(DocumentFrequency)
for _, file := range files {
f, err := os.Open(file)
if err != nil {
return nil, nil, err
}
defer f.Close()
seenTerms := make(map[string]bool) // Track terms in this document
scanner := bufio.NewScanner(f)
scanner.Split(bufio.ScanWords)
for scanner.Scan() {
word := strings.ToLower(strings.Trim(scanner.Text(), ",.!?"))
if index[word] == nil {
index[word] = make(map[string]int)
}
index[word][file]++
if !seenTerms[word] {
docFreq[word]++
seenTerms[word] = true
}
}
}
return index, docFreq, nil
}
func TestIndex() {
// Index files
files := []string{"data/doc1.txt", "data/doc2.txt", "data/doc3.txt"}
index, docFreq, err := BuildIndex(files)
if err != nil {
log.Fatalf("Error building index: %v", err)
}
Files = files
Index = index
DocFreq = docFreq
}

87
pkg/search/search.go Normal file
View File

@ -0,0 +1,87 @@
package search
import (
"math"
"sort"
)
// SearchResult stores the document and its relevance score.
type SearchResult struct {
Document string
Score float64
}
// Search processes different types of queries using TF-IDF scoring.
func Search(terms []string, queryType string, index InvertedIndex, docFreq DocumentFrequency, numDocs int) []SearchResult {
scores := make(map[string]float64)
if queryType == "AND" {
// Ensure all terms appear in the document (AND logic)
for _, doc := range intersectDocs(terms, index) {
scores[doc] = scoreDoc(terms, doc, index, docFreq, numDocs)
}
} else if queryType == "OR" {
// Include any document that contains at least one of the terms (OR logic)
for _, term := range terms {
for doc := range index[term] {
scores[doc] += scoreDoc([]string{term}, doc, index, docFreq, numDocs)
}
}
} else {
// Simple query - score documents based on TF-IDF for any terms
for _, term := range terms {
for doc := range index[term] {
scores[doc] += scoreDoc([]string{term}, doc, index, docFreq, numDocs)
}
}
}
return rankResults(scores)
}
// Helper function to score a single document based on terms
func scoreDoc(terms []string, doc string, index InvertedIndex, docFreq DocumentFrequency, numDocs int) float64 {
score := 0.0
for _, term := range terms {
tf := float64(index[term][doc])
idf := math.Log(float64(numDocs) / float64(docFreq[term]))
score += tf * idf
//fmt.Printf("Score: %f64 %f64 %f64\n", tf, idf, score)
}
return score
}
// Helper function to intersect documents for AND logic
func intersectDocs(terms []string, index InvertedIndex) []string {
if len(terms) == 0 {
return nil
}
docs := make(map[string]bool)
for doc := range index[terms[0]] {
docs[doc] = true
}
for _, term := range terms[1:] {
for doc := range docs {
if _, exists := index[term][doc]; !exists {
delete(docs, doc)
}
}
}
result := []string{}
for doc := range docs {
result = append(result, doc)
}
return result
}
// rankResults sorts the documents by score
func rankResults(scores map[string]float64) []SearchResult {
results := make([]SearchResult, 0, len(scores))
for doc, score := range scores {
results = append(results, SearchResult{Document: doc, Score: score})
}
sort.Slice(results, func(i, j int) bool {
return results[i].Score > results[j].Score
})
return results
}