feat: Added search functionality with TF-IDF scoring and inverted index data structure for efficient document retrieval.
This commit is contained in:
parent
4c5a57d109
commit
8746f789c4
@ -1,9 +1,49 @@
|
|||||||
package controller
|
package controller
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.dev-null.rocks/alexohneander/gosearch/pkg/search"
|
||||||
"github.com/gofiber/fiber/v3"
|
"github.com/gofiber/fiber/v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
func SearchQuery(c fiber.Ctx) error {
|
func SearchQuery(c fiber.Ctx) error {
|
||||||
return c.SendString("Hello, World!")
|
query := c.Params("query")
|
||||||
|
query = strings.TrimSpace(query)
|
||||||
|
|
||||||
|
terms, queryType := parseQuery(query)
|
||||||
|
results := search.Search(terms, queryType, search.Index, search.DocFreq, len(search.Files))
|
||||||
|
|
||||||
|
var response string
|
||||||
|
|
||||||
|
response = fmt.Sprintf("Search Results (%s query):\n", queryType)
|
||||||
|
for _, result := range results {
|
||||||
|
response = response + "\n" + fmt.Sprintf("- %s (Score: %.4f)\n", result.Document, result.Score)
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.SendString(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseQuery parses the query to determine query type and terms
|
||||||
|
func parseQuery(query string) ([]string, string) {
|
||||||
|
if strings.Contains(query, "AND") {
|
||||||
|
return strings.Split(query, " AND "), "AND"
|
||||||
|
} else if strings.Contains(query, "OR") {
|
||||||
|
return strings.Split(query, " OR "), "OR"
|
||||||
|
}
|
||||||
|
return strings.Fields(query), "SIMPLE"
|
||||||
|
}
|
||||||
|
|
||||||
|
// phraseMatch checks if all terms appear in the given document in sequence
|
||||||
|
func phraseMatch(terms []string, doc string) bool {
|
||||||
|
// Read the full document content
|
||||||
|
content, err := os.ReadFile(doc)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// Check if the exact phrase (joined terms) is in the document content
|
||||||
|
phrase := strings.Join(terms, " ")
|
||||||
|
return strings.Contains(strings.ToLower(string(content)), phrase)
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,7 @@ func configureRoutes(app *fiber.App) *fiber.App {
|
|||||||
app.Get("/test", controller.Index)
|
app.Get("/test", controller.Index)
|
||||||
|
|
||||||
// Search
|
// Search
|
||||||
app.Get("/api/search/:index/:query", controller.SearchQuery)
|
app.Get("/api/search/:query", controller.SearchQuery)
|
||||||
|
|
||||||
// Monitor
|
// Monitor
|
||||||
// app.Get("/metrics", monitor.New(monitor.Config{Title: "MyService Metrics Page"}))
|
// app.Get("/metrics", monitor.New(monitor.Config{Title: "MyService Metrics Page"}))
|
||||||
|
7
main.go
7
main.go
@ -1,8 +1,13 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import "git.dev-null.rocks/alexohneander/gosearch/internal/http"
|
import (
|
||||||
|
"git.dev-null.rocks/alexohneander/gosearch/internal/http"
|
||||||
|
"git.dev-null.rocks/alexohneander/gosearch/pkg/search"
|
||||||
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
search.TestIndex()
|
||||||
|
|
||||||
// Start HTTP Server
|
// Start HTTP Server
|
||||||
http.StartService()
|
http.StartService()
|
||||||
}
|
}
|
||||||
|
63
pkg/search/index.go
Normal file
63
pkg/search/index.go
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// needs to be saved in a file or database
|
||||||
|
type InvertedIndex map[string]map[string]int
|
||||||
|
type DocumentFrequency map[string]int
|
||||||
|
|
||||||
|
var Index InvertedIndex
|
||||||
|
var DocFreq DocumentFrequency
|
||||||
|
var Files []string
|
||||||
|
|
||||||
|
// BuildIndex reads files and builds an inverted index.
|
||||||
|
func BuildIndex(files []string) (InvertedIndex, DocumentFrequency, error) {
|
||||||
|
index := make(InvertedIndex)
|
||||||
|
docFreq := make(DocumentFrequency)
|
||||||
|
|
||||||
|
for _, file := range files {
|
||||||
|
f, err := os.Open(file)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
seenTerms := make(map[string]bool) // Track terms in this document
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Split(bufio.ScanWords)
|
||||||
|
|
||||||
|
for scanner.Scan() {
|
||||||
|
word := strings.ToLower(strings.Trim(scanner.Text(), ",.!?"))
|
||||||
|
|
||||||
|
if index[word] == nil {
|
||||||
|
index[word] = make(map[string]int)
|
||||||
|
}
|
||||||
|
index[word][file]++
|
||||||
|
|
||||||
|
if !seenTerms[word] {
|
||||||
|
docFreq[word]++
|
||||||
|
seenTerms[word] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return index, docFreq, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIndex() {
|
||||||
|
// Index files
|
||||||
|
files := []string{"data/doc1.txt", "data/doc2.txt", "data/doc3.txt"}
|
||||||
|
|
||||||
|
index, docFreq, err := BuildIndex(files)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Error building index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
Files = files
|
||||||
|
Index = index
|
||||||
|
DocFreq = docFreq
|
||||||
|
}
|
87
pkg/search/search.go
Normal file
87
pkg/search/search.go
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SearchResult stores the document and its relevance score.
|
||||||
|
type SearchResult struct {
|
||||||
|
Document string
|
||||||
|
Score float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search processes different types of queries using TF-IDF scoring.
|
||||||
|
func Search(terms []string, queryType string, index InvertedIndex, docFreq DocumentFrequency, numDocs int) []SearchResult {
|
||||||
|
scores := make(map[string]float64)
|
||||||
|
|
||||||
|
if queryType == "AND" {
|
||||||
|
// Ensure all terms appear in the document (AND logic)
|
||||||
|
for _, doc := range intersectDocs(terms, index) {
|
||||||
|
scores[doc] = scoreDoc(terms, doc, index, docFreq, numDocs)
|
||||||
|
}
|
||||||
|
} else if queryType == "OR" {
|
||||||
|
// Include any document that contains at least one of the terms (OR logic)
|
||||||
|
for _, term := range terms {
|
||||||
|
for doc := range index[term] {
|
||||||
|
scores[doc] += scoreDoc([]string{term}, doc, index, docFreq, numDocs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Simple query - score documents based on TF-IDF for any terms
|
||||||
|
for _, term := range terms {
|
||||||
|
for doc := range index[term] {
|
||||||
|
scores[doc] += scoreDoc([]string{term}, doc, index, docFreq, numDocs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rankResults(scores)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to score a single document based on terms
|
||||||
|
func scoreDoc(terms []string, doc string, index InvertedIndex, docFreq DocumentFrequency, numDocs int) float64 {
|
||||||
|
score := 0.0
|
||||||
|
for _, term := range terms {
|
||||||
|
tf := float64(index[term][doc])
|
||||||
|
idf := math.Log(float64(numDocs) / float64(docFreq[term]))
|
||||||
|
score += tf * idf
|
||||||
|
//fmt.Printf("Score: %f64 %f64 %f64\n", tf, idf, score)
|
||||||
|
}
|
||||||
|
return score
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to intersect documents for AND logic
|
||||||
|
func intersectDocs(terms []string, index InvertedIndex) []string {
|
||||||
|
if len(terms) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
docs := make(map[string]bool)
|
||||||
|
for doc := range index[terms[0]] {
|
||||||
|
docs[doc] = true
|
||||||
|
}
|
||||||
|
for _, term := range terms[1:] {
|
||||||
|
for doc := range docs {
|
||||||
|
if _, exists := index[term][doc]; !exists {
|
||||||
|
delete(docs, doc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result := []string{}
|
||||||
|
for doc := range docs {
|
||||||
|
result = append(result, doc)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// rankResults sorts the documents by score
|
||||||
|
func rankResults(scores map[string]float64) []SearchResult {
|
||||||
|
results := make([]SearchResult, 0, len(scores))
|
||||||
|
for doc, score := range scores {
|
||||||
|
results = append(results, SearchResult{Document: doc, Score: score})
|
||||||
|
}
|
||||||
|
sort.Slice(results, func(i, j int) bool {
|
||||||
|
return results[i].Score > results[j].Score
|
||||||
|
})
|
||||||
|
return results
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user