feat: Go backend, enhanced search, new widgets, Docker deploy
Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
This commit is contained in:
163
backend/internal/search/reranker.go
Normal file
163
backend/internal/search/reranker.go
Normal file
@@ -0,0 +1,163 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/gooseek/backend/internal/types"
|
||||
)
|
||||
|
||||
type RankedItem struct {
|
||||
Chunk types.Chunk
|
||||
Score float64
|
||||
}
|
||||
|
||||
func RerankBM25(chunks []types.Chunk, query string, topK int) []types.Chunk {
|
||||
if len(chunks) == 0 {
|
||||
return chunks
|
||||
}
|
||||
|
||||
queryTerms := tokenize(query)
|
||||
if len(queryTerms) == 0 {
|
||||
return chunks
|
||||
}
|
||||
|
||||
df := make(map[string]int)
|
||||
for _, chunk := range chunks {
|
||||
seen := make(map[string]bool)
|
||||
terms := tokenize(chunk.Content + " " + chunk.Metadata["title"])
|
||||
for _, term := range terms {
|
||||
if !seen[term] {
|
||||
df[term]++
|
||||
seen[term] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
avgDocLen := 0.0
|
||||
for _, chunk := range chunks {
|
||||
avgDocLen += float64(len(tokenize(chunk.Content)))
|
||||
}
|
||||
avgDocLen /= float64(len(chunks))
|
||||
|
||||
k1 := 1.5
|
||||
b := 0.75
|
||||
n := float64(len(chunks))
|
||||
|
||||
ranked := make([]RankedItem, len(chunks))
|
||||
for i, chunk := range chunks {
|
||||
docTerms := tokenize(chunk.Content + " " + chunk.Metadata["title"])
|
||||
docLen := float64(len(docTerms))
|
||||
|
||||
tf := make(map[string]int)
|
||||
for _, term := range docTerms {
|
||||
tf[term]++
|
||||
}
|
||||
|
||||
score := 0.0
|
||||
for _, qterm := range queryTerms {
|
||||
if termFreq, ok := tf[qterm]; ok {
|
||||
docFreq := float64(df[qterm])
|
||||
idf := math.Log((n - docFreq + 0.5) / (docFreq + 0.5))
|
||||
if idf < 0 {
|
||||
idf = 0
|
||||
}
|
||||
|
||||
tfNorm := float64(termFreq) * (k1 + 1) /
|
||||
(float64(termFreq) + k1*(1-b+b*docLen/avgDocLen))
|
||||
|
||||
score += idf * tfNorm
|
||||
}
|
||||
}
|
||||
|
||||
if title, ok := chunk.Metadata["title"]; ok {
|
||||
titleLower := strings.ToLower(title)
|
||||
for _, qterm := range queryTerms {
|
||||
if strings.Contains(titleLower, qterm) {
|
||||
score += 2.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ranked[i] = RankedItem{Chunk: chunk, Score: score}
|
||||
}
|
||||
|
||||
sort.Slice(ranked, func(i, j int) bool {
|
||||
return ranked[i].Score > ranked[j].Score
|
||||
})
|
||||
|
||||
if topK > len(ranked) {
|
||||
topK = len(ranked)
|
||||
}
|
||||
|
||||
result := make([]types.Chunk, topK)
|
||||
for i := 0; i < topK; i++ {
|
||||
result[i] = ranked[i].Chunk
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func tokenize(text string) []string {
|
||||
text = strings.ToLower(text)
|
||||
|
||||
var tokens []string
|
||||
var current strings.Builder
|
||||
|
||||
for _, r := range text {
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
||||
current.WriteRune(r)
|
||||
} else {
|
||||
if current.Len() >= 2 {
|
||||
tokens = append(tokens, current.String())
|
||||
}
|
||||
current.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
if current.Len() >= 2 {
|
||||
tokens = append(tokens, current.String())
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
func EstimateQueryComplexity(query string) float64 {
|
||||
terms := tokenize(query)
|
||||
complexity := float64(len(terms)) / 5.0
|
||||
|
||||
if strings.Contains(query, "?") {
|
||||
complexity += 0.2
|
||||
}
|
||||
if strings.Contains(query, " и ") || strings.Contains(query, " или ") {
|
||||
complexity += 0.3
|
||||
}
|
||||
|
||||
if complexity > 1.0 {
|
||||
complexity = 1.0
|
||||
}
|
||||
return complexity
|
||||
}
|
||||
|
||||
func ComputeAdaptiveTopK(totalResults int, complexity float64, mode string) int {
|
||||
baseK := 15
|
||||
|
||||
switch mode {
|
||||
case "speed":
|
||||
baseK = 10
|
||||
case "balanced":
|
||||
baseK = 20
|
||||
case "quality":
|
||||
baseK = 30
|
||||
}
|
||||
|
||||
adaptiveK := int(float64(baseK) * (1 + complexity*0.5))
|
||||
|
||||
if adaptiveK > totalResults {
|
||||
adaptiveK = totalResults
|
||||
}
|
||||
|
||||
return adaptiveK
|
||||
}
|
||||
Reference in New Issue
Block a user