Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
164 lines
3.0 KiB
Go
164 lines
3.0 KiB
Go
package search
|
|
|
|
import (
|
|
"math"
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/gooseek/backend/internal/types"
|
|
)
|
|
|
|
type RankedItem struct {
|
|
Chunk types.Chunk
|
|
Score float64
|
|
}
|
|
|
|
func RerankBM25(chunks []types.Chunk, query string, topK int) []types.Chunk {
|
|
if len(chunks) == 0 {
|
|
return chunks
|
|
}
|
|
|
|
queryTerms := tokenize(query)
|
|
if len(queryTerms) == 0 {
|
|
return chunks
|
|
}
|
|
|
|
df := make(map[string]int)
|
|
for _, chunk := range chunks {
|
|
seen := make(map[string]bool)
|
|
terms := tokenize(chunk.Content + " " + chunk.Metadata["title"])
|
|
for _, term := range terms {
|
|
if !seen[term] {
|
|
df[term]++
|
|
seen[term] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
avgDocLen := 0.0
|
|
for _, chunk := range chunks {
|
|
avgDocLen += float64(len(tokenize(chunk.Content)))
|
|
}
|
|
avgDocLen /= float64(len(chunks))
|
|
|
|
k1 := 1.5
|
|
b := 0.75
|
|
n := float64(len(chunks))
|
|
|
|
ranked := make([]RankedItem, len(chunks))
|
|
for i, chunk := range chunks {
|
|
docTerms := tokenize(chunk.Content + " " + chunk.Metadata["title"])
|
|
docLen := float64(len(docTerms))
|
|
|
|
tf := make(map[string]int)
|
|
for _, term := range docTerms {
|
|
tf[term]++
|
|
}
|
|
|
|
score := 0.0
|
|
for _, qterm := range queryTerms {
|
|
if termFreq, ok := tf[qterm]; ok {
|
|
docFreq := float64(df[qterm])
|
|
idf := math.Log((n - docFreq + 0.5) / (docFreq + 0.5))
|
|
if idf < 0 {
|
|
idf = 0
|
|
}
|
|
|
|
tfNorm := float64(termFreq) * (k1 + 1) /
|
|
(float64(termFreq) + k1*(1-b+b*docLen/avgDocLen))
|
|
|
|
score += idf * tfNorm
|
|
}
|
|
}
|
|
|
|
if title, ok := chunk.Metadata["title"]; ok {
|
|
titleLower := strings.ToLower(title)
|
|
for _, qterm := range queryTerms {
|
|
if strings.Contains(titleLower, qterm) {
|
|
score += 2.0
|
|
}
|
|
}
|
|
}
|
|
|
|
ranked[i] = RankedItem{Chunk: chunk, Score: score}
|
|
}
|
|
|
|
sort.Slice(ranked, func(i, j int) bool {
|
|
return ranked[i].Score > ranked[j].Score
|
|
})
|
|
|
|
if topK > len(ranked) {
|
|
topK = len(ranked)
|
|
}
|
|
|
|
result := make([]types.Chunk, topK)
|
|
for i := 0; i < topK; i++ {
|
|
result[i] = ranked[i].Chunk
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func tokenize(text string) []string {
|
|
text = strings.ToLower(text)
|
|
|
|
var tokens []string
|
|
var current strings.Builder
|
|
|
|
for _, r := range text {
|
|
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
|
current.WriteRune(r)
|
|
} else {
|
|
if current.Len() >= 2 {
|
|
tokens = append(tokens, current.String())
|
|
}
|
|
current.Reset()
|
|
}
|
|
}
|
|
|
|
if current.Len() >= 2 {
|
|
tokens = append(tokens, current.String())
|
|
}
|
|
|
|
return tokens
|
|
}
|
|
|
|
func EstimateQueryComplexity(query string) float64 {
|
|
terms := tokenize(query)
|
|
complexity := float64(len(terms)) / 5.0
|
|
|
|
if strings.Contains(query, "?") {
|
|
complexity += 0.2
|
|
}
|
|
if strings.Contains(query, " и ") || strings.Contains(query, " или ") {
|
|
complexity += 0.3
|
|
}
|
|
|
|
if complexity > 1.0 {
|
|
complexity = 1.0
|
|
}
|
|
return complexity
|
|
}
|
|
|
|
func ComputeAdaptiveTopK(totalResults int, complexity float64, mode string) int {
|
|
baseK := 15
|
|
|
|
switch mode {
|
|
case "speed":
|
|
baseK = 10
|
|
case "balanced":
|
|
baseK = 20
|
|
case "quality":
|
|
baseK = 30
|
|
}
|
|
|
|
adaptiveK := int(float64(baseK) * (1 + complexity*0.5))
|
|
|
|
if adaptiveK > totalResults {
|
|
adaptiveK = totalResults
|
|
}
|
|
|
|
return adaptiveK
|
|
}
|