package search import ( "math" "sort" "strings" "unicode" "github.com/gooseek/backend/internal/types" ) type RankedItem struct { Chunk types.Chunk Score float64 } func RerankBM25(chunks []types.Chunk, query string, topK int) []types.Chunk { if len(chunks) == 0 { return chunks } queryTerms := tokenize(query) if len(queryTerms) == 0 { return chunks } df := make(map[string]int) for _, chunk := range chunks { seen := make(map[string]bool) terms := tokenize(chunk.Content + " " + chunk.Metadata["title"]) for _, term := range terms { if !seen[term] { df[term]++ seen[term] = true } } } avgDocLen := 0.0 for _, chunk := range chunks { avgDocLen += float64(len(tokenize(chunk.Content))) } avgDocLen /= float64(len(chunks)) k1 := 1.5 b := 0.75 n := float64(len(chunks)) ranked := make([]RankedItem, len(chunks)) for i, chunk := range chunks { docTerms := tokenize(chunk.Content + " " + chunk.Metadata["title"]) docLen := float64(len(docTerms)) tf := make(map[string]int) for _, term := range docTerms { tf[term]++ } score := 0.0 for _, qterm := range queryTerms { if termFreq, ok := tf[qterm]; ok { docFreq := float64(df[qterm]) idf := math.Log((n - docFreq + 0.5) / (docFreq + 0.5)) if idf < 0 { idf = 0 } tfNorm := float64(termFreq) * (k1 + 1) / (float64(termFreq) + k1*(1-b+b*docLen/avgDocLen)) score += idf * tfNorm } } if title, ok := chunk.Metadata["title"]; ok { titleLower := strings.ToLower(title) for _, qterm := range queryTerms { if strings.Contains(titleLower, qterm) { score += 2.0 } } } ranked[i] = RankedItem{Chunk: chunk, Score: score} } sort.Slice(ranked, func(i, j int) bool { return ranked[i].Score > ranked[j].Score }) if topK > len(ranked) { topK = len(ranked) } result := make([]types.Chunk, topK) for i := 0; i < topK; i++ { result[i] = ranked[i].Chunk } return result } func tokenize(text string) []string { text = strings.ToLower(text) var tokens []string var current strings.Builder for _, r := range text { if unicode.IsLetter(r) || unicode.IsDigit(r) { current.WriteRune(r) } else { if current.Len() >= 2 { tokens = append(tokens, current.String()) } current.Reset() } } if current.Len() >= 2 { tokens = append(tokens, current.String()) } return tokens } func EstimateQueryComplexity(query string) float64 { terms := tokenize(query) complexity := float64(len(terms)) / 5.0 if strings.Contains(query, "?") { complexity += 0.2 } if strings.Contains(query, " и ") || strings.Contains(query, " или ") { complexity += 0.3 } if complexity > 1.0 { complexity = 1.0 } return complexity } func ComputeAdaptiveTopK(totalResults int, complexity float64, mode string) int { baseK := 15 switch mode { case "speed": baseK = 10 case "balanced": baseK = 20 case "quality": baseK = 30 } adaptiveK := int(float64(baseK) * (1 + complexity*0.5)) if adaptiveK > totalResults { adaptiveK = totalResults } return adaptiveK }