feat: Go backend, enhanced search, new widgets, Docker deploy
Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
This commit is contained in:
215
backend/internal/search/media.go
Normal file
215
backend/internal/search/media.go
Normal file
@@ -0,0 +1,215 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"context"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/gooseek/backend/internal/types"
|
||||
)
|
||||
|
||||
type MediaSearchOptions struct {
|
||||
MaxImages int
|
||||
MaxVideos int
|
||||
}
|
||||
|
||||
type MediaSearchResult struct {
|
||||
Images []types.ImageData `json:"images"`
|
||||
Videos []types.VideoData `json:"videos"`
|
||||
}
|
||||
|
||||
func (c *SearXNGClient) SearchMedia(ctx context.Context, query string, opts *MediaSearchOptions) (*MediaSearchResult, error) {
|
||||
if opts == nil {
|
||||
opts = &MediaSearchOptions{MaxImages: 8, MaxVideos: 6}
|
||||
}
|
||||
|
||||
result := &MediaSearchResult{
|
||||
Images: make([]types.ImageData, 0),
|
||||
Videos: make([]types.VideoData, 0),
|
||||
}
|
||||
|
||||
imageCh := make(chan []types.ImageData, 1)
|
||||
videoCh := make(chan []types.VideoData, 1)
|
||||
errCh := make(chan error, 2)
|
||||
|
||||
go func() {
|
||||
images, err := c.searchImages(ctx, query, opts.MaxImages)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
imageCh <- nil
|
||||
return
|
||||
}
|
||||
errCh <- nil
|
||||
imageCh <- images
|
||||
}()
|
||||
|
||||
go func() {
|
||||
videos, err := c.searchVideos(ctx, query, opts.MaxVideos)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
videoCh <- nil
|
||||
return
|
||||
}
|
||||
errCh <- nil
|
||||
videoCh <- videos
|
||||
}()
|
||||
|
||||
<-errCh
|
||||
<-errCh
|
||||
result.Images = <-imageCh
|
||||
result.Videos = <-videoCh
|
||||
|
||||
if result.Images == nil {
|
||||
result.Images = make([]types.ImageData, 0)
|
||||
}
|
||||
if result.Videos == nil {
|
||||
result.Videos = make([]types.VideoData, 0)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (c *SearXNGClient) searchImages(ctx context.Context, query string, max int) ([]types.ImageData, error) {
|
||||
resp, err := c.Search(ctx, query, &SearchOptions{
|
||||
Categories: []string{"images"},
|
||||
PageNo: 1,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
images := make([]types.ImageData, 0, max)
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for _, r := range resp.Results {
|
||||
if len(images) >= max {
|
||||
break
|
||||
}
|
||||
|
||||
imgURL := r.ImgSrc
|
||||
if imgURL == "" {
|
||||
imgURL = r.ThumbnailSrc
|
||||
}
|
||||
if imgURL == "" {
|
||||
imgURL = r.Thumbnail
|
||||
}
|
||||
if imgURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if seen[imgURL] {
|
||||
continue
|
||||
}
|
||||
seen[imgURL] = true
|
||||
|
||||
images = append(images, types.ImageData{
|
||||
URL: imgURL,
|
||||
Title: r.Title,
|
||||
Source: extractDomain(r.URL),
|
||||
SourceURL: r.URL,
|
||||
})
|
||||
}
|
||||
|
||||
return images, nil
|
||||
}
|
||||
|
||||
func (c *SearXNGClient) searchVideos(ctx context.Context, query string, max int) ([]types.VideoData, error) {
|
||||
resp, err := c.Search(ctx, query, &SearchOptions{
|
||||
Categories: []string{"videos"},
|
||||
PageNo: 1,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
videos := make([]types.VideoData, 0, max)
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for _, r := range resp.Results {
|
||||
if len(videos) >= max {
|
||||
break
|
||||
}
|
||||
|
||||
if seen[r.URL] {
|
||||
continue
|
||||
}
|
||||
seen[r.URL] = true
|
||||
|
||||
platform := detectVideoPlatform(r.URL)
|
||||
|
||||
video := types.VideoData{
|
||||
Title: r.Title,
|
||||
URL: r.URL,
|
||||
Thumbnail: r.Thumbnail,
|
||||
Duration: toInt(r.Duration),
|
||||
Views: toInt(r.Views),
|
||||
Author: r.Author,
|
||||
Platform: platform,
|
||||
EmbedURL: r.IframeSrc,
|
||||
}
|
||||
|
||||
videos = append(videos, video)
|
||||
}
|
||||
|
||||
return videos, nil
|
||||
}
|
||||
|
||||
var (
|
||||
youtubePattern = regexp.MustCompile(`youtube\.com|youtu\.be`)
|
||||
rutubePattern = regexp.MustCompile(`rutube\.ru`)
|
||||
vkPattern = regexp.MustCompile(`vk\.com`)
|
||||
dzenPattern = regexp.MustCompile(`dzen\.ru`)
|
||||
)
|
||||
|
||||
func detectVideoPlatform(url string) string {
|
||||
urlLower := strings.ToLower(url)
|
||||
|
||||
if youtubePattern.MatchString(urlLower) {
|
||||
return "youtube"
|
||||
}
|
||||
if rutubePattern.MatchString(urlLower) {
|
||||
return "rutube"
|
||||
}
|
||||
if vkPattern.MatchString(urlLower) {
|
||||
return "vk"
|
||||
}
|
||||
if dzenPattern.MatchString(urlLower) {
|
||||
return "dzen"
|
||||
}
|
||||
|
||||
return "other"
|
||||
}
|
||||
|
||||
func extractDomain(rawURL string) string {
|
||||
rawURL = strings.TrimPrefix(rawURL, "https://")
|
||||
rawURL = strings.TrimPrefix(rawURL, "http://")
|
||||
rawURL = strings.TrimPrefix(rawURL, "www.")
|
||||
|
||||
if idx := strings.Index(rawURL, "/"); idx > 0 {
|
||||
rawURL = rawURL[:idx]
|
||||
}
|
||||
|
||||
return rawURL
|
||||
}
|
||||
|
||||
func toInt(v interface{}) int {
|
||||
if v == nil {
|
||||
return 0
|
||||
}
|
||||
switch val := v.(type) {
|
||||
case int:
|
||||
return val
|
||||
case int64:
|
||||
return int(val)
|
||||
case float64:
|
||||
return int(val)
|
||||
case string:
|
||||
if i, err := strconv.Atoi(val); err == nil {
|
||||
return i
|
||||
}
|
||||
return 0
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
163
backend/internal/search/reranker.go
Normal file
163
backend/internal/search/reranker.go
Normal file
@@ -0,0 +1,163 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/gooseek/backend/internal/types"
|
||||
)
|
||||
|
||||
type RankedItem struct {
|
||||
Chunk types.Chunk
|
||||
Score float64
|
||||
}
|
||||
|
||||
func RerankBM25(chunks []types.Chunk, query string, topK int) []types.Chunk {
|
||||
if len(chunks) == 0 {
|
||||
return chunks
|
||||
}
|
||||
|
||||
queryTerms := tokenize(query)
|
||||
if len(queryTerms) == 0 {
|
||||
return chunks
|
||||
}
|
||||
|
||||
df := make(map[string]int)
|
||||
for _, chunk := range chunks {
|
||||
seen := make(map[string]bool)
|
||||
terms := tokenize(chunk.Content + " " + chunk.Metadata["title"])
|
||||
for _, term := range terms {
|
||||
if !seen[term] {
|
||||
df[term]++
|
||||
seen[term] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
avgDocLen := 0.0
|
||||
for _, chunk := range chunks {
|
||||
avgDocLen += float64(len(tokenize(chunk.Content)))
|
||||
}
|
||||
avgDocLen /= float64(len(chunks))
|
||||
|
||||
k1 := 1.5
|
||||
b := 0.75
|
||||
n := float64(len(chunks))
|
||||
|
||||
ranked := make([]RankedItem, len(chunks))
|
||||
for i, chunk := range chunks {
|
||||
docTerms := tokenize(chunk.Content + " " + chunk.Metadata["title"])
|
||||
docLen := float64(len(docTerms))
|
||||
|
||||
tf := make(map[string]int)
|
||||
for _, term := range docTerms {
|
||||
tf[term]++
|
||||
}
|
||||
|
||||
score := 0.0
|
||||
for _, qterm := range queryTerms {
|
||||
if termFreq, ok := tf[qterm]; ok {
|
||||
docFreq := float64(df[qterm])
|
||||
idf := math.Log((n - docFreq + 0.5) / (docFreq + 0.5))
|
||||
if idf < 0 {
|
||||
idf = 0
|
||||
}
|
||||
|
||||
tfNorm := float64(termFreq) * (k1 + 1) /
|
||||
(float64(termFreq) + k1*(1-b+b*docLen/avgDocLen))
|
||||
|
||||
score += idf * tfNorm
|
||||
}
|
||||
}
|
||||
|
||||
if title, ok := chunk.Metadata["title"]; ok {
|
||||
titleLower := strings.ToLower(title)
|
||||
for _, qterm := range queryTerms {
|
||||
if strings.Contains(titleLower, qterm) {
|
||||
score += 2.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ranked[i] = RankedItem{Chunk: chunk, Score: score}
|
||||
}
|
||||
|
||||
sort.Slice(ranked, func(i, j int) bool {
|
||||
return ranked[i].Score > ranked[j].Score
|
||||
})
|
||||
|
||||
if topK > len(ranked) {
|
||||
topK = len(ranked)
|
||||
}
|
||||
|
||||
result := make([]types.Chunk, topK)
|
||||
for i := 0; i < topK; i++ {
|
||||
result[i] = ranked[i].Chunk
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func tokenize(text string) []string {
|
||||
text = strings.ToLower(text)
|
||||
|
||||
var tokens []string
|
||||
var current strings.Builder
|
||||
|
||||
for _, r := range text {
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
||||
current.WriteRune(r)
|
||||
} else {
|
||||
if current.Len() >= 2 {
|
||||
tokens = append(tokens, current.String())
|
||||
}
|
||||
current.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
if current.Len() >= 2 {
|
||||
tokens = append(tokens, current.String())
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
func EstimateQueryComplexity(query string) float64 {
|
||||
terms := tokenize(query)
|
||||
complexity := float64(len(terms)) / 5.0
|
||||
|
||||
if strings.Contains(query, "?") {
|
||||
complexity += 0.2
|
||||
}
|
||||
if strings.Contains(query, " и ") || strings.Contains(query, " или ") {
|
||||
complexity += 0.3
|
||||
}
|
||||
|
||||
if complexity > 1.0 {
|
||||
complexity = 1.0
|
||||
}
|
||||
return complexity
|
||||
}
|
||||
|
||||
func ComputeAdaptiveTopK(totalResults int, complexity float64, mode string) int {
|
||||
baseK := 15
|
||||
|
||||
switch mode {
|
||||
case "speed":
|
||||
baseK = 10
|
||||
case "balanced":
|
||||
baseK = 20
|
||||
case "quality":
|
||||
baseK = 30
|
||||
}
|
||||
|
||||
adaptiveK := int(float64(baseK) * (1 + complexity*0.5))
|
||||
|
||||
if adaptiveK > totalResults {
|
||||
adaptiveK = totalResults
|
||||
}
|
||||
|
||||
return adaptiveK
|
||||
}
|
||||
177
backend/internal/search/searxng.go
Normal file
177
backend/internal/search/searxng.go
Normal file
@@ -0,0 +1,177 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gooseek/backend/internal/types"
|
||||
"github.com/gooseek/backend/pkg/config"
|
||||
)
|
||||
|
||||
type SearXNGClient struct {
|
||||
primaryURL string
|
||||
fallbackURLs []string
|
||||
client *http.Client
|
||||
timeout time.Duration
|
||||
}
|
||||
|
||||
func NewSearXNGClient(cfg *config.Config) *SearXNGClient {
|
||||
return &SearXNGClient{
|
||||
primaryURL: cfg.SearXNGURL,
|
||||
fallbackURLs: cfg.SearXNGFallbackURL,
|
||||
client: &http.Client{Timeout: cfg.SearchTimeout},
|
||||
timeout: cfg.SearchTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
type SearchOptions struct {
|
||||
Engines []string
|
||||
Categories []string
|
||||
PageNo int
|
||||
Language string
|
||||
}
|
||||
|
||||
func (c *SearXNGClient) Search(ctx context.Context, query string, opts *SearchOptions) (*types.SearchResponse, error) {
|
||||
candidates := c.buildCandidates()
|
||||
if len(candidates) == 0 {
|
||||
return nil, fmt.Errorf("no SearXNG URLs configured")
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for _, baseURL := range candidates {
|
||||
result, err := c.searchWithURL(ctx, baseURL, query, opts)
|
||||
if err == nil {
|
||||
return result, nil
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("all SearXNG instances failed: %w", lastErr)
|
||||
}
|
||||
|
||||
func (c *SearXNGClient) buildCandidates() []string {
|
||||
candidates := make([]string, 0)
|
||||
|
||||
if c.primaryURL != "" {
|
||||
u := strings.TrimSuffix(c.primaryURL, "/")
|
||||
if !strings.HasPrefix(u, "http") {
|
||||
u = "http://" + u
|
||||
}
|
||||
candidates = append(candidates, u)
|
||||
}
|
||||
|
||||
for _, fb := range c.fallbackURLs {
|
||||
u := strings.TrimSpace(fb)
|
||||
if u == "" {
|
||||
continue
|
||||
}
|
||||
u = strings.TrimSuffix(u, "/")
|
||||
if !strings.HasPrefix(u, "http") {
|
||||
u = "https://" + u
|
||||
}
|
||||
if !contains(candidates, u) {
|
||||
candidates = append(candidates, u)
|
||||
}
|
||||
}
|
||||
|
||||
return candidates
|
||||
}
|
||||
|
||||
func (c *SearXNGClient) searchWithURL(ctx context.Context, baseURL, query string, opts *SearchOptions) (*types.SearchResponse, error) {
|
||||
params := url.Values{}
|
||||
params.Set("format", "json")
|
||||
params.Set("q", query)
|
||||
|
||||
if opts != nil {
|
||||
if len(opts.Engines) > 0 {
|
||||
params.Set("engines", strings.Join(opts.Engines, ","))
|
||||
}
|
||||
if len(opts.Categories) > 0 {
|
||||
params.Set("categories", strings.Join(opts.Categories, ","))
|
||||
}
|
||||
if opts.PageNo > 0 {
|
||||
params.Set("pageno", fmt.Sprintf("%d", opts.PageNo))
|
||||
}
|
||||
if opts.Language != "" {
|
||||
params.Set("language", opts.Language)
|
||||
}
|
||||
}
|
||||
|
||||
reqURL := fmt.Sprintf("%s/search?%s", baseURL, params.Encode())
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("SearXNG returned status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Results []types.SearchResult `json:"results"`
|
||||
Suggestions []string `json:"suggestions"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &types.SearchResponse{
|
||||
Results: result.Results,
|
||||
Suggestions: result.Suggestions,
|
||||
}, nil
|
||||
}
|
||||
|
||||
var (
|
||||
productPattern = regexp.MustCompile(`ozon\.ru/product|wildberries\.ru/catalog/\d|aliexpress\.(ru|com)/item|market\.yandex`)
|
||||
videoPattern = regexp.MustCompile(`rutube\.ru/video|vk\.com/video|vk\.com/clip|youtube\.com/watch|youtu\.be|dzen\.ru/video`)
|
||||
vkProfilePattern = regexp.MustCompile(`vk\.com/[a-zA-Z0-9_.]+$`)
|
||||
tgProfilePattern = regexp.MustCompile(`t\.me/[a-zA-Z0-9_]+$`)
|
||||
)
|
||||
|
||||
func CategorizeResult(result *types.SearchResult) types.ContentCategory {
|
||||
urlLower := strings.ToLower(result.URL)
|
||||
|
||||
if productPattern.MatchString(urlLower) {
|
||||
return types.CategoryProduct
|
||||
}
|
||||
|
||||
if videoPattern.MatchString(urlLower) || result.IframeSrc != "" || result.Category == "videos" {
|
||||
return types.CategoryVideo
|
||||
}
|
||||
|
||||
if tgProfilePattern.MatchString(urlLower) {
|
||||
return types.CategoryProfile
|
||||
}
|
||||
if vkProfilePattern.MatchString(urlLower) && !videoPattern.MatchString(urlLower) {
|
||||
return types.CategoryProfile
|
||||
}
|
||||
|
||||
if result.ImgSrc != "" && result.Category == "images" {
|
||||
return types.CategoryImage
|
||||
}
|
||||
|
||||
return types.CategoryArticle
|
||||
}
|
||||
|
||||
func contains(slice []string, item string) bool {
|
||||
for _, s := range slice {
|
||||
if s == item {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
Reference in New Issue
Block a user