Files
gooseek/backend/internal/agent/travel_events_collector.go
home ab48a0632b
Some checks failed
Build and Deploy GooSeek / build-backend (push) Failing after 1m4s
Build and Deploy GooSeek / build-webui (push) Failing after 1m2s
Build and Deploy GooSeek / deploy (push) Has been skipped
feat: CI/CD pipeline + Learning/Medicine/Travel services
- Add Gitea Actions workflow for automated build & deploy
- Add K8s manifests: webui, travel-svc, medicine-svc, sandbox-svc
- Update kustomization for localhost:5000 registry
- Add ingress for gooseek.ru and api.gooseek.ru
- Learning cabinet with onboarding, courses, sandbox integration
- Medicine service with symptom analysis and doctor matching
- Travel service with itinerary planning
- Server setup scripts (NVIDIA/CUDA, K3s, Gitea runner)

Made-with: Cursor
2026-03-02 20:25:44 +03:00

518 lines
14 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package agent
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"regexp"
"strings"
"time"
"github.com/gooseek/backend/internal/llm"
"github.com/gooseek/backend/internal/search"
"github.com/google/uuid"
)
// CollectEventsEnriched collects real upcoming events/activities for the destination.
// Pipeline: SearXNG (event-focused queries) -> Crawl4AI -> LLM extraction -> geocode.
// Only returns actual events (concerts, exhibitions, festivals, etc.), NOT news articles.
func CollectEventsEnriched(ctx context.Context, cfg TravelOrchestratorConfig, brief *TripBrief) ([]EventCard, error) {
if cfg.SearchClient == nil {
return nil, nil
}
rawResults := searchForEvents(ctx, cfg.SearchClient, brief)
if len(rawResults) == 0 {
log.Printf("[travel-events] no search results found")
return nil, nil
}
log.Printf("[travel-events] found %d raw search results", len(rawResults))
var crawledContent []crawledPage
if cfg.Crawl4AIURL != "" {
crawledContent = crawlEventPages(ctx, cfg.Crawl4AIURL, rawResults)
}
events := extractEventsWithLLM(ctx, cfg.LLM, brief, rawResults, crawledContent)
events = geocodeEvents(ctx, cfg, brief, events)
events = deduplicateEvents(events)
events = filterFreshEvents(events, brief.StartDate)
// Hard filter: drop events that ended up in another city/country due to ambiguous geocoding.
destGeo := geocodeDestinations(ctx, cfg, brief)
events = filterEventsNearDestinations(events, destGeo, 250)
if len(events) > 15 {
events = events[:15]
}
log.Printf("[travel-events] returning %d events", len(events))
return events, nil
}
type crawledPage struct {
URL string
Title string
Content string
}
type eventSearchResult struct {
Title string
URL string
Content string
PublishedDate string
Engine string
}
func searchForEvents(ctx context.Context, client *search.SearXNGClient, brief *TripBrief) []eventSearchResult {
var results []eventSearchResult
seen := make(map[string]bool)
for _, dest := range brief.Destinations {
queries := generateEventQueries(dest, brief.StartDate, brief.EndDate)
for _, q := range queries {
searchCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
resp, err := client.Search(searchCtx, q, &search.SearchOptions{
Categories: []string{"general"},
PageNo: 1,
})
cancel()
if err != nil {
log.Printf("[travel-events] search error for '%s': %v", q, err)
continue
}
for _, r := range resp.Results {
if r.URL == "" || seen[r.URL] {
continue
}
if isNewsArticleURL(r.URL) || isOldContent(r.PublishedDate) {
continue
}
seen[r.URL] = true
results = append(results, eventSearchResult{
Title: r.Title,
URL: r.URL,
Content: r.Content,
PublishedDate: r.PublishedDate,
Engine: r.Engine,
})
}
}
}
return results
}
func generateEventQueries(destination, startDate, endDate string) []string {
month := ""
year := ""
if len(startDate) >= 7 {
parts := strings.Split(startDate, "-")
if len(parts) >= 2 {
year = parts[0]
monthNum := parts[1]
monthNames := map[string]string{
"01": "январь", "02": "февраль", "03": "март",
"04": "апрель", "05": "май", "06": "июнь",
"07": "июль", "08": "август", "09": "сентябрь",
"10": "октябрь", "11": "ноябрь", "12": "декабрь",
}
month = monthNames[monthNum]
}
}
if year == "" {
year = time.Now().Format("2006")
}
if month == "" {
monthNames := []string{"", "январь", "февраль", "март", "апрель", "май", "июнь",
"июль", "август", "сентябрь", "октябрь", "ноябрь", "декабрь"}
month = monthNames[time.Now().Month()]
}
queries := []string{
fmt.Sprintf("афиша %s %s %s концерты выставки", destination, month, year),
fmt.Sprintf("мероприятия %s %s %s расписание", destination, month, year),
fmt.Sprintf("куда сходить %s %s %s", destination, month, year),
fmt.Sprintf("site:afisha.ru %s %s", destination, month),
fmt.Sprintf("site:kassir.ru %s %s %s", destination, month, year),
}
return queries
}
func isNewsArticleURL(u string) bool {
newsPatterns := []string{
"/news/", "/novosti/", "/article/", "/stati/",
"ria.ru", "tass.ru", "rbc.ru", "lenta.ru", "gazeta.ru",
"interfax.ru", "kommersant.ru", "iz.ru", "mk.ru",
"regnum.ru", "aif.ru", "kp.ru",
}
lower := strings.ToLower(u)
for _, p := range newsPatterns {
if strings.Contains(lower, p) {
return true
}
}
return false
}
func isOldContent(publishedDate string) bool {
if publishedDate == "" {
return false
}
formats := []string{
"2006-01-02T15:04:05Z",
"2006-01-02T15:04:05-07:00",
"2006-01-02",
"02.01.2006",
}
for _, f := range formats {
if t, err := time.Parse(f, publishedDate); err == nil {
sixMonthsAgo := time.Now().AddDate(0, -6, 0)
return t.Before(sixMonthsAgo)
}
}
return false
}
func filterFreshEvents(events []EventCard, tripStartDate string) []EventCard {
if tripStartDate == "" {
return events
}
tripStart, err := time.Parse("2006-01-02", tripStartDate)
if err != nil {
return events
}
cutoff := tripStart.AddDate(0, -1, 0)
var fresh []EventCard
for _, e := range events {
if e.DateEnd != "" {
if endDate, err := time.Parse("2006-01-02", e.DateEnd); err == nil {
if endDate.Before(cutoff) {
continue
}
}
}
if e.DateStart != "" {
if startDate, err := time.Parse("2006-01-02", e.DateStart); err == nil {
twoMonthsAfterTrip := tripStart.AddDate(0, 2, 0)
if startDate.After(twoMonthsAfterTrip) {
continue
}
}
}
fresh = append(fresh, e)
}
return fresh
}
func crawlEventPages(ctx context.Context, crawl4aiURL string, results []eventSearchResult) []crawledPage {
maxCrawl := 4
if len(results) < maxCrawl {
maxCrawl = len(results)
}
var pages []crawledPage
for _, r := range results[:maxCrawl] {
crawlCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
page, err := crawlSinglePage(crawlCtx, crawl4aiURL, r.URL)
cancel()
if err != nil {
log.Printf("[travel-events] crawl failed for %s: %v", r.URL, err)
continue
}
if page != nil && len(page.Content) > 100 {
pages = append(pages, *page)
}
}
return pages
}
func crawlSinglePage(ctx context.Context, crawl4aiURL, pageURL string) (*crawledPage, error) {
reqBody := fmt.Sprintf(`{
"urls": ["%s"],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "default",
"page_timeout": 15000
}
}
}`, pageURL)
req, err := http.NewRequestWithContext(ctx, "POST", crawl4aiURL+"/crawl", strings.NewReader(reqBody))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 20 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("crawl4ai returned %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
content := extractCrawledMarkdown(string(body))
title := extractCrawledTitle(string(body))
if len(content) > 10000 {
content = content[:10000]
}
return &crawledPage{
URL: pageURL,
Title: title,
Content: content,
}, nil
}
func extractCrawledMarkdown(response string) string {
var result struct {
Results []struct {
RawMarkdown string `json:"raw_markdown"`
Markdown string `json:"markdown"`
} `json:"results"`
}
if err := json.Unmarshal([]byte(response), &result); err == nil && len(result.Results) > 0 {
if result.Results[0].RawMarkdown != "" {
return result.Results[0].RawMarkdown
}
return result.Results[0].Markdown
}
return ""
}
func extractCrawledTitle(response string) string {
var result struct {
Results []struct {
Title string `json:"title"`
} `json:"results"`
}
if err := json.Unmarshal([]byte(response), &result); err == nil && len(result.Results) > 0 {
return result.Results[0].Title
}
return ""
}
func extractEventsWithLLM(ctx context.Context, llmClient llm.Client, brief *TripBrief, searchResults []eventSearchResult, crawled []crawledPage) []EventCard {
var contextBuilder strings.Builder
contextBuilder.WriteString("Данные об афише и мероприятиях:\n\n")
maxSearch := 10
if len(searchResults) < maxSearch {
maxSearch = len(searchResults)
}
for i := 0; i < maxSearch; i++ {
r := searchResults[i]
contextBuilder.WriteString(fmt.Sprintf("### %s\nURL: %s\n%s\n\n", r.Title, r.URL, truncateStr(r.Content, 300)))
}
if len(crawled) > 0 {
contextBuilder.WriteString("\nПодробности со страниц:\n\n")
maxCrawled := 3
if len(crawled) < maxCrawled {
maxCrawled = len(crawled)
}
for i := 0; i < maxCrawled; i++ {
p := crawled[i]
contextBuilder.WriteString(fmt.Sprintf("### %s (%s)\n%s\n\n", p.Title, p.URL, truncateStr(p.Content, 2000)))
}
}
currentYear := time.Now().Format("2006")
prompt := fmt.Sprintf(`Извлеки ТОЛЬКО реальные МЕРОПРИЯТИЯ (концерты, выставки, фестивали, спектакли, спортивные события) в %s на %s — %s.
%s
СТРОГО ЗАПРЕЩЕНО:
- Новостные статьи, обзоры, блог-посты — это НЕ мероприятия
- Устаревшие события (до %s года)
- Выдуманные мероприятия
JSON (ТОЛЬКО массив, без текста):
[{"id":"evt-1","title":"Название","description":"Что за мероприятие, 1 предложение","dateStart":"YYYY-MM-DD","dateEnd":"YYYY-MM-DD","price":500,"currency":"RUB","url":"https://...","address":"Город, Площадка, адрес","tags":["концерт"]}]
Правила:
- ТОЛЬКО конкретные мероприятия с названием, местом и датой
- dateStart/dateEnd в формате YYYY-MM-DD, если дата неизвестна — ""
- price в рублях, 0 если неизвестна
- address — точный адрес площадки для геокодинга
- tags: концерт, выставка, фестиваль, спектакль, спорт, кино, мастер-класс, экскурсия
- Максимум 10 мероприятий`,
strings.Join(brief.Destinations, ", "),
brief.StartDate,
brief.EndDate,
contextBuilder.String(),
currentYear,
)
llmCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
response, err := llmClient.GenerateText(llmCtx, llm.StreamRequest{
Messages: []llm.Message{{Role: llm.RoleUser, Content: prompt}},
Options: llm.StreamOptions{MaxTokens: 3000, Temperature: 0.1},
})
if err != nil {
log.Printf("[travel-events] LLM extraction failed: %v", err)
return nil
}
jsonMatch := regexp.MustCompile(`\[[\s\S]*\]`).FindString(response)
if jsonMatch == "" {
log.Printf("[travel-events] no JSON array in LLM response (len=%d)", len(response))
return nil
}
var events []EventCard
if err := json.Unmarshal([]byte(jsonMatch), &events); err != nil {
log.Printf("[travel-events] JSON parse error: %v", err)
events = tryPartialEventParse(jsonMatch)
if len(events) == 0 {
return nil
}
}
for i := range events {
if events[i].ID == "" {
events[i].ID = uuid.New().String()
}
}
log.Printf("[travel-events] extracted %d events from LLM", len(events))
return events
}
func tryPartialEventParse(jsonStr string) []EventCard {
var events []EventCard
objRegex := regexp.MustCompile(`\{[^{}]*"title"\s*:\s*"[^"]+[^{}]*\}`)
matches := objRegex.FindAllString(jsonStr, -1)
for _, m := range matches {
var e EventCard
if err := json.Unmarshal([]byte(m), &e); err == nil && e.Title != "" {
events = append(events, e)
}
}
if len(events) > 0 {
log.Printf("[travel-events] partial parse recovered %d events", len(events))
}
return events
}
func geocodeEvents(ctx context.Context, cfg TravelOrchestratorConfig, brief *TripBrief, events []EventCard) []EventCard {
destSuffix := strings.Join(brief.Destinations, ", ")
for i := range events {
if events[i].Address == "" || (events[i].Lat != 0 && events[i].Lng != 0) {
continue
}
queries := []string{events[i].Address}
if destSuffix != "" && !strings.Contains(strings.ToLower(events[i].Address), strings.ToLower(destSuffix)) {
queries = append(queries, fmt.Sprintf("%s, %s", events[i].Address, destSuffix))
}
queries = append(queries, fmt.Sprintf("%s, %s", events[i].Title, destSuffix))
var lastErr error
for _, q := range queries {
geoCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
geo, err := cfg.TravelData.Geocode(geoCtx, q)
cancel()
if err != nil {
lastErr = err
continue
}
events[i].Lat = geo.Lat
events[i].Lng = geo.Lng
break
}
if events[i].Lat == 0 && events[i].Lng == 0 {
if lastErr != nil {
log.Printf("[travel-events] geocode failed for '%s': %v", events[i].Address, lastErr)
} else {
log.Printf("[travel-events] geocode failed for '%s'", events[i].Address)
}
continue
}
}
return events
}
func filterEventsNearDestinations(events []EventCard, destinations []destGeoEntry, maxKm float64) []EventCard {
if len(destinations) == 0 {
return events
}
filtered := make([]EventCard, 0, len(events))
for _, e := range events {
if e.Lat == 0 && e.Lng == 0 {
continue
}
minD := 1e18
for _, d := range destinations {
if d.Lat == 0 && d.Lng == 0 {
continue
}
dd := distanceKm(e.Lat, e.Lng, d.Lat, d.Lng)
if dd < minD {
minD = dd
}
}
if minD <= maxKm {
filtered = append(filtered, e)
} else {
log.Printf("[travel-events] dropped far event '%s' (%.0fkm from destinations)", e.Title, minD)
}
}
return filtered
}
func deduplicateEvents(events []EventCard) []EventCard {
seen := make(map[string]bool)
var unique []EventCard
for _, e := range events {
key := strings.ToLower(e.Title)
if len(key) > 50 {
key = key[:50]
}
if seen[key] {
continue
}
seen[key] = true
unique = append(unique, e)
}
return unique
}