Files
gooseek/backend/internal/agent/travel_events_collector.go
home 08bd41e75c feat: travel service with 2GIS routing, POI, hotels + finance providers + UI overhaul
- Add travel-svc microservice (Amadeus, TravelPayouts, 2GIS, OpenRouteService)
- Add travel orchestrator with parallel collectors (events, POI, hotels, flights)
- Add 2GIS road routing with transport cost calculation (car/bus/taxi)
- Add TravelMap (2GIS MapGL) and TravelWidgets components
- Add useTravelChat hook for streaming travel agent responses
- Add finance heatmap providers refactor
- Add SearXNG settings, API proxy routes, Docker compose updates
- Update Dockerfiles, config, types, and all UI pages for consistency

Made-with: Cursor
2026-03-01 21:58:32 +03:00

468 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package agent
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"regexp"
"strings"
"time"
"github.com/gooseek/backend/internal/llm"
"github.com/gooseek/backend/internal/search"
"github.com/google/uuid"
)
// CollectEventsEnriched collects real upcoming events/activities for the destination.
// Pipeline: SearXNG (event-focused queries) -> Crawl4AI -> LLM extraction -> geocode.
// Only returns actual events (concerts, exhibitions, festivals, etc.), NOT news articles.
func CollectEventsEnriched(ctx context.Context, cfg TravelOrchestratorConfig, brief *TripBrief) ([]EventCard, error) {
if cfg.SearchClient == nil {
return nil, nil
}
rawResults := searchForEvents(ctx, cfg.SearchClient, brief)
if len(rawResults) == 0 {
log.Printf("[travel-events] no search results found")
return nil, nil
}
log.Printf("[travel-events] found %d raw search results", len(rawResults))
var crawledContent []crawledPage
if cfg.Crawl4AIURL != "" {
crawledContent = crawlEventPages(ctx, cfg.Crawl4AIURL, rawResults)
}
events := extractEventsWithLLM(ctx, cfg.LLM, brief, rawResults, crawledContent)
events = geocodeEvents(ctx, cfg, events)
events = deduplicateEvents(events)
events = filterFreshEvents(events, brief.StartDate)
if len(events) > 15 {
events = events[:15]
}
log.Printf("[travel-events] returning %d events", len(events))
return events, nil
}
type crawledPage struct {
URL string
Title string
Content string
}
type eventSearchResult struct {
Title string
URL string
Content string
PublishedDate string
Engine string
}
func searchForEvents(ctx context.Context, client *search.SearXNGClient, brief *TripBrief) []eventSearchResult {
var results []eventSearchResult
seen := make(map[string]bool)
for _, dest := range brief.Destinations {
queries := generateEventQueries(dest, brief.StartDate, brief.EndDate)
for _, q := range queries {
searchCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
resp, err := client.Search(searchCtx, q, &search.SearchOptions{
Categories: []string{"general"},
PageNo: 1,
})
cancel()
if err != nil {
log.Printf("[travel-events] search error for '%s': %v", q, err)
continue
}
for _, r := range resp.Results {
if r.URL == "" || seen[r.URL] {
continue
}
if isNewsArticleURL(r.URL) || isOldContent(r.PublishedDate) {
continue
}
seen[r.URL] = true
results = append(results, eventSearchResult{
Title: r.Title,
URL: r.URL,
Content: r.Content,
PublishedDate: r.PublishedDate,
Engine: r.Engine,
})
}
}
}
return results
}
func generateEventQueries(destination, startDate, endDate string) []string {
month := ""
year := ""
if len(startDate) >= 7 {
parts := strings.Split(startDate, "-")
if len(parts) >= 2 {
year = parts[0]
monthNum := parts[1]
monthNames := map[string]string{
"01": "январь", "02": "февраль", "03": "март",
"04": "апрель", "05": "май", "06": "июнь",
"07": "июль", "08": "август", "09": "сентябрь",
"10": "октябрь", "11": "ноябрь", "12": "декабрь",
}
month = monthNames[monthNum]
}
}
if year == "" {
year = time.Now().Format("2006")
}
if month == "" {
monthNames := []string{"", "январь", "февраль", "март", "апрель", "май", "июнь",
"июль", "август", "сентябрь", "октябрь", "ноябрь", "декабрь"}
month = monthNames[time.Now().Month()]
}
queries := []string{
fmt.Sprintf("афиша %s %s %s концерты выставки", destination, month, year),
fmt.Sprintf("мероприятия %s %s %s расписание", destination, month, year),
fmt.Sprintf("куда сходить %s %s %s", destination, month, year),
fmt.Sprintf("site:afisha.ru %s %s", destination, month),
fmt.Sprintf("site:kassir.ru %s %s %s", destination, month, year),
}
return queries
}
func isNewsArticleURL(u string) bool {
newsPatterns := []string{
"/news/", "/novosti/", "/article/", "/stati/",
"ria.ru", "tass.ru", "rbc.ru", "lenta.ru", "gazeta.ru",
"interfax.ru", "kommersant.ru", "iz.ru", "mk.ru",
"regnum.ru", "aif.ru", "kp.ru",
}
lower := strings.ToLower(u)
for _, p := range newsPatterns {
if strings.Contains(lower, p) {
return true
}
}
return false
}
func isOldContent(publishedDate string) bool {
if publishedDate == "" {
return false
}
formats := []string{
"2006-01-02T15:04:05Z",
"2006-01-02T15:04:05-07:00",
"2006-01-02",
"02.01.2006",
}
for _, f := range formats {
if t, err := time.Parse(f, publishedDate); err == nil {
sixMonthsAgo := time.Now().AddDate(0, -6, 0)
return t.Before(sixMonthsAgo)
}
}
return false
}
func filterFreshEvents(events []EventCard, tripStartDate string) []EventCard {
if tripStartDate == "" {
return events
}
tripStart, err := time.Parse("2006-01-02", tripStartDate)
if err != nil {
return events
}
cutoff := tripStart.AddDate(0, -1, 0)
var fresh []EventCard
for _, e := range events {
if e.DateEnd != "" {
if endDate, err := time.Parse("2006-01-02", e.DateEnd); err == nil {
if endDate.Before(cutoff) {
continue
}
}
}
if e.DateStart != "" {
if startDate, err := time.Parse("2006-01-02", e.DateStart); err == nil {
twoMonthsAfterTrip := tripStart.AddDate(0, 2, 0)
if startDate.After(twoMonthsAfterTrip) {
continue
}
}
}
fresh = append(fresh, e)
}
return fresh
}
func crawlEventPages(ctx context.Context, crawl4aiURL string, results []eventSearchResult) []crawledPage {
maxCrawl := 4
if len(results) < maxCrawl {
maxCrawl = len(results)
}
var pages []crawledPage
for _, r := range results[:maxCrawl] {
crawlCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
page, err := crawlSinglePage(crawlCtx, crawl4aiURL, r.URL)
cancel()
if err != nil {
log.Printf("[travel-events] crawl failed for %s: %v", r.URL, err)
continue
}
if page != nil && len(page.Content) > 100 {
pages = append(pages, *page)
}
}
return pages
}
func crawlSinglePage(ctx context.Context, crawl4aiURL, pageURL string) (*crawledPage, error) {
reqBody := fmt.Sprintf(`{
"urls": ["%s"],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "default",
"page_timeout": 15000
}
}
}`, pageURL)
req, err := http.NewRequestWithContext(ctx, "POST", crawl4aiURL+"/crawl", strings.NewReader(reqBody))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 20 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("crawl4ai returned %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
content := extractCrawledMarkdown(string(body))
title := extractCrawledTitle(string(body))
if len(content) > 10000 {
content = content[:10000]
}
return &crawledPage{
URL: pageURL,
Title: title,
Content: content,
}, nil
}
func extractCrawledMarkdown(response string) string {
var result struct {
Results []struct {
RawMarkdown string `json:"raw_markdown"`
Markdown string `json:"markdown"`
} `json:"results"`
}
if err := json.Unmarshal([]byte(response), &result); err == nil && len(result.Results) > 0 {
if result.Results[0].RawMarkdown != "" {
return result.Results[0].RawMarkdown
}
return result.Results[0].Markdown
}
return ""
}
func extractCrawledTitle(response string) string {
var result struct {
Results []struct {
Title string `json:"title"`
} `json:"results"`
}
if err := json.Unmarshal([]byte(response), &result); err == nil && len(result.Results) > 0 {
return result.Results[0].Title
}
return ""
}
func extractEventsWithLLM(ctx context.Context, llmClient llm.Client, brief *TripBrief, searchResults []eventSearchResult, crawled []crawledPage) []EventCard {
var contextBuilder strings.Builder
contextBuilder.WriteString("Данные об афише и мероприятиях:\n\n")
maxSearch := 10
if len(searchResults) < maxSearch {
maxSearch = len(searchResults)
}
for i := 0; i < maxSearch; i++ {
r := searchResults[i]
contextBuilder.WriteString(fmt.Sprintf("### %s\nURL: %s\n%s\n\n", r.Title, r.URL, truncateStr(r.Content, 300)))
}
if len(crawled) > 0 {
contextBuilder.WriteString("\nПодробности со страниц:\n\n")
maxCrawled := 3
if len(crawled) < maxCrawled {
maxCrawled = len(crawled)
}
for i := 0; i < maxCrawled; i++ {
p := crawled[i]
contextBuilder.WriteString(fmt.Sprintf("### %s (%s)\n%s\n\n", p.Title, p.URL, truncateStr(p.Content, 2000)))
}
}
currentYear := time.Now().Format("2006")
prompt := fmt.Sprintf(`Извлеки ТОЛЬКО реальные МЕРОПРИЯТИЯ (концерты, выставки, фестивали, спектакли, спортивные события) в %s на %s — %s.
%s
СТРОГО ЗАПРЕЩЕНО:
- Новостные статьи, обзоры, блог-посты — это НЕ мероприятия
- Устаревшие события (до %s года)
- Выдуманные мероприятия
JSON (ТОЛЬКО массив, без текста):
[{"id":"evt-1","title":"Название","description":"Что за мероприятие, 1 предложение","dateStart":"YYYY-MM-DD","dateEnd":"YYYY-MM-DD","price":500,"currency":"RUB","url":"https://...","address":"Город, Площадка, адрес","tags":["концерт"]}]
Правила:
- ТОЛЬКО конкретные мероприятия с названием, местом и датой
- dateStart/dateEnd в формате YYYY-MM-DD, если дата неизвестна — ""
- price в рублях, 0 если неизвестна
- address — точный адрес площадки для геокодинга
- tags: концерт, выставка, фестиваль, спектакль, спорт, кино, мастер-класс, экскурсия
- Максимум 10 мероприятий`,
strings.Join(brief.Destinations, ", "),
brief.StartDate,
brief.EndDate,
contextBuilder.String(),
currentYear,
)
llmCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
response, err := llmClient.GenerateText(llmCtx, llm.StreamRequest{
Messages: []llm.Message{{Role: llm.RoleUser, Content: prompt}},
Options: llm.StreamOptions{MaxTokens: 3000, Temperature: 0.1},
})
if err != nil {
log.Printf("[travel-events] LLM extraction failed: %v", err)
return nil
}
jsonMatch := regexp.MustCompile(`\[[\s\S]*\]`).FindString(response)
if jsonMatch == "" {
log.Printf("[travel-events] no JSON array in LLM response (len=%d)", len(response))
return nil
}
var events []EventCard
if err := json.Unmarshal([]byte(jsonMatch), &events); err != nil {
log.Printf("[travel-events] JSON parse error: %v", err)
events = tryPartialEventParse(jsonMatch)
if len(events) == 0 {
return nil
}
}
for i := range events {
if events[i].ID == "" {
events[i].ID = uuid.New().String()
}
}
log.Printf("[travel-events] extracted %d events from LLM", len(events))
return events
}
func tryPartialEventParse(jsonStr string) []EventCard {
var events []EventCard
objRegex := regexp.MustCompile(`\{[^{}]*"title"\s*:\s*"[^"]+[^{}]*\}`)
matches := objRegex.FindAllString(jsonStr, -1)
for _, m := range matches {
var e EventCard
if err := json.Unmarshal([]byte(m), &e); err == nil && e.Title != "" {
events = append(events, e)
}
}
if len(events) > 0 {
log.Printf("[travel-events] partial parse recovered %d events", len(events))
}
return events
}
func geocodeEvents(ctx context.Context, cfg TravelOrchestratorConfig, events []EventCard) []EventCard {
for i := range events {
if events[i].Address == "" || (events[i].Lat != 0 && events[i].Lng != 0) {
continue
}
geoCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
geo, err := cfg.TravelData.Geocode(geoCtx, events[i].Address)
cancel()
if err != nil {
log.Printf("[travel-events] geocode failed for '%s': %v", events[i].Address, err)
continue
}
events[i].Lat = geo.Lat
events[i].Lng = geo.Lng
}
return events
}
func deduplicateEvents(events []EventCard) []EventCard {
seen := make(map[string]bool)
var unique []EventCard
for _, e := range events {
key := strings.ToLower(e.Title)
if len(key) > 50 {
key = key[:50]
}
if seen[key] {
continue
}
seen[key] = true
unique = append(unique, e)
}
return unique
}