gooseek/backend/internal/podcast/generator.go

package podcast

import (
	"context"
	"encoding/json"
	"fmt"
	"net/http"
	"strings"
	"time"

	"github.com/gooseek/backend/internal/llm"
	"github.com/google/uuid"
)

type PodcastType string

const (
	PodcastDaily     PodcastType = "daily"
	PodcastWeekly    PodcastType = "weekly"
	PodcastTopicDeep PodcastType = "topic_deep"
	PodcastBreaking  PodcastType = "breaking"
)

type VoiceStyle string

const (
	VoiceNeutral      VoiceStyle = "neutral"
	VoiceEnthusiastic VoiceStyle = "enthusiastic"
	VoiceProfessional VoiceStyle = "professional"
	VoiceCasual       VoiceStyle = "casual"
	VoiceStorytelling VoiceStyle = "storytelling"
)

type Podcast struct {
	ID           string           `json:"id"`
	Title        string           `json:"title"`
	Description  string           `json:"description"`
	Type         PodcastType      `json:"type"`
	Date         time.Time        `json:"date"`
	Duration     int              `json:"durationSeconds"`
	AudioURL     string           `json:"audioUrl,omitempty"`
	Transcript   string           `json:"transcript"`
	Segments     []PodcastSegment `json:"segments"`
	Topics       []string         `json:"topics"`
	Sources      []Source         `json:"sources"`
	Thumbnail    string           `json:"thumbnail,omitempty"`
	Status       PodcastStatus    `json:"status"`
	GeneratedAt  time.Time        `json:"generatedAt"`
	PublishedAt  *time.Time       `json:"publishedAt,omitempty"`
	Locale       string           `json:"locale"`
	VoiceConfig  VoiceConfig      `json:"voiceConfig"`
}

type PodcastStatus string

const (
	StatusDraft      PodcastStatus = "draft"
	StatusGenerating PodcastStatus = "generating"
	StatusReady      PodcastStatus = "ready"
	StatusPublished  PodcastStatus = "published"
	StatusFailed     PodcastStatus = "failed"
)

type PodcastSegment struct {
	ID         string    `json:"id"`
	Type       string    `json:"type"`
	Title      string    `json:"title"`
	Content    string    `json:"content"`
	Duration   int       `json:"durationSeconds"`
	StartTime  int       `json:"startTime"`
	EndTime    int       `json:"endTime"`
	Sources    []Source  `json:"sources,omitempty"`
	Highlights []string  `json:"highlights,omitempty"`
}

type Source struct {
	Title     string `json:"title"`
	URL       string `json:"url"`
	Publisher string `json:"publisher"`
	Date      string `json:"date,omitempty"`
}

type VoiceConfig struct {
	Provider   string     `json:"provider"`
	VoiceID    string     `json:"voiceId"`
	Style      VoiceStyle `json:"style"`
	Speed      float64    `json:"speed"`
	Pitch      float64    `json:"pitch"`
	Language   string     `json:"language"`
}

type PodcastGenerator struct {
	llm        llm.Client
	ttsClient  TTSClient
	httpClient *http.Client
	config     GeneratorConfig
}

type GeneratorConfig struct {
	DefaultDuration int
	MaxDuration     int
	DefaultVoice    VoiceConfig
	OutputDir       string
}

type TTSClient interface {
	GenerateSpeech(ctx context.Context, text string, config VoiceConfig) ([]byte, error)
}

func NewPodcastGenerator(llmClient llm.Client, ttsClient TTSClient, cfg GeneratorConfig) *PodcastGenerator {
	if cfg.DefaultDuration == 0 {
		cfg.DefaultDuration = 300
	}
	if cfg.MaxDuration == 0 {
		cfg.MaxDuration = 1800
	}
	if cfg.DefaultVoice.Provider == "" {
		cfg.DefaultVoice = VoiceConfig{
			Provider: "elevenlabs",
			VoiceID:  "21m00Tcm4TlvDq8ikWAM",
			Style:    VoiceNeutral,
			Speed:    1.0,
			Pitch:    1.0,
			Language: "ru",
		}
	}

	return &PodcastGenerator{
		llm:        llmClient,
		ttsClient:  ttsClient,
		httpClient: &http.Client{Timeout: 60 * time.Second},
		config:     cfg,
	}
}

type GenerateOptions struct {
	Type          PodcastType
	Topics        []string
	NewsItems     []NewsItem
	Date          time.Time
	Duration      int
	Locale        string
	VoiceConfig   *VoiceConfig
	IncludeIntro  bool
	IncludeOutro  bool
	PersonalizeFor string
}

type NewsItem struct {
	Title       string   `json:"title"`
	Summary     string   `json:"summary"`
	URL         string   `json:"url"`
	Source      string   `json:"source"`
	PublishedAt string   `json:"publishedAt"`
	Topics      []string `json:"topics"`
	Importance  int      `json:"importance"`
}

func (g *PodcastGenerator) GenerateDailyPodcast(ctx context.Context, opts GenerateOptions) (*Podcast, error) {
	if opts.Date.IsZero() {
		opts.Date = time.Now()
	}
	if opts.Duration == 0 {
		opts.Duration = g.config.DefaultDuration
	}
	if opts.Locale == "" {
		opts.Locale = "ru"
	}

	script, err := g.generateScript(ctx, opts)
	if err != nil {
		return nil, fmt.Errorf("failed to generate script: %w", err)
	}

	podcast := &Podcast{
		ID:          uuid.New().String(),
		Title:       script.Title,
		Description: script.Description,
		Type:        opts.Type,
		Date:        opts.Date,
		Duration:    opts.Duration,
		Transcript:  script.FullText,
		Segments:    script.Segments,
		Topics:      opts.Topics,
		Sources:     script.Sources,
		Status:      StatusDraft,
		GeneratedAt: time.Now(),
		Locale:      opts.Locale,
		VoiceConfig: g.config.DefaultVoice,
	}

	if opts.VoiceConfig != nil {
		podcast.VoiceConfig = *opts.VoiceConfig
	}

	return podcast, nil
}

type PodcastScript struct {
	Title       string
	Description string
	FullText    string
	Segments    []PodcastSegment
	Sources     []Source
}

func (g *PodcastGenerator) generateScript(ctx context.Context, opts GenerateOptions) (*PodcastScript, error) {
	locale := opts.Locale
	langInstruction := ""
	if locale == "ru" {
		langInstruction = "Generate the entire script in Russian language. Use natural Russian speech patterns."
	}

	newsJSON, _ := json.Marshal(opts.NewsItems)

	prompt := fmt.Sprintf(`Create a podcast script for a daily news digest.

Date: %s
Duration target: %d seconds (approximately %d minutes)
Topics: %v
%s

News items to cover:
%s

Create an engaging podcast script with these requirements:
1. Start with a catchy introduction greeting the audience
2. Cover the most important news first
3. Transition smoothly between stories
4. Add brief analysis or context where appropriate
5. End with a summary and sign-off

The script should sound natural when read aloud - use conversational language, not formal news anchor style.

Respond in JSON format:
{
  "title": "Podcast title for this episode",
  "description": "Brief episode description",
  "segments": [
    {
      "type": "intro|news|analysis|transition|outro",
      "title": "Segment title",
      "content": "Full text to be spoken",
      "highlights": ["Key point 1", "Key point 2"],
      "sources": [{"title": "Source title", "url": "url", "publisher": "publisher"}]
    }
  ]
}`, opts.Date.Format("2006-01-02"), opts.Duration, opts.Duration/60, opts.Topics, langInstruction, string(newsJSON))

	result, err := g.llm.GenerateText(ctx, llm.StreamRequest{
		Messages: []llm.Message{{Role: "user", Content: prompt}},
	})
	if err != nil {
		return nil, err
	}

	jsonStr := extractJSON(result)

	var parsed struct {
		Title       string `json:"title"`
		Description string `json:"description"`
		Segments    []struct {
			Type       string   `json:"type"`
			Title      string   `json:"title"`
			Content    string   `json:"content"`
			Highlights []string `json:"highlights"`
			Sources    []struct {
				Title     string `json:"title"`
				URL       string `json:"url"`
				Publisher string `json:"publisher"`
			} `json:"sources"`
		} `json:"segments"`
	}

	if err := json.Unmarshal([]byte(jsonStr), &parsed); err != nil {
		return g.generateDefaultScript(opts)
	}

	script := &PodcastScript{
		Title:       parsed.Title,
		Description: parsed.Description,
		Segments:    make([]PodcastSegment, 0),
		Sources:     make([]Source, 0),
	}

	var fullTextBuilder strings.Builder
	currentTime := 0
	avgWordsPerSecond := 2.5

	for i, seg := range parsed.Segments {
		wordCount := len(strings.Fields(seg.Content))
		segDuration := int(float64(wordCount) / avgWordsPerSecond)
		if segDuration < 10 {
			segDuration = 10
		}

		segment := PodcastSegment{
			ID:         uuid.New().String(),
			Type:       seg.Type,
			Title:      seg.Title,
			Content:    seg.Content,
			Duration:   segDuration,
			StartTime:  currentTime,
			EndTime:    currentTime + segDuration,
			Highlights: seg.Highlights,
		}

		for _, src := range seg.Sources {
			source := Source{
				Title:     src.Title,
				URL:       src.URL,
				Publisher: src.Publisher,
			}
			segment.Sources = append(segment.Sources, source)
			script.Sources = append(script.Sources, source)
		}

		script.Segments = append(script.Segments, segment)

		fullTextBuilder.WriteString(seg.Content)
		if i < len(parsed.Segments)-1 {
			fullTextBuilder.WriteString("\n\n")
		}

		currentTime += segDuration
	}

	script.FullText = fullTextBuilder.String()

	return script, nil
}

func (g *PodcastGenerator) generateDefaultScript(opts GenerateOptions) (*PodcastScript, error) {
	date := opts.Date.Format("2 January 2006")

	intro := fmt.Sprintf("Добрый день! С вами GooSeek Daily — ваш ежедневный подкаст с главными новостями. Сегодня %s, и вот что происходит в мире.", date)

	var newsContent strings.Builder
	for i, news := range opts.NewsItems {
		if i > 0 {
			newsContent.WriteString("\n\n")
		}
		newsContent.WriteString(fmt.Sprintf("%s. %s", news.Title, news.Summary))
	}

	outro := "На этом всё на сегодня. Спасибо, что слушаете GooSeek Daily! Подписывайтесь на наш подкаст и до встречи завтра."

	return &PodcastScript{
		Title:       fmt.Sprintf("GooSeek Daily — %s", date),
		Description: "Ежедневный подкаст с главными новостями",
		FullText:    fmt.Sprintf("%s\n\n%s\n\n%s", intro, newsContent.String(), outro),
		Segments: []PodcastSegment{
			{ID: uuid.New().String(), Type: "intro", Title: "Вступление", Content: intro, Duration: 15},
			{ID: uuid.New().String(), Type: "news", Title: "Новости", Content: newsContent.String(), Duration: opts.Duration - 30},
			{ID: uuid.New().String(), Type: "outro", Title: "Завершение", Content: outro, Duration: 15},
		},
	}, nil
}

func (g *PodcastGenerator) GenerateAudio(ctx context.Context, podcast *Podcast) ([]byte, error) {
	if g.ttsClient == nil {
		return nil, fmt.Errorf("TTS client not configured")
	}

	podcast.Status = StatusGenerating

	audioData, err := g.ttsClient.GenerateSpeech(ctx, podcast.Transcript, podcast.VoiceConfig)
	if err != nil {
		podcast.Status = StatusFailed
		return nil, fmt.Errorf("failed to generate audio: %w", err)
	}

	podcast.Status = StatusReady

	return audioData, nil
}

func (g *PodcastGenerator) GenerateWeeklySummary(ctx context.Context, weeklyNews []NewsItem, locale string) (*Podcast, error) {
	return g.GenerateDailyPodcast(ctx, GenerateOptions{
		Type:         PodcastWeekly,
		NewsItems:    weeklyNews,
		Duration:     900,
		Locale:       locale,
		IncludeIntro: true,
		IncludeOutro: true,
	})
}

func (g *PodcastGenerator) GenerateTopicDeepDive(ctx context.Context, topic string, articles []NewsItem, locale string) (*Podcast, error) {
	return g.GenerateDailyPodcast(ctx, GenerateOptions{
		Type:         PodcastTopicDeep,
		Topics:       []string{topic},
		NewsItems:    articles,
		Duration:     600,
		Locale:       locale,
		IncludeIntro: true,
		IncludeOutro: true,
	})
}

func extractJSON(text string) string {
	start := strings.Index(text, "{")
	if start == -1 {
		return "{}"
	}

	depth := 0
	for i := start; i < len(text); i++ {
		if text[i] == '{' {
			depth++
		} else if text[i] == '}' {
			depth--
			if depth == 0 {
				return text[start : i+1]
			}
		}
	}

	return "{}"
}

func (p *Podcast) ToJSON() ([]byte, error) {
	return json.Marshal(p)
}

func ParsePodcast(data []byte) (*Podcast, error) {
	var podcast Podcast
	if err := json.Unmarshal(data, &podcast); err != nil {
		return nil, err
	}
	return &podcast, nil
}

type ElevenLabsTTS struct {
	apiKey     string
	httpClient *http.Client
	baseURL    string
}

func NewElevenLabsTTS(apiKey string) *ElevenLabsTTS {
	return &ElevenLabsTTS{
		apiKey:     apiKey,
		httpClient: &http.Client{Timeout: 120 * time.Second},
		baseURL:    "https://api.elevenlabs.io/v1",
	}
}

func (t *ElevenLabsTTS) GenerateSpeech(ctx context.Context, text string, config VoiceConfig) ([]byte, error) {
	voiceID := config.VoiceID
	if voiceID == "" {
		voiceID = "21m00Tcm4TlvDq8ikWAM"
	}

	url := fmt.Sprintf("%s/text-to-speech/%s", t.baseURL, voiceID)

	body := map[string]interface{}{
		"text":     text,
		"model_id": "eleven_multilingual_v2",
		"voice_settings": map[string]interface{}{
			"stability":        0.5,
			"similarity_boost": 0.75,
			"style":            0.5,
			"use_speaker_boost": true,
		},
	}

	bodyJSON, _ := json.Marshal(body)

	req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(string(bodyJSON)))
	if err != nil {
		return nil, err
	}

	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("xi-api-key", t.apiKey)
	req.Header.Set("Accept", "audio/mpeg")

	resp, err := t.httpClient.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("ElevenLabs API error: %d", resp.StatusCode)
	}

	var audioData []byte
	buf := make([]byte, 32*1024)
	for {
		n, err := resp.Body.Read(buf)
		if n > 0 {
			audioData = append(audioData, buf[:n]...)
		}
		if err != nil {
			break
		}
	}

	return audioData, nil
}

type DummyTTS struct{}

func (t *DummyTTS) GenerateSpeech(ctx context.Context, text string, config VoiceConfig) ([]byte, error) {
	return []byte{}, nil
}