gooseek/backend/cmd/scraper-svc/main.go

package main

import (
	"context"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"regexp"
	"strings"
	"time"

	"github.com/gofiber/fiber/v2"
	"github.com/gofiber/fiber/v2/middleware/cors"
	"github.com/gofiber/fiber/v2/middleware/logger"
	"github.com/gooseek/backend/pkg/config"
)

type ScrapeRequest struct {
	URL string `json:"url"`
}

type ScrapeResponse struct {
	URL     string `json:"url"`
	Title   string `json:"title"`
	Content string `json:"content"`
	Success bool   `json:"success"`
	Error   string `json:"error,omitempty"`
}

func main() {
	cfg, err := config.Load()
	if err != nil {
		log.Fatal("Failed to load config:", err)
	}

	app := fiber.New(fiber.Config{
		BodyLimit:    10 * 1024 * 1024,
		ReadTimeout:  30 * time.Second,
		WriteTimeout: 30 * time.Second,
		IdleTimeout:  60 * time.Second,
	})

	app.Use(logger.New())
	app.Use(cors.New())

	app.Get("/health", func(c *fiber.Ctx) error {
		return c.JSON(fiber.Map{"status": "ok"})
	})

	app.Post("/api/v1/scrape", func(c *fiber.Ctx) error {
		var req ScrapeRequest
		if err := c.BodyParser(&req); err != nil {
			return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
		}

		if req.URL == "" {
			return c.Status(400).JSON(fiber.Map{"error": "URL is required"})
		}

		ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout)
		defer cancel()

		result := scrapeURL(ctx, req.URL, cfg)
		return c.JSON(result)
	})

	app.Post("/api/v1/scrape/batch", func(c *fiber.Ctx) error {
		var req struct {
			URLs []string `json:"urls"`
		}
		if err := c.BodyParser(&req); err != nil {
			return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
		}

		if len(req.URLs) == 0 {
			return c.Status(400).JSON(fiber.Map{"error": "URLs are required"})
		}

		if len(req.URLs) > 10 {
			req.URLs = req.URLs[:10]
		}

		ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout*3)
		defer cancel()

		results := make([]ScrapeResponse, len(req.URLs))
		resultCh := make(chan struct {
			index  int
			result ScrapeResponse
		}, len(req.URLs))

		for i, url := range req.URLs {
			go func(idx int, u string) {
				resultCh <- struct {
					index  int
					result ScrapeResponse
				}{idx, scrapeURL(ctx, u, cfg)}
			}(i, url)
		}

		for range req.URLs {
			r := <-resultCh
			results[r.index] = r.result
		}

		return c.JSON(fiber.Map{"results": results})
	})

	port := cfg.ScraperSvcPort
	log.Printf("scraper-svc listening on :%d", port)
	log.Fatal(app.Listen(fmt.Sprintf(":%d", port)))
}

func scrapeURL(ctx context.Context, url string, cfg *config.Config) ScrapeResponse {
	if cfg.Crawl4AIURL != "" {
		result, err := scrapeWithCrawl4AI(ctx, url, cfg.Crawl4AIURL)
		if err == nil && result.Success {
			return *result
		}
	}

	return scrapeDirectly(ctx, url)
}

func scrapeWithCrawl4AI(ctx context.Context, url, crawl4aiURL string) (*ScrapeResponse, error) {
	reqBody := fmt.Sprintf(`{
		"urls": ["%s"],
		"crawler_config": {
			"type": "CrawlerRunConfig",
			"params": {
				"cache_mode": "default",
				"page_timeout": 20000
			}
		}
	}`, url)

	req, err := http.NewRequestWithContext(ctx, "POST", crawl4aiURL+"/crawl", strings.NewReader(reqBody))
	if err != nil {
		return nil, err
	}
	req.Header.Set("Content-Type", "application/json")

	client := &http.Client{Timeout: 25 * time.Second}
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("Crawl4AI returned status %d", resp.StatusCode)
	}

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}

	markdown := extractMarkdown(string(body))
	title := extractTitle(string(body))

	if len(markdown) > 100 {
		return &ScrapeResponse{
			URL:     url,
			Title:   title,
			Content: truncate(markdown, 15000),
			Success: true,
		}, nil
	}

	return nil, fmt.Errorf("insufficient content from Crawl4AI")
}

func scrapeDirectly(ctx context.Context, url string) ScrapeResponse {
	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
	}

	req.Header.Set("User-Agent", "GooSeek-Scraper/1.0")
	req.Header.Set("Accept", "text/html,application/xhtml+xml")

	client := &http.Client{Timeout: 10 * time.Second}
	resp, err := client.Do(req)
	if err != nil {
		return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return ScrapeResponse{URL: url, Success: false, Error: fmt.Sprintf("HTTP %d", resp.StatusCode)}
	}

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
	}

	html := string(body)
	title := extractHTMLTitle(html)
	content := extractTextContent(html)

	if len(content) < 100 {
		return ScrapeResponse{URL: url, Success: false, Error: "Insufficient content"}
	}

	return ScrapeResponse{
		URL:     url,
		Title:   title,
		Content: truncate(content, 15000),
		Success: true,
	}
}

var (
	titleRegex   = regexp.MustCompile(`<title[^>]*>([^<]+)</title>`)
	scriptRegex  = regexp.MustCompile(`(?s)<script[^>]*>.*?</script>`)
	styleRegex   = regexp.MustCompile(`(?s)<style[^>]*>.*?</style>`)
	tagRegex     = regexp.MustCompile(`<[^>]+>`)
	spaceRegex   = regexp.MustCompile(`\s+`)
)

func extractHTMLTitle(html string) string {
	matches := titleRegex.FindStringSubmatch(html)
	if len(matches) > 1 {
		return strings.TrimSpace(matches[1])
	}
	return ""
}

func extractTextContent(html string) string {
	bodyStart := strings.Index(strings.ToLower(html), "<body")
	bodyEnd := strings.Index(strings.ToLower(html), "</body>")

	if bodyStart != -1 && bodyEnd != -1 && bodyEnd > bodyStart {
		html = html[bodyStart:bodyEnd]
	}

	html = scriptRegex.ReplaceAllString(html, "")
	html = styleRegex.ReplaceAllString(html, "")
	html = tagRegex.ReplaceAllString(html, " ")
	html = spaceRegex.ReplaceAllString(html, " ")

	return strings.TrimSpace(html)
}

func extractMarkdown(response string) string {
	if idx := strings.Index(response, `"raw_markdown"`); idx != -1 {
		start := strings.Index(response[idx:], `"`) + idx + 1
		start = strings.Index(response[start:], `"`) + start + 1
		end := strings.Index(response[start:], `"`)
		if end != -1 {
			return response[start : start+end]
		}
	}
	return ""
}

func extractTitle(response string) string {
	if idx := strings.Index(response, `"title"`); idx != -1 {
		start := strings.Index(response[idx:], `"`) + idx + 1
		start = strings.Index(response[start:], `"`) + start + 1
		end := strings.Index(response[start:], `"`)
		if end != -1 {
			return response[start : start+end]
		}
	}
	return ""
}

func truncate(s string, maxLen int) string {
	if len(s) <= maxLen {
		return s
	}
	return s[:maxLen]
}

func init() {
	if os.Getenv("PORT") == "" {
		os.Setenv("PORT", "3021")
	}
}