package main import ( "context" "fmt" "io" "log" "net/http" "os" "regexp" "strings" "time" "github.com/gofiber/fiber/v2" "github.com/gofiber/fiber/v2/middleware/cors" "github.com/gofiber/fiber/v2/middleware/logger" "github.com/gooseek/backend/pkg/config" ) type ScrapeRequest struct { URL string `json:"url"` } type ScrapeResponse struct { URL string `json:"url"` Title string `json:"title"` Content string `json:"content"` Success bool `json:"success"` Error string `json:"error,omitempty"` } func main() { cfg, err := config.Load() if err != nil { log.Fatal("Failed to load config:", err) } app := fiber.New(fiber.Config{ BodyLimit: 10 * 1024 * 1024, ReadTimeout: 30 * time.Second, WriteTimeout: 30 * time.Second, IdleTimeout: 60 * time.Second, }) app.Use(logger.New()) app.Use(cors.New()) app.Get("/health", func(c *fiber.Ctx) error { return c.JSON(fiber.Map{"status": "ok"}) }) app.Post("/api/v1/scrape", func(c *fiber.Ctx) error { var req ScrapeRequest if err := c.BodyParser(&req); err != nil { return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"}) } if req.URL == "" { return c.Status(400).JSON(fiber.Map{"error": "URL is required"}) } ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout) defer cancel() result := scrapeURL(ctx, req.URL, cfg) return c.JSON(result) }) app.Post("/api/v1/scrape/batch", func(c *fiber.Ctx) error { var req struct { URLs []string `json:"urls"` } if err := c.BodyParser(&req); err != nil { return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"}) } if len(req.URLs) == 0 { return c.Status(400).JSON(fiber.Map{"error": "URLs are required"}) } if len(req.URLs) > 10 { req.URLs = req.URLs[:10] } ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout*3) defer cancel() results := make([]ScrapeResponse, len(req.URLs)) resultCh := make(chan struct { index int result ScrapeResponse }, len(req.URLs)) for i, url := range req.URLs { go func(idx int, u string) { resultCh <- struct { index int result ScrapeResponse }{idx, scrapeURL(ctx, u, cfg)} }(i, url) } for range req.URLs { r := <-resultCh results[r.index] = r.result } return c.JSON(fiber.Map{"results": results}) }) port := cfg.ScraperSvcPort log.Printf("scraper-svc listening on :%d", port) log.Fatal(app.Listen(fmt.Sprintf(":%d", port))) } func scrapeURL(ctx context.Context, url string, cfg *config.Config) ScrapeResponse { if cfg.Crawl4AIURL != "" { result, err := scrapeWithCrawl4AI(ctx, url, cfg.Crawl4AIURL) if err == nil && result.Success { return *result } } return scrapeDirectly(ctx, url) } func scrapeWithCrawl4AI(ctx context.Context, url, crawl4aiURL string) (*ScrapeResponse, error) { reqBody := fmt.Sprintf(`{ "urls": ["%s"], "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "default", "page_timeout": 20000 } } }`, url) req, err := http.NewRequestWithContext(ctx, "POST", crawl4aiURL+"/crawl", strings.NewReader(reqBody)) if err != nil { return nil, err } req.Header.Set("Content-Type", "application/json") client := &http.Client{Timeout: 25 * time.Second} resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("Crawl4AI returned status %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, err } markdown := extractMarkdown(string(body)) title := extractTitle(string(body)) if len(markdown) > 100 { return &ScrapeResponse{ URL: url, Title: title, Content: truncate(markdown, 15000), Success: true, }, nil } return nil, fmt.Errorf("insufficient content from Crawl4AI") } func scrapeDirectly(ctx context.Context, url string) ScrapeResponse { req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return ScrapeResponse{URL: url, Success: false, Error: err.Error()} } req.Header.Set("User-Agent", "GooSeek-Scraper/1.0") req.Header.Set("Accept", "text/html,application/xhtml+xml") client := &http.Client{Timeout: 10 * time.Second} resp, err := client.Do(req) if err != nil { return ScrapeResponse{URL: url, Success: false, Error: err.Error()} } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return ScrapeResponse{URL: url, Success: false, Error: fmt.Sprintf("HTTP %d", resp.StatusCode)} } body, err := io.ReadAll(resp.Body) if err != nil { return ScrapeResponse{URL: url, Success: false, Error: err.Error()} } html := string(body) title := extractHTMLTitle(html) content := extractTextContent(html) if len(content) < 100 { return ScrapeResponse{URL: url, Success: false, Error: "Insufficient content"} } return ScrapeResponse{ URL: url, Title: title, Content: truncate(content, 15000), Success: true, } } var ( titleRegex = regexp.MustCompile(`