Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
285 lines
6.7 KiB
Go
285 lines
6.7 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/gofiber/fiber/v2"
|
|
"github.com/gofiber/fiber/v2/middleware/cors"
|
|
"github.com/gofiber/fiber/v2/middleware/logger"
|
|
"github.com/gooseek/backend/pkg/config"
|
|
)
|
|
|
|
type ScrapeRequest struct {
|
|
URL string `json:"url"`
|
|
}
|
|
|
|
type ScrapeResponse struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title"`
|
|
Content string `json:"content"`
|
|
Success bool `json:"success"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
func main() {
|
|
cfg, err := config.Load()
|
|
if err != nil {
|
|
log.Fatal("Failed to load config:", err)
|
|
}
|
|
|
|
app := fiber.New(fiber.Config{
|
|
BodyLimit: 10 * 1024 * 1024,
|
|
ReadTimeout: 30 * time.Second,
|
|
WriteTimeout: 30 * time.Second,
|
|
IdleTimeout: 60 * time.Second,
|
|
})
|
|
|
|
app.Use(logger.New())
|
|
app.Use(cors.New())
|
|
|
|
app.Get("/health", func(c *fiber.Ctx) error {
|
|
return c.JSON(fiber.Map{"status": "ok"})
|
|
})
|
|
|
|
app.Post("/api/v1/scrape", func(c *fiber.Ctx) error {
|
|
var req ScrapeRequest
|
|
if err := c.BodyParser(&req); err != nil {
|
|
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
|
|
}
|
|
|
|
if req.URL == "" {
|
|
return c.Status(400).JSON(fiber.Map{"error": "URL is required"})
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout)
|
|
defer cancel()
|
|
|
|
result := scrapeURL(ctx, req.URL, cfg)
|
|
return c.JSON(result)
|
|
})
|
|
|
|
app.Post("/api/v1/scrape/batch", func(c *fiber.Ctx) error {
|
|
var req struct {
|
|
URLs []string `json:"urls"`
|
|
}
|
|
if err := c.BodyParser(&req); err != nil {
|
|
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
|
|
}
|
|
|
|
if len(req.URLs) == 0 {
|
|
return c.Status(400).JSON(fiber.Map{"error": "URLs are required"})
|
|
}
|
|
|
|
if len(req.URLs) > 10 {
|
|
req.URLs = req.URLs[:10]
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout*3)
|
|
defer cancel()
|
|
|
|
results := make([]ScrapeResponse, len(req.URLs))
|
|
resultCh := make(chan struct {
|
|
index int
|
|
result ScrapeResponse
|
|
}, len(req.URLs))
|
|
|
|
for i, url := range req.URLs {
|
|
go func(idx int, u string) {
|
|
resultCh <- struct {
|
|
index int
|
|
result ScrapeResponse
|
|
}{idx, scrapeURL(ctx, u, cfg)}
|
|
}(i, url)
|
|
}
|
|
|
|
for range req.URLs {
|
|
r := <-resultCh
|
|
results[r.index] = r.result
|
|
}
|
|
|
|
return c.JSON(fiber.Map{"results": results})
|
|
})
|
|
|
|
port := cfg.ScraperSvcPort
|
|
log.Printf("scraper-svc listening on :%d", port)
|
|
log.Fatal(app.Listen(fmt.Sprintf(":%d", port)))
|
|
}
|
|
|
|
func scrapeURL(ctx context.Context, url string, cfg *config.Config) ScrapeResponse {
|
|
if cfg.Crawl4AIURL != "" {
|
|
result, err := scrapeWithCrawl4AI(ctx, url, cfg.Crawl4AIURL)
|
|
if err == nil && result.Success {
|
|
return *result
|
|
}
|
|
}
|
|
|
|
return scrapeDirectly(ctx, url)
|
|
}
|
|
|
|
func scrapeWithCrawl4AI(ctx context.Context, url, crawl4aiURL string) (*ScrapeResponse, error) {
|
|
reqBody := fmt.Sprintf(`{
|
|
"urls": ["%s"],
|
|
"crawler_config": {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"cache_mode": "default",
|
|
"page_timeout": 20000
|
|
}
|
|
}
|
|
}`, url)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "POST", crawl4aiURL+"/crawl", strings.NewReader(reqBody))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
client := &http.Client{Timeout: 25 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("Crawl4AI returned status %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
markdown := extractMarkdown(string(body))
|
|
title := extractTitle(string(body))
|
|
|
|
if len(markdown) > 100 {
|
|
return &ScrapeResponse{
|
|
URL: url,
|
|
Title: title,
|
|
Content: truncate(markdown, 15000),
|
|
Success: true,
|
|
}, nil
|
|
}
|
|
|
|
return nil, fmt.Errorf("insufficient content from Crawl4AI")
|
|
}
|
|
|
|
func scrapeDirectly(ctx context.Context, url string) ScrapeResponse {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
|
|
}
|
|
|
|
req.Header.Set("User-Agent", "GooSeek-Scraper/1.0")
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
|
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return ScrapeResponse{URL: url, Success: false, Error: fmt.Sprintf("HTTP %d", resp.StatusCode)}
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
|
|
}
|
|
|
|
html := string(body)
|
|
title := extractHTMLTitle(html)
|
|
content := extractTextContent(html)
|
|
|
|
if len(content) < 100 {
|
|
return ScrapeResponse{URL: url, Success: false, Error: "Insufficient content"}
|
|
}
|
|
|
|
return ScrapeResponse{
|
|
URL: url,
|
|
Title: title,
|
|
Content: truncate(content, 15000),
|
|
Success: true,
|
|
}
|
|
}
|
|
|
|
var (
|
|
titleRegex = regexp.MustCompile(`<title[^>]*>([^<]+)</title>`)
|
|
scriptRegex = regexp.MustCompile(`(?s)<script[^>]*>.*?</script>`)
|
|
styleRegex = regexp.MustCompile(`(?s)<style[^>]*>.*?</style>`)
|
|
tagRegex = regexp.MustCompile(`<[^>]+>`)
|
|
spaceRegex = regexp.MustCompile(`\s+`)
|
|
)
|
|
|
|
func extractHTMLTitle(html string) string {
|
|
matches := titleRegex.FindStringSubmatch(html)
|
|
if len(matches) > 1 {
|
|
return strings.TrimSpace(matches[1])
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractTextContent(html string) string {
|
|
bodyStart := strings.Index(strings.ToLower(html), "<body")
|
|
bodyEnd := strings.Index(strings.ToLower(html), "</body>")
|
|
|
|
if bodyStart != -1 && bodyEnd != -1 && bodyEnd > bodyStart {
|
|
html = html[bodyStart:bodyEnd]
|
|
}
|
|
|
|
html = scriptRegex.ReplaceAllString(html, "")
|
|
html = styleRegex.ReplaceAllString(html, "")
|
|
html = tagRegex.ReplaceAllString(html, " ")
|
|
html = spaceRegex.ReplaceAllString(html, " ")
|
|
|
|
return strings.TrimSpace(html)
|
|
}
|
|
|
|
func extractMarkdown(response string) string {
|
|
if idx := strings.Index(response, `"raw_markdown"`); idx != -1 {
|
|
start := strings.Index(response[idx:], `"`) + idx + 1
|
|
start = strings.Index(response[start:], `"`) + start + 1
|
|
end := strings.Index(response[start:], `"`)
|
|
if end != -1 {
|
|
return response[start : start+end]
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractTitle(response string) string {
|
|
if idx := strings.Index(response, `"title"`); idx != -1 {
|
|
start := strings.Index(response[idx:], `"`) + idx + 1
|
|
start = strings.Index(response[start:], `"`) + start + 1
|
|
end := strings.Index(response[start:], `"`)
|
|
if end != -1 {
|
|
return response[start : start+end]
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func truncate(s string, maxLen int) string {
|
|
if len(s) <= maxLen {
|
|
return s
|
|
}
|
|
return s[:maxLen]
|
|
}
|
|
|
|
func init() {
|
|
if os.Getenv("PORT") == "" {
|
|
os.Setenv("PORT", "3021")
|
|
}
|
|
}
|