feat: Go backend, enhanced search, new widgets, Docker deploy
Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
This commit is contained in:
284
backend/cmd/scraper-svc/main.go
Normal file
284
backend/cmd/scraper-svc/main.go
Normal file
@@ -0,0 +1,284 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/gofiber/fiber/v2/middleware/cors"
|
||||
"github.com/gofiber/fiber/v2/middleware/logger"
|
||||
"github.com/gooseek/backend/pkg/config"
|
||||
)
|
||||
|
||||
type ScrapeRequest struct {
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
type ScrapeResponse struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
Success bool `json:"success"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
cfg, err := config.Load()
|
||||
if err != nil {
|
||||
log.Fatal("Failed to load config:", err)
|
||||
}
|
||||
|
||||
app := fiber.New(fiber.Config{
|
||||
BodyLimit: 10 * 1024 * 1024,
|
||||
ReadTimeout: 30 * time.Second,
|
||||
WriteTimeout: 30 * time.Second,
|
||||
IdleTimeout: 60 * time.Second,
|
||||
})
|
||||
|
||||
app.Use(logger.New())
|
||||
app.Use(cors.New())
|
||||
|
||||
app.Get("/health", func(c *fiber.Ctx) error {
|
||||
return c.JSON(fiber.Map{"status": "ok"})
|
||||
})
|
||||
|
||||
app.Post("/api/v1/scrape", func(c *fiber.Ctx) error {
|
||||
var req ScrapeRequest
|
||||
if err := c.BodyParser(&req); err != nil {
|
||||
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
if req.URL == "" {
|
||||
return c.Status(400).JSON(fiber.Map{"error": "URL is required"})
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout)
|
||||
defer cancel()
|
||||
|
||||
result := scrapeURL(ctx, req.URL, cfg)
|
||||
return c.JSON(result)
|
||||
})
|
||||
|
||||
app.Post("/api/v1/scrape/batch", func(c *fiber.Ctx) error {
|
||||
var req struct {
|
||||
URLs []string `json:"urls"`
|
||||
}
|
||||
if err := c.BodyParser(&req); err != nil {
|
||||
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
if len(req.URLs) == 0 {
|
||||
return c.Status(400).JSON(fiber.Map{"error": "URLs are required"})
|
||||
}
|
||||
|
||||
if len(req.URLs) > 10 {
|
||||
req.URLs = req.URLs[:10]
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout*3)
|
||||
defer cancel()
|
||||
|
||||
results := make([]ScrapeResponse, len(req.URLs))
|
||||
resultCh := make(chan struct {
|
||||
index int
|
||||
result ScrapeResponse
|
||||
}, len(req.URLs))
|
||||
|
||||
for i, url := range req.URLs {
|
||||
go func(idx int, u string) {
|
||||
resultCh <- struct {
|
||||
index int
|
||||
result ScrapeResponse
|
||||
}{idx, scrapeURL(ctx, u, cfg)}
|
||||
}(i, url)
|
||||
}
|
||||
|
||||
for range req.URLs {
|
||||
r := <-resultCh
|
||||
results[r.index] = r.result
|
||||
}
|
||||
|
||||
return c.JSON(fiber.Map{"results": results})
|
||||
})
|
||||
|
||||
port := cfg.ScraperSvcPort
|
||||
log.Printf("scraper-svc listening on :%d", port)
|
||||
log.Fatal(app.Listen(fmt.Sprintf(":%d", port)))
|
||||
}
|
||||
|
||||
func scrapeURL(ctx context.Context, url string, cfg *config.Config) ScrapeResponse {
|
||||
if cfg.Crawl4AIURL != "" {
|
||||
result, err := scrapeWithCrawl4AI(ctx, url, cfg.Crawl4AIURL)
|
||||
if err == nil && result.Success {
|
||||
return *result
|
||||
}
|
||||
}
|
||||
|
||||
return scrapeDirectly(ctx, url)
|
||||
}
|
||||
|
||||
func scrapeWithCrawl4AI(ctx context.Context, url, crawl4aiURL string) (*ScrapeResponse, error) {
|
||||
reqBody := fmt.Sprintf(`{
|
||||
"urls": ["%s"],
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": "default",
|
||||
"page_timeout": 20000
|
||||
}
|
||||
}
|
||||
}`, url)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", crawl4aiURL+"/crawl", strings.NewReader(reqBody))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
client := &http.Client{Timeout: 25 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("Crawl4AI returned status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
markdown := extractMarkdown(string(body))
|
||||
title := extractTitle(string(body))
|
||||
|
||||
if len(markdown) > 100 {
|
||||
return &ScrapeResponse{
|
||||
URL: url,
|
||||
Title: title,
|
||||
Content: truncate(markdown, 15000),
|
||||
Success: true,
|
||||
}, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("insufficient content from Crawl4AI")
|
||||
}
|
||||
|
||||
func scrapeDirectly(ctx context.Context, url string) ScrapeResponse {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", "GooSeek-Scraper/1.0")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return ScrapeResponse{URL: url, Success: false, Error: fmt.Sprintf("HTTP %d", resp.StatusCode)}
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
|
||||
}
|
||||
|
||||
html := string(body)
|
||||
title := extractHTMLTitle(html)
|
||||
content := extractTextContent(html)
|
||||
|
||||
if len(content) < 100 {
|
||||
return ScrapeResponse{URL: url, Success: false, Error: "Insufficient content"}
|
||||
}
|
||||
|
||||
return ScrapeResponse{
|
||||
URL: url,
|
||||
Title: title,
|
||||
Content: truncate(content, 15000),
|
||||
Success: true,
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
titleRegex = regexp.MustCompile(`<title[^>]*>([^<]+)</title>`)
|
||||
scriptRegex = regexp.MustCompile(`(?s)<script[^>]*>.*?</script>`)
|
||||
styleRegex = regexp.MustCompile(`(?s)<style[^>]*>.*?</style>`)
|
||||
tagRegex = regexp.MustCompile(`<[^>]+>`)
|
||||
spaceRegex = regexp.MustCompile(`\s+`)
|
||||
)
|
||||
|
||||
func extractHTMLTitle(html string) string {
|
||||
matches := titleRegex.FindStringSubmatch(html)
|
||||
if len(matches) > 1 {
|
||||
return strings.TrimSpace(matches[1])
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractTextContent(html string) string {
|
||||
bodyStart := strings.Index(strings.ToLower(html), "<body")
|
||||
bodyEnd := strings.Index(strings.ToLower(html), "</body>")
|
||||
|
||||
if bodyStart != -1 && bodyEnd != -1 && bodyEnd > bodyStart {
|
||||
html = html[bodyStart:bodyEnd]
|
||||
}
|
||||
|
||||
html = scriptRegex.ReplaceAllString(html, "")
|
||||
html = styleRegex.ReplaceAllString(html, "")
|
||||
html = tagRegex.ReplaceAllString(html, " ")
|
||||
html = spaceRegex.ReplaceAllString(html, " ")
|
||||
|
||||
return strings.TrimSpace(html)
|
||||
}
|
||||
|
||||
func extractMarkdown(response string) string {
|
||||
if idx := strings.Index(response, `"raw_markdown"`); idx != -1 {
|
||||
start := strings.Index(response[idx:], `"`) + idx + 1
|
||||
start = strings.Index(response[start:], `"`) + start + 1
|
||||
end := strings.Index(response[start:], `"`)
|
||||
if end != -1 {
|
||||
return response[start : start+end]
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractTitle(response string) string {
|
||||
if idx := strings.Index(response, `"title"`); idx != -1 {
|
||||
start := strings.Index(response[idx:], `"`) + idx + 1
|
||||
start = strings.Index(response[start:], `"`) + start + 1
|
||||
end := strings.Index(response[start:], `"`)
|
||||
if end != -1 {
|
||||
return response[start : start+end]
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen]
|
||||
}
|
||||
|
||||
func init() {
|
||||
if os.Getenv("PORT") == "" {
|
||||
os.Setenv("PORT", "3021")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user