Files
home 06fe57c765 feat: Go backend, enhanced search, new widgets, Docker deploy
Major changes:
- Add Go backend (backend/) with microservices architecture
- Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler,
  proxy-manager, media-search, fastClassifier, language detection
- New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard,
  UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions
- Improved discover-svc with discover-db integration
- Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md)
- Library-svc: project_id schema migration
- Remove deprecated finance-svc and travel-svc
- Localization improvements across services

Made-with: Cursor
2026-02-27 04:15:32 +03:00

285 lines
6.7 KiB
Go

package main
import (
"context"
"fmt"
"io"
"log"
"net/http"
"os"
"regexp"
"strings"
"time"
"github.com/gofiber/fiber/v2"
"github.com/gofiber/fiber/v2/middleware/cors"
"github.com/gofiber/fiber/v2/middleware/logger"
"github.com/gooseek/backend/pkg/config"
)
type ScrapeRequest struct {
URL string `json:"url"`
}
type ScrapeResponse struct {
URL string `json:"url"`
Title string `json:"title"`
Content string `json:"content"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
func main() {
cfg, err := config.Load()
if err != nil {
log.Fatal("Failed to load config:", err)
}
app := fiber.New(fiber.Config{
BodyLimit: 10 * 1024 * 1024,
ReadTimeout: 30 * time.Second,
WriteTimeout: 30 * time.Second,
IdleTimeout: 60 * time.Second,
})
app.Use(logger.New())
app.Use(cors.New())
app.Get("/health", func(c *fiber.Ctx) error {
return c.JSON(fiber.Map{"status": "ok"})
})
app.Post("/api/v1/scrape", func(c *fiber.Ctx) error {
var req ScrapeRequest
if err := c.BodyParser(&req); err != nil {
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
}
if req.URL == "" {
return c.Status(400).JSON(fiber.Map{"error": "URL is required"})
}
ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout)
defer cancel()
result := scrapeURL(ctx, req.URL, cfg)
return c.JSON(result)
})
app.Post("/api/v1/scrape/batch", func(c *fiber.Ctx) error {
var req struct {
URLs []string `json:"urls"`
}
if err := c.BodyParser(&req); err != nil {
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
}
if len(req.URLs) == 0 {
return c.Status(400).JSON(fiber.Map{"error": "URLs are required"})
}
if len(req.URLs) > 10 {
req.URLs = req.URLs[:10]
}
ctx, cancel := context.WithTimeout(context.Background(), cfg.ScrapeTimeout*3)
defer cancel()
results := make([]ScrapeResponse, len(req.URLs))
resultCh := make(chan struct {
index int
result ScrapeResponse
}, len(req.URLs))
for i, url := range req.URLs {
go func(idx int, u string) {
resultCh <- struct {
index int
result ScrapeResponse
}{idx, scrapeURL(ctx, u, cfg)}
}(i, url)
}
for range req.URLs {
r := <-resultCh
results[r.index] = r.result
}
return c.JSON(fiber.Map{"results": results})
})
port := cfg.ScraperSvcPort
log.Printf("scraper-svc listening on :%d", port)
log.Fatal(app.Listen(fmt.Sprintf(":%d", port)))
}
func scrapeURL(ctx context.Context, url string, cfg *config.Config) ScrapeResponse {
if cfg.Crawl4AIURL != "" {
result, err := scrapeWithCrawl4AI(ctx, url, cfg.Crawl4AIURL)
if err == nil && result.Success {
return *result
}
}
return scrapeDirectly(ctx, url)
}
func scrapeWithCrawl4AI(ctx context.Context, url, crawl4aiURL string) (*ScrapeResponse, error) {
reqBody := fmt.Sprintf(`{
"urls": ["%s"],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "default",
"page_timeout": 20000
}
}
}`, url)
req, err := http.NewRequestWithContext(ctx, "POST", crawl4aiURL+"/crawl", strings.NewReader(reqBody))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 25 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("Crawl4AI returned status %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
markdown := extractMarkdown(string(body))
title := extractTitle(string(body))
if len(markdown) > 100 {
return &ScrapeResponse{
URL: url,
Title: title,
Content: truncate(markdown, 15000),
Success: true,
}, nil
}
return nil, fmt.Errorf("insufficient content from Crawl4AI")
}
func scrapeDirectly(ctx context.Context, url string) ScrapeResponse {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
}
req.Header.Set("User-Agent", "GooSeek-Scraper/1.0")
req.Header.Set("Accept", "text/html,application/xhtml+xml")
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
if err != nil {
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return ScrapeResponse{URL: url, Success: false, Error: fmt.Sprintf("HTTP %d", resp.StatusCode)}
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return ScrapeResponse{URL: url, Success: false, Error: err.Error()}
}
html := string(body)
title := extractHTMLTitle(html)
content := extractTextContent(html)
if len(content) < 100 {
return ScrapeResponse{URL: url, Success: false, Error: "Insufficient content"}
}
return ScrapeResponse{
URL: url,
Title: title,
Content: truncate(content, 15000),
Success: true,
}
}
var (
titleRegex = regexp.MustCompile(`<title[^>]*>([^<]+)</title>`)
scriptRegex = regexp.MustCompile(`(?s)<script[^>]*>.*?</script>`)
styleRegex = regexp.MustCompile(`(?s)<style[^>]*>.*?</style>`)
tagRegex = regexp.MustCompile(`<[^>]+>`)
spaceRegex = regexp.MustCompile(`\s+`)
)
func extractHTMLTitle(html string) string {
matches := titleRegex.FindStringSubmatch(html)
if len(matches) > 1 {
return strings.TrimSpace(matches[1])
}
return ""
}
func extractTextContent(html string) string {
bodyStart := strings.Index(strings.ToLower(html), "<body")
bodyEnd := strings.Index(strings.ToLower(html), "</body>")
if bodyStart != -1 && bodyEnd != -1 && bodyEnd > bodyStart {
html = html[bodyStart:bodyEnd]
}
html = scriptRegex.ReplaceAllString(html, "")
html = styleRegex.ReplaceAllString(html, "")
html = tagRegex.ReplaceAllString(html, " ")
html = spaceRegex.ReplaceAllString(html, " ")
return strings.TrimSpace(html)
}
func extractMarkdown(response string) string {
if idx := strings.Index(response, `"raw_markdown"`); idx != -1 {
start := strings.Index(response[idx:], `"`) + idx + 1
start = strings.Index(response[start:], `"`) + start + 1
end := strings.Index(response[start:], `"`)
if end != -1 {
return response[start : start+end]
}
}
return ""
}
func extractTitle(response string) string {
if idx := strings.Index(response, `"title"`); idx != -1 {
start := strings.Index(response[idx:], `"`) + idx + 1
start = strings.Index(response[start:], `"`) + start + 1
end := strings.Index(response[start:], `"`)
if end != -1 {
return response[start : start+end]
}
}
return ""
}
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen]
}
func init() {
if os.Getenv("PORT") == "" {
os.Setenv("PORT", "3021")
}
}