Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
344 lines
7.6 KiB
Go
344 lines
7.6 KiB
Go
package files
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/base64"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/gooseek/backend/internal/llm"
|
|
"github.com/ledongthuc/pdf"
|
|
)
|
|
|
|
type FileAnalyzer struct {
|
|
llmClient llm.Client
|
|
storagePath string
|
|
}
|
|
|
|
type AnalysisResult struct {
|
|
FileType string `json:"fileType"`
|
|
ExtractedText string `json:"extractedText"`
|
|
Summary string `json:"summary"`
|
|
KeyPoints []string `json:"keyPoints"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
func NewFileAnalyzer(llmClient llm.Client, storagePath string) *FileAnalyzer {
|
|
if storagePath == "" {
|
|
storagePath = "/tmp/gooseek-files"
|
|
}
|
|
os.MkdirAll(storagePath, 0755)
|
|
|
|
return &FileAnalyzer{
|
|
llmClient: llmClient,
|
|
storagePath: storagePath,
|
|
}
|
|
}
|
|
|
|
func (fa *FileAnalyzer) AnalyzeFile(ctx context.Context, filePath string, fileType string) (*AnalysisResult, error) {
|
|
switch {
|
|
case strings.HasPrefix(fileType, "application/pdf"):
|
|
return fa.analyzePDF(ctx, filePath)
|
|
case strings.HasPrefix(fileType, "image/"):
|
|
return fa.analyzeImage(ctx, filePath, fileType)
|
|
case strings.HasPrefix(fileType, "text/"):
|
|
return fa.analyzeText(ctx, filePath)
|
|
default:
|
|
return nil, fmt.Errorf("unsupported file type: %s", fileType)
|
|
}
|
|
}
|
|
|
|
func (fa *FileAnalyzer) analyzePDF(ctx context.Context, filePath string) (*AnalysisResult, error) {
|
|
text, metadata, err := extractPDFContent(filePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to extract PDF content: %w", err)
|
|
}
|
|
|
|
if len(text) > 50000 {
|
|
text = text[:50000] + "\n\n[Content truncated...]"
|
|
}
|
|
|
|
summary, keyPoints, err := fa.generateSummary(ctx, text, "PDF document")
|
|
if err != nil {
|
|
summary = ""
|
|
keyPoints = nil
|
|
}
|
|
|
|
return &AnalysisResult{
|
|
FileType: "pdf",
|
|
ExtractedText: text,
|
|
Summary: summary,
|
|
KeyPoints: keyPoints,
|
|
Metadata: metadata,
|
|
}, nil
|
|
}
|
|
|
|
func extractPDFContent(filePath string) (string, map[string]interface{}, error) {
|
|
f, r, err := pdf.Open(filePath)
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
defer f.Close()
|
|
|
|
var textBuilder strings.Builder
|
|
numPages := r.NumPage()
|
|
|
|
for i := 1; i <= numPages; i++ {
|
|
p := r.Page(i)
|
|
if p.V.IsNull() {
|
|
continue
|
|
}
|
|
|
|
text, err := p.GetPlainText(nil)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
textBuilder.WriteString(text)
|
|
textBuilder.WriteString("\n\n")
|
|
|
|
if textBuilder.Len() > 100000 {
|
|
break
|
|
}
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"numPages": numPages,
|
|
}
|
|
|
|
return textBuilder.String(), metadata, nil
|
|
}
|
|
|
|
func (fa *FileAnalyzer) analyzeImage(ctx context.Context, filePath string, mimeType string) (*AnalysisResult, error) {
|
|
imageData, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read image: %w", err)
|
|
}
|
|
|
|
base64Image := base64.StdEncoding.EncodeToString(imageData)
|
|
|
|
description, err := fa.describeImage(ctx, base64Image, mimeType)
|
|
if err != nil {
|
|
description = "Image analysis unavailable"
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"size": len(imageData),
|
|
}
|
|
|
|
return &AnalysisResult{
|
|
FileType: "image",
|
|
ExtractedText: description,
|
|
Summary: description,
|
|
KeyPoints: extractKeyPointsFromDescription(description),
|
|
Metadata: metadata,
|
|
}, nil
|
|
}
|
|
|
|
func (fa *FileAnalyzer) describeImage(ctx context.Context, base64Image, mimeType string) (string, error) {
|
|
prompt := `Analyze this image and provide:
|
|
1. A detailed description of what's shown
|
|
2. Any text visible in the image (OCR)
|
|
3. Key elements and their relationships
|
|
4. Any data, charts, or diagrams and their meaning
|
|
|
|
Be thorough but concise.`
|
|
|
|
messages := []llm.Message{
|
|
{
|
|
Role: "user",
|
|
Content: prompt,
|
|
Images: []llm.ImageContent{
|
|
{
|
|
Type: mimeType,
|
|
Data: base64Image,
|
|
IsBase64: true,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
result, err := fa.llmClient.GenerateText(ctx, llm.StreamRequest{
|
|
Messages: messages,
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func (fa *FileAnalyzer) analyzeText(ctx context.Context, filePath string) (*AnalysisResult, error) {
|
|
content, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read file: %w", err)
|
|
}
|
|
|
|
text := string(content)
|
|
if len(text) > 50000 {
|
|
text = text[:50000] + "\n\n[Content truncated...]"
|
|
}
|
|
|
|
summary, keyPoints, err := fa.generateSummary(ctx, text, "text document")
|
|
if err != nil {
|
|
summary = ""
|
|
keyPoints = nil
|
|
}
|
|
|
|
return &AnalysisResult{
|
|
FileType: "text",
|
|
ExtractedText: text,
|
|
Summary: summary,
|
|
KeyPoints: keyPoints,
|
|
Metadata: map[string]interface{}{
|
|
"size": len(content),
|
|
"lineCount": strings.Count(text, "\n") + 1,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
func (fa *FileAnalyzer) generateSummary(ctx context.Context, text, docType string) (string, []string, error) {
|
|
if len(text) < 100 {
|
|
return text, nil, nil
|
|
}
|
|
|
|
truncatedText := text
|
|
if len(text) > 15000 {
|
|
truncatedText = text[:15000] + "\n\n[Content truncated for analysis...]"
|
|
}
|
|
|
|
prompt := fmt.Sprintf(`Analyze this %s and provide:
|
|
|
|
1. A concise summary (2-3 paragraphs)
|
|
2. 5-7 key points as bullet points
|
|
|
|
Document content:
|
|
%s
|
|
|
|
Format your response as:
|
|
SUMMARY:
|
|
[your summary here]
|
|
|
|
KEY POINTS:
|
|
- [point 1]
|
|
- [point 2]
|
|
...`, docType, truncatedText)
|
|
|
|
result, err := fa.llmClient.GenerateText(ctx, llm.StreamRequest{
|
|
Messages: []llm.Message{
|
|
{Role: llm.RoleUser, Content: prompt},
|
|
},
|
|
})
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
|
|
summary, keyPoints := parseSummaryResponse(result)
|
|
return summary, keyPoints, nil
|
|
}
|
|
|
|
func parseSummaryResponse(response string) (string, []string) {
|
|
var summary string
|
|
var keyPoints []string
|
|
|
|
parts := strings.Split(response, "KEY POINTS:")
|
|
if len(parts) >= 2 {
|
|
summaryPart := strings.TrimPrefix(parts[0], "SUMMARY:")
|
|
summary = strings.TrimSpace(summaryPart)
|
|
|
|
keyPointsPart := parts[1]
|
|
for _, line := range strings.Split(keyPointsPart, "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if strings.HasPrefix(line, "-") || strings.HasPrefix(line, "•") || strings.HasPrefix(line, "*") {
|
|
point := strings.TrimPrefix(strings.TrimPrefix(strings.TrimPrefix(line, "-"), "•"), "*")
|
|
point = strings.TrimSpace(point)
|
|
if point != "" {
|
|
keyPoints = append(keyPoints, point)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
summary = response
|
|
}
|
|
|
|
return summary, keyPoints
|
|
}
|
|
|
|
func extractKeyPointsFromDescription(description string) []string {
|
|
var points []string
|
|
sentences := strings.Split(description, ".")
|
|
|
|
for i, s := range sentences {
|
|
s = strings.TrimSpace(s)
|
|
if len(s) > 20 && i < 5 {
|
|
points = append(points, s+".")
|
|
}
|
|
}
|
|
|
|
return points
|
|
}
|
|
|
|
func DetectMimeType(filename string, content []byte) string {
|
|
ext := strings.ToLower(filepath.Ext(filename))
|
|
switch ext {
|
|
case ".pdf":
|
|
return "application/pdf"
|
|
case ".png":
|
|
return "image/png"
|
|
case ".jpg", ".jpeg":
|
|
return "image/jpeg"
|
|
case ".gif":
|
|
return "image/gif"
|
|
case ".webp":
|
|
return "image/webp"
|
|
case ".txt":
|
|
return "text/plain"
|
|
case ".md":
|
|
return "text/markdown"
|
|
case ".csv":
|
|
return "text/csv"
|
|
case ".json":
|
|
return "application/json"
|
|
default:
|
|
return http.DetectContentType(content[:min(512, len(content))])
|
|
}
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func (fa *FileAnalyzer) SaveFile(filename string, content io.Reader) (string, int64, error) {
|
|
safeName := filepath.Base(filename)
|
|
destPath := filepath.Join(fa.storagePath, safeName)
|
|
|
|
file, err := os.Create(destPath)
|
|
if err != nil {
|
|
return "", 0, err
|
|
}
|
|
defer file.Close()
|
|
|
|
var buf bytes.Buffer
|
|
size, err := io.Copy(io.MultiWriter(file, &buf), content)
|
|
if err != nil {
|
|
return "", 0, err
|
|
}
|
|
|
|
return destPath, size, nil
|
|
}
|
|
|
|
func (fa *FileAnalyzer) DeleteFile(filePath string) error {
|
|
if !strings.HasPrefix(filePath, fa.storagePath) {
|
|
return fmt.Errorf("invalid file path")
|
|
}
|
|
return os.Remove(filePath)
|
|
}
|