feat：增加多模态ai测试接口

2026-01-15 14:13:00 +08:00 · 2026-01-15 14:13:00 +08:00 · 222b294101
commit 222b294101
parent 9170c77e32
5 changed files with 200 additions and 0 deletions
--- a/pkg/common/qwen/qwen_vl.go
+++ b/pkg/common/qwen/qwen_vl.go
@ -0,0 +1,89 @@
+package qwen
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	modelQwen "fonchain-fiee/pkg/model/qwen"
+	"fonchain-fiee/pkg/utils"
+
+	"go.uber.org/zap"
+)
+
+// VL 调用通义千问视觉多模态API，支持多个视频、多张图片和文本
+func VL(videoURLs []string, imageURLs []string, text string, model string) (resp *modelQwen.VLResponse, err error) {
+	// 设置默认模型
+	if model == "" {
+		model = "qwen3-vl-plus"
+	}
+
+	// 构建内容列表
+	content := make([]modelQwen.VLContent, 0)
+
+	// 添加视频内容，支持自定义fps
+	for _, videoURL := range videoURLs {
+		fps := 2 // 默认fps为2
+		content = append(content, modelQwen.VLContent{
+			Type: "video_url",
+			VideoURL: &modelQwen.VideoURL{
+				URL: videoURL,
+			},
+			FPS: fps,
+		})
+	}
+
+	// 添加图片内容
+	for _, imageURL := range imageURLs {
+		content = append(content, modelQwen.VLContent{
+			Type: "image_url",
+			ImageURL: &modelQwen.ImageURL{
+				URL: imageURL,
+			},
+		})
+	}
+
+	// 添加文本内容
+	if text != "" {
+		content = append(content, modelQwen.VLContent{
+			Type: "text",
+			Text: text,
+		})
+	}
+
+	// 构建请求
+	req := modelQwen.VLRequest{
+		Model: model,
+		Messages: []modelQwen.VLMessage{
+			{
+				Role:    "user",
+				Content: content,
+			},
+		},
+	}
+
+	// 序列化请求
+	jsonData, err := json.Marshal(req)
+	if err != nil {
+		zap.L().Error("VL Marshal failed", zap.Error(err))
+		return nil, errors.New("序列化请求失败")
+	}
+
+	// 发送请求
+	body, err := utils.PostBytes(modelQwen.DashscopeVLURL, map[string]interface{}{
+		"Authorization": "Bearer " + modelQwen.DashscopeAPIKey,
+		"Content-Type":  "application/json",
+	}, jsonData)
+	if err != nil {
+		zap.L().Error("VL Post failed", zap.Error(err))
+		return nil, errors.New("请求视觉AI失败")
+	}
+
+	// 解析响应
+	var result modelQwen.VLResponse
+	if err = json.Unmarshal(body, &result); err != nil {
+		zap.L().Error("VL Unmarshal failed", zap.Error(err), zap.String("body", string(body)))
+		return nil, fmt.Errorf("解析响应失败: %v", err)
+	}
+
+	return &result, nil
+}
--- a/pkg/model/qwen/image.go
+++ b/pkg/model/qwen/image.go
@ -4,6 +4,7 @@ const (
 	DashscopeAPIKey        string = "sk-5ae9df5d3bcf4755ad5d12012058a2e7"
 	DashscopeText2ImageURL string = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text2image/image-synthesis"
 	DashscopeEditImageURL  string = "https://dashscope.aliyuncs.com/api/v1/services/aigc/image2image/image-synthesis"
+	DashscopeVLURL         string = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
 )

 // QwenImageRequest 通义千问文生图请求
--- a/pkg/model/qwen/qwen_vl.go
+++ b/pkg/model/qwen/qwen_vl.go
@ -0,0 +1,47 @@
+package qwen
+
+// VLContent 视觉多模态内容结构，支持文本、图片和视频
+type VLContent struct {
+	Type     string    `json:"type"`                // text, image_url, video_url
+	Text     string    `json:"text,omitempty"`      // type=text 时使用
+	ImageURL *ImageURL `json:"image_url,omitempty"` // type=image_url 时使用
+	VideoURL *VideoURL `json:"video_url,omitempty"` // type=video_url 时使用
+	FPS      int       `json:"fps,omitempty"`       // type=video_url 时可选，视频帧率
+}
+
+// VideoURL 视频URL结构
+type VideoURL struct {
+	URL string `json:"url"`
+}
+
+// VLRequest 视觉多模态请求结构
+type VLRequest struct {
+	Model        string      `json:"model"`                   // 模型名称，如 qwen3-vl-plus
+	Messages     []VLMessage `json:"messages"`                // 消息列表
+	Seed         int64       `json:"seed,omitempty"`          // 随机种子
+	EnableSearch bool        `json:"enable_search,omitempty"` // 是否启用搜索
+}
+
+// VLMessage 视觉多模态消息结构
+type VLMessage struct {
+	Role    string      `json:"role"`    // user, assistant, system
+	Content []VLContent `json:"content"` // 内容列表，可包含文本、图片、视频
+}
+
+// VLResponse 视觉多模态响应结构
+type VLResponse struct {
+	Choices []VLChoice `json:"choices"`
+	Model   string     `json:"model,omitempty"`
+	ID      string     `json:"id,omitempty"`
+}
+
+// VLChoice 视觉多模态选择结果
+type VLChoice struct {
+	Message struct {
+		Content          string `json:"content"`
+		ReasoningContent string `json:"reasoning_content"`
+		Role             string `json:"role"`
+	} `json:"message"`
+	FinishReason string `json:"finish_reason"`
+	Index        int    `json:"index,omitempty"`
+}
--- a/pkg/router/media.go
+++ b/pkg/router/media.go
@ -86,6 +86,7 @@ func MediaRouter(r *gin.RouterGroup) {
 	{
 		aiNoAuth.POST("image-generate", serviceAI.AIImageGenerate)
 		aiNoAuth.POST("text-generate", serviceAI.AIChat)
+		aiNoAuth.POST("video-vl", serviceAI.AIVideoVL)
 	}
 	aiAuth := auth.Group("ai")
 	{
--- a/pkg/service/ai/video_vl.go
+++ b/pkg/service/ai/video_vl.go
@ -0,0 +1,62 @@
+package ai
+
+import (
+	"errors"
+	"fonchain-fiee/pkg/common/qwen"
+	"fonchain-fiee/pkg/service"
+	"fonchain-fiee/pkg/utils"
+
+	"github.com/gin-gonic/gin"
+)
+
+// VideoVLRequest 视频/图片理解请求参数
+type VideoVLRequest struct {
+	Videos []string `json:"videos"` // 视频URL列表
+	Images []string `json:"images"` // 图片URL列表
+	Text   string   `json:"text"`   // 可选的文本提示
+	Model  string   `json:"model"`  // 可选的模型名称，默认使用 qwen3-vl-plus
+}
+
+// AIVideoVL AI理解视频/图片接口
+func AIVideoVL(ctx *gin.Context) {
+	var req VideoVLRequest
+	if err := ctx.ShouldBindJSON(&req); err != nil {
+		service.Error(ctx, errors.New("参数错误"))
+		return
+	}
+
+	// 检查是否至少提供了视频或图片
+	if len(req.Videos) == 0 && len(req.Images) == 0 {
+		service.Error(ctx, errors.New("至少需要提供一个视频或图片"))
+		return
+	}
+
+	// 检查视频大小，每个视频不能超过55MB
+	const maxVideoSizeMB = 55
+	for _, videoURL := range req.Videos {
+		if videoURL == "" {
+			continue
+		}
+		// 获取视频文件大小（单位：MB）
+		sizeMB, err := utils.GetRemoteFileSize(videoURL)
+		if err != nil {
+			service.Error(ctx, errors.New("获取视频大小失败: "+err.Error()))
+			return
+		}
+		// 检查是否超过55MB
+		if sizeMB > maxVideoSizeMB {
+			service.Error(ctx, errors.New("作品视频数不能超过55MB"))
+			return
+		}
+	}
+
+	// 调用VL函数进行AI理解
+	result, err := qwen.VL(req.Videos, req.Images, req.Text, req.Model)
+	if err != nil {
+		service.Error(ctx, err)
+		return
+	}
+
+	// 返回AI返回的数据
+	service.Success(ctx, result)
+}