diff --git a/packages/opencode/src/provider/models.ts b/packages/opencode/src/provider/models.ts index 514203e91..97310dd19 100644 --- a/packages/opencode/src/provider/models.ts +++ b/packages/opencode/src/provider/models.ts @@ -28,6 +28,12 @@ export namespace ModelsDev { context: z.number(), output: z.number(), }), + modalities: z + .object({ + input: z.array(z.enum(["text", "audio", "image", "video", "pdf"])), + output: z.array(z.enum(["text", "audio", "image", "video", "pdf"])), + }) + .optional(), experimental: z.boolean().optional(), options: z.record(z.string(), z.any()), provider: z.object({ npm: z.string() }).optional(), diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts index 188639830..cf7c4c20d 100644 --- a/packages/opencode/src/provider/provider.ts +++ b/packages/opencode/src/provider/provider.ts @@ -245,6 +245,11 @@ export namespace Provider { context: 0, output: 0, }, + modalities: model.modalities ?? + existing?.modalities ?? { + input: ["text"], + output: ["text"], + }, provider: model.provider ?? existing?.provider, } parsed.models[modelID] = parsedModel diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts index 05fffb3af..632c2ba2f 100644 --- a/packages/opencode/src/session/prompt.ts +++ b/packages/opencode/src/session/prompt.ts @@ -453,6 +453,10 @@ export namespace SessionPrompt { abort: options.abortSignal!, messageID: input.processor.message.id, callID: options.toolCallId, + extra: { + modelID: input.modelID, + providerID: input.providerID, + }, agent: input.agent.name, metadata: async (val) => { const match = input.processor.partFromToolCall(options.toolCallId) @@ -485,22 +489,24 @@ export namespace SessionPrompt { }, toModelOutput: (result: any) => { const res = result as Tool.ExecuteResult - if (res.part) { - if (res.part.type === "text") { - return { - type: "text", - value: res.part.text, + if (res.parts) { + const parts = res.parts.map((part) => { + if (part.type === "text") { + return { + type: "text", + text: part.text, + } as const } - } + return { + type: "media", + mediaType: part.mime, + data: part.url, + } as const + }) + return { type: "content", - value: [ - { - type: "media", - mediaType: res.part.mime, - data: res.part.url, - }, - ], + value: parts, } } diff --git a/packages/opencode/src/tool/read.ts b/packages/opencode/src/tool/read.ts index 95174132d..40b30650e 100644 --- a/packages/opencode/src/tool/read.ts +++ b/packages/opencode/src/tool/read.ts @@ -7,6 +7,7 @@ import { FileTime } from "../file/time" import DESCRIPTION from "./read.txt" import { Filesystem } from "../util/filesystem" import { Instance } from "../project/instance" +import { Provider } from "../provider/provider" const DEFAULT_READ_LIMIT = 2000 const MAX_LINE_LENGTH = 2000 @@ -51,18 +52,30 @@ export const ReadTool = Tool.define("read", { } const isImage = isImageFile(filepath) + const supportsImages = await (async () => { + if (!ctx.extra?.["providerID"] || !ctx.extra?.["modelID"]) return false + const providerID = ctx.extra["providerID"] as string + const modelID = ctx.extra["modelID"] as string + const model = await Provider.getModel(providerID, modelID).catch(() => undefined) + if (!model) return false + return model.info.modalities?.input?.includes("image") ?? false + })() if (isImage) { + if (!supportsImages) { + throw new Error(`Model may not be able to read images`) + } const mime = file.type - const msg = `Image read successfully` + const msg = "Image read successfully" return { title, output: msg, - part: { - type: "file", - url: Buffer.from(await file.bytes()).toString("base64"), - mime, - filename: filepath, - }, + parts: [ + { + type: "file", + url: Buffer.from(await file.bytes()).toString("base64"), + mime, + }, + ], metadata: { preview: msg, }, diff --git a/packages/opencode/src/tool/read.txt b/packages/opencode/src/tool/read.txt index d46a5a52e..b5bffee26 100644 --- a/packages/opencode/src/tool/read.txt +++ b/packages/opencode/src/tool/read.txt @@ -9,3 +9,4 @@ Usage: - Results are returned using cat -n format, with line numbers starting at 1 - You have the capability to call multiple tools in a single response. It is always better to speculatively read multiple files as a batch that are potentially useful. - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents. +- You can read image files using this tool. diff --git a/packages/opencode/src/tool/tool.ts b/packages/opencode/src/tool/tool.ts index 253d841ff..3c71f2f12 100644 --- a/packages/opencode/src/tool/tool.ts +++ b/packages/opencode/src/tool/tool.ts @@ -5,15 +5,13 @@ export namespace Tool { [key: string]: any } - export type PartOutput = - | { type: "text"; text: string } - | { type: "file"; url: string; mime: string; filename?: string } + export type PartOutput = { type: "text"; text: string } | { type: "file"; url: string; mime: string } export type ExecuteResult = { title: string metadata: M output: string - part?: PartOutput + parts?: PartOutput[] } export type Context = {