pdf support in read tool (#5222)

Co-authored-by: ammi1378 <ammi1378@users.noreply.github.com>
This commit is contained in:
Aiden Cline 2025-12-07 19:54:00 -08:00 committed by GitHub
parent 06ba1f76dc
commit a3bb4a3c85
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 40 additions and 169 deletions

View file

@ -2,6 +2,17 @@ import type { APICallError, ModelMessage } from "ai"
import { unique } from "remeda"
import type { JSONSchema } from "zod/v4/core"
import type { Provider } from "./provider"
import type { ModelsDev } from "./models"
type Modality = NonNullable<ModelsDev.Model["modalities"]>["input"][number]
function mimeToModality(mime: string): Modality | undefined {
if (mime.startsWith("image/")) return "image"
if (mime.startsWith("audio/")) return "audio"
if (mime.startsWith("video/")) return "video"
if (mime === "application/pdf") return "pdf"
return undefined
}
export namespace ProviderTransform {
function normalizeMessages(msgs: ModelMessage[], model: Provider.Model): ModelMessage[] {
@ -148,7 +159,32 @@ export namespace ProviderTransform {
return msgs
}
function unsupportedParts(msgs: ModelMessage[], model: Provider.Model): ModelMessage[] {
return msgs.map((msg) => {
if (msg.role !== "user" || !Array.isArray(msg.content)) return msg
const filtered = msg.content.map((part) => {
if (part.type !== "file" && part.type !== "image") return part
const mime = part.type === "image" ? part.image.toString().split(";")[0].replace("data:", "") : part.mediaType
const filename = part.type === "file" ? part.filename : undefined
const modality = mimeToModality(mime)
if (!modality) return part
if (model.capabilities.input[modality]) return part
const name = filename ? `"${filename}"` : modality
return {
type: "text" as const,
text: `ERROR: Cannot read ${name} (this model does not support ${modality} input). Inform the user.`,
}
})
return { ...msg, content: filtered }
})
}
export function message(msgs: ModelMessage[], model: Provider.Model) {
msgs = unsupportedParts(msgs, model)
msgs = normalizeMessages(msgs, model)
if (model.providerID === "anthropic" || model.api.id.includes("anthropic") || model.api.id.includes("claude")) {
msgs = applyCaching(msgs, model.providerID)

View file

@ -411,147 +411,6 @@ export namespace MessageV2 {
})
export type WithParts = z.infer<typeof WithParts>
export function fromV1(v1: Message.Info) {
if (v1.role === "assistant") {
const info: Assistant = {
id: v1.id,
parentID: "",
sessionID: v1.metadata.sessionID,
role: "assistant",
time: {
created: v1.metadata.time.created,
completed: v1.metadata.time.completed,
},
cost: v1.metadata.assistant!.cost,
path: v1.metadata.assistant!.path,
summary: v1.metadata.assistant!.summary,
tokens: v1.metadata.assistant!.tokens,
modelID: v1.metadata.assistant!.modelID,
providerID: v1.metadata.assistant!.providerID,
mode: "build",
error: v1.metadata.error,
}
const parts = v1.parts.flatMap((part): Part[] => {
const base = {
id: Identifier.ascending("part"),
messageID: v1.id,
sessionID: v1.metadata.sessionID,
}
if (part.type === "text") {
return [
{
...base,
type: "text",
text: part.text,
},
]
}
if (part.type === "step-start") {
return [
{
...base,
type: "step-start",
},
]
}
if (part.type === "tool-invocation") {
return [
{
...base,
type: "tool",
callID: part.toolInvocation.toolCallId,
tool: part.toolInvocation.toolName,
state: (() => {
if (part.toolInvocation.state === "partial-call") {
return {
status: "pending",
input: {},
raw: "",
}
}
const { title, time, ...metadata } = v1.metadata.tool[part.toolInvocation.toolCallId] ?? {}
if (part.toolInvocation.state === "call") {
return {
status: "running",
input: part.toolInvocation.args,
time: {
start: time?.start,
},
}
}
if (part.toolInvocation.state === "result") {
return {
status: "completed",
input: part.toolInvocation.args,
output: part.toolInvocation.result,
title,
time,
metadata,
}
}
throw new Error("unknown tool invocation state")
})(),
},
]
}
return []
})
return {
info,
parts,
}
}
if (v1.role === "user") {
const info: User = {
id: v1.id,
sessionID: v1.metadata.sessionID,
role: "user",
time: {
created: v1.metadata.time.created,
},
agent: "build",
model: {
providerID: "opencode",
modelID: "opencode",
},
}
const parts = v1.parts.flatMap((part): Part[] => {
const base = {
id: Identifier.ascending("part"),
messageID: v1.id,
sessionID: v1.metadata.sessionID,
}
if (part.type === "text") {
return [
{
...base,
type: "text",
text: part.text,
},
]
}
if (part.type === "file") {
return [
{
...base,
type: "file",
mime: part.mediaType,
filename: part.filename,
url: part.url,
},
]
}
return []
})
return { info, parts }
}
throw new Error("unknown message type")
}
export function toModelMessage(
input: {
info: Info

View file

@ -7,7 +7,6 @@ import { FileTime } from "../file/time"
import DESCRIPTION from "./read.txt"
import { Filesystem } from "../util/filesystem"
import { Instance } from "../project/instance"
import { Provider } from "../provider/provider"
import { Identifier } from "../id/id"
import { Permission } from "../permission"
import { Agent } from "@/agent/agent"
@ -94,15 +93,11 @@ export const ReadTool = Tool.define("read", {
throw new Error(`File not found: ${filepath}`)
}
const isImage = isImageFile(filepath)
const model = ctx.extra?.model as Provider.Model | undefined
const supportsImages = model?.capabilities.input.image ?? false
if (isImage) {
if (!supportsImages) {
throw new Error(`Failed to read image: ${filepath}, model may not be able to read images`)
}
const isImage = file.type.startsWith("image/")
const isPdf = file.type === "application/pdf"
if (isImage || isPdf) {
const mime = file.type
const msg = "Image read successfully"
const msg = `${isImage ? "Image" : "PDF"} read successfully`
return {
title,
output: msg,
@ -164,25 +159,6 @@ export const ReadTool = Tool.define("read", {
},
})
function isImageFile(filePath: string): string | false {
const ext = path.extname(filePath).toLowerCase()
switch (ext) {
case ".jpg":
case ".jpeg":
return "JPEG"
case ".png":
return "PNG"
case ".gif":
return "GIF"
case ".bmp":
return "BMP"
case ".webp":
return "WebP"
default:
return false
}
}
async function isBinaryFile(filepath: string, file: Bun.BunFile): Promise<boolean> {
const ext = path.extname(filepath).toLowerCase()
// binary check for common non-text extensions