feat(look_at): add image_data parameter for clipboard/pasted image support
Closes #704 Add support for base64-encoded image data in the look_at tool, enabling analysis of clipboard/pasted images without requiring a file path. Changes: - Add optional image_data parameter to LookAtArgs type - Update validateArgs to accept either file_path or image_data - Add inferMimeTypeFromBase64 function to detect image format - Add try/catch around atob() to handle invalid base64 gracefully - Update execute to handle both file path and data URL inputs - Add comprehensive tests for image_data functionality
This commit is contained in:
parent
4330f25fee
commit
d099b0255f
@ -31,25 +31,52 @@ describe("look-at tool", () => {
|
|||||||
const normalized = normalizeArgs(args as any)
|
const normalized = normalizeArgs(args as any)
|
||||||
expect(normalized.file_path).toBe("/preferred.png")
|
expect(normalized.file_path).toBe("/preferred.png")
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// given image_data provided
|
||||||
|
// when called with base64 image data
|
||||||
|
// then preserve image_data in normalized args
|
||||||
|
test("preserves image_data when provided", () => {
|
||||||
|
const args = { image_data: "data:image/png;base64,iVBORw0KGgo=", goal: "analyze" }
|
||||||
|
const normalized = normalizeArgs(args as any)
|
||||||
|
expect(normalized.image_data).toBe("data:image/png;base64,iVBORw0KGgo=")
|
||||||
|
expect(normalized.file_path).toBeUndefined()
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
describe("validateArgs", () => {
|
describe("validateArgs", () => {
|
||||||
// given valid arguments
|
// given valid arguments with file_path
|
||||||
// when validated
|
// when validated
|
||||||
// then return null (no error)
|
// then return null (no error)
|
||||||
test("returns null for valid args", () => {
|
test("returns null for valid args with file_path", () => {
|
||||||
const args = { file_path: "/valid/path.png", goal: "analyze" }
|
const args = { file_path: "/valid/path.png", goal: "analyze" }
|
||||||
expect(validateArgs(args)).toBeNull()
|
expect(validateArgs(args)).toBeNull()
|
||||||
})
|
})
|
||||||
|
|
||||||
// given file_path missing
|
// given valid arguments with image_data
|
||||||
|
// when validated
|
||||||
|
// then return null (no error)
|
||||||
|
test("returns null for valid args with image_data", () => {
|
||||||
|
const args = { image_data: "data:image/png;base64,iVBORw0KGgo=", goal: "analyze" }
|
||||||
|
expect(validateArgs(args)).toBeNull()
|
||||||
|
})
|
||||||
|
|
||||||
|
// given neither file_path nor image_data
|
||||||
// when validated
|
// when validated
|
||||||
// then clear error message
|
// then clear error message
|
||||||
test("returns error when file_path is missing", () => {
|
test("returns error when neither file_path nor image_data provided", () => {
|
||||||
const args = { goal: "analyze" } as any
|
const args = { goal: "analyze" } as any
|
||||||
const error = validateArgs(args)
|
const error = validateArgs(args)
|
||||||
expect(error).toContain("file_path")
|
expect(error).toContain("file_path")
|
||||||
expect(error).toContain("required")
|
expect(error).toContain("image_data")
|
||||||
|
})
|
||||||
|
|
||||||
|
// given both file_path and image_data
|
||||||
|
// when validated
|
||||||
|
// then return error (mutually exclusive)
|
||||||
|
test("returns error when both file_path and image_data provided", () => {
|
||||||
|
const args = { file_path: "/path.png", image_data: "base64data", goal: "analyze" }
|
||||||
|
const error = validateArgs(args)
|
||||||
|
expect(error).toContain("only one")
|
||||||
})
|
})
|
||||||
|
|
||||||
// given goal missing
|
// given goal missing
|
||||||
@ -69,6 +96,17 @@ describe("look-at tool", () => {
|
|||||||
const args = { file_path: "", goal: "analyze" }
|
const args = { file_path: "", goal: "analyze" }
|
||||||
const error = validateArgs(args)
|
const error = validateArgs(args)
|
||||||
expect(error).toContain("file_path")
|
expect(error).toContain("file_path")
|
||||||
|
expect(error).toContain("image_data")
|
||||||
|
})
|
||||||
|
|
||||||
|
// given image_data is empty string
|
||||||
|
// when validated
|
||||||
|
// then return error
|
||||||
|
test("returns error when image_data is empty string", () => {
|
||||||
|
const args = { image_data: "", goal: "analyze" }
|
||||||
|
const error = validateArgs(args)
|
||||||
|
expect(error).toContain("file_path")
|
||||||
|
expect(error).toContain("image_data")
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -109,7 +147,7 @@ describe("look-at tool", () => {
|
|||||||
toolContext
|
toolContext
|
||||||
)
|
)
|
||||||
|
|
||||||
expect(result).toContain("Error: Failed to analyze file")
|
expect(result).toContain("Error: Failed to analyze")
|
||||||
expect(result).toContain("malformed response")
|
expect(result).toContain("malformed response")
|
||||||
expect(result).toContain("multimodal-looker")
|
expect(result).toContain("multimodal-looker")
|
||||||
expect(result).toContain("image/png")
|
expect(result).toContain("image/png")
|
||||||
@ -217,4 +255,111 @@ describe("look-at tool", () => {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
describe("createLookAt with image_data", () => {
|
||||||
|
// given base64 image data is provided
|
||||||
|
// when LookAt tool executed
|
||||||
|
// then should send data URL to session.prompt
|
||||||
|
test("sends data URL when image_data provided", async () => {
|
||||||
|
let promptBody: any
|
||||||
|
|
||||||
|
const mockClient = {
|
||||||
|
app: {
|
||||||
|
agents: async () => ({ data: [] }),
|
||||||
|
},
|
||||||
|
session: {
|
||||||
|
get: async () => ({ data: { directory: "/project" } }),
|
||||||
|
create: async () => ({ data: { id: "ses_image_data_test" } }),
|
||||||
|
prompt: async (input: any) => {
|
||||||
|
promptBody = input.body
|
||||||
|
return { data: {} }
|
||||||
|
},
|
||||||
|
messages: async () => ({
|
||||||
|
data: [
|
||||||
|
{ info: { role: "assistant", time: { created: 1 } }, parts: [{ type: "text", text: "analyzed" }] },
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
const tool = createLookAt({
|
||||||
|
client: mockClient,
|
||||||
|
directory: "/project",
|
||||||
|
} as any)
|
||||||
|
|
||||||
|
const toolContext: ToolContext = {
|
||||||
|
sessionID: "parent-session",
|
||||||
|
messageID: "parent-message",
|
||||||
|
agent: "sisyphus",
|
||||||
|
directory: "/project",
|
||||||
|
worktree: "/project",
|
||||||
|
abort: new AbortController().signal,
|
||||||
|
metadata: () => {},
|
||||||
|
ask: async () => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
await tool.execute(
|
||||||
|
{ image_data: "data:image/png;base64,iVBORw0KGgo=", goal: "describe this image" },
|
||||||
|
toolContext
|
||||||
|
)
|
||||||
|
|
||||||
|
const filePart = promptBody.parts.find((p: any) => p.type === "file")
|
||||||
|
expect(filePart).toBeDefined()
|
||||||
|
expect(filePart.url).toContain("data:image/png;base64")
|
||||||
|
expect(filePart.mime).toBe("image/png")
|
||||||
|
expect(filePart.filename).toContain("clipboard-image")
|
||||||
|
})
|
||||||
|
|
||||||
|
// given raw base64 without data URI prefix
|
||||||
|
// when LookAt tool executed
|
||||||
|
// then should detect mime type and create proper data URL
|
||||||
|
test("handles raw base64 without data URI prefix", async () => {
|
||||||
|
let promptBody: any
|
||||||
|
|
||||||
|
const mockClient = {
|
||||||
|
app: {
|
||||||
|
agents: async () => ({ data: [] }),
|
||||||
|
},
|
||||||
|
session: {
|
||||||
|
get: async () => ({ data: { directory: "/project" } }),
|
||||||
|
create: async () => ({ data: { id: "ses_raw_base64_test" } }),
|
||||||
|
prompt: async (input: any) => {
|
||||||
|
promptBody = input.body
|
||||||
|
return { data: {} }
|
||||||
|
},
|
||||||
|
messages: async () => ({
|
||||||
|
data: [
|
||||||
|
{ info: { role: "assistant", time: { created: 1 } }, parts: [{ type: "text", text: "analyzed" }] },
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
const tool = createLookAt({
|
||||||
|
client: mockClient,
|
||||||
|
directory: "/project",
|
||||||
|
} as any)
|
||||||
|
|
||||||
|
const toolContext: ToolContext = {
|
||||||
|
sessionID: "parent-session",
|
||||||
|
messageID: "parent-message",
|
||||||
|
agent: "sisyphus",
|
||||||
|
directory: "/project",
|
||||||
|
worktree: "/project",
|
||||||
|
abort: new AbortController().signal,
|
||||||
|
metadata: () => {},
|
||||||
|
ask: async () => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
await tool.execute(
|
||||||
|
{ image_data: "iVBORw0KGgo=", goal: "analyze" },
|
||||||
|
toolContext
|
||||||
|
)
|
||||||
|
|
||||||
|
const filePart = promptBody.parts.find((p: any) => p.type === "file")
|
||||||
|
expect(filePart).toBeDefined()
|
||||||
|
expect(filePart.url).toContain("data:")
|
||||||
|
expect(filePart.url).toContain("base64")
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
@ -11,14 +11,23 @@ interface LookAtArgsWithAlias extends LookAtArgs {
|
|||||||
|
|
||||||
export function normalizeArgs(args: LookAtArgsWithAlias): LookAtArgs {
|
export function normalizeArgs(args: LookAtArgsWithAlias): LookAtArgs {
|
||||||
return {
|
return {
|
||||||
file_path: args.file_path ?? args.path ?? "",
|
file_path: args.file_path ?? args.path,
|
||||||
|
image_data: args.image_data,
|
||||||
goal: args.goal ?? "",
|
goal: args.goal ?? "",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function validateArgs(args: LookAtArgs): string | null {
|
export function validateArgs(args: LookAtArgs): string | null {
|
||||||
if (!args.file_path) {
|
const hasFilePath = args.file_path && args.file_path.length > 0
|
||||||
return `Error: Missing required parameter 'file_path'. Usage: look_at(file_path="/path/to/file", goal="what to extract")`
|
const hasImageData = args.image_data && args.image_data.length > 0
|
||||||
|
|
||||||
|
if (!hasFilePath && !hasImageData) {
|
||||||
|
return `Error: Must provide either 'file_path' or 'image_data'. Usage:
|
||||||
|
- look_at(file_path="/path/to/file", goal="what to extract")
|
||||||
|
- look_at(image_data="base64_encoded_data", goal="what to extract")`
|
||||||
|
}
|
||||||
|
if (hasFilePath && hasImageData) {
|
||||||
|
return `Error: Provide only one of 'file_path' or 'image_data', not both.`
|
||||||
}
|
}
|
||||||
if (!args.goal) {
|
if (!args.goal) {
|
||||||
return `Error: Missing required parameter 'goal'. Usage: look_at(file_path="/path/to/file", goal="what to extract")`
|
return `Error: Missing required parameter 'goal'. Usage: look_at(file_path="/path/to/file", goal="what to extract")`
|
||||||
@ -26,6 +35,28 @@ export function validateArgs(args: LookAtArgs): string | null {
|
|||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function inferMimeTypeFromBase64(base64Data: string): string {
|
||||||
|
if (base64Data.startsWith("data:")) {
|
||||||
|
const match = base64Data.match(/^data:([^;]+);/)
|
||||||
|
if (match) return match[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const cleanData = base64Data.replace(/^data:[^;]+;base64,/, "")
|
||||||
|
const header = atob(cleanData.slice(0, 16))
|
||||||
|
|
||||||
|
if (header.startsWith("\x89PNG")) return "image/png"
|
||||||
|
if (header.startsWith("\xFF\xD8\xFF")) return "image/jpeg"
|
||||||
|
if (header.startsWith("GIF8")) return "image/gif"
|
||||||
|
if (header.startsWith("RIFF") && header.includes("WEBP")) return "image/webp"
|
||||||
|
if (header.startsWith("%PDF")) return "application/pdf"
|
||||||
|
} catch {
|
||||||
|
// Invalid base64 - fall through to default
|
||||||
|
}
|
||||||
|
|
||||||
|
return "image/png"
|
||||||
|
}
|
||||||
|
|
||||||
function inferMimeType(filePath: string): string {
|
function inferMimeType(filePath: string): string {
|
||||||
const ext = extname(filePath).toLowerCase()
|
const ext = extname(filePath).toLowerCase()
|
||||||
const mimeTypes: Record<string, string> = {
|
const mimeTypes: Record<string, string> = {
|
||||||
@ -64,11 +95,22 @@ function inferMimeType(filePath: string): string {
|
|||||||
return mimeTypes[ext] || "application/octet-stream"
|
return mimeTypes[ext] || "application/octet-stream"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function extractBase64Data(imageData: string): string {
|
||||||
|
if (imageData.startsWith("data:")) {
|
||||||
|
const commaIndex = imageData.indexOf(",")
|
||||||
|
if (commaIndex !== -1) {
|
||||||
|
return imageData.slice(commaIndex + 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return imageData
|
||||||
|
}
|
||||||
|
|
||||||
export function createLookAt(ctx: PluginInput): ToolDefinition {
|
export function createLookAt(ctx: PluginInput): ToolDefinition {
|
||||||
return tool({
|
return tool({
|
||||||
description: LOOK_AT_DESCRIPTION,
|
description: LOOK_AT_DESCRIPTION,
|
||||||
args: {
|
args: {
|
||||||
file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
|
file_path: tool.schema.string().optional().describe("Absolute path to the file to analyze"),
|
||||||
|
image_data: tool.schema.string().optional().describe("Base64 encoded image data (for clipboard/pasted images)"),
|
||||||
goal: tool.schema.string().describe("What specific information to extract from the file"),
|
goal: tool.schema.string().describe("What specific information to extract from the file"),
|
||||||
},
|
},
|
||||||
async execute(rawArgs: LookAtArgs, toolContext) {
|
async execute(rawArgs: LookAtArgs, toolContext) {
|
||||||
@ -79,12 +121,34 @@ export function createLookAt(ctx: PluginInput): ToolDefinition {
|
|||||||
return validationError
|
return validationError
|
||||||
}
|
}
|
||||||
|
|
||||||
log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`)
|
const isBase64Input = Boolean(args.image_data)
|
||||||
|
const sourceDescription = isBase64Input ? "clipboard/pasted image" : args.file_path
|
||||||
|
log(`[look_at] Analyzing ${sourceDescription}, goal: ${args.goal}`)
|
||||||
|
|
||||||
const mimeType = inferMimeType(args.file_path)
|
let mimeType: string
|
||||||
const filename = basename(args.file_path)
|
let filePart: { type: "file"; mime: string; url: string; filename: string }
|
||||||
|
|
||||||
const prompt = `Analyze this file and extract the requested information.
|
if (isBase64Input) {
|
||||||
|
mimeType = inferMimeTypeFromBase64(args.image_data!)
|
||||||
|
const base64Content = extractBase64Data(args.image_data!)
|
||||||
|
const dataUrl = `data:${mimeType};base64,${base64Content}`
|
||||||
|
filePart = {
|
||||||
|
type: "file",
|
||||||
|
mime: mimeType,
|
||||||
|
url: dataUrl,
|
||||||
|
filename: `clipboard-image.${mimeType.split("/")[1] || "png"}`,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
mimeType = inferMimeType(args.file_path!)
|
||||||
|
filePart = {
|
||||||
|
type: "file",
|
||||||
|
mime: mimeType,
|
||||||
|
url: pathToFileURL(args.file_path!).href,
|
||||||
|
filename: basename(args.file_path!),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const prompt = `Analyze this ${isBase64Input ? "image" : "file"} and extract the requested information.
|
||||||
|
|
||||||
Goal: ${args.goal}
|
Goal: ${args.goal}
|
||||||
|
|
||||||
@ -157,7 +221,7 @@ Original error: ${createResult.error}`
|
|||||||
log("[look_at] Failed to resolve multimodal-looker model info", error)
|
log("[look_at] Failed to resolve multimodal-looker model info", error)
|
||||||
}
|
}
|
||||||
|
|
||||||
log(`[look_at] Sending prompt with file passthrough to session ${sessionID}`)
|
log(`[look_at] Sending prompt with ${isBase64Input ? "base64 image" : "file"} to session ${sessionID}`)
|
||||||
try {
|
try {
|
||||||
await promptWithModelSuggestionRetry(ctx.client, {
|
await promptWithModelSuggestionRetry(ctx.client, {
|
||||||
path: { id: sessionID },
|
path: { id: sessionID },
|
||||||
@ -171,7 +235,7 @@ Original error: ${createResult.error}`
|
|||||||
},
|
},
|
||||||
parts: [
|
parts: [
|
||||||
{ type: "text", text: prompt },
|
{ type: "text", text: prompt },
|
||||||
{ type: "file", mime: mimeType, url: pathToFileURL(args.file_path).href, filename },
|
filePart,
|
||||||
],
|
],
|
||||||
...(agentModel ? { model: { providerID: agentModel.providerID, modelID: agentModel.modelID } } : {}),
|
...(agentModel ? { model: { providerID: agentModel.providerID, modelID: agentModel.modelID } } : {}),
|
||||||
...(agentVariant ? { variant: agentVariant } : {}),
|
...(agentVariant ? { variant: agentVariant } : {}),
|
||||||
@ -183,20 +247,20 @@ Original error: ${createResult.error}`
|
|||||||
|
|
||||||
const isJsonParseError = errorMessage.includes("JSON") && (errorMessage.includes("EOF") || errorMessage.includes("parse"))
|
const isJsonParseError = errorMessage.includes("JSON") && (errorMessage.includes("EOF") || errorMessage.includes("parse"))
|
||||||
if (isJsonParseError) {
|
if (isJsonParseError) {
|
||||||
return `Error: Failed to analyze file - received malformed response from multimodal-looker agent.
|
return `Error: Failed to analyze ${isBase64Input ? "image" : "file"} - received malformed response from multimodal-looker agent.
|
||||||
|
|
||||||
This typically occurs when:
|
This typically occurs when:
|
||||||
1. The multimodal-looker model is not available or not connected
|
1. The multimodal-looker model is not available or not connected
|
||||||
2. The model does not support this file type (${mimeType})
|
2. The model does not support this ${isBase64Input ? "image format" : `file type (${mimeType})`}
|
||||||
3. The API returned an empty or truncated response
|
3. The API returned an empty or truncated response
|
||||||
|
|
||||||
File: ${args.file_path}
|
${isBase64Input ? "Source: clipboard/pasted image" : `File: ${args.file_path}`}
|
||||||
MIME type: ${mimeType}
|
MIME type: ${mimeType}
|
||||||
|
|
||||||
Try:
|
Try:
|
||||||
- Ensure a vision-capable model (e.g., gemini-3-flash, gpt-5.2) is available
|
- Ensure a vision-capable model (e.g., gemini-3-flash, gpt-5.2) is available
|
||||||
- Check provider connections in opencode settings
|
- Check provider connections in opencode settings
|
||||||
- For text files like .md, .txt, use the Read tool instead
|
${!isBase64Input ? "- For text files like .md, .txt, use the Read tool instead" : ""}
|
||||||
|
|
||||||
Original error: ${errorMessage}`
|
Original error: ${errorMessage}`
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
export interface LookAtArgs {
|
export interface LookAtArgs {
|
||||||
file_path: string
|
file_path?: string
|
||||||
|
image_data?: string // base64 encoded image data (for clipboard images)
|
||||||
goal: string
|
goal: string
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user