diff --git a/src/tools/look-at/assistant-message-extractor.ts b/src/tools/look-at/assistant-message-extractor.ts new file mode 100644 index 00000000..f7db57b7 --- /dev/null +++ b/src/tools/look-at/assistant-message-extractor.ts @@ -0,0 +1,67 @@ +type MessageTime = { created?: number } + +type MessageInfo = { + role?: string + time?: MessageTime +} + +type MessagePart = { + type?: string + text?: string +} + +type SessionMessage = { + info?: MessageInfo + parts?: unknown +} + +function isObject(value: unknown): value is Record { + return typeof value === "object" && value !== null +} + +function asSessionMessage(value: unknown): SessionMessage | null { + if (!isObject(value)) return null + const info = value["info"] + const parts = value["parts"] + return { + info: isObject(info) + ? { + role: typeof info["role"] === "string" ? info["role"] : undefined, + time: isObject(info["time"]) ? { created: typeof info["time"]["created"] === "number" ? info["time"]["created"] : undefined } : undefined, + } + : undefined, + parts, + } +} + +function getCreatedTime(message: SessionMessage): number { + return message.info?.time?.created ?? 0 +} + +function getTextParts(message: SessionMessage): MessagePart[] { + if (!Array.isArray(message.parts)) return [] + return message.parts + .filter((part): part is Record => isObject(part)) + .map((part) => ({ + type: typeof part["type"] === "string" ? part["type"] : undefined, + text: typeof part["text"] === "string" ? part["text"] : undefined, + })) + .filter((part) => part.type === "text" && Boolean(part.text)) +} + +export function extractLatestAssistantText(messages: unknown): string | null { + if (!Array.isArray(messages) || messages.length === 0) return null + + const assistantMessages = messages + .map(asSessionMessage) + .filter((message): message is SessionMessage => message !== null) + .filter((message) => message.info?.role === "assistant") + .sort((a, b) => getCreatedTime(b) - getCreatedTime(a)) + + const lastAssistantMessage = assistantMessages[0] + if (!lastAssistantMessage) return null + + const textParts = getTextParts(lastAssistantMessage) + const responseText = textParts.map((part) => part.text).join("\n") + return responseText +} diff --git a/src/tools/look-at/look-at-arguments.ts b/src/tools/look-at/look-at-arguments.ts new file mode 100644 index 00000000..dc98f692 --- /dev/null +++ b/src/tools/look-at/look-at-arguments.ts @@ -0,0 +1,31 @@ +import type { LookAtArgs } from "./types" + +export interface LookAtArgsWithAlias extends LookAtArgs { + path?: string +} + +export function normalizeArgs(args: LookAtArgsWithAlias): LookAtArgs { + return { + file_path: args.file_path ?? args.path, + image_data: args.image_data, + goal: args.goal ?? "", + } +} + +export function validateArgs(args: LookAtArgs): string | null { + const hasFilePath = Boolean(args.file_path && args.file_path.length > 0) + const hasImageData = Boolean(args.image_data && args.image_data.length > 0) + + if (!hasFilePath && !hasImageData) { + return `Error: Must provide either 'file_path' or 'image_data'. Usage: +- look_at(file_path="/path/to/file", goal="what to extract") +- look_at(image_data="base64_encoded_data", goal="what to extract")` + } + if (hasFilePath && hasImageData) { + return "Error: Provide only one of 'file_path' or 'image_data', not both." + } + if (!args.goal) { + return "Error: Missing required parameter 'goal'. Usage: look_at(file_path=\"/path/to/file\", goal=\"what to extract\")" + } + return null +} diff --git a/src/tools/look-at/mime-type-inference.ts b/src/tools/look-at/mime-type-inference.ts new file mode 100644 index 00000000..18954c46 --- /dev/null +++ b/src/tools/look-at/mime-type-inference.ts @@ -0,0 +1,71 @@ +import { extname } from "node:path" + +export function inferMimeTypeFromBase64(base64Data: string): string { + if (base64Data.startsWith("data:")) { + const match = base64Data.match(/^data:([^;]+);/) + if (match) return match[1] + } + + try { + const cleanData = base64Data.replace(/^data:[^;]+;base64,/, "") + const header = atob(cleanData.slice(0, 16)) + + if (header.startsWith("\x89PNG")) return "image/png" + if (header.startsWith("\xFF\xD8\xFF")) return "image/jpeg" + if (header.startsWith("GIF8")) return "image/gif" + if (header.startsWith("RIFF") && header.includes("WEBP")) return "image/webp" + if (header.startsWith("%PDF")) return "application/pdf" + } catch { + // invalid base64 - fall through + } + + return "image/png" +} + +export function inferMimeTypeFromFilePath(filePath: string): string { + const ext = extname(filePath).toLowerCase() + const mimeTypes: Record = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".webp": "image/webp", + ".heic": "image/heic", + ".heif": "image/heif", + ".mp4": "video/mp4", + ".mpeg": "video/mpeg", + ".mpg": "video/mpeg", + ".mov": "video/mov", + ".avi": "video/avi", + ".flv": "video/x-flv", + ".webm": "video/webm", + ".wmv": "video/wmv", + ".3gpp": "video/3gpp", + ".3gp": "video/3gpp", + ".wav": "audio/wav", + ".mp3": "audio/mp3", + ".aiff": "audio/aiff", + ".aac": "audio/aac", + ".ogg": "audio/ogg", + ".flac": "audio/flac", + ".pdf": "application/pdf", + ".txt": "text/plain", + ".csv": "text/csv", + ".md": "text/md", + ".html": "text/html", + ".json": "application/json", + ".xml": "application/xml", + ".js": "text/javascript", + ".py": "text/x-python", + } + return mimeTypes[ext] || "application/octet-stream" +} + +export function extractBase64Data(imageData: string): string { + if (imageData.startsWith("data:")) { + const commaIndex = imageData.indexOf(",") + if (commaIndex !== -1) { + return imageData.slice(commaIndex + 1) + } + } + return imageData +} diff --git a/src/tools/look-at/multimodal-agent-metadata.ts b/src/tools/look-at/multimodal-agent-metadata.ts new file mode 100644 index 00000000..e24c8b6f --- /dev/null +++ b/src/tools/look-at/multimodal-agent-metadata.ts @@ -0,0 +1,56 @@ +import type { PluginInput } from "@opencode-ai/plugin" +import { MULTIMODAL_LOOKER_AGENT } from "./constants" +import { log } from "../../shared" + +type AgentModel = { providerID: string; modelID: string } + +type ResolvedAgentMetadata = { + agentModel?: AgentModel + agentVariant?: string +} + +type AgentInfo = { + name?: string + model?: AgentModel + variant?: string +} + +function isObject(value: unknown): value is Record { + return typeof value === "object" && value !== null +} + +function toAgentInfo(value: unknown): AgentInfo | null { + if (!isObject(value)) return null + const name = typeof value["name"] === "string" ? value["name"] : undefined + const variant = typeof value["variant"] === "string" ? value["variant"] : undefined + const modelValue = value["model"] + const model = + isObject(modelValue) && + typeof modelValue["providerID"] === "string" && + typeof modelValue["modelID"] === "string" + ? { providerID: modelValue["providerID"], modelID: modelValue["modelID"] } + : undefined + return { name, model, variant } +} + +export async function resolveMultimodalLookerAgentMetadata( + ctx: PluginInput +): Promise { + try { + const agentsResult = await ctx.client.app?.agents?.() + const agentsRaw = isObject(agentsResult) ? agentsResult["data"] : undefined + const agents = Array.isArray(agentsRaw) ? agentsRaw.map(toAgentInfo).filter(Boolean) : [] + + const matched = agents.find( + (agent) => agent?.name?.toLowerCase() === MULTIMODAL_LOOKER_AGENT.toLowerCase() + ) + + return { + agentModel: matched?.model, + agentVariant: matched?.variant, + } + } catch (error) { + log("[look_at] Failed to resolve multimodal-looker model info", error) + return {} + } +} diff --git a/src/tools/look-at/tools.ts b/src/tools/look-at/tools.ts index 28e6edf8..c9ae3448 100644 --- a/src/tools/look-at/tools.ts +++ b/src/tools/look-at/tools.ts @@ -1,109 +1,20 @@ -import { extname, basename } from "node:path" +import { basename } from "node:path" import { pathToFileURL } from "node:url" import { tool, type PluginInput, type ToolDefinition } from "@opencode-ai/plugin" import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants" import type { LookAtArgs } from "./types" import { log, promptSyncWithModelSuggestionRetry } from "../../shared" +import { extractLatestAssistantText } from "./assistant-message-extractor" +import type { LookAtArgsWithAlias } from "./look-at-arguments" +import { normalizeArgs, validateArgs } from "./look-at-arguments" +import { + extractBase64Data, + inferMimeTypeFromBase64, + inferMimeTypeFromFilePath, +} from "./mime-type-inference" +import { resolveMultimodalLookerAgentMetadata } from "./multimodal-agent-metadata" -interface LookAtArgsWithAlias extends LookAtArgs { - path?: string -} - -export function normalizeArgs(args: LookAtArgsWithAlias): LookAtArgs { - return { - file_path: args.file_path ?? args.path, - image_data: args.image_data, - goal: args.goal ?? "", - } -} - -export function validateArgs(args: LookAtArgs): string | null { - const hasFilePath = args.file_path && args.file_path.length > 0 - const hasImageData = args.image_data && args.image_data.length > 0 - - if (!hasFilePath && !hasImageData) { - return `Error: Must provide either 'file_path' or 'image_data'. Usage: -- look_at(file_path="/path/to/file", goal="what to extract") -- look_at(image_data="base64_encoded_data", goal="what to extract")` - } - if (hasFilePath && hasImageData) { - return `Error: Provide only one of 'file_path' or 'image_data', not both.` - } - if (!args.goal) { - return `Error: Missing required parameter 'goal'. Usage: look_at(file_path="/path/to/file", goal="what to extract")` - } - return null -} - -function inferMimeTypeFromBase64(base64Data: string): string { - if (base64Data.startsWith("data:")) { - const match = base64Data.match(/^data:([^;]+);/) - if (match) return match[1] - } - - try { - const cleanData = base64Data.replace(/^data:[^;]+;base64,/, "") - const header = atob(cleanData.slice(0, 16)) - - if (header.startsWith("\x89PNG")) return "image/png" - if (header.startsWith("\xFF\xD8\xFF")) return "image/jpeg" - if (header.startsWith("GIF8")) return "image/gif" - if (header.startsWith("RIFF") && header.includes("WEBP")) return "image/webp" - if (header.startsWith("%PDF")) return "application/pdf" - } catch { - // Invalid base64 - fall through to default - } - - return "image/png" -} - -function inferMimeType(filePath: string): string { - const ext = extname(filePath).toLowerCase() - const mimeTypes: Record = { - ".jpg": "image/jpeg", - ".jpeg": "image/jpeg", - ".png": "image/png", - ".webp": "image/webp", - ".heic": "image/heic", - ".heif": "image/heif", - ".mp4": "video/mp4", - ".mpeg": "video/mpeg", - ".mpg": "video/mpeg", - ".mov": "video/mov", - ".avi": "video/avi", - ".flv": "video/x-flv", - ".webm": "video/webm", - ".wmv": "video/wmv", - ".3gpp": "video/3gpp", - ".3gp": "video/3gpp", - ".wav": "audio/wav", - ".mp3": "audio/mp3", - ".aiff": "audio/aiff", - ".aac": "audio/aac", - ".ogg": "audio/ogg", - ".flac": "audio/flac", - ".pdf": "application/pdf", - ".txt": "text/plain", - ".csv": "text/csv", - ".md": "text/md", - ".html": "text/html", - ".json": "application/json", - ".xml": "application/xml", - ".js": "text/javascript", - ".py": "text/x-python", - } - return mimeTypes[ext] || "application/octet-stream" -} - -function extractBase64Data(imageData: string): string { - if (imageData.startsWith("data:")) { - const commaIndex = imageData.indexOf(",") - if (commaIndex !== -1) { - return imageData.slice(commaIndex + 1) - } - } - return imageData -} +export { normalizeArgs, validateArgs } from "./look-at-arguments" export function createLookAt(ctx: PluginInput): ToolDefinition { return tool({ @@ -125,27 +36,30 @@ export function createLookAt(ctx: PluginInput): ToolDefinition { const sourceDescription = isBase64Input ? "clipboard/pasted image" : args.file_path log(`[look_at] Analyzing ${sourceDescription}, goal: ${args.goal}`) + const imageData = args.image_data + const filePath = args.file_path + let mimeType: string let filePart: { type: "file"; mime: string; url: string; filename: string } - if (isBase64Input) { - mimeType = inferMimeTypeFromBase64(args.image_data!) - const base64Content = extractBase64Data(args.image_data!) - const dataUrl = `data:${mimeType};base64,${base64Content}` + if (imageData) { + mimeType = inferMimeTypeFromBase64(imageData) filePart = { type: "file", mime: mimeType, - url: dataUrl, + url: `data:${mimeType};base64,${extractBase64Data(imageData)}`, filename: `clipboard-image.${mimeType.split("/")[1] || "png"}`, } - } else { - mimeType = inferMimeType(args.file_path!) + } else if (filePath) { + mimeType = inferMimeTypeFromFilePath(filePath) filePart = { type: "file", mime: mimeType, - url: pathToFileURL(args.file_path!).href, - filename: basename(args.file_path!), + url: pathToFileURL(filePath).href, + filename: basename(filePath), } + } else { + return "Error: Must provide either 'file_path' or 'image_data'." } const prompt = `Analyze this ${isBase64Input ? "image" : "file"} and extract the requested information. @@ -166,13 +80,8 @@ If the requested information is not found, clearly state what is missing.` body: { parentID: toolContext.sessionID, title: `look_at: ${args.goal.substring(0, 50)}`, - permission: [ - { permission: "question", action: "deny" as const, pattern: "*" }, - ], - } as any, - query: { - directory: parentDirectory, }, + query: { directory: parentDirectory }, }) if (createResult.error) { @@ -194,32 +103,7 @@ Original error: ${createResult.error}` const sessionID = createResult.data.id log(`[look_at] Created session: ${sessionID}`) - let agentModel: { providerID: string; modelID: string } | undefined - let agentVariant: string | undefined - - try { - const agentsResult = await ctx.client.app?.agents?.() - type AgentInfo = { - name: string - mode?: "subagent" | "primary" | "all" - model?: { providerID: string; modelID: string } - variant?: string - } - const agents = ((agentsResult as { data?: AgentInfo[] })?.data ?? agentsResult) as AgentInfo[] | undefined - if (agents?.length) { - const matchedAgent = agents.find( - (agent) => agent.name.toLowerCase() === MULTIMODAL_LOOKER_AGENT.toLowerCase() - ) - if (matchedAgent?.model) { - agentModel = matchedAgent.model - } - if (matchedAgent?.variant) { - agentVariant = matchedAgent.variant - } - } - } catch (error) { - log("[look_at] Failed to resolve multimodal-looker model info", error) - } + const { agentModel, agentVariant } = await resolveMultimodalLookerAgentMetadata(ctx) log(`[look_at] Sending prompt with ${isBase64Input ? "base64 image" : "file"} to session ${sessionID}`) try { @@ -242,7 +126,6 @@ Original error: ${createResult.error}` }, }) } catch (promptError) { - const errorMessage = promptError instanceof Error ? promptError.message : String(promptError) log(`[look_at] Prompt error:`, promptError) throw promptError @@ -262,25 +145,13 @@ Original error: ${createResult.error}` const messages = messagesResult.data log(`[look_at] Got ${messages.length} messages`) - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const lastAssistantMessage = messages - .filter((m: any) => m.info.role === "assistant") - .sort((a: any, b: any) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0] - - if (!lastAssistantMessage) { - log(`[look_at] No assistant message found`) - return `Error: No response from multimodal-looker agent` + const responseText = extractLatestAssistantText(messages) + if (!responseText) { + log("[look_at] No assistant message found") + return "Error: No response from multimodal-looker agent" } - log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`) - - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const textParts = lastAssistantMessage.parts.filter((p: any) => p.type === "text") - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const responseText = textParts.map((p: any) => p.text).join("\n") - log(`[look_at] Got response, length: ${responseText.length}`) - return responseText }, })