From 3af30b0a211830750e2f695c38c234d1ee6c8b65 Mon Sep 17 00:00:00 2001 From: YeonGyu-Kim Date: Sun, 25 Jan 2026 15:02:41 +0900 Subject: [PATCH] feat(skills): add agent-browser option for browser automation (#1090) Add configurable browser automation allowing users to choose between Playwright MCP (default) and Vercel's agent-browser CLI. Changes: - Add browser_automation_engine.provider config option - Dynamic skill loading based on provider selection - Comprehensive agent-browser CLI reference (inline in skills.ts) - Propagate browserProvider to delegate_task and buildAgent - Update documentation with provider comparison Co-authored-by: Suyeol Jeon Co-authored-by: YeonGyu Kim --- assets/oh-my-opencode.schema.json | 14 + docs/configurations.md | 51 ++- docs/features.md | 43 ++- src/agents/utils.test.ts | 45 ++- src/agents/utils.ts | 13 +- src/config/schema.test.ts | 100 +++++- src/config/schema.ts | 15 + .../builtin-skills/agent-browser/SKILL.md | 296 +++++++++++++++++ src/features/builtin-skills/index.ts | 2 +- src/features/builtin-skills/skills.test.ts | 89 +++++ src/features/builtin-skills/skills.ts | 310 +++++++++++++++++- .../skill-content.test.ts | 63 ++++ .../opencode-skill-loader/skill-content.ts | 28 +- src/index.ts | 4 +- src/plugin-handlers/config-handler.ts | 4 +- src/tools/delegate-task/tools.test.ts | 109 +++++- src/tools/delegate-task/tools.ts | 7 +- 17 files changed, 1155 insertions(+), 38 deletions(-) create mode 100644 src/features/builtin-skills/agent-browser/SKILL.md create mode 100644 src/features/builtin-skills/skills.test.ts diff --git a/assets/oh-my-opencode.schema.json b/assets/oh-my-opencode.schema.json index a57bc2e0..126fcc1e 100644 --- a/assets/oh-my-opencode.schema.json +++ b/assets/oh-my-opencode.schema.json @@ -38,6 +38,7 @@ "type": "string", "enum": [ "playwright", + "agent-browser", "frontend-ui-ux", "git-master" ] @@ -2172,6 +2173,19 @@ "type": "boolean" } } + }, + "browser_automation_engine": { + "type": "object", + "properties": { + "provider": { + "default": "playwright", + "type": "string", + "enum": [ + "playwright", + "agent-browser" + ] + } + } } } } \ No newline at end of file diff --git a/docs/configurations.md b/docs/configurations.md index 67735089..39bfbbc3 100644 --- a/docs/configurations.md +++ b/docs/configurations.md @@ -159,7 +159,7 @@ Available agents: `oracle`, `librarian`, `explore`, `multimodal-looker` Oh My OpenCode includes built-in skills that provide additional capabilities: -- **playwright**: Browser automation with Playwright MCP. Use for web scraping, testing, screenshots, and browser interactions. +- **playwright** (default) / **agent-browser**: Browser automation for web scraping, testing, screenshots, and browser interactions. See [Browser Automation](#browser-automation) for switching between providers. - **git-master**: Git expert for atomic commits, rebase/squash, and history search (blame, bisect, log -S). STRONGLY RECOMMENDED: Use with `delegate_task(category='quick', load_skills=['git-master'], ...)` to save context. Disable built-in skills via `disabled_skills` in `~/.config/opencode/oh-my-opencode.json` or `.opencode/oh-my-opencode.json`: @@ -170,7 +170,54 @@ Disable built-in skills via `disabled_skills` in `~/.config/opencode/oh-my-openc } ``` -Available built-in skills: `playwright`, `git-master` +Available built-in skills: `playwright`, `agent-browser`, `git-master` + +## Browser Automation + +Choose between two browser automation providers: + +| Provider | Interface | Features | Installation | +|----------|-----------|----------|--------------| +| **playwright** (default) | MCP tools | Playwright MCP server with structured tool calls | Auto-installed via npx | +| **agent-browser** | Bash CLI | Vercel's CLI with session management, parallel browsers | Requires `bun add -g agent-browser` | + +**Switch providers** via `browser_automation_engine` in `oh-my-opencode.json`: + +```json +{ + "browser_automation_engine": { + "provider": "agent-browser" + } +} +``` + +### Playwright (Default) + +Uses the official Playwright MCP server (`@playwright/mcp`). Browser automation happens through structured MCP tool calls. + +### agent-browser + +Uses [Vercel's agent-browser CLI](https://github.com/vercel-labs/agent-browser). Key advantages: +- **Session management**: Run multiple isolated browser instances with `--session` flag +- **Persistent profiles**: Keep browser state across restarts with `--profile` +- **Snapshot-based workflow**: Get element refs via `snapshot -i`, interact with `@e1`, `@e2`, etc. +- **CLI-first**: All commands via Bash - great for scripting + +**Installation required**: +```bash +bun add -g agent-browser +agent-browser install # Download Chromium +``` + +**Example workflow**: +```bash +agent-browser open https://example.com +agent-browser snapshot -i # Get interactive elements with refs +agent-browser fill @e1 "user@example.com" +agent-browser click @e2 +agent-browser screenshot result.png +agent-browser close +``` ## Git Master diff --git a/docs/features.md b/docs/features.md index 2e5990dc..21cfabc9 100644 --- a/docs/features.md +++ b/docs/features.md @@ -78,11 +78,15 @@ Skills provide specialized workflows with embedded MCP servers and detailed inst | **frontend-ui-ux** | UI/UX tasks, styling | Designer-turned-developer persona. Crafts stunning UI/UX even without design mockups. Emphasizes bold aesthetic direction, distinctive typography, cohesive color palettes. | | **git-master** | commit, rebase, squash, blame | MUST USE for ANY git operations. Atomic commits with automatic splitting, rebase/squash workflows, history search (blame, bisect, log -S). | -### Skill: playwright +### Skill: Browser Automation (playwright / agent-browser) **Trigger**: Any browser-related request -Provides browser automation via Playwright MCP server: +Oh-My-OpenCode provides two browser automation providers, configurable via `browser_automation_engine.provider`: + +#### Option 1: Playwright MCP (Default) + +The default provider uses Playwright MCP server: ```yaml mcp: @@ -91,18 +95,41 @@ mcp: args: ["@playwright/mcp@latest"] ``` -**Capabilities**: +**Usage**: +``` +/playwright Navigate to example.com and take a screenshot +``` + +#### Option 2: Agent Browser CLI (Vercel) + +Alternative provider using [Vercel's agent-browser CLI](https://github.com/vercel-labs/agent-browser): + +```json +{ + "browser_automation_engine": { + "provider": "agent-browser" + } +} +``` + +**Requires installation**: +```bash +bun add -g agent-browser +``` + +**Usage**: +``` +Use agent-browser to navigate to example.com and extract the main heading +``` + +#### Capabilities (Both Providers) + - Navigate and interact with web pages - Take screenshots and PDFs - Fill forms and click elements - Wait for network requests - Scrape content -**Usage**: -``` -/playwright Navigate to example.com and take a screenshot -``` - ### Skill: frontend-ui-ux **Trigger**: UI design tasks, visual changes diff --git a/src/agents/utils.test.ts b/src/agents/utils.test.ts index e2f5082f..a5d3ec7a 100644 --- a/src/agents/utils.test.ts +++ b/src/agents/utils.test.ts @@ -1,6 +1,7 @@ -import { describe, test, expect } from "bun:test" +import { describe, test, expect, beforeEach } from "bun:test" import { createBuiltinAgents } from "./utils" import type { AgentConfig } from "@opencode-ai/sdk" +import { clearSkillCache } from "../features/opencode-skill-loader/skill-content" const TEST_DEFAULT_MODEL = "anthropic/claude-opus-4-5" @@ -109,6 +110,10 @@ describe("buildAgent with category and skills", () => { const { buildAgent } = require("./utils") const TEST_MODEL = "anthropic/claude-opus-4-5" + beforeEach(() => { + clearSkillCache() + }) + test("agent with category inherits category settings", () => { // #given - agent factory that sets category but no model const source = { @@ -308,4 +313,42 @@ describe("buildAgent with category and skills", () => { // #then expect(agent.prompt).toBe("Base prompt") }) + + test("agent with agent-browser skill resolves when browserProvider is set", () => { + // #given + const source = { + "test-agent": () => + ({ + description: "Test agent", + skills: ["agent-browser"], + prompt: "Base prompt", + }) as AgentConfig, + } + + // #when - browserProvider is "agent-browser" + const agent = buildAgent(source["test-agent"], TEST_MODEL, undefined, undefined, "agent-browser") + + // #then - agent-browser skill content should be in prompt + expect(agent.prompt).toContain("agent-browser") + expect(agent.prompt).toContain("Base prompt") + }) + + test("agent with agent-browser skill NOT resolved when browserProvider not set", () => { + // #given + const source = { + "test-agent": () => + ({ + description: "Test agent", + skills: ["agent-browser"], + prompt: "Base prompt", + }) as AgentConfig, + } + + // #when - no browserProvider (defaults to playwright) + const agent = buildAgent(source["test-agent"], TEST_MODEL) + + // #then - agent-browser skill not found, only base prompt remains + expect(agent.prompt).toBe("Base prompt") + expect(agent.prompt).not.toContain("agent-browser open") + }) }) diff --git a/src/agents/utils.ts b/src/agents/utils.ts index f3959e3a..6e86f348 100644 --- a/src/agents/utils.ts +++ b/src/agents/utils.ts @@ -15,6 +15,7 @@ import { DEFAULT_CATEGORIES, CATEGORY_DESCRIPTIONS } from "../tools/delegate-tas import { resolveMultipleSkills } from "../features/opencode-skill-loader/skill-content" import { createBuiltinSkills } from "../features/builtin-skills" import type { LoadedSkill, SkillScope } from "../features/opencode-skill-loader/types" +import type { BrowserAutomationProvider } from "../config/schema" type AgentSource = AgentFactory | AgentConfig @@ -50,7 +51,8 @@ export function buildAgent( source: AgentSource, model: string, categories?: CategoriesConfig, - gitMasterConfig?: GitMasterConfig + gitMasterConfig?: GitMasterConfig, + browserProvider?: BrowserAutomationProvider ): AgentConfig { const base = isFactory(source) ? source(model) : source const categoryConfigs: Record = categories @@ -74,7 +76,7 @@ export function buildAgent( } if (agentWithCategory.skills?.length) { - const { resolved } = resolveMultipleSkills(agentWithCategory.skills, { gitMasterConfig }) + const { resolved } = resolveMultipleSkills(agentWithCategory.skills, { gitMasterConfig, browserProvider }) if (resolved.size > 0) { const skillContent = Array.from(resolved.values()).join("\n\n") base.prompt = skillContent + (base.prompt ? "\n\n" + base.prompt : "") @@ -146,7 +148,8 @@ export async function createBuiltinAgents( categories?: CategoriesConfig, gitMasterConfig?: GitMasterConfig, discoveredSkills: LoadedSkill[] = [], - client?: any + client?: any, + browserProvider?: BrowserAutomationProvider ): Promise> { if (!systemDefaultModel) { throw new Error("createBuiltinAgents requires systemDefaultModel") @@ -167,7 +170,7 @@ export async function createBuiltinAgents( description: categories?.[name]?.description ?? CATEGORY_DESCRIPTIONS[name] ?? "General tasks", })) - const builtinSkills = createBuiltinSkills() + const builtinSkills = createBuiltinSkills({ browserProvider }) const builtinSkillNames = new Set(builtinSkills.map(s => s.name)) const builtinAvailable: AvailableSkill[] = builtinSkills.map((skill) => ({ @@ -204,7 +207,7 @@ export async function createBuiltinAgents( systemDefaultModel, }) - let config = buildAgent(source, model, mergedCategories, gitMasterConfig) + let config = buildAgent(source, model, mergedCategories, gitMasterConfig, browserProvider) // Apply variant from override or resolved fallback chain if (override?.variant) { diff --git a/src/config/schema.test.ts b/src/config/schema.test.ts index 43e1d4e5..6f72d157 100644 --- a/src/config/schema.test.ts +++ b/src/config/schema.test.ts @@ -1,5 +1,12 @@ import { describe, expect, test } from "bun:test" -import { AgentOverrideConfigSchema, BuiltinCategoryNameSchema, CategoryConfigSchema, OhMyOpenCodeConfigSchema } from "./schema" +import { + AgentOverrideConfigSchema, + BrowserAutomationConfigSchema, + BrowserAutomationProviderSchema, + BuiltinCategoryNameSchema, + CategoryConfigSchema, + OhMyOpenCodeConfigSchema, +} from "./schema" describe("disabled_mcps schema", () => { test("should accept built-in MCP names", () => { @@ -508,3 +515,94 @@ describe("Sisyphus-Junior agent override", () => { } }) }) + +describe("BrowserAutomationProviderSchema", () => { + test("accepts 'playwright' as valid provider", () => { + // #given + const input = "playwright" + + // #when + const result = BrowserAutomationProviderSchema.safeParse(input) + + // #then + expect(result.success).toBe(true) + expect(result.data).toBe("playwright") + }) + + test("accepts 'agent-browser' as valid provider", () => { + // #given + const input = "agent-browser" + + // #when + const result = BrowserAutomationProviderSchema.safeParse(input) + + // #then + expect(result.success).toBe(true) + expect(result.data).toBe("agent-browser") + }) + + test("rejects invalid provider", () => { + // #given + const input = "invalid-provider" + + // #when + const result = BrowserAutomationProviderSchema.safeParse(input) + + // #then + expect(result.success).toBe(false) + }) +}) + +describe("BrowserAutomationConfigSchema", () => { + test("defaults provider to 'playwright' when not specified", () => { + // #given + const input = {} + + // #when + const result = BrowserAutomationConfigSchema.parse(input) + + // #then + expect(result.provider).toBe("playwright") + }) + + test("accepts agent-browser provider", () => { + // #given + const input = { provider: "agent-browser" } + + // #when + const result = BrowserAutomationConfigSchema.parse(input) + + // #then + expect(result.provider).toBe("agent-browser") + }) +}) + +describe("OhMyOpenCodeConfigSchema - browser_automation_engine", () => { + test("accepts browser_automation_engine config", () => { + // #given + const input = { + browser_automation_engine: { + provider: "agent-browser", + }, + } + + // #when + const result = OhMyOpenCodeConfigSchema.safeParse(input) + + // #then + expect(result.success).toBe(true) + expect(result.data?.browser_automation_engine?.provider).toBe("agent-browser") + }) + + test("accepts config without browser_automation_engine", () => { + // #given + const input = {} + + // #when + const result = OhMyOpenCodeConfigSchema.safeParse(input) + + // #then + expect(result.success).toBe(true) + expect(result.data?.browser_automation_engine).toBeUndefined() + }) +}) diff --git a/src/config/schema.ts b/src/config/schema.ts index 126355b7..44d7f2d8 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -30,6 +30,7 @@ export const BuiltinAgentNameSchema = z.enum([ export const BuiltinSkillNameSchema = z.enum([ "playwright", + "agent-browser", "frontend-ui-ux", "git-master", ]) @@ -298,6 +299,17 @@ export const GitMasterConfigSchema = z.object({ include_co_authored_by: z.boolean().default(true), }) +export const BrowserAutomationProviderSchema = z.enum(["playwright", "agent-browser"]) + +export const BrowserAutomationConfigSchema = z.object({ + /** + * Browser automation provider to use for the "playwright" skill. + * - "playwright": Uses Playwright MCP server (@playwright/mcp) - default + * - "agent-browser": Uses Vercel's agent-browser CLI (requires: bun add -g agent-browser) + */ + provider: BrowserAutomationProviderSchema.default("playwright"), +}) + export const OhMyOpenCodeConfigSchema = z.object({ $schema: z.string().optional(), disabled_mcps: z.array(AnyMcpNameSchema).optional(), @@ -317,6 +329,7 @@ export const OhMyOpenCodeConfigSchema = z.object({ background_task: BackgroundTaskConfigSchema.optional(), notification: NotificationConfigSchema.optional(), git_master: GitMasterConfigSchema.optional(), + browser_automation_engine: BrowserAutomationConfigSchema.optional(), }) export type OhMyOpenCodeConfig = z.infer @@ -339,5 +352,7 @@ export type CategoryConfig = z.infer export type CategoriesConfig = z.infer export type BuiltinCategoryName = z.infer export type GitMasterConfig = z.infer +export type BrowserAutomationProvider = z.infer +export type BrowserAutomationConfig = z.infer export { AnyMcpNameSchema, type AnyMcpName, McpNameSchema, type McpName } from "../mcp/types" diff --git a/src/features/builtin-skills/agent-browser/SKILL.md b/src/features/builtin-skills/agent-browser/SKILL.md new file mode 100644 index 00000000..508c9ed9 --- /dev/null +++ b/src/features/builtin-skills/agent-browser/SKILL.md @@ -0,0 +1,296 @@ +--- +name: agent-browser +description: Automates browser interactions for web testing, form filling, screenshots, and data extraction. Use when the user needs to navigate websites, interact with web pages, fill forms, take screenshots, test web applications, or extract information from web pages. +--- + +# Browser Automation with agent-browser + +## Quick start + +```bash +agent-browser open # Navigate to page +agent-browser snapshot -i # Get interactive elements with refs +agent-browser click @e1 # Click element by ref +agent-browser fill @e2 "text" # Fill input by ref +agent-browser close # Close browser +``` + +## Core workflow + +1. Navigate: `agent-browser open ` +2. Snapshot: `agent-browser snapshot -i` (returns elements with refs like `@e1`, `@e2`) +3. Interact using refs from the snapshot +4. Re-snapshot after navigation or significant DOM changes + +## Commands + +### Navigation +```bash +agent-browser open # Navigate to URL +agent-browser back # Go back +agent-browser forward # Go forward +agent-browser reload # Reload page +agent-browser close # Close browser +``` + +### Snapshot (page analysis) +```bash +agent-browser snapshot # Full accessibility tree +agent-browser snapshot -i # Interactive elements only (recommended) +agent-browser snapshot -c # Compact output +agent-browser snapshot -d 3 # Limit depth to 3 +agent-browser snapshot -s "#main" # Scope to CSS selector +``` + +### Interactions (use @refs from snapshot) +```bash +agent-browser click @e1 # Click +agent-browser dblclick @e1 # Double-click +agent-browser focus @e1 # Focus element +agent-browser fill @e2 "text" # Clear and type +agent-browser type @e2 "text" # Type without clearing +agent-browser press Enter # Press key +agent-browser press Control+a # Key combination +agent-browser keydown Shift # Hold key down +agent-browser keyup Shift # Release key +agent-browser hover @e1 # Hover +agent-browser check @e1 # Check checkbox +agent-browser uncheck @e1 # Uncheck checkbox +agent-browser select @e1 "value" # Select dropdown +agent-browser scroll down 500 # Scroll page +agent-browser scrollintoview @e1 # Scroll element into view +agent-browser drag @e1 @e2 # Drag and drop +agent-browser upload @e1 file.pdf # Upload files +``` + +### Get information +```bash +agent-browser get text @e1 # Get element text +agent-browser get html @e1 # Get innerHTML +agent-browser get value @e1 # Get input value +agent-browser get attr @e1 href # Get attribute +agent-browser get title # Get page title +agent-browser get url # Get current URL +agent-browser get count ".item" # Count matching elements +agent-browser get box @e1 # Get bounding box +``` + +### Check state +```bash +agent-browser is visible @e1 # Check if visible +agent-browser is enabled @e1 # Check if enabled +agent-browser is checked @e1 # Check if checked +``` + +### Screenshots & PDF +```bash +agent-browser screenshot # Screenshot to stdout +agent-browser screenshot path.png # Save to file +agent-browser screenshot --full # Full page +agent-browser pdf output.pdf # Save as PDF +``` + +### Video recording +```bash +agent-browser record start ./demo.webm # Start recording (uses current URL + state) +agent-browser click @e1 # Perform actions +agent-browser record stop # Stop and save video +agent-browser record restart ./take2.webm # Stop current + start new recording +``` +Recording creates a fresh context but preserves cookies/storage from your session. + +### Wait +```bash +agent-browser wait @e1 # Wait for element +agent-browser wait 2000 # Wait milliseconds +agent-browser wait --text "Success" # Wait for text +agent-browser wait --url "**/dashboard" # Wait for URL pattern +agent-browser wait --load networkidle # Wait for network idle +agent-browser wait --fn "window.ready" # Wait for JS condition +``` + +### Mouse control +```bash +agent-browser mouse move 100 200 # Move mouse +agent-browser mouse down left # Press button +agent-browser mouse up left # Release button +agent-browser mouse wheel 100 # Scroll wheel +``` + +### Semantic locators (alternative to refs) +```bash +agent-browser find role button click --name "Submit" +agent-browser find text "Sign In" click +agent-browser find label "Email" fill "user@test.com" +agent-browser find first ".item" click +agent-browser find nth 2 "a" text +``` + +### Browser settings +```bash +agent-browser set viewport 1920 1080 # Set viewport size +agent-browser set device "iPhone 14" # Emulate device +agent-browser set geo 37.7749 -122.4194 # Set geolocation +agent-browser set offline on # Toggle offline mode +agent-browser set headers '{"X-Key":"v"}' # Extra HTTP headers +agent-browser set credentials user pass # HTTP basic auth +agent-browser set media dark # Emulate color scheme +``` + +### Cookies & Storage +```bash +agent-browser cookies # Get all cookies +agent-browser cookies set name value # Set cookie +agent-browser cookies clear # Clear cookies +agent-browser storage local # Get all localStorage +agent-browser storage local key # Get specific key +agent-browser storage local set k v # Set value +agent-browser storage local clear # Clear all +agent-browser storage session # Get all sessionStorage +agent-browser storage session key # Get specific key +agent-browser storage session set k v # Set value +agent-browser storage session clear # Clear all +``` + +### Network +```bash +agent-browser network route # Intercept requests +agent-browser network route --abort # Block requests +agent-browser network route --body '{}' # Mock response +agent-browser network unroute [url] # Remove routes +agent-browser network requests # View tracked requests +agent-browser network requests --filter api # Filter requests +``` + +### Tabs & Windows +```bash +agent-browser tab # List tabs +agent-browser tab new [url] # New tab +agent-browser tab 2 # Switch to tab +agent-browser tab close # Close tab +agent-browser window new # New window +``` + +### Frames +```bash +agent-browser frame "#iframe" # Switch to iframe +agent-browser frame main # Back to main frame +``` + +### Dialogs +```bash +agent-browser dialog accept [text] # Accept dialog +agent-browser dialog dismiss # Dismiss dialog +``` + +### JavaScript +```bash +agent-browser eval "document.title" # Run JavaScript +``` + +## Global Options + +| Option | Description | +|--------|-------------| +| `--session ` | Isolated browser session (`AGENT_BROWSER_SESSION` env) | +| `--profile ` | Persistent browser profile (`AGENT_BROWSER_PROFILE` env) | +| `--headers ` | HTTP headers scoped to URL's origin | +| `--executable-path ` | Custom browser binary (`AGENT_BROWSER_EXECUTABLE_PATH` env) | +| `--args ` | Browser launch args (`AGENT_BROWSER_ARGS` env) | +| `--user-agent ` | Custom User-Agent (`AGENT_BROWSER_USER_AGENT` env) | +| `--proxy ` | Proxy server (`AGENT_BROWSER_PROXY` env) | +| `--proxy-bypass ` | Hosts to bypass proxy (`AGENT_BROWSER_PROXY_BYPASS` env) | +| `-p, --provider ` | Cloud browser provider (`AGENT_BROWSER_PROVIDER` env) | +| `--json` | Machine-readable JSON output | +| `--headed` | Show browser window (not headless) | +| `--cdp ` | Connect via Chrome DevTools Protocol | +| `--debug` | Debug output | + +## Example: Form submission + +```bash +agent-browser open https://example.com/form +agent-browser snapshot -i +# Output shows: textbox "Email" [ref=e1], textbox "Password" [ref=e2], button "Submit" [ref=e3] + +agent-browser fill @e1 "user@example.com" +agent-browser fill @e2 "password123" +agent-browser click @e3 +agent-browser wait --load networkidle +agent-browser snapshot -i # Check result +``` + +## Example: Authentication with saved state + +```bash +# Login once +agent-browser open https://app.example.com/login +agent-browser snapshot -i +agent-browser fill @e1 "username" +agent-browser fill @e2 "password" +agent-browser click @e3 +agent-browser wait --url "**/dashboard" +agent-browser state save auth.json + +# Later sessions: load saved state +agent-browser state load auth.json +agent-browser open https://app.example.com/dashboard +``` + +### Header-based Auth (Skip login flows) +```bash +# Headers scoped to api.example.com only +agent-browser open api.example.com --headers '{"Authorization": "Bearer "}' +# Navigate to another domain - headers NOT sent (safe) +agent-browser open other-site.com +# Global headers (all domains) +agent-browser set headers '{"X-Custom-Header": "value"}' +``` + +## Sessions & Persistent Profiles + +### Sessions (parallel browsers) +```bash +agent-browser --session test1 open site-a.com +agent-browser --session test2 open site-b.com +agent-browser session list +``` + +### Persistent Profiles +Persists cookies, localStorage, IndexedDB, service workers, cache, login sessions across browser restarts. +```bash +agent-browser --profile ~/.myapp-profile open myapp.com +# Or via env var +AGENT_BROWSER_PROFILE=~/.myapp-profile agent-browser open myapp.com +``` +- Use different profile paths for different projects +- Login once → restart browser → still logged in +- Stores: cookies, localStorage, IndexedDB, service workers, browser cache + +## JSON output (for parsing) + +Add `--json` for machine-readable output: +```bash +agent-browser snapshot -i --json +agent-browser get text @e1 --json +``` + +## Debugging + +```bash +agent-browser open example.com --headed # Show browser window +agent-browser console # View console messages +agent-browser errors # View page errors +agent-browser record start ./debug.webm # Record from current page +agent-browser record stop # Save recording +agent-browser connect 9222 # Local CDP port +agent-browser --cdp "wss://browser-service.com/cdp?token=..." snapshot # Remote via WebSocket +agent-browser console --clear # Clear console +agent-browser errors --clear # Clear errors +agent-browser highlight @e1 # Highlight element +agent-browser trace start # Start recording trace +agent-browser trace stop trace.zip # Stop and save trace +``` + +--- +Install: `bun add -g agent-browser && agent-browser install`. Run `agent-browser --help` for all commands. Repo: https://github.com/vercel-labs/agent-browser diff --git a/src/features/builtin-skills/index.ts b/src/features/builtin-skills/index.ts index 7ca1facc..7bd296b4 100644 --- a/src/features/builtin-skills/index.ts +++ b/src/features/builtin-skills/index.ts @@ -1,2 +1,2 @@ export * from "./types" -export { createBuiltinSkills } from "./skills" +export { createBuiltinSkills, type CreateBuiltinSkillsOptions } from "./skills" diff --git a/src/features/builtin-skills/skills.test.ts b/src/features/builtin-skills/skills.test.ts new file mode 100644 index 00000000..196df2e3 --- /dev/null +++ b/src/features/builtin-skills/skills.test.ts @@ -0,0 +1,89 @@ +import { describe, test, expect } from "bun:test" +import { createBuiltinSkills } from "./skills" + +describe("createBuiltinSkills", () => { + test("returns playwright skill by default", () => { + // #given - no options (default) + + // #when + const skills = createBuiltinSkills() + + // #then + const browserSkill = skills.find((s) => s.name === "playwright") + expect(browserSkill).toBeDefined() + expect(browserSkill!.description).toContain("browser") + expect(browserSkill!.mcpConfig).toHaveProperty("playwright") + }) + + test("returns playwright skill when browserProvider is 'playwright'", () => { + // #given + const options = { browserProvider: "playwright" as const } + + // #when + const skills = createBuiltinSkills(options) + + // #then + const playwrightSkill = skills.find((s) => s.name === "playwright") + const agentBrowserSkill = skills.find((s) => s.name === "agent-browser") + expect(playwrightSkill).toBeDefined() + expect(agentBrowserSkill).toBeUndefined() + }) + + test("returns agent-browser skill when browserProvider is 'agent-browser'", () => { + // #given + const options = { browserProvider: "agent-browser" as const } + + // #when + const skills = createBuiltinSkills(options) + + // #then + const agentBrowserSkill = skills.find((s) => s.name === "agent-browser") + const playwrightSkill = skills.find((s) => s.name === "playwright") + expect(agentBrowserSkill).toBeDefined() + expect(agentBrowserSkill!.description).toContain("browser") + expect(agentBrowserSkill!.allowedTools).toContain("Bash(agent-browser:*)") + expect(agentBrowserSkill!.template).toContain("agent-browser") + expect(playwrightSkill).toBeUndefined() + }) + + test("agent-browser skill template is inlined (not loaded from file)", () => { + // #given + const options = { browserProvider: "agent-browser" as const } + + // #when + const skills = createBuiltinSkills(options) + const agentBrowserSkill = skills.find((s) => s.name === "agent-browser") + + // #then - template should contain substantial content (inlined, not fallback) + expect(agentBrowserSkill!.template).toContain("## Quick start") + expect(agentBrowserSkill!.template).toContain("## Commands") + expect(agentBrowserSkill!.template).toContain("agent-browser open") + expect(agentBrowserSkill!.template).toContain("agent-browser snapshot") + }) + + test("always includes frontend-ui-ux and git-master skills", () => { + // #given - both provider options + + // #when + const defaultSkills = createBuiltinSkills() + const agentBrowserSkills = createBuiltinSkills({ browserProvider: "agent-browser" }) + + // #then + for (const skills of [defaultSkills, agentBrowserSkills]) { + expect(skills.find((s) => s.name === "frontend-ui-ux")).toBeDefined() + expect(skills.find((s) => s.name === "git-master")).toBeDefined() + } + }) + + test("returns exactly 3 skills regardless of provider", () => { + // #given + + // #when + const defaultSkills = createBuiltinSkills() + const agentBrowserSkills = createBuiltinSkills({ browserProvider: "agent-browser" }) + + // #then + expect(defaultSkills).toHaveLength(3) + expect(agentBrowserSkills).toHaveLength(3) + }) +}) diff --git a/src/features/builtin-skills/skills.ts b/src/features/builtin-skills/skills.ts index 66bff8e9..da121885 100644 --- a/src/features/builtin-skills/skills.ts +++ b/src/features/builtin-skills/skills.ts @@ -1,4 +1,5 @@ import type { BuiltinSkill } from "./types" +import type { BrowserAutomationProvider } from "../../config/schema" const playwrightSkill: BuiltinSkill = { name: "playwright", @@ -14,6 +15,303 @@ This skill provides browser automation capabilities via the Playwright MCP serve }, } +const agentBrowserSkill: BuiltinSkill = { + name: "agent-browser", + description: "MUST USE for any browser-related tasks. Browser automation via agent-browser CLI - verification, browsing, information gathering, web scraping, testing, screenshots, and all browser interactions.", + template: `# Browser Automation with agent-browser + +## Quick start + +\`\`\`bash +agent-browser open # Navigate to page +agent-browser snapshot -i # Get interactive elements with refs +agent-browser click @e1 # Click element by ref +agent-browser fill @e2 "text" # Fill input by ref +agent-browser close # Close browser +\`\`\` + +## Core workflow + +1. Navigate: \`agent-browser open \` +2. Snapshot: \`agent-browser snapshot -i\` (returns elements with refs like \`@e1\`, \`@e2\`) +3. Interact using refs from the snapshot +4. Re-snapshot after navigation or significant DOM changes + +## Commands + +### Navigation +\`\`\`bash +agent-browser open # Navigate to URL +agent-browser back # Go back +agent-browser forward # Go forward +agent-browser reload # Reload page +agent-browser close # Close browser +\`\`\` + +### Snapshot (page analysis) +\`\`\`bash +agent-browser snapshot # Full accessibility tree +agent-browser snapshot -i # Interactive elements only (recommended) +agent-browser snapshot -c # Compact output +agent-browser snapshot -d 3 # Limit depth to 3 +agent-browser snapshot -s "#main" # Scope to CSS selector +\`\`\` + +### Interactions (use @refs from snapshot) +\`\`\`bash +agent-browser click @e1 # Click +agent-browser dblclick @e1 # Double-click +agent-browser focus @e1 # Focus element +agent-browser fill @e2 "text" # Clear and type +agent-browser type @e2 "text" # Type without clearing +agent-browser press Enter # Press key +agent-browser press Control+a # Key combination +agent-browser keydown Shift # Hold key down +agent-browser keyup Shift # Release key +agent-browser hover @e1 # Hover +agent-browser check @e1 # Check checkbox +agent-browser uncheck @e1 # Uncheck checkbox +agent-browser select @e1 "value" # Select dropdown +agent-browser scroll down 500 # Scroll page +agent-browser scrollintoview @e1 # Scroll element into view +agent-browser drag @e1 @e2 # Drag and drop +agent-browser upload @e1 file.pdf # Upload files +\`\`\` + +### Get information +\`\`\`bash +agent-browser get text @e1 # Get element text +agent-browser get html @e1 # Get innerHTML +agent-browser get value @e1 # Get input value +agent-browser get attr @e1 href # Get attribute +agent-browser get title # Get page title +agent-browser get url # Get current URL +agent-browser get count ".item" # Count matching elements +agent-browser get box @e1 # Get bounding box +\`\`\` + +### Check state +\`\`\`bash +agent-browser is visible @e1 # Check if visible +agent-browser is enabled @e1 # Check if enabled +agent-browser is checked @e1 # Check if checked +\`\`\` + +### Screenshots & PDF +\`\`\`bash +agent-browser screenshot # Screenshot to stdout +agent-browser screenshot path.png # Save to file +agent-browser screenshot --full # Full page +agent-browser pdf output.pdf # Save as PDF +\`\`\` + +### Video recording +\`\`\`bash +agent-browser record start ./demo.webm # Start recording (uses current URL + state) +agent-browser click @e1 # Perform actions +agent-browser record stop # Stop and save video +agent-browser record restart ./take2.webm # Stop current + start new recording +\`\`\` +Recording creates a fresh context but preserves cookies/storage from your session. + +### Wait +\`\`\`bash +agent-browser wait @e1 # Wait for element +agent-browser wait 2000 # Wait milliseconds +agent-browser wait --text "Success" # Wait for text +agent-browser wait --url "**/dashboard" # Wait for URL pattern +agent-browser wait --load networkidle # Wait for network idle +agent-browser wait --fn "window.ready" # Wait for JS condition +\`\`\` + +### Mouse control +\`\`\`bash +agent-browser mouse move 100 200 # Move mouse +agent-browser mouse down left # Press button +agent-browser mouse up left # Release button +agent-browser mouse wheel 100 # Scroll wheel +\`\`\` + +### Semantic locators (alternative to refs) +\`\`\`bash +agent-browser find role button click --name "Submit" +agent-browser find text "Sign In" click +agent-browser find label "Email" fill "user@test.com" +agent-browser find first ".item" click +agent-browser find nth 2 "a" text +\`\`\` + +### Browser settings +\`\`\`bash +agent-browser set viewport 1920 1080 # Set viewport size +agent-browser set device "iPhone 14" # Emulate device +agent-browser set geo 37.7749 -122.4194 # Set geolocation +agent-browser set offline on # Toggle offline mode +agent-browser set headers '{"X-Key":"v"}' # Extra HTTP headers +agent-browser set credentials user pass # HTTP basic auth +agent-browser set media dark # Emulate color scheme +\`\`\` + +### Cookies & Storage +\`\`\`bash +agent-browser cookies # Get all cookies +agent-browser cookies set name value # Set cookie +agent-browser cookies clear # Clear cookies +agent-browser storage local # Get all localStorage +agent-browser storage local key # Get specific key +agent-browser storage local set k v # Set value +agent-browser storage local clear # Clear all +agent-browser storage session # Get all sessionStorage +agent-browser storage session key # Get specific key +agent-browser storage session set k v # Set value +agent-browser storage session clear # Clear all +\`\`\` + +### Network +\`\`\`bash +agent-browser network route # Intercept requests +agent-browser network route --abort # Block requests +agent-browser network route --body '{}' # Mock response +agent-browser network unroute [url] # Remove routes +agent-browser network requests # View tracked requests +agent-browser network requests --filter api # Filter requests +\`\`\` + +### Tabs & Windows +\`\`\`bash +agent-browser tab # List tabs +agent-browser tab new [url] # New tab +agent-browser tab 2 # Switch to tab +agent-browser tab close # Close tab +agent-browser window new # New window +\`\`\` + +### Frames +\`\`\`bash +agent-browser frame "#iframe" # Switch to iframe +agent-browser frame main # Back to main frame +\`\`\` + +### Dialogs +\`\`\`bash +agent-browser dialog accept [text] # Accept dialog +agent-browser dialog dismiss # Dismiss dialog +\`\`\` + +### JavaScript +\`\`\`bash +agent-browser eval "document.title" # Run JavaScript +\`\`\` + +## Global Options + +| Option | Description | +|--------|-------------| +| \`--session \` | Isolated browser session (\`AGENT_BROWSER_SESSION\` env) | +| \`--profile \` | Persistent browser profile (\`AGENT_BROWSER_PROFILE\` env) | +| \`--headers \` | HTTP headers scoped to URL's origin | +| \`--executable-path \` | Custom browser binary (\`AGENT_BROWSER_EXECUTABLE_PATH\` env) | +| \`--args \` | Browser launch args (\`AGENT_BROWSER_ARGS\` env) | +| \`--user-agent \` | Custom User-Agent (\`AGENT_BROWSER_USER_AGENT\` env) | +| \`--proxy \` | Proxy server (\`AGENT_BROWSER_PROXY\` env) | +| \`--proxy-bypass \` | Hosts to bypass proxy (\`AGENT_BROWSER_PROXY_BYPASS\` env) | +| \`-p, --provider \` | Cloud browser provider (\`AGENT_BROWSER_PROVIDER\` env) | +| \`--json\` | Machine-readable JSON output | +| \`--headed\` | Show browser window (not headless) | +| \`--cdp \` | Connect via Chrome DevTools Protocol | +| \`--debug\` | Debug output | + +## Example: Form submission + +\`\`\`bash +agent-browser open https://example.com/form +agent-browser snapshot -i +# Output shows: textbox "Email" [ref=e1], textbox "Password" [ref=e2], button "Submit" [ref=e3] + +agent-browser fill @e1 "user@example.com" +agent-browser fill @e2 "password123" +agent-browser click @e3 +agent-browser wait --load networkidle +agent-browser snapshot -i # Check result +\`\`\` + +## Example: Authentication with saved state + +\`\`\`bash +# Login once +agent-browser open https://app.example.com/login +agent-browser snapshot -i +agent-browser fill @e1 "username" +agent-browser fill @e2 "password" +agent-browser click @e3 +agent-browser wait --url "**/dashboard" +agent-browser state save auth.json + +# Later sessions: load saved state +agent-browser state load auth.json +agent-browser open https://app.example.com/dashboard +\`\`\` + +### Header-based Auth (Skip login flows) +\`\`\`bash +# Headers scoped to api.example.com only +agent-browser open api.example.com --headers '{"Authorization": "Bearer "}' +# Navigate to another domain - headers NOT sent (safe) +agent-browser open other-site.com +# Global headers (all domains) +agent-browser set headers '{"X-Custom-Header": "value"}' +\`\`\` + +## Sessions & Persistent Profiles + +### Sessions (parallel browsers) +\`\`\`bash +agent-browser --session test1 open site-a.com +agent-browser --session test2 open site-b.com +agent-browser session list +\`\`\` + +### Persistent Profiles +Persists cookies, localStorage, IndexedDB, service workers, cache, login sessions across browser restarts. +\`\`\`bash +agent-browser --profile ~/.myapp-profile open myapp.com +# Or via env var +AGENT_BROWSER_PROFILE=~/.myapp-profile agent-browser open myapp.com +\`\`\` +- Use different profile paths for different projects +- Login once → restart browser → still logged in +- Stores: cookies, localStorage, IndexedDB, service workers, browser cache + +## JSON output (for parsing) + +Add \`--json\` for machine-readable output: +\`\`\`bash +agent-browser snapshot -i --json +agent-browser get text @e1 --json +\`\`\` + +## Debugging + +\`\`\`bash +agent-browser open example.com --headed # Show browser window +agent-browser console # View console messages +agent-browser errors # View page errors +agent-browser record start ./debug.webm # Record from current page +agent-browser record stop # Save recording +agent-browser connect 9222 # Local CDP port +agent-browser --cdp "wss://browser-service.com/cdp?token=..." snapshot # Remote via WebSocket +agent-browser console --clear # Clear console +agent-browser errors --clear # Clear errors +agent-browser highlight @e1 # Highlight element +agent-browser trace start # Start recording trace +agent-browser trace stop trace.zip # Stop and save trace +\`\`\` + +--- +Install: \`bun add -g agent-browser && agent-browser install\`. Run \`agent-browser --help\` for all commands. Repo: https://github.com/vercel-labs/agent-browser`, + allowedTools: ["Bash(agent-browser:*)"], +} + const frontendUiUxSkill: BuiltinSkill = { name: "frontend-ui-ux", description: "Designer-turned-developer who crafts stunning UI/UX even without design mockups", @@ -1198,6 +1496,14 @@ POTENTIAL ACTIONS: - Bisect without proper good/bad boundaries -> Wasted time`, } -export function createBuiltinSkills(): BuiltinSkill[] { - return [playwrightSkill, frontendUiUxSkill, gitMasterSkill] +export interface CreateBuiltinSkillsOptions { + browserProvider?: BrowserAutomationProvider +} + +export function createBuiltinSkills(options: CreateBuiltinSkillsOptions = {}): BuiltinSkill[] { + const { browserProvider = "playwright" } = options + + const browserSkill = browserProvider === "agent-browser" ? agentBrowserSkill : playwrightSkill + + return [browserSkill, frontendUiUxSkill, gitMasterSkill] } diff --git a/src/features/opencode-skill-loader/skill-content.test.ts b/src/features/opencode-skill-loader/skill-content.test.ts index fd8c597d..beca2678 100644 --- a/src/features/opencode-skill-loader/skill-content.test.ts +++ b/src/features/opencode-skill-loader/skill-content.test.ts @@ -265,3 +265,66 @@ describe("resolveMultipleSkillsAsync", () => { expect(result.notFound).toEqual([]) }) }) + +describe("resolveSkillContent with browserProvider", () => { + it("should resolve agent-browser skill when browserProvider is 'agent-browser'", () => { + // #given: browserProvider set to agent-browser + const options = { browserProvider: "agent-browser" as const } + + // #when: resolving content for 'agent-browser' + const result = resolveSkillContent("agent-browser", options) + + // #then: returns agent-browser template + expect(result).not.toBeNull() + expect(result).toContain("agent-browser") + }) + + it("should return null for agent-browser when browserProvider is default", () => { + // #given: no browserProvider (defaults to playwright) + + // #when: resolving content for 'agent-browser' + const result = resolveSkillContent("agent-browser") + + // #then: returns null because agent-browser is not in default builtin skills + expect(result).toBeNull() + }) + + it("should return null for playwright when browserProvider is agent-browser", () => { + // #given: browserProvider set to agent-browser + const options = { browserProvider: "agent-browser" as const } + + // #when: resolving content for 'playwright' + const result = resolveSkillContent("playwright", options) + + // #then: returns null because playwright is replaced by agent-browser + expect(result).toBeNull() + }) +}) + +describe("resolveMultipleSkills with browserProvider", () => { + it("should resolve agent-browser when browserProvider is set", () => { + // #given: agent-browser and git-master requested with browserProvider + const skillNames = ["agent-browser", "git-master"] + const options = { browserProvider: "agent-browser" as const } + + // #when: resolving multiple skills + const result = resolveMultipleSkills(skillNames, options) + + // #then: both resolved + expect(result.resolved.has("agent-browser")).toBe(true) + expect(result.resolved.has("git-master")).toBe(true) + expect(result.notFound).toHaveLength(0) + }) + + it("should not resolve agent-browser without browserProvider option", () => { + // #given: agent-browser requested without browserProvider + const skillNames = ["agent-browser"] + + // #when: resolving multiple skills + const result = resolveMultipleSkills(skillNames) + + // #then: agent-browser not found + expect(result.resolved.has("agent-browser")).toBe(false) + expect(result.notFound).toContain("agent-browser") + }) +}) diff --git a/src/features/opencode-skill-loader/skill-content.ts b/src/features/opencode-skill-loader/skill-content.ts index 18294750..0a4bf81b 100644 --- a/src/features/opencode-skill-loader/skill-content.ts +++ b/src/features/opencode-skill-loader/skill-content.ts @@ -3,24 +3,27 @@ import { discoverSkills } from "./loader" import type { LoadedSkill } from "./types" import { parseFrontmatter } from "../../shared/frontmatter" import { readFileSync } from "node:fs" -import type { GitMasterConfig } from "../../config/schema" +import type { GitMasterConfig, BrowserAutomationProvider } from "../../config/schema" export interface SkillResolutionOptions { gitMasterConfig?: GitMasterConfig + browserProvider?: BrowserAutomationProvider } -let cachedSkills: LoadedSkill[] | null = null +const cachedSkillsByProvider = new Map() function clearSkillCache(): void { - cachedSkills = null + cachedSkillsByProvider.clear() } -async function getAllSkills(): Promise { - if (cachedSkills) return cachedSkills +async function getAllSkills(options?: SkillResolutionOptions): Promise { + const cacheKey = options?.browserProvider ?? "playwright" + const cached = cachedSkillsByProvider.get(cacheKey) + if (cached) return cached const [discoveredSkills, builtinSkillDefs] = await Promise.all([ discoverSkills({ includeClaudeCodePaths: true }), - Promise.resolve(createBuiltinSkills()), + Promise.resolve(createBuiltinSkills({ browserProvider: options?.browserProvider })), ]) const builtinSkillsAsLoaded: LoadedSkill[] = builtinSkillDefs.map((skill) => ({ @@ -44,8 +47,9 @@ async function getAllSkills(): Promise { const discoveredNames = new Set(discoveredSkills.map((s) => s.name)) const uniqueBuiltins = builtinSkillsAsLoaded.filter((s) => !discoveredNames.has(s.name)) - cachedSkills = [...discoveredSkills, ...uniqueBuiltins] - return cachedSkills + const allSkills = [...discoveredSkills, ...uniqueBuiltins] + cachedSkillsByProvider.set(cacheKey, allSkills) + return allSkills } async function extractSkillTemplate(skill: LoadedSkill): Promise { @@ -118,7 +122,7 @@ export function injectGitMasterConfig(template: string, config?: GitMasterConfig } export function resolveSkillContent(skillName: string, options?: SkillResolutionOptions): string | null { - const skills = createBuiltinSkills() + const skills = createBuiltinSkills({ browserProvider: options?.browserProvider }) const skill = skills.find((s) => s.name === skillName) if (!skill) return null @@ -133,7 +137,7 @@ export function resolveMultipleSkills(skillNames: string[], options?: SkillResol resolved: Map notFound: string[] } { - const skills = createBuiltinSkills() + const skills = createBuiltinSkills({ browserProvider: options?.browserProvider }) const skillMap = new Map(skills.map((s) => [s.name, s.template])) const resolved = new Map() @@ -159,7 +163,7 @@ export async function resolveSkillContentAsync( skillName: string, options?: SkillResolutionOptions ): Promise { - const allSkills = await getAllSkills() + const allSkills = await getAllSkills(options) const skill = allSkills.find((s) => s.name === skillName) if (!skill) return null @@ -179,7 +183,7 @@ export async function resolveMultipleSkillsAsync( resolved: Map notFound: string[] }> { - const allSkills = await getAllSkills() + const allSkills = await getAllSkills(options) const skillMap = new Map() for (const skill of allSkills) { skillMap.set(skill.name, skill) diff --git a/src/index.ts b/src/index.ts index 3210602d..b05876e9 100644 --- a/src/index.ts +++ b/src/index.ts @@ -243,6 +243,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { "multimodal-looker" ); const lookAt = isMultimodalLookerEnabled ? createLookAt(ctx) : null; + const browserProvider = pluginConfig.browser_automation_engine?.provider ?? "playwright"; const delegateTask = createDelegateTask({ manager: backgroundManager, client: ctx.client, @@ -250,10 +251,11 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { userCategories: pluginConfig.categories, gitMasterConfig: pluginConfig.git_master, sisyphusJuniorModel: pluginConfig.agents?.["sisyphus-junior"]?.model, + browserProvider, }); const disabledSkills = new Set(pluginConfig.disabled_skills ?? []); const systemMcpNames = getSystemMcpServerNames(); - const builtinSkills = createBuiltinSkills().filter((skill) => { + const builtinSkills = createBuiltinSkills({ browserProvider }).filter((skill) => { if (disabledSkills.has(skill.name as never)) return false; if (skill.mcpConfig) { for (const mcpName of Object.keys(skill.mcpConfig)) { diff --git a/src/plugin-handlers/config-handler.ts b/src/plugin-handlers/config-handler.ts index f571d05f..54200745 100644 --- a/src/plugin-handlers/config-handler.ts +++ b/src/plugin-handlers/config-handler.ts @@ -165,6 +165,7 @@ export function createConfigHandler(deps: ConfigHandlerDeps) { ...discoveredUserSkills, ]; + const browserProvider = pluginConfig.browser_automation_engine?.provider ?? "playwright"; const builtinAgents = await createBuiltinAgents( migratedDisabledAgents, pluginConfig.agents, @@ -173,7 +174,8 @@ export function createConfigHandler(deps: ConfigHandlerDeps) { pluginConfig.categories, pluginConfig.git_master, allDiscoveredSkills, - ctx.client + ctx.client, + browserProvider ); // Claude Code agents: Do NOT apply permission migration diff --git a/src/tools/delegate-task/tools.test.ts b/src/tools/delegate-task/tools.test.ts index 3e051f85..7f52cf74 100644 --- a/src/tools/delegate-task/tools.test.ts +++ b/src/tools/delegate-task/tools.test.ts @@ -3,14 +3,15 @@ import { DEFAULT_CATEGORIES, CATEGORY_PROMPT_APPENDS, CATEGORY_DESCRIPTIONS } fr import { resolveCategoryConfig } from "./tools" import type { CategoryConfig } from "../../config/schema" import { __resetModelCache } from "../../shared/model-availability" +import { clearSkillCache } from "../../features/opencode-skill-loader/skill-content" // Test constants - systemDefaultModel is required by resolveCategoryConfig const SYSTEM_DEFAULT_MODEL = "anthropic/claude-sonnet-4-5" describe("sisyphus-task", () => { - // Reset model cache before each test to prevent cross-test pollution beforeEach(() => { __resetModelCache() + clearSkillCache() }) describe("DEFAULT_CATEGORIES", () => { @@ -1324,6 +1325,112 @@ describe("sisyphus-task", () => { }, { timeout: 20000 }) }) + describe("browserProvider propagation", () => { + test("should resolve agent-browser skill when browserProvider is passed", async () => { + // #given - delegate_task configured with browserProvider: "agent-browser" + const { createDelegateTask } = require("./tools") + let promptBody: any + + const mockManager = { launch: async () => ({}) } + const mockClient = { + app: { agents: async () => ({ data: [] }) }, + config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) }, + session: { + get: async () => ({ data: { directory: "/project" } }), + create: async () => ({ data: { id: "ses_browser_provider" } }), + prompt: async (input: any) => { + promptBody = input.body + return { data: {} } + }, + messages: async () => ({ + data: [{ info: { role: "assistant" }, parts: [{ type: "text", text: "Done" }] }] + }), + status: async () => ({ data: {} }), + }, + } + + // Pass browserProvider to createDelegateTask + const tool = createDelegateTask({ + manager: mockManager, + client: mockClient, + browserProvider: "agent-browser", + }) + + const toolContext = { + sessionID: "parent-session", + messageID: "parent-message", + agent: "Sisyphus", + abort: new AbortController().signal, + } + + // #when - request agent-browser skill + await tool.execute( + { + description: "Test browserProvider propagation", + prompt: "Do something", + category: "ultrabrain", + run_in_background: false, + load_skills: ["agent-browser"], + }, + toolContext + ) + + // #then - agent-browser skill should be resolved (not in notFound) + expect(promptBody).toBeDefined() + expect(promptBody.system).toBeDefined() + expect(promptBody.system).toContain("agent-browser") + }, { timeout: 20000 }) + + test("should NOT resolve agent-browser skill when browserProvider is not set", async () => { + // #given - delegate_task without browserProvider (defaults to playwright) + const { createDelegateTask } = require("./tools") + + const mockManager = { launch: async () => ({}) } + const mockClient = { + app: { agents: async () => ({ data: [] }) }, + config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) }, + session: { + get: async () => ({ data: { directory: "/project" } }), + create: async () => ({ data: { id: "ses_no_browser_provider" } }), + prompt: async () => ({ data: {} }), + messages: async () => ({ + data: [{ info: { role: "assistant" }, parts: [{ type: "text", text: "Done" }] }] + }), + status: async () => ({ data: {} }), + }, + } + + // No browserProvider passed + const tool = createDelegateTask({ + manager: mockManager, + client: mockClient, + }) + + const toolContext = { + sessionID: "parent-session", + messageID: "parent-message", + agent: "Sisyphus", + abort: new AbortController().signal, + } + + // #when - request agent-browser skill without browserProvider + const result = await tool.execute( + { + description: "Test missing browserProvider", + prompt: "Do something", + category: "ultrabrain", + run_in_background: false, + load_skills: ["agent-browser"], + }, + toolContext + ) + + // #then - should return skill not found error + expect(result).toContain("Skills not found") + expect(result).toContain("agent-browser") + }) + }) + describe("buildSystemContent", () => { test("returns undefined when no skills and no category promptAppend", () => { // #given diff --git a/src/tools/delegate-task/tools.ts b/src/tools/delegate-task/tools.ts index 390d1983..574cd6b9 100644 --- a/src/tools/delegate-task/tools.ts +++ b/src/tools/delegate-task/tools.ts @@ -3,7 +3,7 @@ import { existsSync, readdirSync } from "node:fs" import { join } from "node:path" import type { BackgroundManager } from "../../features/background-agent" import type { DelegateTaskArgs } from "./types" -import type { CategoryConfig, CategoriesConfig, GitMasterConfig } from "../../config/schema" +import type { CategoryConfig, CategoriesConfig, GitMasterConfig, BrowserAutomationProvider } from "../../config/schema" import { DEFAULT_CATEGORIES, CATEGORY_PROMPT_APPENDS, CATEGORY_DESCRIPTIONS } from "./constants" import { findNearestMessageWithFields, findFirstMessageWithAgent, MESSAGE_STORAGE } from "../../features/hook-message-injector" import { resolveMultipleSkillsAsync } from "../../features/opencode-skill-loader/skill-content" @@ -157,6 +157,7 @@ export interface DelegateTaskToolOptions { userCategories?: CategoriesConfig gitMasterConfig?: GitMasterConfig sisyphusJuniorModel?: string + browserProvider?: BrowserAutomationProvider } export interface BuildSystemContentInput { @@ -179,7 +180,7 @@ export function buildSystemContent(input: BuildSystemContentInput): string | und } export function createDelegateTask(options: DelegateTaskToolOptions): ToolDefinition { - const { manager, client, directory, userCategories, gitMasterConfig, sisyphusJuniorModel } = options + const { manager, client, directory, userCategories, gitMasterConfig, sisyphusJuniorModel, browserProvider } = options const allCategories = { ...DEFAULT_CATEGORIES, ...userCategories } const categoryNames = Object.keys(allCategories) @@ -239,7 +240,7 @@ Prompts MUST be in English.` let skillContent: string | undefined if (args.load_skills.length > 0) { - const { resolved, notFound } = await resolveMultipleSkillsAsync(args.load_skills, { gitMasterConfig }) + const { resolved, notFound } = await resolveMultipleSkillsAsync(args.load_skills, { gitMasterConfig, browserProvider }) if (notFound.length > 0) { const allSkills = await discoverSkills({ includeClaudeCodePaths: true }) const available = allSkills.map(s => s.name).join(", ")