From ec7cfda60526da926d79d24c903c83f1485fa6c6 Mon Sep 17 00:00:00 2001 From: Deeman Date: Sat, 21 Feb 2026 17:19:34 +0100 Subject: [PATCH] scout: JS-based cookie dismiss + scout_js tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _dismiss_cookie_banner: switch to execute_script for CSS selector clicks (OneTrust on ICE uses pointer-events:none overlay — mouse clicks don't reach it, but JS .click() bypasses this). Falls back to text-based JS search. - Selectors cover: OneTrust, Cookiebot, CookieYes, generic [id/class*=accept/consent] - Text fallback covers: IAB TCF "Allow All" pattern (Reuters, etc.) - Add scout_js tool: run arbitrary JS on current page — useful for shadow DOM, z-index overlays, and any element that resists normal CSS/text selectors - Add _click_via_js helper for targeted JS injection clicks Tested patterns: ICE (theice.com) — OneTrust #onetrust-accept-btn-handler — requires JS click CFTC (cftc.gov) — no banner Reuters — IAB TCF "Allow All" — text click works Co-Authored-By: Claude Sonnet 4.6 --- tools/scout/src/scout/browser.py | 120 ++++++++++++++++++++++++++++++- tools/scout/src/scout/server.py | 20 ++++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/tools/scout/src/scout/browser.py b/tools/scout/src/scout/browser.py index 55cb404..ecb7888 100644 --- a/tools/scout/src/scout/browser.py +++ b/tools/scout/src/scout/browser.py @@ -20,6 +20,26 @@ from datetime import datetime import msgspec from pydoll.browser.chromium import Chrome +from pydoll.browser.options import ChromiumOptions + +# Chrome binary search order — covers native installs and Flatpak +_CHROME_PATHS = [ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/chromium", + "/usr/bin/chromium-browser", + # Flatpak (system install) + "/var/lib/flatpak/app/com.google.Chrome/current/active/export/bin/com.google.Chrome", + # Flatpak (user install) + str(pathlib.Path.home() / ".local/share/flatpak/app/com.google.Chrome/current/active/export/bin/com.google.Chrome"), +] + + +def _find_chrome() -> str | None: + for p in _CHROME_PATHS: + if pathlib.Path(p).exists(): + return p + return None logger = logging.getLogger("scout.browser") @@ -63,7 +83,15 @@ async def _ensure_browser() -> None: """Launch Chrome if not already running. Idempotent.""" if _state["tab"] is not None: return - browser = Chrome() + chrome_path = _find_chrome() + assert chrome_path is not None, ( + "No Chrome/Chromium binary found. Install via: " + "sudo dnf install chromium OR flatpak install com.google.Chrome" + ) + logger.info("Using Chrome at: %s", chrome_path) + options = ChromiumOptions() + options.binary_location = chrome_path + browser = Chrome(options=options) tab = await browser.start() _state["browser"] = browser _state["tab"] = tab @@ -71,6 +99,93 @@ async def _ensure_browser() -> None: logger.info("Chrome launched") +# Cookie consent selectors — ordered by specificity (vendor-specific first, generic last) +_COOKIE_SELECTORS = [ + # OneTrust (very common on financial/data sites incl. ICE) + "#onetrust-accept-btn-handler", + ".onetrust-accept-btn-handler", + # Cookiebot + "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll", + # CookieYes / CookieLaw + ".cky-btn-accept", + ".cookie-law-accept", + # Termly + "#termly-code-snippet-support", + # Generic accept buttons (text-based fallback) + "button[id*='accept']", + "button[class*='accept']", + "button[id*='cookie']", + "button[class*='cookie']", + "button[id*='consent']", + "button[class*='consent']", +] + +# Generic visible text patterns for cookie accept buttons +_COOKIE_ACCEPT_TEXTS = [ + "Accept All", "Accept all", "Accept all cookies", + "Accept Cookies", "Accept cookies", + "I Accept", "I accept", "Accept", + "Allow All", "Allow all", + "Agree", "I Agree", "OK", "Got it", + "Continue", "Dismiss", +] + + +async def _click_via_js(tab, selector: str) -> bool: + """Click an element via JS injection — bypasses pointer-events/z-index issues.""" + try: + # Escape selector for JS string + escaped = selector.replace("'", "\\'") + result = await tab.execute_script( + f"var el = document.querySelector('{escaped}'); " + f"if (el) {{ el.click(); true; }} else {{ false; }}" + ) + return bool(result) + except Exception: + return False + + +async def _dismiss_cookie_banner(tab) -> bool: + """Try to find and click a cookie consent accept button. Returns True if dismissed. + + Uses JS injection (click_js / execute_script) as primary method since cookie + banners often have z-index/pointer-events issues that block Pydoll's mouse simulation. + """ + # Try CSS selectors via JS click (bypasses visibility/pointer-events issues) + for selector in _COOKIE_SELECTORS: + try: + # Check if element exists in DOM first + exists = await tab.execute_script( + f"!!document.querySelector('{selector.replace(chr(39), chr(92)+chr(39))}')" + ) + if exists: + clicked = await _click_via_js(tab, selector) + if clicked: + await asyncio.sleep(0.8) + logger.info("Cookie banner dismissed via JS selector: %s", selector) + return True + except Exception: + continue + + # Fallback: find buttons by text content via JS + for text in _COOKIE_ACCEPT_TEXTS: + try: + escaped_text = text.replace("'", "\\'") + result = await tab.execute_script( + f"var buttons = Array.from(document.querySelectorAll('button, a[role=button]'));" + f"var btn = buttons.find(b => b.textContent.trim().includes('{escaped_text}'));" + f"if (btn) {{ btn.click(); true; }} else {{ false; }}" + ) + if result: + await asyncio.sleep(0.8) + logger.info("Cookie banner dismissed via JS text: %r", text) + return True + except Exception: + continue + + return False + + async def visit(url: str) -> PageInfo: """Navigate to url. Opens browser on first call.""" await _ensure_browser() @@ -79,6 +194,9 @@ async def visit(url: str) -> PageInfo: await tab.go_to(url) await asyncio.sleep(1) # let dynamic content settle + # Auto-dismiss cookie consent banners before anything else + await _dismiss_cookie_banner(tab) + title = await tab.title links = await tab.query("a", find_all=True) element_count = len(links) if links else 0 diff --git a/tools/scout/src/scout/server.py b/tools/scout/src/scout/server.py index 3cfc412..a6e5bdf 100644 --- a/tools/scout/src/scout/server.py +++ b/tools/scout/src/scout/server.py @@ -160,6 +160,26 @@ async def scout_analyze(har_path: str) -> str: return analyze.format_summary(summary) +@mcp.tool() +async def scout_js(script: str) -> str: + """Execute JavaScript on the current page and return the result. + + Useful for interacting with elements that are hard to reach via CSS selectors + (shadow DOM, z-index overlays, pointer-events:none, cookie banners, etc.). + + Args: + script: JavaScript to evaluate. Return value is stringified. + + Examples: + "document.title" + "document.querySelector('#onetrust-accept-btn-handler').click(); 'clicked'" + "Array.from(document.querySelectorAll('button')).map(b=>b.textContent.trim()).join('|')" + """ + assert _state["tab"] is not None, "No browser open — call scout_visit first" + result = await _state["tab"].execute_script(script) + return str(result) if result is not None else "(no return value)" + + @mcp.tool() async def scout_close() -> str: """Close the browser and clean up. Stops any active HAR recording first."""