scout: JS-based cookie dismiss + scout_js tool

- _dismiss_cookie_banner: switch to execute_script for CSS selector clicks
  (OneTrust on ICE uses pointer-events:none overlay — mouse clicks don't reach it,
  but JS .click() bypasses this). Falls back to text-based JS search.
- Selectors cover: OneTrust, Cookiebot, CookieYes, generic [id/class*=accept/consent]
- Text fallback covers: IAB TCF "Allow All" pattern (Reuters, etc.)
- Add scout_js tool: run arbitrary JS on current page — useful for shadow DOM,
  z-index overlays, and any element that resists normal CSS/text selectors
- Add _click_via_js helper for targeted JS injection clicks

Tested patterns:
  ICE (theice.com) — OneTrust #onetrust-accept-btn-handler — requires JS click
  CFTC (cftc.gov) — no banner
  Reuters — IAB TCF "Allow All" — text click works

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-21 17:19:34 +01:00
parent 3d3f375e01
commit ec7cfda605
2 changed files with 139 additions and 1 deletions

View File

@@ -20,6 +20,26 @@ from datetime import datetime
import msgspec
from pydoll.browser.chromium import Chrome
from pydoll.browser.options import ChromiumOptions
# Chrome binary search order — covers native installs and Flatpak
_CHROME_PATHS = [
"/usr/bin/google-chrome",
"/usr/bin/google-chrome-stable",
"/usr/bin/chromium",
"/usr/bin/chromium-browser",
# Flatpak (system install)
"/var/lib/flatpak/app/com.google.Chrome/current/active/export/bin/com.google.Chrome",
# Flatpak (user install)
str(pathlib.Path.home() / ".local/share/flatpak/app/com.google.Chrome/current/active/export/bin/com.google.Chrome"),
]
def _find_chrome() -> str | None:
for p in _CHROME_PATHS:
if pathlib.Path(p).exists():
return p
return None
logger = logging.getLogger("scout.browser")
@@ -63,7 +83,15 @@ async def _ensure_browser() -> None:
"""Launch Chrome if not already running. Idempotent."""
if _state["tab"] is not None:
return
browser = Chrome()
chrome_path = _find_chrome()
assert chrome_path is not None, (
"No Chrome/Chromium binary found. Install via: "
"sudo dnf install chromium OR flatpak install com.google.Chrome"
)
logger.info("Using Chrome at: %s", chrome_path)
options = ChromiumOptions()
options.binary_location = chrome_path
browser = Chrome(options=options)
tab = await browser.start()
_state["browser"] = browser
_state["tab"] = tab
@@ -71,6 +99,93 @@ async def _ensure_browser() -> None:
logger.info("Chrome launched")
# Cookie consent selectors — ordered by specificity (vendor-specific first, generic last)
_COOKIE_SELECTORS = [
# OneTrust (very common on financial/data sites incl. ICE)
"#onetrust-accept-btn-handler",
".onetrust-accept-btn-handler",
# Cookiebot
"#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
# CookieYes / CookieLaw
".cky-btn-accept",
".cookie-law-accept",
# Termly
"#termly-code-snippet-support",
# Generic accept buttons (text-based fallback)
"button[id*='accept']",
"button[class*='accept']",
"button[id*='cookie']",
"button[class*='cookie']",
"button[id*='consent']",
"button[class*='consent']",
]
# Generic visible text patterns for cookie accept buttons
_COOKIE_ACCEPT_TEXTS = [
"Accept All", "Accept all", "Accept all cookies",
"Accept Cookies", "Accept cookies",
"I Accept", "I accept", "Accept",
"Allow All", "Allow all",
"Agree", "I Agree", "OK", "Got it",
"Continue", "Dismiss",
]
async def _click_via_js(tab, selector: str) -> bool:
"""Click an element via JS injection — bypasses pointer-events/z-index issues."""
try:
# Escape selector for JS string
escaped = selector.replace("'", "\\'")
result = await tab.execute_script(
f"var el = document.querySelector('{escaped}'); "
f"if (el) {{ el.click(); true; }} else {{ false; }}"
)
return bool(result)
except Exception:
return False
async def _dismiss_cookie_banner(tab) -> bool:
"""Try to find and click a cookie consent accept button. Returns True if dismissed.
Uses JS injection (click_js / execute_script) as primary method since cookie
banners often have z-index/pointer-events issues that block Pydoll's mouse simulation.
"""
# Try CSS selectors via JS click (bypasses visibility/pointer-events issues)
for selector in _COOKIE_SELECTORS:
try:
# Check if element exists in DOM first
exists = await tab.execute_script(
f"!!document.querySelector('{selector.replace(chr(39), chr(92)+chr(39))}')"
)
if exists:
clicked = await _click_via_js(tab, selector)
if clicked:
await asyncio.sleep(0.8)
logger.info("Cookie banner dismissed via JS selector: %s", selector)
return True
except Exception:
continue
# Fallback: find buttons by text content via JS
for text in _COOKIE_ACCEPT_TEXTS:
try:
escaped_text = text.replace("'", "\\'")
result = await tab.execute_script(
f"var buttons = Array.from(document.querySelectorAll('button, a[role=button]'));"
f"var btn = buttons.find(b => b.textContent.trim().includes('{escaped_text}'));"
f"if (btn) {{ btn.click(); true; }} else {{ false; }}"
)
if result:
await asyncio.sleep(0.8)
logger.info("Cookie banner dismissed via JS text: %r", text)
return True
except Exception:
continue
return False
async def visit(url: str) -> PageInfo:
"""Navigate to url. Opens browser on first call."""
await _ensure_browser()
@@ -79,6 +194,9 @@ async def visit(url: str) -> PageInfo:
await tab.go_to(url)
await asyncio.sleep(1) # let dynamic content settle
# Auto-dismiss cookie consent banners before anything else
await _dismiss_cookie_banner(tab)
title = await tab.title
links = await tab.query("a", find_all=True)
element_count = len(links) if links else 0

View File

@@ -160,6 +160,26 @@ async def scout_analyze(har_path: str) -> str:
return analyze.format_summary(summary)
@mcp.tool()
async def scout_js(script: str) -> str:
"""Execute JavaScript on the current page and return the result.
Useful for interacting with elements that are hard to reach via CSS selectors
(shadow DOM, z-index overlays, pointer-events:none, cookie banners, etc.).
Args:
script: JavaScript to evaluate. Return value is stringified.
Examples:
"document.title"
"document.querySelector('#onetrust-accept-btn-handler').click(); 'clicked'"
"Array.from(document.querySelectorAll('button')).map(b=>b.textContent.trim()).join('|')"
"""
assert _state["tab"] is not None, "No browser open — call scout_visit first"
result = await _state["tab"].execute_script(script)
return str(result) if result is not None else "(no return value)"
@mcp.tool()
async def scout_close() -> str:
"""Close the browser and clean up. Stops any active HAR recording first."""