scout: JS-based cookie dismiss + scout_js tool
- _dismiss_cookie_banner: switch to execute_script for CSS selector clicks (OneTrust on ICE uses pointer-events:none overlay — mouse clicks don't reach it, but JS .click() bypasses this). Falls back to text-based JS search. - Selectors cover: OneTrust, Cookiebot, CookieYes, generic [id/class*=accept/consent] - Text fallback covers: IAB TCF "Allow All" pattern (Reuters, etc.) - Add scout_js tool: run arbitrary JS on current page — useful for shadow DOM, z-index overlays, and any element that resists normal CSS/text selectors - Add _click_via_js helper for targeted JS injection clicks Tested patterns: ICE (theice.com) — OneTrust #onetrust-accept-btn-handler — requires JS click CFTC (cftc.gov) — no banner Reuters — IAB TCF "Allow All" — text click works Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,26 @@ from datetime import datetime
|
|||||||
|
|
||||||
import msgspec
|
import msgspec
|
||||||
from pydoll.browser.chromium import Chrome
|
from pydoll.browser.chromium import Chrome
|
||||||
|
from pydoll.browser.options import ChromiumOptions
|
||||||
|
|
||||||
|
# Chrome binary search order — covers native installs and Flatpak
|
||||||
|
_CHROME_PATHS = [
|
||||||
|
"/usr/bin/google-chrome",
|
||||||
|
"/usr/bin/google-chrome-stable",
|
||||||
|
"/usr/bin/chromium",
|
||||||
|
"/usr/bin/chromium-browser",
|
||||||
|
# Flatpak (system install)
|
||||||
|
"/var/lib/flatpak/app/com.google.Chrome/current/active/export/bin/com.google.Chrome",
|
||||||
|
# Flatpak (user install)
|
||||||
|
str(pathlib.Path.home() / ".local/share/flatpak/app/com.google.Chrome/current/active/export/bin/com.google.Chrome"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _find_chrome() -> str | None:
|
||||||
|
for p in _CHROME_PATHS:
|
||||||
|
if pathlib.Path(p).exists():
|
||||||
|
return p
|
||||||
|
return None
|
||||||
|
|
||||||
logger = logging.getLogger("scout.browser")
|
logger = logging.getLogger("scout.browser")
|
||||||
|
|
||||||
@@ -63,7 +83,15 @@ async def _ensure_browser() -> None:
|
|||||||
"""Launch Chrome if not already running. Idempotent."""
|
"""Launch Chrome if not already running. Idempotent."""
|
||||||
if _state["tab"] is not None:
|
if _state["tab"] is not None:
|
||||||
return
|
return
|
||||||
browser = Chrome()
|
chrome_path = _find_chrome()
|
||||||
|
assert chrome_path is not None, (
|
||||||
|
"No Chrome/Chromium binary found. Install via: "
|
||||||
|
"sudo dnf install chromium OR flatpak install com.google.Chrome"
|
||||||
|
)
|
||||||
|
logger.info("Using Chrome at: %s", chrome_path)
|
||||||
|
options = ChromiumOptions()
|
||||||
|
options.binary_location = chrome_path
|
||||||
|
browser = Chrome(options=options)
|
||||||
tab = await browser.start()
|
tab = await browser.start()
|
||||||
_state["browser"] = browser
|
_state["browser"] = browser
|
||||||
_state["tab"] = tab
|
_state["tab"] = tab
|
||||||
@@ -71,6 +99,93 @@ async def _ensure_browser() -> None:
|
|||||||
logger.info("Chrome launched")
|
logger.info("Chrome launched")
|
||||||
|
|
||||||
|
|
||||||
|
# Cookie consent selectors — ordered by specificity (vendor-specific first, generic last)
|
||||||
|
_COOKIE_SELECTORS = [
|
||||||
|
# OneTrust (very common on financial/data sites incl. ICE)
|
||||||
|
"#onetrust-accept-btn-handler",
|
||||||
|
".onetrust-accept-btn-handler",
|
||||||
|
# Cookiebot
|
||||||
|
"#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
|
||||||
|
# CookieYes / CookieLaw
|
||||||
|
".cky-btn-accept",
|
||||||
|
".cookie-law-accept",
|
||||||
|
# Termly
|
||||||
|
"#termly-code-snippet-support",
|
||||||
|
# Generic accept buttons (text-based fallback)
|
||||||
|
"button[id*='accept']",
|
||||||
|
"button[class*='accept']",
|
||||||
|
"button[id*='cookie']",
|
||||||
|
"button[class*='cookie']",
|
||||||
|
"button[id*='consent']",
|
||||||
|
"button[class*='consent']",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Generic visible text patterns for cookie accept buttons
|
||||||
|
_COOKIE_ACCEPT_TEXTS = [
|
||||||
|
"Accept All", "Accept all", "Accept all cookies",
|
||||||
|
"Accept Cookies", "Accept cookies",
|
||||||
|
"I Accept", "I accept", "Accept",
|
||||||
|
"Allow All", "Allow all",
|
||||||
|
"Agree", "I Agree", "OK", "Got it",
|
||||||
|
"Continue", "Dismiss",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def _click_via_js(tab, selector: str) -> bool:
|
||||||
|
"""Click an element via JS injection — bypasses pointer-events/z-index issues."""
|
||||||
|
try:
|
||||||
|
# Escape selector for JS string
|
||||||
|
escaped = selector.replace("'", "\\'")
|
||||||
|
result = await tab.execute_script(
|
||||||
|
f"var el = document.querySelector('{escaped}'); "
|
||||||
|
f"if (el) {{ el.click(); true; }} else {{ false; }}"
|
||||||
|
)
|
||||||
|
return bool(result)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def _dismiss_cookie_banner(tab) -> bool:
|
||||||
|
"""Try to find and click a cookie consent accept button. Returns True if dismissed.
|
||||||
|
|
||||||
|
Uses JS injection (click_js / execute_script) as primary method since cookie
|
||||||
|
banners often have z-index/pointer-events issues that block Pydoll's mouse simulation.
|
||||||
|
"""
|
||||||
|
# Try CSS selectors via JS click (bypasses visibility/pointer-events issues)
|
||||||
|
for selector in _COOKIE_SELECTORS:
|
||||||
|
try:
|
||||||
|
# Check if element exists in DOM first
|
||||||
|
exists = await tab.execute_script(
|
||||||
|
f"!!document.querySelector('{selector.replace(chr(39), chr(92)+chr(39))}')"
|
||||||
|
)
|
||||||
|
if exists:
|
||||||
|
clicked = await _click_via_js(tab, selector)
|
||||||
|
if clicked:
|
||||||
|
await asyncio.sleep(0.8)
|
||||||
|
logger.info("Cookie banner dismissed via JS selector: %s", selector)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Fallback: find buttons by text content via JS
|
||||||
|
for text in _COOKIE_ACCEPT_TEXTS:
|
||||||
|
try:
|
||||||
|
escaped_text = text.replace("'", "\\'")
|
||||||
|
result = await tab.execute_script(
|
||||||
|
f"var buttons = Array.from(document.querySelectorAll('button, a[role=button]'));"
|
||||||
|
f"var btn = buttons.find(b => b.textContent.trim().includes('{escaped_text}'));"
|
||||||
|
f"if (btn) {{ btn.click(); true; }} else {{ false; }}"
|
||||||
|
)
|
||||||
|
if result:
|
||||||
|
await asyncio.sleep(0.8)
|
||||||
|
logger.info("Cookie banner dismissed via JS text: %r", text)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def visit(url: str) -> PageInfo:
|
async def visit(url: str) -> PageInfo:
|
||||||
"""Navigate to url. Opens browser on first call."""
|
"""Navigate to url. Opens browser on first call."""
|
||||||
await _ensure_browser()
|
await _ensure_browser()
|
||||||
@@ -79,6 +194,9 @@ async def visit(url: str) -> PageInfo:
|
|||||||
await tab.go_to(url)
|
await tab.go_to(url)
|
||||||
await asyncio.sleep(1) # let dynamic content settle
|
await asyncio.sleep(1) # let dynamic content settle
|
||||||
|
|
||||||
|
# Auto-dismiss cookie consent banners before anything else
|
||||||
|
await _dismiss_cookie_banner(tab)
|
||||||
|
|
||||||
title = await tab.title
|
title = await tab.title
|
||||||
links = await tab.query("a", find_all=True)
|
links = await tab.query("a", find_all=True)
|
||||||
element_count = len(links) if links else 0
|
element_count = len(links) if links else 0
|
||||||
|
|||||||
@@ -160,6 +160,26 @@ async def scout_analyze(har_path: str) -> str:
|
|||||||
return analyze.format_summary(summary)
|
return analyze.format_summary(summary)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def scout_js(script: str) -> str:
|
||||||
|
"""Execute JavaScript on the current page and return the result.
|
||||||
|
|
||||||
|
Useful for interacting with elements that are hard to reach via CSS selectors
|
||||||
|
(shadow DOM, z-index overlays, pointer-events:none, cookie banners, etc.).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
script: JavaScript to evaluate. Return value is stringified.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"document.title"
|
||||||
|
"document.querySelector('#onetrust-accept-btn-handler').click(); 'clicked'"
|
||||||
|
"Array.from(document.querySelectorAll('button')).map(b=>b.textContent.trim()).join('|')"
|
||||||
|
"""
|
||||||
|
assert _state["tab"] is not None, "No browser open — call scout_visit first"
|
||||||
|
result = await _state["tab"].execute_script(script)
|
||||||
|
return str(result) if result is not None else "(no return value)"
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
async def scout_close() -> str:
|
async def scout_close() -> str:
|
||||||
"""Close the browser and clean up. Stops any active HAR recording first."""
|
"""Close the browser and clean up. Stops any active HAR recording first."""
|
||||||
|
|||||||
Reference in New Issue
Block a user