Playwright 浏览器自动化
Playwright 是微软开发的现代浏览器自动化库,支持 Chromium、Firefox 和 WebKit 三大浏览器引擎。相比 Selenium,Playwright 更快、更可靠,API 更现代化,是新一代浏览器自动化的首选工具。
官方文档
本教程内容基于 Playwright Python 官方文档。
当前版本: v1.49.x | Python 要求: 3.9+ | 系统要求: Windows 11+, macOS 14+, Debian 12/Ubuntu 22.04+
为什么选择 Playwright?
Playwright vs Selenium
| 特性 | Playwright | Selenium |
|---|---|---|
| 架构 | 直接使用浏览器协议 | WebDriver 协议 |
| 速度 | 更快 | 较慢 |
| 等待机制 | 自动智能等待 | 需要手动配置 |
| 多浏览器 | Chromium、Firefox、WebKit | 各浏览器独立驱动 |
| 选择器 | CSS、XPath、Text、React 等 | CSS、XPath |
| 并行执行 | 内置支持 | 需要额外框架 |
| 网络拦截 | 内置支持 | 需要额外配置 |
| 录制功能 | 内置 Codegen | 无 |
| 调试工具 | Trace Viewer | 无 |
Playwright 的优势
- 自动等待:元素操作前自动等待元素可操作,减少显式等待
- 多浏览器支持:一套代码支持 Chromium、Firefox 和 WebKit
- 强大的选择器:支持文本选择器、React/Vue 组件选择器
- 网络拦截:可以拦截、修改、模拟网络请求
- 并发执行:内置并行测试支持
- 录制回放:Codegen 工具可录制操作生成代码
- 调试工具:Trace Viewer 提供完整执行回放
安装配置
安装 Playwright
# 安装 Playwright Python 包
pip install playwright
# 安装浏览器(首次使用必须执行)
playwright install
# 只安装特定浏览器
playwright install chromium
playwright install firefox
playwright install webkit
验证安装
# 查看安装的浏览器
playwright --version
# 运行测试
python -c "from playwright.sync_api import sync_playwright; print('安装成功')"
快速开始
第一个示例
from playwright.sync_api import sync_playwright
def main():
with sync_playwright() as p:
# 启动 Chromium 浏览器
browser = p.chromium.launch(headless=False)
# 创建新页面
page = browser.new_page()
# 访问网页
page.goto('https://example.com')
# 获取页面标题
print(f'页面标题: {page.title()}')
# 截图
page.screenshot(path='screenshot.png')
# 关闭浏览器
browser.close()
if __name__ == '__main__':
main()
使用不同浏览器
from playwright.sync_api import sync_playwright
def demo_browsers():
with sync_playwright() as p:
# Chromium 浏览器
chromium = p.chromium.launch(headless=True)
page = chromium.new_page()
page.goto('https://example.com')
print(f'Chromium: {page.title()}')
chromium.close()
# Firefox 浏览器
firefox = p.firefox.launch(headless=True)
page = firefox.new_page()
page.goto('https://example.com')
print(f'Firefox: {page.title()}')
firefox.close()
# WebKit 浏览器(Safari 引擎)
webkit = p.webkit.launch(headless=True)
page = webkit.new_page()
page.goto('https://example.com')
print(f'WebKit: {page.title()}')
webkit.close()
异步 API
Playwright 同时提供同步和异步 API,异步版本性能更好:
import asyncio
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto('https://example.com')
print(f'页面标题: {await page.title()}')
await browser.close()
asyncio.run(main())
浏览器配置
Launch 选项
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False, # 是否无头模式
slow_mo=100, # 操作间隔(毫秒)
devtools=True, # 自动打开开发者工具
downloads_path='./downloads', # 下载目录
proxy={
'server': 'http://proxy.example.com:8080',
'username': 'user',
'password': 'pass'
},
args=[
'--disable-blink-features=AutomationControlled', # 隐藏自动化特征
'--window-size=1920,1080',
]
)
browser.close()
Context 配置
Browser Context 是独立的浏览器会话,类似隐身模式:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
# 创建带有特定配置的上下文
context = browser.new_context(
viewport={'width': 1920, 'height': 1080}, # 视口大小
user_agent='Mozilla/5.0 ...', # User-Agent
locale='zh-CN', # 语言
timezone_id='Asia/Shanghai', # 时区
geolocation={'latitude': 39.9, 'longitude': 116.4}, # 位置
permissions=['geolocation'], # 权限
ignore_https_errors=True, # 忽略 HTTPS 错误
java_script_enabled=True, # 启用 JavaScript
accept_downloads=True, # 允许下载
)
page = context.new_page()
page.goto('https://example.com')
context.close()
browser.close()
视口和设备模拟
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
# 模拟 iPhone 13
iphone_13 = p.devices['iPhone 13']
context = browser.new_context(**iphone_13)
page = context.new_page()
page.goto('https://example.com')
print(f'设备: {iphone_13["user_agent"]}')
context.close()
browser.close()
Cookie 和存储
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
# 添加 Cookie
context = browser.new_context()
context.add_cookies([
{
'name': 'session',
'value': 'abc123',
'domain': 'example.com',
'path': '/'
}
])
# 设置认证状态
context = browser.new_context(
storage_state='state.json' # 从文件加载状态
)
# 保存认证状态
context.storage_state(path='state.json')
context.close()
browser.close()
元素定位
基本选择器
Playwright 支持多种选择器语法:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# CSS 选择器
element = page.query_selector('div.content')
# XPath 选择器(使用 xpath= 前缀)
element = page.query_selector('xpath=//div[@class="content"]')
# 文本选择器(使用 text= 前缀)
element = page.query_selector('text=登录')
element = page.query_selector('text=/登录/i') # 正则表达式
# 组合选择器
element = page.query_selector('div.content >> text=登录')
browser.close()
locator 方法
推荐使用 locator 方法,它具有自动等待和重试功能:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://example.com')
# 使用 locator
button = page.locator('button:has-text("提交")')
button.click()
# 链式调用
form = page.locator('form#login')
username = form.locator('input[name="username"]')
password = form.locator('input[name="password"]')
submit = form.locator('button[type="submit"]')
username.fill('admin')
password.fill('password')
submit.click()
browser.close()
选择器语法详解
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# CSS 选择器
page.locator('div') # 标签
page.locator('#main') # ID
page.locator('.content') # class
page.locator('[data-id="123"]') # 属性
# 文本选择器
page.locator('text=Hello') # 精确文本
page.locator('text=/hello/i') # 正则表达式(忽略大小写)
page.locator('text=/^hello/') # 正则表达式(开头匹配)
# 组合选择器
page.locator('div.content') # 标签 + class
page.locator('div#main') # 标签 + ID
page.locator('input[type="text"]') # 标签 + 属性
# 伪类选择器
page.locator('button:visible') # 可见元素
page.locator('button:hidden') # 隐藏元素
page.locator('button:enabled') # 可用元素
page.locator('button:disabled') # 禁用元素
page.locator('input:focus') # 焦点元素
# 组合伪类
page.locator('button:visible:enabled')
# has-text 和 has
page.locator('article:has-text("Python")') # 包含文本
page.locator('div:has(h1)') # 包含子元素
# nth 选择器
page.locator('div >> nth=0') # 第一个
page.locator('div >> nth=-1') # 最后一个
# 链式选择器
page.locator('div.list').locator('li').first()
browser.close()
多元素处理
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# 获取元素数量
count = page.locator('li').count()
print(f'共有 {count} 个列表项')
# 遍历所有元素
items = page.locator('li')
for i in range(items.count()):
print(items.nth(i).text_content())
# 使用 all() 方法
for item in page.locator('li').all():
print(item.text_content())
# 获取第一个/最后一个
first = page.locator('li').first
last = page.locator('li').last
# 筛选元素
visible_items = page.locator('li >> visible=true')
browser.close()
元素操作
基本操作
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://example.com/form')
# 点击
page.click('button#submit')
page.locator('button#submit').click()
# 双击
page.dblclick('div.item')
# 右键点击
page.click('div.item', button='right')
# 填写表单
page.fill('input[name="username"]', 'admin')
page.fill('input[name="password"]', 'password123')
# 清空后填写
page.locator('input[name="username"]').clear()
page.locator('input[name="username"]').fill('new_user')
# 类型输入(模拟逐字符输入)
page.type('input[name="search"]', 'Python', delay=100) # 每字符间隔100ms
# 选择下拉框
page.select_option('select#country', 'china') # 按值
page.select_option('select#country', label='中国') # 按文本
page.select_option('select#country', index=0) # 按索引
# 复选框和单选框
page.check('input[type="checkbox"]')
page.uncheck('input[type="checkbox"]')
page.is_checked('input[type="checkbox"]') # 检查是否选中
# 上传文件
page.set_input_files('input[type="file"]', 'test.pdf')
page.set_input_files('input[type="file"]', ['file1.pdf', 'file2.pdf'])
browser.close()
获取元素信息
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
element = page.locator('div.content')
# 获取文本内容
text = element.text_content() # 包含子元素文本
inner_text = element.inner_text() # 只获取自己的文本
# 获取属性
href = element.get_attribute('href')
data_id = element.get_attribute('data-id')
# 获取 HTML
html = element.inner_html()
outer_html = element.evaluate('el => el.outerHTML')
# 获取值(表单元素)
value = page.input_value('input[name="username"]')
# 检查元素状态
is_visible = element.is_visible()
is_enabled = element.is_enabled()
is_editable = element.is_editable()
is_checked = element.is_checked()
# 获取边界框
box = element.bounding_box()
if box:
print(f'位置: ({box["x"]}, {box["y"]})')
print(f'大小: {box["width"]} x {box["height"]}')
browser.close()
高级交互
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://example.com')
# 悬停
page.hover('div.menu-item')
# 焦点
page.focus('input[name="search"]')
# 拖拽
source = page.locator('div.source')
target = page.locator('div.target')
source.drag_to(target)
# 键盘操作
page.keyboard.press('Enter')
page.keyboard.press('Control+a') # 全选
page.keyboard.press('Control+c') # 复制
page.keyboard.press('Control+v') # 粘贴
page.keyboard.type('Hello World', delay=100)
# 鼠标操作
page.mouse.click(100, 200)
page.mouse.dblclick(100, 200)
page.mouse.move(300, 400)
page.mouse.down()
page.mouse.move(500, 600)
page.mouse.up()
# 滚动
page.mouse.wheel(0, 500) # 向下滚动500像素
element.scroll_into_view_if_needed() # 滚动到元素可见
# 触摸操作(移动端)
page.touchscreen.tap(100, 200)
browser.close()
等待机制
Playwright 具有自动等待机制,大多数情况下无需手动等待:
自动等待
Playwright 在执行操作前会自动等待:
- 元素附加到 DOM
- 元素可见
- 元素稳定(不在动画中)
- 元素可接收事件
- 元素启用
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# 自动等待元素可点击
page.click('button#submit') # 自动等待按钮可点击
# 自动等待元素可填写
page.fill('input[name="username"]', 'admin') # 自动等待输入框可用
browser.close()
显式等待
对于特殊情况,可以使用显式等待:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# 等待元素出现
page.wait_for_selector('div.content')
# 等待元素可见
page.wait_for_selector('div.content', state='visible')
# 等待元素隐藏
page.wait_for_selector('div.loading', state='hidden')
# 等待元素从 DOM 中移除
page.wait_for_selector('div.notification', state='detached')
# 等待导航完成
page.click('a.link')
page.wait_for_load_state('networkidle')
# 等待 URL 变化
page.wait_for_url('**/dashboard')
page.wait_for_url(re.compile(r'.*/dashboard/.*'))
# 等待请求完成
with page.expect_request('**/api/data') as req_info:
page.click('button#load')
request = req_info.value
# 等待响应
with page.expect_response('**/api/data') as resp_info:
page.click('button#load')
response = resp_info.value
# 等待弹出窗口
with page.expect_popup() as popup_info:
page.click('a[target="_blank"]')
popup = popup_info.value
# 等待文件下载
with page.expect_download() as download_info:
page.click('a.download')
download = download_info.value
# 等待函数
page.wait_for_function('() => document.title.includes("Dashboard")')
# 等待超时
page.wait_for_timeout(3000) # 等待3秒(不推荐,仅用于调试)
browser.close()
设置超时
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 设置全局超时
page.set_default_timeout(60000) # 60秒
# 单次操作设置超时
page.click('button', timeout=10000) # 10秒
# 等待时设置超时
page.wait_for_selector('div.content', timeout=30000) # 30秒
browser.close()
网络拦截
Playwright 可以拦截和修改网络请求,这是非常强大的功能:
拦截请求
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 拦截所有请求
def handle_route(route):
print(f'请求: {route.request.url}')
route.continue_() # 继续请求
page.route('**', handle_route)
# 拦截特定请求
def block_images(route):
if route.request.resource_type == 'image':
route.abort() # 阻止请求
else:
route.continue_()
page.route('**/*.{png,jpg,jpeg,gif,svg}', block_images)
# 修改请求
def modify_request(route):
headers = route.request.headers
headers['X-Custom-Header'] = 'value'
route.continue_(headers=headers)
page.route('**/api/*', modify_request)
# Mock 响应
def mock_api(route):
route.fulfill(
status=200,
content_type='application/json',
body='{"status": "success", "data": []}'
)
page.route('**/api/data', mock_api)
page.goto('https://example.com')
browser.close()
监听请求和响应
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 监听请求
def on_request(request):
print(f'请求: {request.method} {request.url}')
page.on('request', on_request)
# 监听响应
def on_response(response):
print(f'响应: {response.status} {response.url}')
if 'application/json' in response.headers.get('content-type', ''):
print(response.json())
page.on('response', on_response)
page.goto('https://example.com')
browser.close()
WebSocket 路由(Playwright 1.48+)
Playwright 1.48 引入了 WebSocket 路由功能,可以拦截、修改和模拟 WebSocket 连接:
基本 WebSocket 拦截
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 拦截 WebSocket 连接
def handle_websocket(ws):
print(f'WebSocket 连接: {ws.url}')
# 监听来自服务器的消息
ws.on('framesreceived', lambda frames: [
print(f'收到消息: {frame}') for frame in frames
])
# 监听发送到服务器的消息
ws.on('framessent', lambda frames: [
print(f'发送消息: {frame}') for frame in frames
])
# 连接关闭时
ws.on('close', lambda: print('WebSocket 关闭'))
page.route_web_socket('**/ws', handle_websocket)
page.goto('https://example.com')
browser.close()
模拟 WebSocket 响应
from playwright.sync_api import sync_playwright
from typing import Union
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 模拟 WebSocket 服务器
def mock_websocket(ws):
def on_message(message: Union[str, bytes]):
print(f'收到: {message}')
# 根据消息内容返回不同响应
if message == 'ping':
ws.send('pong')
elif message == 'get_data':
ws.send('{"data": [1, 2, 3]}')
ws.on('message', on_message)
page.route_web_socket('**/ws', mock_websocket)
page.goto('https://example.com')
browser.close()
修改 WebSocket 消息
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
def modify_websocket(ws):
# 修改发送的消息
def on_frames_sent(frames):
for frame in frames:
# 在发送前修改消息内容
if 'secret' in frame:
modified = frame.replace('secret', '***')
print(f'修改消息: {frame} -> {modified}')
# 修改接收的消息
def on_frames_received(frames):
for frame in frames:
# 过滤敏感信息
if 'sensitive_data' in str(frame):
print('过滤敏感数据')
return
ws.on('framessent', on_frames_sent)
ws.on('framesreceived', on_frames_received)
page.route_web_socket('**', modify_websocket)
page.goto('https://example.com')
browser.close()
认证和请求头
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
# 设置基本认证
context = browser.new_context(
http_credentials={
'username': 'user',
'password': 'pass'
}
)
page = context.new_page()
# 设置额外请求头
page.set_extra_http_headers({
'X-API-Key': 'your-api-key',
'Authorization': 'Bearer token'
})
page.goto('https://api.example.com/data')
browser.close()
多页面和框架
多标签页处理
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
context = browser.new_context()
page = context.new_page()
page.goto('https://example.com')
# 点击链接打开新标签页
with context.expect_page() as new_page_info:
page.click('a[target="_blank"]')
new_page = new_page_info.value
print(f'新页面标题: {new_page.title()}')
# 获取所有页面
pages = context.pages
for p in pages:
print(f'页面: {p.url}')
# 在页面间切换
page.bring_to_front()
browser.close()
iframe 处理
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# 通过选择器定位 iframe
frame = page.frame_locator('iframe#content')
# 在 iframe 内操作
frame.fill('input[name="username"]', 'admin')
frame.click('button[type="submit"]')
# 获取所有 frame
for frame in page.frames:
print(f'Frame URL: {frame.url}')
# 通过 name 定位 frame
frame = page.frame('frame-name')
# 通过 URL 定位 frame
frame = page.frame(url=re.compile(r'.*/embed/.*'))
browser.close()
执行 JavaScript
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# 执行 JavaScript
result = page.evaluate('() => document.title')
print(f'标题: {result}')
# 传递参数
result = page.evaluate(
'value => document.querySelector("h1").textContent + value',
'!'
)
print(f'结果: {result}')
# 返回复杂对象
info = page.evaluate('''() => {
return {
title: document.title,
url: location.href,
width: window.innerWidth,
height: window.innerHeight
}
}''')
print(info)
# 在元素上执行
element = page.locator('div.content')
result = element.evaluate('el => el.innerText')
# 执行脚本
page.evaluate('''() => {
document.querySelector("button").click();
}''')
# 添加脚本到页面
page.add_script_tag(url='https://cdn.example.com/script.js')
page.add_script_tag(content='console.log("Hello");')
browser.close()
Clock API(Playwright 1.45+)
Clock API 允许在测试中控制和操作时间,非常适合测试倒计时、定时器、日程提醒等时间相关功能:
基本用法
from playwright.sync_api import sync_playwright
import datetime
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 安装时钟,设置初始时间
page.clock.install(time=datetime.datetime(2024, 2, 2, 8, 0, 0))
page.goto('https://example.com')
# 此时页面中的 Date.now() 返回 2024-02-02 08:00:00
browser.close()
暂停和恢复时间
from playwright.sync_api import sync_playwright
import datetime
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 设置初始时间
page.clock.install(time=datetime.datetime(2024, 2, 2, 8, 0, 0))
page.goto('https://example.com')
# 暂停时间在指定时刻
page.clock.pause_at(datetime.datetime(2024, 2, 2, 10, 0, 0))
# 此时 Date.now() 固定在 10:00
# 可以验证页面显示的时间是否正确
# 快进时间(模拟用户离开一段时间后回来)
page.clock.fast_forward('30:00') # 快进 30 分钟
# 快进到特定时间
page.clock.fast_forward(datetime.datetime(2024, 2, 2, 11, 0, 0))
browser.close()
运行定时器
from playwright.sync_api import sync_playwright
import datetime
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.clock.install(time=datetime.datetime(2024, 1, 1, 0, 0, 0))
page.goto('https://example.com')
# 快进时间,同时运行中间的所有定时器
page.clock.run_for('05:00') # 运行 5 分钟内的所有定时器
# 或者按毫秒运行
page.clock.run_for(60000) # 运行 1 分钟
browser.close()
测试倒计时场景
from playwright.sync_api import sync_playwright
import datetime
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# 设置初始时间
page.clock.install(time=datetime.datetime(2024, 1, 1, 0, 0, 0))
page.goto('https://example.com/countdown')
# 假设页面有一个 10 分钟倒计时
# 快进 9 分钟
page.clock.fast_forward('09:00')
# 验证剩余时间显示为 1 分钟
remaining = page.locator('.countdown').text_content()
assert '1:00' in remaining or '00:01' in remaining
# 快进剩余 1 分钟
page.clock.fast_forward('01:00')
# 验证倒计时结束
assert page.locator('.countdown-ended').is_visible()
browser.close()
Clock API 方法总结
| 方法 | 说明 |
|---|---|
clock.install(time) | 安装时钟并设置初始时间 |
clock.pause_at(time) | 暂停时间在指定时刻 |
clock.fast_forward(duration) | 快进指定时间(不运行定时器) |
clock.run_for(duration) | 快进并运行中间的定时器 |
clock.resume() | 恢复时间流动 |
clock.set_fixed_time(time) | 固定时间不动 |
截图和 PDF
截图
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# 整页截图
page.screenshot(path='full-page.png', full_page=True)
# 视口截图
page.screenshot(path='viewport.png')
# 元素截图
element = page.locator('div.content')
element.screenshot(path='element.png')
# 返回截图数据
screenshot_bytes = page.screenshot()
# 设置截图质量(仅 JPEG)
page.screenshot(path='image.jpg', type='jpeg', quality=80)
browser.close()
生成 PDF
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')
# 生成 PDF
page.pdf(
path='document.pdf',
format='A4',
print_background=True,
margin={
'top': '20mm',
'right': '20mm',
'bottom': '20mm',
'left': '20mm'
}
)
# 横向 PDF
page.pdf(path='document-landscape.pdf', format='A4', landscape=True)
browser.close()
录制和调试
代码生成器
Playwright 提供代码生成器,可以录制操作并生成代码:
# 启动代码生成器
playwright codegen
# 录制特定网站
playwright codegen https://example.com
# 生成 Python 代码
playwright codegen --target python https://example.com
# 保存到文件
playwright codegen --target python -o script.py https://example.com
Trace Viewer
Trace Viewer 可以记录完整执行过程,用于调试:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
context = browser.new_context()
# 开始录制
context.tracing.start(screenshots=True, snapshots=True, sources=True)
page = context.new_page()
page.goto('https://example.com')
page.click('button')
# 停止录制并保存
context.tracing.stop(path='trace.zip')
browser.close()
# 查看 trace
# 运行命令: playwright show-trace trace.zip
调试模式
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=1000) # 慢速模式
page = browser.new_page()
# 暂停执行,进入调试模式
page.pause()
page.goto('https://example.com')
browser.close()
完整爬虫示例
from playwright.sync_api import sync_playwright
import json
import time
import random
class PlaywrightSpider:
def __init__(self, headless=True):
self.headless = headless
self.browser = None
self.context = None
self.page = None
def start(self):
"""启动浏览器"""
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(
headless=self.headless,
args=['--disable-blink-features=AutomationControlled']
)
self.context = self.browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
self.page = self.context.new_page()
def close(self):
"""关闭浏览器"""
if self.context:
self.context.close()
if self.browser:
self.browser.close()
if self.playwright:
self.playwright.stop()
def goto(self, url, wait_until='networkidle'):
"""访问页面"""
self.page.goto(url, wait_until=wait_until)
time.sleep(random.uniform(0.5, 1.5))
def scroll_to_bottom(self):
"""滚动到页面底部"""
self.page.evaluate('''() => {
window.scrollTo(0, document.body.scrollHeight);
}''')
time.sleep(1)
def infinite_scroll(self, max_scrolls=10):
"""无限滚动加载"""
last_height = 0
scrolls = 0
while scrolls < max_scrolls:
self.page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
new_height = self.page.evaluate('document.body.scrollHeight')
if new_height == last_height:
break
last_height = new_height
scrolls += 1
def wait_for_selector(self, selector, timeout=30000):
"""等待选择器出现"""
return self.page.wait_for_selector(selector, timeout=timeout)
def extract_items(self, selector, fields):
"""提取列表数据"""
items = []
elements = self.page.locator(selector)
for i in range(elements.count()):
element = elements.nth(i)
item = {}
for field, field_selector in fields.items():
try:
field_element = element.locator(field_selector['selector'])
if field_selector.get('type') == 'text':
item[field] = field_element.text_content()
elif field_selector.get('type') == 'attribute':
item[field] = field_element.get_attribute(field_selector['attr'])
except:
item[field] = ''
items.append(item)
return items
def save_to_json(self, data, filename):
"""保存数据到 JSON"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f'已保存 {len(data)} 条数据到 {filename}')
# 使用示例
if __name__ == '__main__':
spider = PlaywrightSpider(headless=True)
try:
spider.start()
# 示例:爬取 Hacker News
spider.goto('https://news.ycombinator.com/')
items = spider.extract_items(
selector='.athing',
fields={
'title': {
'selector': '.titleline > a',
'type': 'text'
},
'link': {
'selector': '.titleline > a',
'type': 'attribute',
'attr': 'href'
},
'score': {
'selector': '+ tr .score',
'type': 'text'
}
}
)
spider.save_to_json(items, 'news.json')
finally:
spider.close()
小结
本章我们学习了:
- Playwright 简介 - 相比 Selenium 的优势
- 安装配置 - 安装 Playwright 和浏览器
- 浏览器配置 - launch、context、视口设置
- 元素定位 - CSS、XPath、文本选择器
- 元素操作 - 点击、填写、拖拽等
- 等待机制 - 自动等待和显式等待
- 网络拦截 - 拦截、修改、Mock 请求
- 多页面和框架 - 标签页和 iframe 处理
- 高级功能 - 截图、PDF、录制、调试
Playwright 是新一代浏览器自动化工具,其现代化的 API 和强大的功能使其成为网页爬虫和自动化测试的首选。
练习
- 使用 Playwright 登录一个需要账号的网站
- 编写一个处理无限滚动页面的爬虫
- 使用网络拦截功能 Mock API 响应
- 使用 Trace Viewer 调试你的爬虫脚本