跳到主要内容

Playwright 浏览器自动化

Playwright 是微软开发的现代浏览器自动化库,支持 Chromium、Firefox 和 WebKit 三大浏览器引擎。相比 Selenium,Playwright 更快、更可靠,API 更现代化,是新一代浏览器自动化的首选工具。

官方文档

本教程内容基于 Playwright Python 官方文档

当前版本: v1.49.x | Python 要求: 3.9+ | 系统要求: Windows 11+, macOS 14+, Debian 12/Ubuntu 22.04+

为什么选择 Playwright?

Playwright vs Selenium

特性PlaywrightSelenium
架构直接使用浏览器协议WebDriver 协议
速度更快较慢
等待机制自动智能等待需要手动配置
多浏览器Chromium、Firefox、WebKit各浏览器独立驱动
选择器CSS、XPath、Text、React 等CSS、XPath
并行执行内置支持需要额外框架
网络拦截内置支持需要额外配置
录制功能内置 Codegen
调试工具Trace Viewer

Playwright 的优势

  • 自动等待:元素操作前自动等待元素可操作,减少显式等待
  • 多浏览器支持:一套代码支持 Chromium、Firefox 和 WebKit
  • 强大的选择器:支持文本选择器、React/Vue 组件选择器
  • 网络拦截:可以拦截、修改、模拟网络请求
  • 并发执行:内置并行测试支持
  • 录制回放:Codegen 工具可录制操作生成代码
  • 调试工具:Trace Viewer 提供完整执行回放

安装配置

安装 Playwright

# 安装 Playwright Python 包
pip install playwright

# 安装浏览器(首次使用必须执行)
playwright install

# 只安装特定浏览器
playwright install chromium
playwright install firefox
playwright install webkit

验证安装

# 查看安装的浏览器
playwright --version

# 运行测试
python -c "from playwright.sync_api import sync_playwright; print('安装成功')"

快速开始

第一个示例

from playwright.sync_api import sync_playwright

def main():
with sync_playwright() as p:
# 启动 Chromium 浏览器
browser = p.chromium.launch(headless=False)

# 创建新页面
page = browser.new_page()

# 访问网页
page.goto('https://example.com')

# 获取页面标题
print(f'页面标题: {page.title()}')

# 截图
page.screenshot(path='screenshot.png')

# 关闭浏览器
browser.close()

if __name__ == '__main__':
main()

使用不同浏览器

from playwright.sync_api import sync_playwright

def demo_browsers():
with sync_playwright() as p:
# Chromium 浏览器
chromium = p.chromium.launch(headless=True)
page = chromium.new_page()
page.goto('https://example.com')
print(f'Chromium: {page.title()}')
chromium.close()

# Firefox 浏览器
firefox = p.firefox.launch(headless=True)
page = firefox.new_page()
page.goto('https://example.com')
print(f'Firefox: {page.title()}')
firefox.close()

# WebKit 浏览器(Safari 引擎)
webkit = p.webkit.launch(headless=True)
page = webkit.new_page()
page.goto('https://example.com')
print(f'WebKit: {page.title()}')
webkit.close()

异步 API

Playwright 同时提供同步和异步 API,异步版本性能更好:

import asyncio
from playwright.async_api import async_playwright

async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()

await page.goto('https://example.com')
print(f'页面标题: {await page.title()}')

await browser.close()

asyncio.run(main())

浏览器配置

Launch 选项

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch(
headless=False, # 是否无头模式
slow_mo=100, # 操作间隔(毫秒)
devtools=True, # 自动打开开发者工具
downloads_path='./downloads', # 下载目录
proxy={
'server': 'http://proxy.example.com:8080',
'username': 'user',
'password': 'pass'
},
args=[
'--disable-blink-features=AutomationControlled', # 隐藏自动化特征
'--window-size=1920,1080',
]
)
browser.close()

Context 配置

Browser Context 是独立的浏览器会话,类似隐身模式:

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()

# 创建带有特定配置的上下文
context = browser.new_context(
viewport={'width': 1920, 'height': 1080}, # 视口大小
user_agent='Mozilla/5.0 ...', # User-Agent
locale='zh-CN', # 语言
timezone_id='Asia/Shanghai', # 时区
geolocation={'latitude': 39.9, 'longitude': 116.4}, # 位置
permissions=['geolocation'], # 权限
ignore_https_errors=True, # 忽略 HTTPS 错误
java_script_enabled=True, # 启用 JavaScript
accept_downloads=True, # 允许下载
)

page = context.new_page()
page.goto('https://example.com')

context.close()
browser.close()

视口和设备模拟

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()

# 模拟 iPhone 13
iphone_13 = p.devices['iPhone 13']
context = browser.new_context(**iphone_13)
page = context.new_page()

page.goto('https://example.com')
print(f'设备: {iphone_13["user_agent"]}')

context.close()
browser.close()
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()

# 添加 Cookie
context = browser.new_context()
context.add_cookies([
{
'name': 'session',
'value': 'abc123',
'domain': 'example.com',
'path': '/'
}
])

# 设置认证状态
context = browser.new_context(
storage_state='state.json' # 从文件加载状态
)

# 保存认证状态
context.storage_state(path='state.json')

context.close()
browser.close()

元素定位

基本选择器

Playwright 支持多种选择器语法:

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# CSS 选择器
element = page.query_selector('div.content')

# XPath 选择器(使用 xpath= 前缀)
element = page.query_selector('xpath=//div[@class="content"]')

# 文本选择器(使用 text= 前缀)
element = page.query_selector('text=登录')
element = page.query_selector('text=/登录/i') # 正则表达式

# 组合选择器
element = page.query_selector('div.content >> text=登录')

browser.close()

locator 方法

推荐使用 locator 方法,它具有自动等待和重试功能:

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://example.com')

# 使用 locator
button = page.locator('button:has-text("提交")')
button.click()

# 链式调用
form = page.locator('form#login')
username = form.locator('input[name="username"]')
password = form.locator('input[name="password"]')
submit = form.locator('button[type="submit"]')

username.fill('admin')
password.fill('password')
submit.click()

browser.close()

选择器语法详解

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# CSS 选择器
page.locator('div') # 标签
page.locator('#main') # ID
page.locator('.content') # class
page.locator('[data-id="123"]') # 属性

# 文本选择器
page.locator('text=Hello') # 精确文本
page.locator('text=/hello/i') # 正则表达式(忽略大小写)
page.locator('text=/^hello/') # 正则表达式(开头匹配)

# 组合选择器
page.locator('div.content') # 标签 + class
page.locator('div#main') # 标签 + ID
page.locator('input[type="text"]') # 标签 + 属性

# 伪类选择器
page.locator('button:visible') # 可见元素
page.locator('button:hidden') # 隐藏元素
page.locator('button:enabled') # 可用元素
page.locator('button:disabled') # 禁用元素
page.locator('input:focus') # 焦点元素

# 组合伪类
page.locator('button:visible:enabled')

# has-text 和 has
page.locator('article:has-text("Python")') # 包含文本
page.locator('div:has(h1)') # 包含子元素

# nth 选择器
page.locator('div >> nth=0') # 第一个
page.locator('div >> nth=-1') # 最后一个

# 链式选择器
page.locator('div.list').locator('li').first()

browser.close()

多元素处理

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# 获取元素数量
count = page.locator('li').count()
print(f'共有 {count} 个列表项')

# 遍历所有元素
items = page.locator('li')
for i in range(items.count()):
print(items.nth(i).text_content())

# 使用 all() 方法
for item in page.locator('li').all():
print(item.text_content())

# 获取第一个/最后一个
first = page.locator('li').first
last = page.locator('li').last

# 筛选元素
visible_items = page.locator('li >> visible=true')

browser.close()

元素操作

基本操作

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://example.com/form')

# 点击
page.click('button#submit')
page.locator('button#submit').click()

# 双击
page.dblclick('div.item')

# 右键点击
page.click('div.item', button='right')

# 填写表单
page.fill('input[name="username"]', 'admin')
page.fill('input[name="password"]', 'password123')

# 清空后填写
page.locator('input[name="username"]').clear()
page.locator('input[name="username"]').fill('new_user')

# 类型输入(模拟逐字符输入)
page.type('input[name="search"]', 'Python', delay=100) # 每字符间隔100ms

# 选择下拉框
page.select_option('select#country', 'china') # 按值
page.select_option('select#country', label='中国') # 按文本
page.select_option('select#country', index=0) # 按索引

# 复选框和单选框
page.check('input[type="checkbox"]')
page.uncheck('input[type="checkbox"]')
page.is_checked('input[type="checkbox"]') # 检查是否选中

# 上传文件
page.set_input_files('input[type="file"]', 'test.pdf')
page.set_input_files('input[type="file"]', ['file1.pdf', 'file2.pdf'])

browser.close()

获取元素信息

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

element = page.locator('div.content')

# 获取文本内容
text = element.text_content() # 包含子元素文本
inner_text = element.inner_text() # 只获取自己的文本

# 获取属性
href = element.get_attribute('href')
data_id = element.get_attribute('data-id')

# 获取 HTML
html = element.inner_html()
outer_html = element.evaluate('el => el.outerHTML')

# 获取值(表单元素)
value = page.input_value('input[name="username"]')

# 检查元素状态
is_visible = element.is_visible()
is_enabled = element.is_enabled()
is_editable = element.is_editable()
is_checked = element.is_checked()

# 获取边界框
box = element.bounding_box()
if box:
print(f'位置: ({box["x"]}, {box["y"]})')
print(f'大小: {box["width"]} x {box["height"]}')

browser.close()

高级交互

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://example.com')

# 悬停
page.hover('div.menu-item')

# 焦点
page.focus('input[name="search"]')

# 拖拽
source = page.locator('div.source')
target = page.locator('div.target')
source.drag_to(target)

# 键盘操作
page.keyboard.press('Enter')
page.keyboard.press('Control+a') # 全选
page.keyboard.press('Control+c') # 复制
page.keyboard.press('Control+v') # 粘贴
page.keyboard.type('Hello World', delay=100)

# 鼠标操作
page.mouse.click(100, 200)
page.mouse.dblclick(100, 200)
page.mouse.move(300, 400)
page.mouse.down()
page.mouse.move(500, 600)
page.mouse.up()

# 滚动
page.mouse.wheel(0, 500) # 向下滚动500像素
element.scroll_into_view_if_needed() # 滚动到元素可见

# 触摸操作(移动端)
page.touchscreen.tap(100, 200)

browser.close()

等待机制

Playwright 具有自动等待机制,大多数情况下无需手动等待:

自动等待

Playwright 在执行操作前会自动等待:

  • 元素附加到 DOM
  • 元素可见
  • 元素稳定(不在动画中)
  • 元素可接收事件
  • 元素启用
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# 自动等待元素可点击
page.click('button#submit') # 自动等待按钮可点击

# 自动等待元素可填写
page.fill('input[name="username"]', 'admin') # 自动等待输入框可用

browser.close()

显式等待

对于特殊情况,可以使用显式等待:

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# 等待元素出现
page.wait_for_selector('div.content')

# 等待元素可见
page.wait_for_selector('div.content', state='visible')

# 等待元素隐藏
page.wait_for_selector('div.loading', state='hidden')

# 等待元素从 DOM 中移除
page.wait_for_selector('div.notification', state='detached')

# 等待导航完成
page.click('a.link')
page.wait_for_load_state('networkidle')

# 等待 URL 变化
page.wait_for_url('**/dashboard')
page.wait_for_url(re.compile(r'.*/dashboard/.*'))

# 等待请求完成
with page.expect_request('**/api/data') as req_info:
page.click('button#load')
request = req_info.value

# 等待响应
with page.expect_response('**/api/data') as resp_info:
page.click('button#load')
response = resp_info.value

# 等待弹出窗口
with page.expect_popup() as popup_info:
page.click('a[target="_blank"]')
popup = popup_info.value

# 等待文件下载
with page.expect_download() as download_info:
page.click('a.download')
download = download_info.value

# 等待函数
page.wait_for_function('() => document.title.includes("Dashboard")')

# 等待超时
page.wait_for_timeout(3000) # 等待3秒(不推荐,仅用于调试)

browser.close()

设置超时

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 设置全局超时
page.set_default_timeout(60000) # 60秒

# 单次操作设置超时
page.click('button', timeout=10000) # 10秒

# 等待时设置超时
page.wait_for_selector('div.content', timeout=30000) # 30秒

browser.close()

网络拦截

Playwright 可以拦截和修改网络请求,这是非常强大的功能:

拦截请求

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 拦截所有请求
def handle_route(route):
print(f'请求: {route.request.url}')
route.continue_() # 继续请求

page.route('**', handle_route)

# 拦截特定请求
def block_images(route):
if route.request.resource_type == 'image':
route.abort() # 阻止请求
else:
route.continue_()

page.route('**/*.{png,jpg,jpeg,gif,svg}', block_images)

# 修改请求
def modify_request(route):
headers = route.request.headers
headers['X-Custom-Header'] = 'value'
route.continue_(headers=headers)

page.route('**/api/*', modify_request)

# Mock 响应
def mock_api(route):
route.fulfill(
status=200,
content_type='application/json',
body='{"status": "success", "data": []}'
)

page.route('**/api/data', mock_api)

page.goto('https://example.com')

browser.close()

监听请求和响应

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 监听请求
def on_request(request):
print(f'请求: {request.method} {request.url}')

page.on('request', on_request)

# 监听响应
def on_response(response):
print(f'响应: {response.status} {response.url}')
if 'application/json' in response.headers.get('content-type', ''):
print(response.json())

page.on('response', on_response)

page.goto('https://example.com')

browser.close()

WebSocket 路由(Playwright 1.48+)

Playwright 1.48 引入了 WebSocket 路由功能,可以拦截、修改和模拟 WebSocket 连接:

基本 WebSocket 拦截

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 拦截 WebSocket 连接
def handle_websocket(ws):
print(f'WebSocket 连接: {ws.url}')

# 监听来自服务器的消息
ws.on('framesreceived', lambda frames: [
print(f'收到消息: {frame}') for frame in frames
])

# 监听发送到服务器的消息
ws.on('framessent', lambda frames: [
print(f'发送消息: {frame}') for frame in frames
])

# 连接关闭时
ws.on('close', lambda: print('WebSocket 关闭'))

page.route_web_socket('**/ws', handle_websocket)

page.goto('https://example.com')

browser.close()

模拟 WebSocket 响应

from playwright.sync_api import sync_playwright
from typing import Union

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 模拟 WebSocket 服务器
def mock_websocket(ws):
def on_message(message: Union[str, bytes]):
print(f'收到: {message}')
# 根据消息内容返回不同响应
if message == 'ping':
ws.send('pong')
elif message == 'get_data':
ws.send('{"data": [1, 2, 3]}')

ws.on('message', on_message)

page.route_web_socket('**/ws', mock_websocket)

page.goto('https://example.com')

browser.close()

修改 WebSocket 消息

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

def modify_websocket(ws):
# 修改发送的消息
def on_frames_sent(frames):
for frame in frames:
# 在发送前修改消息内容
if 'secret' in frame:
modified = frame.replace('secret', '***')
print(f'修改消息: {frame} -> {modified}')

# 修改接收的消息
def on_frames_received(frames):
for frame in frames:
# 过滤敏感信息
if 'sensitive_data' in str(frame):
print('过滤敏感数据')
return

ws.on('framessent', on_frames_sent)
ws.on('framesreceived', on_frames_received)

page.route_web_socket('**', modify_websocket)

page.goto('https://example.com')

browser.close()

认证和请求头

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()

# 设置基本认证
context = browser.new_context(
http_credentials={
'username': 'user',
'password': 'pass'
}
)

page = context.new_page()

# 设置额外请求头
page.set_extra_http_headers({
'X-API-Key': 'your-api-key',
'Authorization': 'Bearer token'
})

page.goto('https://api.example.com/data')

browser.close()

多页面和框架

多标签页处理

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
context = browser.new_context()
page = context.new_page()
page.goto('https://example.com')

# 点击链接打开新标签页
with context.expect_page() as new_page_info:
page.click('a[target="_blank"]')
new_page = new_page_info.value

print(f'新页面标题: {new_page.title()}')

# 获取所有页面
pages = context.pages
for p in pages:
print(f'页面: {p.url}')

# 在页面间切换
page.bring_to_front()

browser.close()

iframe 处理

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# 通过选择器定位 iframe
frame = page.frame_locator('iframe#content')

# 在 iframe 内操作
frame.fill('input[name="username"]', 'admin')
frame.click('button[type="submit"]')

# 获取所有 frame
for frame in page.frames:
print(f'Frame URL: {frame.url}')

# 通过 name 定位 frame
frame = page.frame('frame-name')

# 通过 URL 定位 frame
frame = page.frame(url=re.compile(r'.*/embed/.*'))

browser.close()

执行 JavaScript

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# 执行 JavaScript
result = page.evaluate('() => document.title')
print(f'标题: {result}')

# 传递参数
result = page.evaluate(
'value => document.querySelector("h1").textContent + value',
'!'
)
print(f'结果: {result}')

# 返回复杂对象
info = page.evaluate('''() => {
return {
title: document.title,
url: location.href,
width: window.innerWidth,
height: window.innerHeight
}
}''')
print(info)

# 在元素上执行
element = page.locator('div.content')
result = element.evaluate('el => el.innerText')

# 执行脚本
page.evaluate('''() => {
document.querySelector("button").click();
}''')

# 添加脚本到页面
page.add_script_tag(url='https://cdn.example.com/script.js')
page.add_script_tag(content='console.log("Hello");')

browser.close()

Clock API(Playwright 1.45+)

Clock API 允许在测试中控制和操作时间,非常适合测试倒计时、定时器、日程提醒等时间相关功能:

基本用法

from playwright.sync_api import sync_playwright
import datetime

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 安装时钟,设置初始时间
page.clock.install(time=datetime.datetime(2024, 2, 2, 8, 0, 0))

page.goto('https://example.com')

# 此时页面中的 Date.now() 返回 2024-02-02 08:00:00

browser.close()

暂停和恢复时间

from playwright.sync_api import sync_playwright
import datetime

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 设置初始时间
page.clock.install(time=datetime.datetime(2024, 2, 2, 8, 0, 0))
page.goto('https://example.com')

# 暂停时间在指定时刻
page.clock.pause_at(datetime.datetime(2024, 2, 2, 10, 0, 0))

# 此时 Date.now() 固定在 10:00
# 可以验证页面显示的时间是否正确

# 快进时间(模拟用户离开一段时间后回来)
page.clock.fast_forward('30:00') # 快进 30 分钟

# 快进到特定时间
page.clock.fast_forward(datetime.datetime(2024, 2, 2, 11, 0, 0))

browser.close()

运行定时器

from playwright.sync_api import sync_playwright
import datetime

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

page.clock.install(time=datetime.datetime(2024, 1, 1, 0, 0, 0))
page.goto('https://example.com')

# 快进时间,同时运行中间的所有定时器
page.clock.run_for('05:00') # 运行 5 分钟内的所有定时器

# 或者按毫秒运行
page.clock.run_for(60000) # 运行 1 分钟

browser.close()

测试倒计时场景

from playwright.sync_api import sync_playwright
import datetime

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

# 设置初始时间
page.clock.install(time=datetime.datetime(2024, 1, 1, 0, 0, 0))

page.goto('https://example.com/countdown')

# 假设页面有一个 10 分钟倒计时
# 快进 9 分钟
page.clock.fast_forward('09:00')

# 验证剩余时间显示为 1 分钟
remaining = page.locator('.countdown').text_content()
assert '1:00' in remaining or '00:01' in remaining

# 快进剩余 1 分钟
page.clock.fast_forward('01:00')

# 验证倒计时结束
assert page.locator('.countdown-ended').is_visible()

browser.close()

Clock API 方法总结

方法说明
clock.install(time)安装时钟并设置初始时间
clock.pause_at(time)暂停时间在指定时刻
clock.fast_forward(duration)快进指定时间(不运行定时器)
clock.run_for(duration)快进并运行中间的定时器
clock.resume()恢复时间流动
clock.set_fixed_time(time)固定时间不动

截图和 PDF

截图

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# 整页截图
page.screenshot(path='full-page.png', full_page=True)

# 视口截图
page.screenshot(path='viewport.png')

# 元素截图
element = page.locator('div.content')
element.screenshot(path='element.png')

# 返回截图数据
screenshot_bytes = page.screenshot()

# 设置截图质量(仅 JPEG)
page.screenshot(path='image.jpg', type='jpeg', quality=80)

browser.close()

生成 PDF

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com')

# 生成 PDF
page.pdf(
path='document.pdf',
format='A4',
print_background=True,
margin={
'top': '20mm',
'right': '20mm',
'bottom': '20mm',
'left': '20mm'
}
)

# 横向 PDF
page.pdf(path='document-landscape.pdf', format='A4', landscape=True)

browser.close()

录制和调试

代码生成器

Playwright 提供代码生成器,可以录制操作并生成代码:

# 启动代码生成器
playwright codegen

# 录制特定网站
playwright codegen https://example.com

# 生成 Python 代码
playwright codegen --target python https://example.com

# 保存到文件
playwright codegen --target python -o script.py https://example.com

Trace Viewer

Trace Viewer 可以记录完整执行过程,用于调试:

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch()
context = browser.new_context()

# 开始录制
context.tracing.start(screenshots=True, snapshots=True, sources=True)

page = context.new_page()
page.goto('https://example.com')
page.click('button')

# 停止录制并保存
context.tracing.stop(path='trace.zip')

browser.close()

# 查看 trace
# 运行命令: playwright show-trace trace.zip

调试模式

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=1000) # 慢速模式
page = browser.new_page()

# 暂停执行,进入调试模式
page.pause()

page.goto('https://example.com')

browser.close()

完整爬虫示例

from playwright.sync_api import sync_playwright
import json
import time
import random

class PlaywrightSpider:
def __init__(self, headless=True):
self.headless = headless
self.browser = None
self.context = None
self.page = None

def start(self):
"""启动浏览器"""
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(
headless=self.headless,
args=['--disable-blink-features=AutomationControlled']
)
self.context = self.browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
self.page = self.context.new_page()

def close(self):
"""关闭浏览器"""
if self.context:
self.context.close()
if self.browser:
self.browser.close()
if self.playwright:
self.playwright.stop()

def goto(self, url, wait_until='networkidle'):
"""访问页面"""
self.page.goto(url, wait_until=wait_until)
time.sleep(random.uniform(0.5, 1.5))

def scroll_to_bottom(self):
"""滚动到页面底部"""
self.page.evaluate('''() => {
window.scrollTo(0, document.body.scrollHeight);
}''')
time.sleep(1)

def infinite_scroll(self, max_scrolls=10):
"""无限滚动加载"""
last_height = 0
scrolls = 0

while scrolls < max_scrolls:
self.page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)

new_height = self.page.evaluate('document.body.scrollHeight')
if new_height == last_height:
break
last_height = new_height
scrolls += 1

def wait_for_selector(self, selector, timeout=30000):
"""等待选择器出现"""
return self.page.wait_for_selector(selector, timeout=timeout)

def extract_items(self, selector, fields):
"""提取列表数据"""
items = []
elements = self.page.locator(selector)

for i in range(elements.count()):
element = elements.nth(i)
item = {}

for field, field_selector in fields.items():
try:
field_element = element.locator(field_selector['selector'])
if field_selector.get('type') == 'text':
item[field] = field_element.text_content()
elif field_selector.get('type') == 'attribute':
item[field] = field_element.get_attribute(field_selector['attr'])
except:
item[field] = ''

items.append(item)

return items

def save_to_json(self, data, filename):
"""保存数据到 JSON"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f'已保存 {len(data)} 条数据到 {filename}')


# 使用示例
if __name__ == '__main__':
spider = PlaywrightSpider(headless=True)

try:
spider.start()

# 示例:爬取 Hacker News
spider.goto('https://news.ycombinator.com/')

items = spider.extract_items(
selector='.athing',
fields={
'title': {
'selector': '.titleline > a',
'type': 'text'
},
'link': {
'selector': '.titleline > a',
'type': 'attribute',
'attr': 'href'
},
'score': {
'selector': '+ tr .score',
'type': 'text'
}
}
)

spider.save_to_json(items, 'news.json')

finally:
spider.close()

小结

本章我们学习了:

  1. Playwright 简介 - 相比 Selenium 的优势
  2. 安装配置 - 安装 Playwright 和浏览器
  3. 浏览器配置 - launch、context、视口设置
  4. 元素定位 - CSS、XPath、文本选择器
  5. 元素操作 - 点击、填写、拖拽等
  6. 等待机制 - 自动等待和显式等待
  7. 网络拦截 - 拦截、修改、Mock 请求
  8. 多页面和框架 - 标签页和 iframe 处理
  9. 高级功能 - 截图、PDF、录制、调试

Playwright 是新一代浏览器自动化工具,其现代化的 API 和强大的功能使其成为网页爬虫和自动化测试的首选。

练习

  1. 使用 Playwright 登录一个需要账号的网站
  2. 编写一个处理无限滚动页面的爬虫
  3. 使用网络拦截功能 Mock API 响应
  4. 使用 Trace Viewer 调试你的爬虫脚本