跳到主要内容

Python 爬虫速查表

本文是 Python 爬虫常用代码片段的快速参考。

requests 库

发送请求

import requests

# GET 请求
response = requests.get(url)
response = requests.get(url, params={'key': 'value'})
response = requests.get(url, headers={'User-Agent': '...'})

# POST 请求
response = requests.post(url, data={'key': 'value'})
response = requests.post(url, json={'key': 'value'})

# 带 Cookie
response = requests.get(url, cookies={'session': 'xxx'})

# 带代理
response = requests.get(url, proxies={'http': 'http://ip:port'})

# 带超时
response = requests.get(url, timeout=10)

响应处理

response.status_code      # 状态码
response.text # 文本内容
response.json() # JSON 解析
response.content # 字节内容
response.headers # 响应头
response.cookies # Cookie
response.raise_for_status() # 检查错误

BeautifulSoup

创建对象

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')
soup = BeautifulSoup(file, 'lxml')
soup = BeautifulSoup(response.text, 'lxml')

元素查找

soup.find('div')                  # 第一个 div
soup.find_all('a') # 所有 a 标签
soup.select('div.content a') # CSS 选择器

# 常用选择器
soup.select('.class') # class 选择器
soup.select('#id') # id 选择器
soup.select('a[href]') # 属性选择器
soup.select('div > a') # 子选择器

内容提取

element.get_text(strip=True)    # 获取文本
element['href'] # 获取属性
element.get('href', 'default') # 获取属性(带默认值)
element.attrs # 所有属性

正则表达式

常用模式

import re

re.findall(r'\d+', text) # 提取数字
re.search(r'\d+', text).group() # 提取第一个数字
re.sub(r'\d+', 'NUM', text) # 替换数字
re.match(r'^http', url) # 匹配开头
re.split(r'\s+', text) # 按空格分割

常用正则

模式含义
\d数字
\w单词字符
\s空白字符
.任意字符
*0或多个
+1或多个
?0或1个
^开头
$结尾
[]字符集

反爬策略

User-Agent

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

请求延迟

import time
import random

time.sleep(random.uniform(1, 3)) # 随机延迟

代理

proxies = {
'http': 'http://ip:port',
'https': 'http://ip:port'
}

Session

session = requests.Session()
session.headers.update({'User-Agent': '...'})
session.get(url)

数据存储

保存为 JSON

import json

# 写入
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)

# 读取
with open('data.json', 'r', encoding='utf-8') as f:
data = json.load(f)

保存为 CSV

import csv

with open('data.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=['title', 'price'])
writer.writeheader()
writer.writerows(items)

保存到 SQLite

import sqlite3

conn = sqlite3.connect('data.db')
cursor = conn.cursor()
cursor.execute('INSERT INTO table (col1, col2) VALUES (?, ?)', (val1, val2))
conn.commit()
conn.close()

异步爬虫 (aiohttp)

基本用法

import aiohttp
import asyncio

async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()

html = asyncio.run(fetch(url))

并发请求

async def fetch_all(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_one(session, url) for url in urls]
return await asyncio.gather(*tasks)

async def fetch_one(session, url):
async with session.get(url) as response:
return await response.text()

并发控制

semaphore = asyncio.Semaphore(10)

async def fetch_with_limit(session, url):
async with semaphore:
async with session.get(url) as response:
return await response.text()

超时设置

from aiohttp import ClientTimeout

timeout = ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url) as response:
return await response.text()

常见任务

提取所有链接

for a in soup.select('a'):
print(a.get('href'))

提取表格数据

rows = soup.select('table tr')
data = []
for row in rows[1:]: # 跳过表头
cols = row.select('td')
data.append([col.get_text(strip=True) for col in cols])

分页爬取

for page in range(1, 11):
url = f'https://example.com?page={page}'
response = requests.get(url)
# 处理响应

下载图片

response = requests.get(img_url, stream=True)
with open('image.jpg', 'wb') as f:
for chunk in response.iter_content(8192):
f.write(chunk)

Scrapy 命令

# 创建项目
scrapy startproject myproject

# 创建爬虫
scrapy genspider myspider example.com

# 运行爬虫
scrapy crawl myspider

# 保存数据
scrapy crawl myspider -o items.json

# Shell 交互
scrapy shell https://example.com

HTTP 状态码

状态码含义
200成功
301永久重定向
302临时重定向
400请求错误
401需要认证
403禁止访问
404未找到
429请求过多
500服务器错误
502网关错误
503服务不可用

Selenium

初始化

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
options.add_argument('--headless') # 无头模式
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)

元素定位

from selenium.webdriver.common.by import By

driver.find_element(By.ID, 'username')
driver.find_element(By.NAME, 'password')
driver.find_element(By.CLASS_NAME, 'btn')
driver.find_element(By.TAG_NAME, 'input')
driver.find_element(By.CSS_SELECTOR, 'div.content a')
driver.find_element(By.XPATH, '//div[@class="content"]//a')
driver.find_element(By.LINK_TEXT, '登录')
driver.find_element(By.PARTIAL_LINK_TEXT, '登')

等待机制

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 隐式等待
driver.implicitly_wait(10)

# 显式等待
wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located((By.ID, 'content')))
element = wait.until(EC.visibility_of_element_located((By.ID, 'content')))
element = wait.until(EC.element_to_be_clickable((By.ID, 'submit')))

元素操作

element.click()                    # 点击
element.send_keys('text') # 输入
element.clear() # 清空
element.text # 获取文本
element.get_attribute('href') # 获取属性

页面操作

driver.get(url)                    # 访问页面
driver.back() # 后退
driver.forward() # 前进
driver.refresh() # 刷新
driver.title # 页面标题
driver.current_url # 当前 URL
driver.page_source # 页面源码
driver.quit() # 关闭浏览器

多窗口和框架

# 多窗口
main_window = driver.current_window_handle
driver.switch_to.window(window_handle)

# iframe
driver.switch_to.frame('frame_id')
driver.switch_to.default_content()

最佳实践

  1. 设置 User-Agent:模拟浏览器
  2. 添加延迟:避免请求过快
  3. 处理异常:网络可能不稳定
  4. 遵守规则:检查 robots.txt
  5. 保存进度:避免重复爬取
  6. 使用 Session:维护 Cookie
  7. 处理编码:注意中文编码
  8. 选择合适工具:静态页面用 requests,动态页面用 Selenium