跳到主要内容

Scrapy 框架

Scrapy 是 Python 最强大的专业爬虫框架,提供了完整的爬虫解决方案,包括请求调度、数据处理、存储等功能。

官方文档

本教程内容基于 Scrapy 官方文档

安装 Scrapy

pip install scrapy

# 验证安装
scrapy version

创建项目

# 创建新项目
scrapy startproject myspider

# 项目结构
myspider/
├── myspider/ # Python 包
│ ├── __init__.py
│ ├── items.py # 定义数据结构
│ ├── middlewares.py # 中间件
│ ├── pipelines.py # 数据处理管道
│ ├── settings.py # 配置文件
│ └── spiders/ # 爬虫目录
│ └── __init__.py
└── scrapy.cfg # 配置文件

核心组件

编写爬虫

定义 Item

# myspider/items.py
import scrapy

class BookItem(scrapy.Item):
"""书籍信息"""
title = scrapy.Field() # 书名
author = scrapy.Field() # 作者
price = scrapy.Field() # 价格
rating = scrapy.Field() # 评分
publisher = scrapy.Field() # 出版社
publish_date = scrapy.Field() # 出版日期
isbn = scrapy.Field() # ISBN
url = scrapy.Field() # 详情页URL

编写 Spider

# myspider/spiders/book_spider.py
import scrapy
from myspider.items import BookItem

class BookSpider(scrapy.Spider):
name = 'book_spider' # 爬虫名称
allowed_domains = ['example.com'] # 允许的域名

# 起始 URL 列表
start_urls = [
'https://example.com/books?page=1',
]

def parse(self, response):
"""解析页面"""
# 选择所有书籍条目
for book in response.css('div.book-item'):
# 创建 Item 对象
item = BookItem()

# 提取数据
item['title'] = book.css('h3.title::text').get()
item['author'] = book.css('span.author::text').get()
item['price'] = book.css('span.price::text').get()
item['rating'] = book.css('span.rating::text').get()
item['url'] = response.urljoin(book.css('a::attr(href)').get())

# 返回 Item(会被送到 Pipeline 处理)
yield item

# 翻页处理
next_page = response.css('a.next-page::attr(href)').get()
if next_page:
# 继续爬取下一页
yield response.follow(next_page, self.parse)

def parse_detail(self, response):
"""解析详情页"""
item = BookItem()
item['title'] = response.css('h1.title::text').get()
item['publisher'] = response.css('span.publisher::text').get()
item['publish_date'] = response.css('span.publish-date::text').get()
item['isbn'] = response.css('span.isbn::text').get()
item['url'] = response.url
yield item

运行爬虫

# 进入项目目录
cd myspider

# 运行爬虫
scrapy crawl book_spider

# 保存到文件
scrapy crawl book_spider -o books.json
scrapy crawl book_spider -o books.csv
scrapy crawl book_spider -o books.xml

# 保存为 JSON Lines 格式(推荐)
scrapy crawl book_spider -o books.jl

# 设置日志级别
scrapy crawl book_spider --loglevel=INFO

# 查看可用命令
scrapy -h

选择器

CSS 选择器

# 提取文本
response.css('title::text').get() # 单个文本
response.css('title::text').getall() # 所有文本

# 提取属性
response.css('img::attr(src)').get() # 单个属性
response.css('a::attr(href)').getall() # 所有属性

# 使用 class 选择
response.css('.title') # class=title
response.css('#title') # id=title

# 层级选择
response.css('div.content p') # 后代选择
response.css('div.content > p') # 子选择
response.css('div.content + p') # 相邻兄弟

# 伪类选择
response.css('li:first-child').get()
response.css('li:nth-child(2)').get()
response.css('a:contains(Python)').get() # 包含文本

XPath 选择器

# 基本用法
response.xpath('//title').get()
response.xpath('//title/text()').get()

# 提取属性
response.xpath('//img/@src').get()

# 条件选择
response.xpath('//div[@class="content"]')
response.xpath('//a[contains(@href, "book")]')

# 多个路径
response.xpath('//div | //span')

Item Pipeline

处理数据

# myspider/pipelines.py

class BookPipeline:
"""处理书籍数据"""

def process_item(self, item, spider):
"""处理每个 Item"""
# 清理数据
if item.get('title'):
item['title'] = item['title'].strip()

if item.get('price'):
# 提取数字
import re
price = re.search(r'[\d.]+', item['price'])
if price:
item['price'] = float(price.group())

return item


class DuplicatesPipeline:
"""去重"""

def __init__(self):
self.urls_seen = set()

def process_item(self, item, spider):
url = item.get('url')
if url in self.urls_seen:
spider.logger.debug(f'Duplicate item found: {url}')
# 丢弃 item
raise scrapy.exceptions.DropItem(f'Duplicate: {url}')

self.urls_seen.add(url)
return item


class SaveToFilePipeline:
"""保存到文件"""

def open_spider(self, spider):
self.file = open('books.jl', 'w')

def process_item(self, item, spider):
import json
line = json.dumps(dict(item)) + '\n'
self.file.write(line)
return item

def close_spider(self, spider):
self.file.close()

启用 Pipeline

settings.py 中启用:

# myspider/settings.py

ITEM_PIPELINES = {
'myspider.pipelines.BookPipeline': 100,
'myspider.pipelines.DuplicatesPipeline': 200,
'myspider.pipelines.SaveToFilePipeline': 300,
}

请求与响应

FormRequest(表单请求)

# 提交表单
def parse_login(self, response):
return scrapy.FormRequest(
'https://example.com/login',
formdata={
'username': 'user',
'password': 'pass'
},
callback=self.parse_after_login
)

# 从表单自动提取
def parse_search(self, response):
form = response.form
form['keyword'] = 'Python'
return form.submit()

JSON 请求

# 发送 JSON 请求
import json

return scrapy.Request(
url='https://api.example.com/data',
method='POST',
body=json.dumps({'query': 'Python'}),
headers={'Content-Type': 'application/json'},
callback=self.parse_json
)

下载器中间件

设置代理

# myspider/middlewares.py

class ProxyMiddleware:
"""代理中间件"""

def __init__(self):
self.proxy_list = [
'http://user:pass@ip1:port',
'http://user:pass@ip2:port',
]

def process_request(self, request, spider):
# 随机选择代理
proxy = random.choice(self.proxy_list)
request.meta['proxy'] = proxy
class CookieMiddleware:
"""Cookie 中间件"""

def process_request(self, request, spider):
# 设置 Cookie
request.cookies['session_id'] = 'abc123'
request.cookies['user_token'] = 'xyz789'

settings.py 中启用:

DOWNLOADER_MIDDLEWARES = {
'myspider.middlewares.ProxyMiddleware': 100,
}

爬取配置

settings.py 常用配置

# myspider/settings.py

# 机器人名称
BOT_NAME = 'myspider'

# 并发请求数
CONCURRENT_REQUESTS = 16

# 下载延迟(秒)
DOWNLOAD_DELAY = 1

# 是否遵循 robots.txt
ROBOTSTXT_OBEY = True

# 默认请求头
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 ...',
'Accept': 'text/html,application/xhtml+xml',
}

# 启用 Pipeline
ITEM_PIPELINES = {
'myspider.pipelines.MyspiderPipeline': 300,
}

# 下载超时
DOWNLOAD_TIMEOUT = 30

# 重试
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]

# 代理
HTTPPROXY_ENABLED = True

数据存储

保存到 MongoDB

# pipelines.py
import pymongo

class MongoDBPipeline:
"""保存到 MongoDB"""

@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)

def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db

def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]

def process_item(self, item, spider):
self.db.books.insert_one(dict(item))
return item

def close_spider(self, spider):
self.client.close()

settings.py 中配置:

MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'books'

ITEM_PIPELINES = {
'myspider.pipelines.MongoDBPipeline': 400,
}

保存到 MySQL

# pipelines.py
import pymysql

class MySQLPipeline:
"""保存到 MySQL"""

def __init__(self, settings):
self.settings = settings

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)

def open_spider(self, spider):
self.connection = pymysql.connect(
host=self.settings.get('MYSQL_HOST'),
user=self.settings.get('MYSQL_USER'),
password=self.settings.get('MYSQL_PASSWORD'),
database=self.settings.get('MYSQL_DATABASE'),
charset='utf8mb4'
)
self.cursor = self.connection.cursor()

def process_item(self, item, spider):
sql = """
INSERT INTO books (title, author, price, url)
VALUES (%s, %s, %s, %s)
"""
self.cursor.execute(sql, (
item.get('title'),
item.get('author'),
item.get('price'),
item.get('url')
))
self.connection.commit()
return item

def close_spider(self, spider):
self.cursor.close()
self.connection.close()

进阶用法

异步处理

import asyncio

class AsyncSpider(scrapy.Spider):
name = 'async_spider'

async def parse(self, response):
# 使用异步操作
await asyncio.sleep(1)

# 提取数据
yield {'title': response.css('title::text').get()}

分布式爬取

使用 scrapy-redis 实现分布式爬取:

pip install scrapy-redis

配置 settings.py

# 使用 Redis 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 持久化请求队列
SCHEDULER_PERSIST = True

# 使用 Redis 去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# Redis 连接
REDIS_URL = 'redis://localhost:6379'

命令行工具

# 创建项目
scrapy startproject projectname

# 创建爬虫
scrapy genspider myspider example.com

# 查看可用爬虫
scrapy list

# 运行爬虫
scrapy crawl myspider

# shell 交互
scrapy shell https://example.com

# 检查爬虫
scrapy check

# 导出数据
scrapy crawl myspider -o output.json

小结

本章我们学习了:

  1. Scrapy 基础 - 项目结构和核心组件
  2. 编写 Spider - 定义 Item 和爬取逻辑
  3. 选择器 - CSS 和 XPath 选择器
  4. Pipeline - 数据处理和存储
  5. 中间件 - 请求/响应处理
  6. 配置 - 常用设置
  7. 存储 - MongoDB 和 MySQL

练习

  1. 使用 Scrapy 爬取一个新闻网站
  2. 实现数据存储到 MongoDB
  3. 配置代理中间件