Scrapy 框架
Scrapy 是 Python 最强大的专业爬虫框架,提供了完整的爬虫解决方案,包括请求调度、数据处理、存储等功能。
官方文档
本教程内容基于 Scrapy 官方文档。
安装 Scrapy
pip install scrapy
# 验证安装
scrapy version
创建项目
# 创建新项目
scrapy startproject myspider
# 项目结构
myspider/
├── myspider/ # Python 包
│ ├── __init__.py
│ ├── items.py # 定义数据结构
│ ├── middlewares.py # 中间件
│ ├── pipelines.py # 数据处理管道
│ ├── settings.py # 配置文件
│ └── spiders/ # 爬虫目录
│ └── __init__.py
└── scrapy.cfg # 配置文件
核心组件
编写爬虫
定义 Item
# myspider/items.py
import scrapy
class BookItem(scrapy.Item):
"""书籍信息"""
title = scrapy.Field() # 书名
author = scrapy.Field() # 作者
price = scrapy.Field() # 价格
rating = scrapy.Field() # 评分
publisher = scrapy.Field() # 出版社
publish_date = scrapy.Field() # 出版日期
isbn = scrapy.Field() # ISBN
url = scrapy.Field() # 详情页URL
编写 Spider
# myspider/spiders/book_spider.py
import scrapy
from myspider.items import BookItem
class BookSpider(scrapy.Spider):
name = 'book_spider' # 爬虫名称
allowed_domains = ['example.com'] # 允许的域名
# 起始 URL 列表
start_urls = [
'https://example.com/books?page=1',
]
def parse(self, response):
"""解析页面"""
# 选择所有书籍条目
for book in response.css('div.book-item'):
# 创建 Item 对象
item = BookItem()
# 提取数据
item['title'] = book.css('h3.title::text').get()
item['author'] = book.css('span.author::text').get()
item['price'] = book.css('span.price::text').get()
item['rating'] = book.css('span.rating::text').get()
item['url'] = response.urljoin(book.css('a::attr(href)').get())
# 返回 Item(会被送到 Pipeline 处理)
yield item
# 翻页处理
next_page = response.css('a.next-page::attr(href)').get()
if next_page:
# 继续爬取下一页
yield response.follow(next_page, self.parse)
def parse_detail(self, response):
"""解析详情页"""
item = BookItem()
item['title'] = response.css('h1.title::text').get()
item['publisher'] = response.css('span.publisher::text').get()
item['publish_date'] = response.css('span.publish-date::text').get()
item['isbn'] = response.css('span.isbn::text').get()
item['url'] = response.url
yield item
运行爬虫
# 进入项目目录
cd myspider
# 运行爬虫
scrapy crawl book_spider
# 保存到文件
scrapy crawl book_spider -o books.json
scrapy crawl book_spider -o books.csv
scrapy crawl book_spider -o books.xml
# 保存为 JSON Lines 格式(推荐)
scrapy crawl book_spider -o books.jl
# 设置日志级别
scrapy crawl book_spider --loglevel=INFO
# 查看可用命令
scrapy -h
选择器
CSS 选择器
# 提取文本
response.css('title::text').get() # 单个文本
response.css('title::text').getall() # 所有文本
# 提取属性
response.css('img::attr(src)').get() # 单个属性
response.css('a::attr(href)').getall() # 所有属性
# 使用 class 选择
response.css('.title') # class=title
response.css('#title') # id=title
# 层级选择
response.css('div.content p') # 后代选择
response.css('div.content > p') # 子选择
response.css('div.content + p') # 相邻兄弟
# 伪类选择
response.css('li:first-child').get()
response.css('li:nth-child(2)').get()
response.css('a:contains(Python)').get() # 包含文本
XPath 选择器
# 基本用法
response.xpath('//title').get()
response.xpath('//title/text()').get()
# 提取属性
response.xpath('//img/@src').get()
# 条件选择
response.xpath('//div[@class="content"]')
response.xpath('//a[contains(@href, "book")]')
# 多个路径
response.xpath('//div | //span')
Item Pipeline
处理数据
# myspider/pipelines.py
class BookPipeline:
"""处理书籍数据"""
def process_item(self, item, spider):
"""处理每个 Item"""
# 清理数据
if item.get('title'):
item['title'] = item['title'].strip()
if item.get('price'):
# 提取数字
import re
price = re.search(r'[\d.]+', item['price'])
if price:
item['price'] = float(price.group())
return item
class DuplicatesPipeline:
"""去重"""
def __init__(self):
self.urls_seen = set()
def process_item(self, item, spider):
url = item.get('url')
if url in self.urls_seen:
spider.logger.debug(f'Duplicate item found: {url}')
# 丢弃 item
raise scrapy.exceptions.DropItem(f'Duplicate: {url}')
self.urls_seen.add(url)
return item
class SaveToFilePipeline:
"""保存到文件"""
def open_spider(self, spider):
self.file = open('books.jl', 'w')
def process_item(self, item, spider):
import json
line = json.dumps(dict(item)) + '\n'
self.file.write(line)
return item
def close_spider(self, spider):
self.file.close()
启用 Pipeline
在 settings.py 中启用:
# myspider/settings.py
ITEM_PIPELINES = {
'myspider.pipelines.BookPipeline': 100,
'myspider.pipelines.DuplicatesPipeline': 200,
'myspider.pipelines.SaveToFilePipeline': 300,
}
请求与响应
FormRequest(表单请求)
# 提交表单
def parse_login(self, response):
return scrapy.FormRequest(
'https://example.com/login',
formdata={
'username': 'user',
'password': 'pass'
},
callback=self.parse_after_login
)
# 从表单自动提取
def parse_search(self, response):
form = response.form
form['keyword'] = 'Python'
return form.submit()
JSON 请求
# 发送 JSON 请求
import json
return scrapy.Request(
url='https://api.example.com/data',
method='POST',
body=json.dumps({'query': 'Python'}),
headers={'Content-Type': 'application/json'},
callback=self.parse_json
)
下载器中间件
设置代理
# myspider/middlewares.py
class ProxyMiddleware:
"""代理中间件"""
def __init__(self):
self.proxy_list = [
'http://user:pass@ip1:port',
'http://user:pass@ip2:port',
]
def process_request(self, request, spider):
# 随机选择代理
proxy = random.choice(self.proxy_list)
request.meta['proxy'] = proxy
设置 Cookie
class CookieMiddleware:
"""Cookie 中间件"""
def process_request(self, request, spider):
# 设置 Cookie
request.cookies['session_id'] = 'abc123'
request.cookies['user_token'] = 'xyz789'
在 settings.py 中启用:
DOWNLOADER_MIDDLEWARES = {
'myspider.middlewares.ProxyMiddleware': 100,
}
爬取配置
settings.py 常用配置
# myspider/settings.py
# 机器人名称
BOT_NAME = 'myspider'
# 并发请求数
CONCURRENT_REQUESTS = 16
# 下载延迟(秒)
DOWNLOAD_DELAY = 1
# 是否遵循 robots.txt
ROBOTSTXT_OBEY = True
# 默认请求头
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 ...',
'Accept': 'text/html,application/xhtml+xml',
}
# 启用 Pipeline
ITEM_PIPELINES = {
'myspider.pipelines.MyspiderPipeline': 300,
}
# 下载超时
DOWNLOAD_TIMEOUT = 30
# 重试
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]
# 代理
HTTPPROXY_ENABLED = True
数据存储
保存到 MongoDB
# pipelines.py
import pymongo
class MongoDBPipeline:
"""保存到 MongoDB"""
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db.books.insert_one(dict(item))
return item
def close_spider(self, spider):
self.client.close()
在 settings.py 中配置:
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'books'
ITEM_PIPELINES = {
'myspider.pipelines.MongoDBPipeline': 400,
}
保存到 MySQL
# pipelines.py
import pymysql
class MySQLPipeline:
"""保存到 MySQL"""
def __init__(self, settings):
self.settings = settings
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def open_spider(self, spider):
self.connection = pymysql.connect(
host=self.settings.get('MYSQL_HOST'),
user=self.settings.get('MYSQL_USER'),
password=self.settings.get('MYSQL_PASSWORD'),
database=self.settings.get('MYSQL_DATABASE'),
charset='utf8mb4'
)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
sql = """
INSERT INTO books (title, author, price, url)
VALUES (%s, %s, %s, %s)
"""
self.cursor.execute(sql, (
item.get('title'),
item.get('author'),
item.get('price'),
item.get('url')
))
self.connection.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
进阶用法
异步处理
import asyncio
class AsyncSpider(scrapy.Spider):
name = 'async_spider'
async def parse(self, response):
# 使用异步操作
await asyncio.sleep(1)
# 提取数据
yield {'title': response.css('title::text').get()}
分布式爬取
使用 scrapy-redis 实现分布式爬取:
pip install scrapy-redis
配置 settings.py:
# 使用 Redis 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 持久化请求队列
SCHEDULER_PERSIST = True
# 使用 Redis 去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# Redis 连接
REDIS_URL = 'redis://localhost:6379'
命令行工具
# 创建项目
scrapy startproject projectname
# 创建爬虫
scrapy genspider myspider example.com
# 查看可用爬虫
scrapy list
# 运行爬虫
scrapy crawl myspider
# shell 交互
scrapy shell https://example.com
# 检查爬虫
scrapy check
# 导出数据
scrapy crawl myspider -o output.json
小结
本章我们学习了:
- Scrapy 基础 - 项目结构和核心组件
- 编写 Spider - 定义 Item 和爬取逻辑
- 选择器 - CSS 和 XPath 选择器
- Pipeline - 数据处理和存储
- 中间件 - 请求/响应处理
- 配置 - 常用设置
- 存储 - MongoDB 和 MySQL
练习
- 使用 Scrapy 爬取一个新闻网站
- 实现数据存储到 MongoDB
- 配置代理中间件