requests 库详解
requests 是 Python 最流行的 HTTP 库,以其简洁优雅的 API 著称。本章将详细介绍 requests 库的用法。
官方文档
本教程内容基于 requests 官方文档。
安装 requests
pip install requests
发送请求
GET 请求
GET 请求是最常见的 HTTP 请求,用于获取数据:
import requests
# 最简单的 GET 请求
response = requests.get('https://httpbin.org/get')
print(response.status_code) # 200
print(response.text) # 响应内容
带参数的 GET 请求
import requests
# 方式1:直接在 URL 中添加参数
response = requests.get('https://httpbin.org/get?name=Tom&age=25')
# 方式2:使用 params 参数(推荐,自动编码)
params = {
'name': 'Tom',
'age': 25,
'city': 'Beijing'
}
response = requests.get('https://httpbin.org/get', params=params)
print(response.url) # https://httpbin.org/get?name=Tom&age=25&city=Beijing
POST 请求
POST 请求用于提交数据:
import requests
# 方式1:表单数据
data = {
'username': 'admin',
'password': '123456'
}
response = requests.post('https://httpbin.org/post', data=data)
# 方式2:JSON 数据
import json
response = requests.post('https://httpbin.org/post',
data=json.dumps(data),
headers={'Content-Type': 'application/json'})
# 方式3:直接使用 json 参数(推荐)
response = requests.post('https://httpbin.org/post', json=data)
其他请求方法
import requests
# PUT 请求 - 更新资源
response = requests.put('https://httpbin.org/put', data={'key': 'value'})
# DELETE 请求 - 删除资源
response = requests.delete('https://httpbin.org/delete')
# PATCH 请求 - 部分更新
response = requests.patch('https://httpbin.org/patch', data={'name': 'New'})
# HEAD 请求 - 只获取响应头
response = requests.head('https://httpbin.org/get')
# OPTIONS 请求 - 获取支持的请求方法
response = requests.options('https://httpbin.org/get')
响应处理
响应对象属性
import requests
response = requests.get('https://httpbin.org/get')
# 状态码
print(response.status_code) # 200
print(response.ok) # True (状态码 < 400)
# 响应内容
print(response.text) # 字符串形式
print(response.content) # 字节形式
print(response.json()) # JSON 解析
# 响应头
print(response.headers) # 字典形式的响应头
print(response.headers['Content-Type']) # 'application/json'
# 请求信息
print(response.url) # 最终的 URL(包含参数)
print(response.request) # 请求对象
print(response.request.headers) # 发送的请求头
状态码检查
import requests
from requests.exceptions import HTTPError
response = requests.get('https://httpbin.org/get')
# 方式1:检查状态码
if response.status_code == 200:
print('请求成功')
# 方式2:使用 raise_for_status(失败时抛出异常)
try:
response.raise_for_status()
except HTTPError as e:
print(f'请求失败: {e}')
# 方式3:使用 ok 属性
if response.ok:
print('请求成功')
响应编码
import requests
response = requests.get('https://httpbin.org/get')
# 自动检测编码(基于 HTTP 头和内容)
print(response.encoding) # 'utf-8'
# 手动设置编码
response.encoding = 'gbk'
# 读取内容时使用指定编码
content = response.content.decode('utf-8')
请求头和响应头
自定义请求头
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/',
'Cookie': 'session_id=abc123'
}
response = requests.get('https://example.com', headers=headers)
常见 User-Agent
# PC 浏览器
user_agents = {
'chrome': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'firefox': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'edge': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'safari': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
}
# 移动端
user_agents['iphone'] = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1'
user_agents['android'] = 'Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.43 Mobile Safari/537.36'
# 使用示例
headers = {'User-Agent': user_agents['chrome']}
超时设置
import requests
from requests.exceptions import Timeout
# 设置超时(连接超时 + 读取超时)
try:
response = requests.get('https://example.com', timeout=5)
except Timeout:
print('请求超时')
# 分别设置连接超时和读取超时
response = requests.get('https://example.com', timeout=(3, 10))
# 3秒连接超时,10秒读取超时
Session 会话
Session 对象可以在多次请求之间保持 Cookie 和其他状态:
import requests
# 创建 Session
session = requests.Session()
# 设置默认请求头(所有请求都会使用)
session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; MyBot/1.0)'
})
# 登录(Session 会自动保存 Cookie)
login_data = {'username': 'user', 'password': 'pass'}
session.post('https://example.com/login', data=login_data)
# 后续请求自动携带 Cookie
response = session.get('https://example.com/dashboard')
print(response.status_code)
# 查看 Session 的 Cookie
print(session.cookies.get_dict())
文件上传
import requests
# 上传文件
files = {
'file': open('document.pdf', 'rb')
}
response = requests.post('https://httpbin.org/post', files=files)
# 带文件名的上传
files = {
'file': ('report.pdf', open('document.pdf', 'rb'), 'application/pdf')
}
response = requests.post('https://httpbin.org/post', files=files)
# 上传多个文件
files = [
('file1', ('foo.txt', open('foo.txt', 'rb'))),
('file2', ('bar.txt', open('bar.txt', 'rb')))
]
response = requests.post('https://httpbin.org/post', files=files)
代理设置
import requests
# HTTP 代理
proxies = {
'http': 'http://127.0.0.1:7890',
'https': 'http://127.0.0.1:7890'
}
response = requests.get('https://example.com', proxies=proxies)
# 需要认证的代理
proxies_with_auth = {
'http': 'http://user:[email protected]:7890',
'https': 'http://user:[email protected]:7890'
}
# 使用 Session 设置代理
session = requests.Session()
session.proxies = proxies
SSL 证书
import requests
# 验证 SSL 证书(默认)
response = requests.get('https://example.com', verify=True)
# 忽略 SSL 证书验证(不推荐)
response = requests.get('https://example.com', verify=False)
# 使用自定义证书
response = requests.get('https://example.com', cert='/path/to/client.pem'))
# 指定 CA 证书
response = requests.get('https://example.com', verify='/path/to/ca-bundle.crt')
错误处理
import requests
from requests.exceptions import (
RequestException,
ConnectionError,
Timeout,
HTTPError,
TooManyRedirects
)
try:
response = requests.get('https://example.com', timeout=5)
# 检查状态码
response.raise_for_status()
except ConnectionError:
print('连接错误 - 无法连接到服务器')
except Timeout:
print('超时错误 - 请求超时')
except HTTPError as e:
print(f'HTTP 错误 - {e.response.status_code}')
except TooManyRedirects:
print('重定向错误 - 请求次数过多')
except RequestException as e:
print(f'请求错误 - {e}')
else:
print(f'请求成功: {response.status_code}')
高级用法
流式响应
import requests
# 流式下载大文件
with requests.get('https://example.com/largefile.zip', stream=True) as r:
r.raise_for_status()
with open('largefile.zip', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
迭代器模式
import requests
# 迭代响应内容
response = requests.get('https://example.com/largefile', stream=True)
for chunk in response.iter_content(chunk_size=1024):
process(chunk)
请求重试
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 创建带有重试机制的 Session
session = requests.Session()
# 配置重试策略
retry = Retry(
total=3, # 最大重试次数
backoff_factor=0.5, # 重试间隔倍数
status_forcelist=[500, 502, 503, 504] # 需要重试的状态码
)
# 挂载到 HTTP 适配器
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
# 发送请求
response = session.get('https://example.com/api')
准备请求
import requests
from requests import PreparedRequest
# 预先准备请求,然后可以修改或多次发送
req = PreparedRequest()
req.prepare_url('https://example.com', params={'key': 'value'})
req.prepare_headers({'User-Agent': 'MyBot/1.0'})
# 发送准备好的请求
session = requests.Session()
response = session.send(req)
完整示例
import requests
import time
import random
class Spider:
def __init__(self):
self.session = requests.Session()
# 设置默认请求头
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
]
self.session.headers.update({
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
})
# 配置重试
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
adapter = HTTPAdapter(max_retries=Retry(total=3, backoff_factor=0.5))
self.session.mount('http://', adapter)
self.session.mount('https://', adapter)
def get(self, url, **kwargs):
"""发送 GET 请求"""
# 添加随机延迟
time.sleep(random.uniform(0.5, 2))
# 设置超时
kwargs.setdefault('timeout', 10)
response = self.session.get(url, **kwargs)
response.raise_for_status()
return response
def post(self, url, **kwargs):
"""发送 POST 请求"""
time.sleep(random.uniform(0.5, 2))
kwargs.setdefault('timeout', 10)
response = self.session.post(url, **kwargs)
response.raise_for_status()
return response
# 使用示例
if __name__ == '__main__':
spider = Spider()
# GET 请求
response = spider.get('https://httpbin.org/get', params={'key': 'value'})
print(response.json())
# POST 请求
response = spider.post('https://httpbin.org/post', json={'name': 'Tom'})
print(response.json())
小结
本章我们学习了:
- GET/POST 请求 - 发送各种类型的 HTTP 请求
- 响应处理 - 状态码、响应内容、JSON 解析
- 请求头 - 设置 User-Agent 等请求头
- 超时处理 - 避免请求卡死
- Session 会话 - 保持 Cookie 和状态
- 代理和证书 - 进阶配置
- 错误处理 - 优雅处理各种异常
练习
- 使用 requests 库获取天气数据 API(如心知天气)
- 实现一个带有重试机制的请求函数
- 使用 Session 模拟登录过程