跳到主要内容

requests 库详解

requests 是 Python 最流行的 HTTP 库,以其简洁优雅的 API 著称。本章将详细介绍 requests 库的用法。

官方文档

本教程内容基于 requests 官方文档

安装 requests

pip install requests

发送请求

GET 请求

GET 请求是最常见的 HTTP 请求,用于获取数据:

import requests

# 最简单的 GET 请求
response = requests.get('https://httpbin.org/get')
print(response.status_code) # 200
print(response.text) # 响应内容

带参数的 GET 请求

import requests

# 方式1:直接在 URL 中添加参数
response = requests.get('https://httpbin.org/get?name=Tom&age=25')

# 方式2:使用 params 参数(推荐,自动编码)
params = {
'name': 'Tom',
'age': 25,
'city': 'Beijing'
}
response = requests.get('https://httpbin.org/get', params=params)
print(response.url) # https://httpbin.org/get?name=Tom&age=25&city=Beijing

POST 请求

POST 请求用于提交数据:

import requests

# 方式1:表单数据
data = {
'username': 'admin',
'password': '123456'
}
response = requests.post('https://httpbin.org/post', data=data)

# 方式2:JSON 数据
import json
response = requests.post('https://httpbin.org/post',
data=json.dumps(data),
headers={'Content-Type': 'application/json'})

# 方式3:直接使用 json 参数(推荐)
response = requests.post('https://httpbin.org/post', json=data)

其他请求方法

import requests

# PUT 请求 - 更新资源
response = requests.put('https://httpbin.org/put', data={'key': 'value'})

# DELETE 请求 - 删除资源
response = requests.delete('https://httpbin.org/delete')

# PATCH 请求 - 部分更新
response = requests.patch('https://httpbin.org/patch', data={'name': 'New'})

# HEAD 请求 - 只获取响应头
response = requests.head('https://httpbin.org/get')

# OPTIONS 请求 - 获取支持的请求方法
response = requests.options('https://httpbin.org/get')

响应处理

响应对象属性

import requests

response = requests.get('https://httpbin.org/get')

# 状态码
print(response.status_code) # 200
print(response.ok) # True (状态码 < 400)

# 响应内容
print(response.text) # 字符串形式
print(response.content) # 字节形式
print(response.json()) # JSON 解析

# 响应头
print(response.headers) # 字典形式的响应头
print(response.headers['Content-Type']) # 'application/json'

# 请求信息
print(response.url) # 最终的 URL(包含参数)
print(response.request) # 请求对象
print(response.request.headers) # 发送的请求头

状态码检查

import requests
from requests.exceptions import HTTPError

response = requests.get('https://httpbin.org/get')

# 方式1:检查状态码
if response.status_code == 200:
print('请求成功')

# 方式2:使用 raise_for_status(失败时抛出异常)
try:
response.raise_for_status()
except HTTPError as e:
print(f'请求失败: {e}')

# 方式3:使用 ok 属性
if response.ok:
print('请求成功')

响应编码

import requests

response = requests.get('https://httpbin.org/get')

# 自动检测编码(基于 HTTP 头和内容)
print(response.encoding) # 'utf-8'

# 手动设置编码
response.encoding = 'gbk'

# 读取内容时使用指定编码
content = response.content.decode('utf-8')

请求头和响应头

自定义请求头

import requests

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/',
'Cookie': 'session_id=abc123'
}

response = requests.get('https://example.com', headers=headers)

常见 User-Agent

# PC 浏览器
user_agents = {
'chrome': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'firefox': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'edge': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'safari': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
}

# 移动端
user_agents['iphone'] = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1'
user_agents['android'] = 'Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.43 Mobile Safari/537.36'

# 使用示例
headers = {'User-Agent': user_agents['chrome']}

超时设置

import requests
from requests.exceptions import Timeout

# 设置超时(连接超时 + 读取超时)
try:
response = requests.get('https://example.com', timeout=5)
except Timeout:
print('请求超时')

# 分别设置连接超时和读取超时
response = requests.get('https://example.com', timeout=(3, 10))
# 3秒连接超时,10秒读取超时

Session 会话

Session 对象可以在多次请求之间保持 Cookie 和其他状态:

import requests

# 创建 Session
session = requests.Session()

# 设置默认请求头(所有请求都会使用)
session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; MyBot/1.0)'
})

# 登录(Session 会自动保存 Cookie)
login_data = {'username': 'user', 'password': 'pass'}
session.post('https://example.com/login', data=login_data)

# 后续请求自动携带 Cookie
response = session.get('https://example.com/dashboard')
print(response.status_code)

# 查看 Session 的 Cookie
print(session.cookies.get_dict())

文件上传

import requests

# 上传文件
files = {
'file': open('document.pdf', 'rb')
}
response = requests.post('https://httpbin.org/post', files=files)

# 带文件名的上传
files = {
'file': ('report.pdf', open('document.pdf', 'rb'), 'application/pdf')
}
response = requests.post('https://httpbin.org/post', files=files)

# 上传多个文件
files = [
('file1', ('foo.txt', open('foo.txt', 'rb'))),
('file2', ('bar.txt', open('bar.txt', 'rb')))
]
response = requests.post('https://httpbin.org/post', files=files)

代理设置

import requests

# HTTP 代理
proxies = {
'http': 'http://127.0.0.1:7890',
'https': 'http://127.0.0.1:7890'
}
response = requests.get('https://example.com', proxies=proxies)

# 需要认证的代理
proxies_with_auth = {
'http': 'http://user:[email protected]:7890',
'https': 'http://user:[email protected]:7890'
}

# 使用 Session 设置代理
session = requests.Session()
session.proxies = proxies

SSL 证书

import requests

# 验证 SSL 证书(默认)
response = requests.get('https://example.com', verify=True)

# 忽略 SSL 证书验证(不推荐)
response = requests.get('https://example.com', verify=False)

# 使用自定义证书
response = requests.get('https://example.com', cert='/path/to/client.pem'))

# 指定 CA 证书
response = requests.get('https://example.com', verify='/path/to/ca-bundle.crt')

错误处理

import requests
from requests.exceptions import (
RequestException,
ConnectionError,
Timeout,
HTTPError,
TooManyRedirects
)

try:
response = requests.get('https://example.com', timeout=5)

# 检查状态码
response.raise_for_status()

except ConnectionError:
print('连接错误 - 无法连接到服务器')
except Timeout:
print('超时错误 - 请求超时')
except HTTPError as e:
print(f'HTTP 错误 - {e.response.status_code}')
except TooManyRedirects:
print('重定向错误 - 请求次数过多')
except RequestException as e:
print(f'请求错误 - {e}')
else:
print(f'请求成功: {response.status_code}')

高级用法

流式响应

import requests

# 流式下载大文件
with requests.get('https://example.com/largefile.zip', stream=True) as r:
r.raise_for_status()
with open('largefile.zip', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

迭代器模式

import requests

# 迭代响应内容
response = requests.get('https://example.com/largefile', stream=True)
for chunk in response.iter_content(chunk_size=1024):
process(chunk)

请求重试

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# 创建带有重试机制的 Session
session = requests.Session()

# 配置重试策略
retry = Retry(
total=3, # 最大重试次数
backoff_factor=0.5, # 重试间隔倍数
status_forcelist=[500, 502, 503, 504] # 需要重试的状态码
)

# 挂载到 HTTP 适配器
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# 发送请求
response = session.get('https://example.com/api')

准备请求

import requests
from requests import PreparedRequest

# 预先准备请求,然后可以修改或多次发送
req = PreparedRequest()
req.prepare_url('https://example.com', params={'key': 'value'})
req.prepare_headers({'User-Agent': 'MyBot/1.0'})

# 发送准备好的请求
session = requests.Session()
response = session.send(req)

完整示例

import requests
import time
import random

class Spider:
def __init__(self):
self.session = requests.Session()

# 设置默认请求头
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
]
self.session.headers.update({
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
})

# 配置重试
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

adapter = HTTPAdapter(max_retries=Retry(total=3, backoff_factor=0.5))
self.session.mount('http://', adapter)
self.session.mount('https://', adapter)

def get(self, url, **kwargs):
"""发送 GET 请求"""
# 添加随机延迟
time.sleep(random.uniform(0.5, 2))

# 设置超时
kwargs.setdefault('timeout', 10)

response = self.session.get(url, **kwargs)
response.raise_for_status()
return response

def post(self, url, **kwargs):
"""发送 POST 请求"""
time.sleep(random.uniform(0.5, 2))
kwargs.setdefault('timeout', 10)

response = self.session.post(url, **kwargs)
response.raise_for_status()
return response

# 使用示例
if __name__ == '__main__':
spider = Spider()

# GET 请求
response = spider.get('https://httpbin.org/get', params={'key': 'value'})
print(response.json())

# POST 请求
response = spider.post('https://httpbin.org/post', json={'name': 'Tom'})
print(response.json())

小结

本章我们学习了:

  1. GET/POST 请求 - 发送各种类型的 HTTP 请求
  2. 响应处理 - 状态码、响应内容、JSON 解析
  3. 请求头 - 设置 User-Agent 等请求头
  4. 超时处理 - 避免请求卡死
  5. Session 会话 - 保持 Cookie 和状态
  6. 代理和证书 - 进阶配置
  7. 错误处理 - 优雅处理各种异常

练习

  1. 使用 requests 库获取天气数据 API(如心知天气)
  2. 实现一个带有重试机制的请求函数
  3. 使用 Session 模拟登录过程