Requests - HTTP 请求库
Requests 是 Python 中最流行的 HTTP 库,用于发送 HTTP/1.1 请求。它简化了与 Web 服务交互的过程,使 HTTP 请求变得简单且人性化。
什么是 Requests?
Requests 是一个优雅而简单的 Python HTTP 库,主要特点包括:
- 简单易用:直观的 API 设计,符合 Python 哲学
- 功能丰富:支持各种 HTTP 方法、认证、Cookie、Session 等
- 自动处理:自动处理编码、JSON、重定向等
- 安全可靠:支持 SSL/TLS,提供超时控制
安装 Requests
pip install requests
基础用法
发送 GET 请求
import requests
# 最简单的 GET 请求
response = requests.get('https://api.github.com')
print(response.status_code) # 200
print(response.text) # 响应内容(字符串)
# 带参数的 GET 请求
params = {
'q': 'python',
'page': 1
}
response = requests.get('https://api.github.com/search/repositories', params=params)
print(response.url) # https://api.github.com/search/repositories?q=python&page=1
# 解析 JSON 响应
data = response.json()
print(data['total_count'])
发送 POST 请求
import requests
# 发送表单数据
payload = {
'username': 'admin',
'password': 'secret'
}
response = requests.post('https://httpbin.org/post', data=payload)
# 发送 JSON 数据
json_data = {
'name': '张三',
'age': 25
}
response = requests.post('https://httpbin.org/post', json=json_data)
# 发送文件
files = {'file': open('report.txt', 'rb')}
response = requests.post('https://httpbin.org/post', files=files)
其他 HTTP 方法
import requests
# PUT 请求
response = requests.put('https://httpbin.org/put', data={'key': 'value'})
# DELETE 请求
response = requests.delete('https://httpbin.org/delete')
# HEAD 请求
response = requests.head('https://httpbin.org/get')
# OPTIONS 请求
response = requests.options('https://httpbin.org/get')
# PATCH 请求
response = requests.patch('https://httpbin.org/patch', data={'key': 'value'})
响应处理
响应属性
import requests
response = requests.get('https://api.github.com')
# 状态码
print(response.status_code) # 200
print(response.status_code == requests.codes.ok) # True
# 响应头
print(response.headers) # 响应头字典
print(response.headers['Content-Type'])
# 响应内容
print(response.text) # 字符串形式
print(response.content) # 字节形式(二进制)
print(response.json()) # JSON 解析
# 编码
print(response.encoding) # 自动检测的编码
response.encoding = 'utf-8' # 手动设置编码
# URL 和请求历史
print(response.url) # 最终 URL(考虑重定向)
print(response.history) # 重定向历史
状态码检查
import requests
response = requests.get('https://api.github.com')
# 检查状态码
if response.status_code == 200:
print('请求成功')
elif response.status_code == 404:
print('资源未找到')
# 使用 raise_for_status() 自动抛出异常
response = requests.get('https://httpbin.org/status/404')
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print(f'HTTP 错误: {e}')
请求头设置
自定义请求头
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'application/json',
'Authorization': 'Bearer your_token_here',
'X-Custom-Header': 'custom_value'
}
response = requests.get('https://httpbin.org/headers', headers=headers)
print(response.json())
模拟浏览器请求
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get('https://www.example.com', headers=headers)
Cookie 处理
发送和接收 Cookie
import requests
# 发送 Cookie
cookies = {
'session_id': 'abc123',
'user_pref': 'dark_mode'
}
response = requests.get('https://httpbin.org/cookies', cookies=cookies)
# 接收 Cookie
response = requests.get('https://httpbin.org/cookies/set/session_id/abc123')
print(response.cookies) # CookieJar 对象
print(response.cookies['session_id']) # abc123
# 将 Cookie 转换为字典
cookies_dict = requests.utils.dict_from_cookiejar(response.cookies)
使用 Session 保持 Cookie
import requests
# 创建 Session
session = requests.Session()
# 登录(设置 Cookie)
login_data = {
'username': 'admin',
'password': 'secret'
}
session.post('https://httpbin.org/post', data=login_data)
# 后续请求自动携带 Cookie
response = session.get('https://httpbin.org/cookies')
print(response.text)
# 关闭 Session
session.close()
Session 对象
使用 Session 提高效率
import requests
# Session 会保持连接池和 Cookie
with requests.Session() as session:
# 设置默认请求头
session.headers.update({
'User-Agent': 'MyApp/1.0'
})
# 所有请求都会使用这些默认设置
response1 = session.get('https://httpbin.org/get')
response2 = session.post('https://httpbin.org/post', data={'key': 'value'})
response3 = session.get('https://httpbin.org/cookies')
Session 的更多用法
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 创建带重试策略的 Session
session = requests.Session()
# 配置重试策略
retries = Retry(
total=5, # 总共重试 5 次
backoff_factor=1, # 重试间隔时间
status_forcelist=[500, 502, 503, 504] # 这些状态码触发重试
)
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
response = session.get('https://httpbin.org/get')
超时设置
设置请求超时
import requests
# 设置超时时间(秒)
try:
response = requests.get('https://httpbin.org/delay/10', timeout=5)
except requests.exceptions.Timeout:
print('请求超时')
# 分别设置连接超时和读取超时
# timeout=(连接超时, 读取超时)
response = requests.get('https://httpbin.org/get', timeout=(3.05, 27))
代理设置
使用代理
import requests
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
response = requests.get('https://httpbin.org/ip', proxies=proxies)
# 需要认证的代理
proxies = {
'http': 'http://user:[email protected]:3128',
}
SSL/TLS 设置
验证 SSL 证书
import requests
# 默认验证 SSL 证书
response = requests.get('https://httpbin.org/get', verify=True)
# 禁用 SSL 验证(不推荐用于生产环境)
response = requests.get('https://self-signed.badssl.com/', verify=False)
# 使用自定义 CA 证书
response = requests.get('https://httpbin.org/get', verify='/path/to/ca.crt')
客户端证书
import requests
# 使用客户端证书
cert = ('/path/to/client.crt', '/path/to/client.key')
response = requests.get('https://httpbin.org/get', cert=cert)
文件上传和下载
上传文件
import requests
# 单文件上传
with open('document.txt', 'rb') as f:
files = {'file': f}
response = requests.post('https://httpbin.org/post', files=files)
# 多文件上传
files = [
('file1', ('report.txt', open('report.txt', 'rb'), 'text/plain')),
('file2', ('data.json', open('data.json', 'rb'), 'application/json'))
]
response = requests.post('https://httpbin.org/post', files=files)
# 带元数据的文件上传
files = {
'file': ('custom_name.txt', open('original.txt', 'rb'), 'text/plain', {'Expires': '0'})
}
response = requests.post('https://httpbin.org/post', files=files)
下载文件
import requests
# 流式下载大文件
url = 'https://example.com/large_file.zip'
response = requests.get(url, stream=True)
with open('large_file.zip', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
# 显示下载进度
import os
url = 'https://example.com/file.zip'
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open('file.zip', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
percent = (downloaded / total_size) * 100
print(f'下载进度: {percent:.2f}%')
身份认证
基本认证
import requests
from requests.auth import HTTPBasicAuth
# 基本 HTTP 认证
response = requests.get(
'https://httpbin.org/basic-auth/user/passwd',
auth=HTTPBasicAuth('user', 'passwd')
)
# 简写形式
response = requests.get(
'https://httpbin.org/basic-auth/user/passwd',
auth=('user', 'passwd')
)
摘要认证
import requests
from requests.auth import HTTPDigestAuth
response = requests.get(
'https://httpbin.org/digest-auth/auth/user/passwd',
auth=HTTPDigestAuth('user', 'passwd')
)
OAuth 认证
import requests
from requests_oauthlib import OAuth1, OAuth2Session
# OAuth 1.0
auth = OAuth1('client_key', 'client_secret', 'resource_owner_key', 'resource_owner_secret')
response = requests.get('https://api.example.com/protected', auth=auth)
# OAuth 2.0
oauth = OAuth2Session(client_id='client_id')
token = oauth.fetch_token(
token_url='https://api.example.com/oauth/token',
client_secret='client_secret',
authorization_response='https://callback.example.com/?code=xxx'
)
response = oauth.get('https://api.example.com/protected')
异常处理
常见异常
import requests
from requests.exceptions import (
RequestException,
HTTPError,
ConnectionError,
Timeout,
TooManyRedirects,
URLRequired,
MissingSchema,
InvalidSchema,
InvalidURL
)
try:
response = requests.get('https://httpbin.org/get', timeout=5)
response.raise_for_status()
except HTTPError as e:
print(f'HTTP 错误: {e}')
except ConnectionError as e:
print(f'连接错误: {e}')
except Timeout as e:
print(f'超时错误: {e}')
except TooManyRedirects as e:
print(f'重定向过多: {e}')
except RequestException as e:
print(f'请求异常: {e}')
高级用法
流式请求
import requests
# 流式响应
response = requests.get('https://httpbin.org/stream/20', stream=True)
for line in response.iter_lines():
if line:
print(line.decode('utf-8'))
# 流式 JSON
response = requests.get('https://httpbin.org/stream/20', stream=True)
for line in response.iter_lines():
if line:
import json
data = json.loads(line)
print(data)
钩子函数
import requests
import time
def timing_hook(response, *args, **kwargs):
"""计算请求耗时"""
response.elapsed_total = response.elapsed.total_seconds()
print(f'请求耗时: {response.elapsed_total} 秒')
def print_url(response, *args, **kwargs):
"""打印请求 URL"""
print(f'请求 URL: {response.url}')
response = requests.get(
'https://httpbin.org/get',
hooks={'response': [timing_hook, print_url]}
)
自定义传输适配器
import requests
from requests.adapters import HTTPAdapter
class CustomAdapter(HTTPAdapter):
def send(self, request, **kwargs):
# 在发送请求前修改请求
request.headers['X-Custom-Header'] = 'custom_value'
return super().send(request, **kwargs)
session = requests.Session()
session.mount('https://', CustomAdapter())
response = session.get('https://httpbin.org/get')
实际应用示例
示例 1:REST API 客户端
import requests
import json
class APIClient:
def __init__(self, base_url, api_key=None):
self.base_url = base_url
self.session = requests.Session()
if api_key:
self.session.headers.update({
'Authorization': f'Bearer {api_key}'
})
def get(self, endpoint, params=None):
url = f'{self.base_url}{endpoint}'
response = self.session.get(url, params=params)
response.raise_for_status()
return response.json()
def post(self, endpoint, data=None, json=None):
url = f'{self.base_url}{endpoint}'
response = self.session.post(url, data=data, json=json)
response.raise_for_status()
return response.json()
def put(self, endpoint, data=None, json=None):
url = f'{self.base_url}{endpoint}'
response = self.session.put(url, data=data, json=json)
response.raise_for_status()
return response.json()
def delete(self, endpoint):
url = f'{self.base_url}{endpoint}'
response = self.session.delete(url)
response.raise_for_status()
return response.status_code == 204
# 使用示例
client = APIClient('https://api.example.com/v1/', api_key='your_api_key')
users = client.get('/users')
new_user = client.post('/users', json={'name': '张三', 'email': '[email protected]'})
示例 2:网页爬虫
import requests
from bs4 import BeautifulSoup
import time
class WebScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
self.delay = 1 # 请求间隔(秒)
def fetch(self, url):
"""获取网页内容"""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except requests.exceptions.RequestException as e:
print(f'获取页面失败: {e}')
return None
finally:
time.sleep(self.delay)
def parse(self, html):
"""解析 HTML"""
soup = BeautifulSoup(html, 'html.parser')
return soup
def scrape(self, url):
"""爬取页面"""
html = self.fetch(url)
if html:
return self.parse(html)
return None
# 使用示例
scraper = WebScraper()
soup = scraper.scrape('https://example.com')
if soup:
title = soup.find('title')
print(f'页面标题: {title.text if title else "N/A"}')
示例 3:文件下载器
import requests
import os
from concurrent.futures import ThreadPoolExecutor
class FileDownloader:
def __init__(self, max_workers=5):
self.max_workers = max_workers
self.session = requests.Session()
def download(self, url, save_path=None, chunk_size=8192):
"""下载单个文件"""
if save_path is None:
save_path = os.path.basename(url.split('?')[0])
try:
response = self.session.get(url, stream=True, timeout=30)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = (downloaded / total_size) * 100
print(f'\r下载 {save_path}: {percent:.1f}%', end='')
print(f'\n下载完成: {save_path}')
return True
except requests.exceptions.RequestException as e:
print(f'下载失败: {e}')
return False
def download_multiple(self, urls, save_dir='downloads'):
"""批量下载文件"""
os.makedirs(save_dir, exist_ok=True)
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = []
for url in urls:
save_path = os.path.join(save_dir, os.path.basename(url.split('?')[0]))
future = executor.submit(self.download, url, save_path)
futures.append(future)
# 等待所有下载完成
for future in futures:
future.result()
# 使用示例
downloader = FileDownloader(max_workers=3)
urls = [
'https://example.com/file1.pdf',
'https://example.com/file2.pdf',
'https://example.com/file3.pdf'
]
downloader.download_multiple(urls, save_dir='downloads')
小结
Requests 是 Python 中处理 HTTP 请求的首选库,它简化了与 Web 服务交互的过程。
核心概念:
- 请求方法:GET、POST、PUT、DELETE 等
- 响应处理:状态码、响应头、响应体
- Session:保持连接和 Cookie
- 异常处理:网络错误、超时、HTTP 错误
常用功能:
- 发送请求:
get(),post(),put(),delete() - 参数传递:
params,data,json - 请求头设置:
headers - Cookie 处理:
cookies, Session - 文件操作:
files, 流式下载 - 身份认证:
auth - 超时控制:
timeout - 代理设置:
proxies
练习
- 编写一个函数,获取 GitHub API 上某个用户的公开仓库列表
- 实现一个简单的网页爬虫,抓取新闻网站的标题
- 创建一个文件下载器,支持断点续传功能
- 编写一个 REST API 客户端类,支持自动重试和错误处理
- 实现一个批量 URL 检查工具,检查多个 URL 的可访问性