跳到主要内容

Requests - HTTP 请求库

Requests 是 Python 中最流行的 HTTP 库,用于发送 HTTP/1.1 请求。它简化了与 Web 服务交互的过程,使 HTTP 请求变得简单且人性化。

什么是 Requests?

Requests 是一个优雅而简单的 Python HTTP 库,主要特点包括:

  • 简单易用:直观的 API 设计,符合 Python 哲学
  • 功能丰富:支持各种 HTTP 方法、认证、Cookie、Session 等
  • 自动处理:自动处理编码、JSON、重定向等
  • 安全可靠:支持 SSL/TLS,提供超时控制

安装 Requests

pip install requests

基础用法

发送 GET 请求

import requests

# 最简单的 GET 请求
response = requests.get('https://api.github.com')
print(response.status_code) # 200
print(response.text) # 响应内容(字符串)

# 带参数的 GET 请求
params = {
'q': 'python',
'page': 1
}
response = requests.get('https://api.github.com/search/repositories', params=params)
print(response.url) # https://api.github.com/search/repositories?q=python&page=1

# 解析 JSON 响应
data = response.json()
print(data['total_count'])

发送 POST 请求

import requests

# 发送表单数据
payload = {
'username': 'admin',
'password': 'secret'
}
response = requests.post('https://httpbin.org/post', data=payload)

# 发送 JSON 数据
json_data = {
'name': '张三',
'age': 25
}
response = requests.post('https://httpbin.org/post', json=json_data)

# 发送文件
files = {'file': open('report.txt', 'rb')}
response = requests.post('https://httpbin.org/post', files=files)

其他 HTTP 方法

import requests

# PUT 请求
response = requests.put('https://httpbin.org/put', data={'key': 'value'})

# DELETE 请求
response = requests.delete('https://httpbin.org/delete')

# HEAD 请求
response = requests.head('https://httpbin.org/get')

# OPTIONS 请求
response = requests.options('https://httpbin.org/get')

# PATCH 请求
response = requests.patch('https://httpbin.org/patch', data={'key': 'value'})

响应处理

响应属性

import requests

response = requests.get('https://api.github.com')

# 状态码
print(response.status_code) # 200
print(response.status_code == requests.codes.ok) # True

# 响应头
print(response.headers) # 响应头字典
print(response.headers['Content-Type'])

# 响应内容
print(response.text) # 字符串形式
print(response.content) # 字节形式(二进制)
print(response.json()) # JSON 解析

# 编码
print(response.encoding) # 自动检测的编码
response.encoding = 'utf-8' # 手动设置编码

# URL 和请求历史
print(response.url) # 最终 URL(考虑重定向)
print(response.history) # 重定向历史

状态码检查

import requests

response = requests.get('https://api.github.com')

# 检查状态码
if response.status_code == 200:
print('请求成功')
elif response.status_code == 404:
print('资源未找到')

# 使用 raise_for_status() 自动抛出异常
response = requests.get('https://httpbin.org/status/404')
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print(f'HTTP 错误: {e}')

请求头设置

自定义请求头

import requests

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'application/json',
'Authorization': 'Bearer your_token_here',
'X-Custom-Header': 'custom_value'
}

response = requests.get('https://httpbin.org/headers', headers=headers)
print(response.json())

模拟浏览器请求

import requests

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}

response = requests.get('https://www.example.com', headers=headers)
import requests

# 发送 Cookie
cookies = {
'session_id': 'abc123',
'user_pref': 'dark_mode'
}
response = requests.get('https://httpbin.org/cookies', cookies=cookies)

# 接收 Cookie
response = requests.get('https://httpbin.org/cookies/set/session_id/abc123')
print(response.cookies) # CookieJar 对象
print(response.cookies['session_id']) # abc123

# 将 Cookie 转换为字典
cookies_dict = requests.utils.dict_from_cookiejar(response.cookies)
import requests

# 创建 Session
session = requests.Session()

# 登录(设置 Cookie)
login_data = {
'username': 'admin',
'password': 'secret'
}
session.post('https://httpbin.org/post', data=login_data)

# 后续请求自动携带 Cookie
response = session.get('https://httpbin.org/cookies')
print(response.text)

# 关闭 Session
session.close()

Session 对象

使用 Session 提高效率

import requests

# Session 会保持连接池和 Cookie
with requests.Session() as session:
# 设置默认请求头
session.headers.update({
'User-Agent': 'MyApp/1.0'
})

# 所有请求都会使用这些默认设置
response1 = session.get('https://httpbin.org/get')
response2 = session.post('https://httpbin.org/post', data={'key': 'value'})
response3 = session.get('https://httpbin.org/cookies')

Session 的更多用法

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# 创建带重试策略的 Session
session = requests.Session()

# 配置重试策略
retries = Retry(
total=5, # 总共重试 5 次
backoff_factor=1, # 重试间隔时间
status_forcelist=[500, 502, 503, 504] # 这些状态码触发重试
)

session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))

response = session.get('https://httpbin.org/get')

超时设置

设置请求超时

import requests

# 设置超时时间(秒)
try:
response = requests.get('https://httpbin.org/delay/10', timeout=5)
except requests.exceptions.Timeout:
print('请求超时')

# 分别设置连接超时和读取超时
# timeout=(连接超时, 读取超时)
response = requests.get('https://httpbin.org/get', timeout=(3.05, 27))

代理设置

使用代理

import requests

proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}

response = requests.get('https://httpbin.org/ip', proxies=proxies)

# 需要认证的代理
proxies = {
'http': 'http://user:[email protected]:3128',
}

SSL/TLS 设置

验证 SSL 证书

import requests

# 默认验证 SSL 证书
response = requests.get('https://httpbin.org/get', verify=True)

# 禁用 SSL 验证(不推荐用于生产环境)
response = requests.get('https://self-signed.badssl.com/', verify=False)

# 使用自定义 CA 证书
response = requests.get('https://httpbin.org/get', verify='/path/to/ca.crt')

客户端证书

import requests

# 使用客户端证书
cert = ('/path/to/client.crt', '/path/to/client.key')
response = requests.get('https://httpbin.org/get', cert=cert)

文件上传和下载

上传文件

import requests

# 单文件上传
with open('document.txt', 'rb') as f:
files = {'file': f}
response = requests.post('https://httpbin.org/post', files=files)

# 多文件上传
files = [
('file1', ('report.txt', open('report.txt', 'rb'), 'text/plain')),
('file2', ('data.json', open('data.json', 'rb'), 'application/json'))
]
response = requests.post('https://httpbin.org/post', files=files)

# 带元数据的文件上传
files = {
'file': ('custom_name.txt', open('original.txt', 'rb'), 'text/plain', {'Expires': '0'})
}
response = requests.post('https://httpbin.org/post', files=files)

下载文件

import requests

# 流式下载大文件
url = 'https://example.com/large_file.zip'
response = requests.get(url, stream=True)

with open('large_file.zip', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)

# 显示下载进度
import os

url = 'https://example.com/file.zip'
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))

downloaded = 0
with open('file.zip', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
percent = (downloaded / total_size) * 100
print(f'下载进度: {percent:.2f}%')

身份认证

基本认证

import requests
from requests.auth import HTTPBasicAuth

# 基本 HTTP 认证
response = requests.get(
'https://httpbin.org/basic-auth/user/passwd',
auth=HTTPBasicAuth('user', 'passwd')
)

# 简写形式
response = requests.get(
'https://httpbin.org/basic-auth/user/passwd',
auth=('user', 'passwd')
)

摘要认证

import requests
from requests.auth import HTTPDigestAuth

response = requests.get(
'https://httpbin.org/digest-auth/auth/user/passwd',
auth=HTTPDigestAuth('user', 'passwd')
)

OAuth 认证

import requests
from requests_oauthlib import OAuth1, OAuth2Session

# OAuth 1.0
auth = OAuth1('client_key', 'client_secret', 'resource_owner_key', 'resource_owner_secret')
response = requests.get('https://api.example.com/protected', auth=auth)

# OAuth 2.0
oauth = OAuth2Session(client_id='client_id')
token = oauth.fetch_token(
token_url='https://api.example.com/oauth/token',
client_secret='client_secret',
authorization_response='https://callback.example.com/?code=xxx'
)
response = oauth.get('https://api.example.com/protected')

异常处理

常见异常

import requests
from requests.exceptions import (
RequestException,
HTTPError,
ConnectionError,
Timeout,
TooManyRedirects,
URLRequired,
MissingSchema,
InvalidSchema,
InvalidURL
)

try:
response = requests.get('https://httpbin.org/get', timeout=5)
response.raise_for_status()
except HTTPError as e:
print(f'HTTP 错误: {e}')
except ConnectionError as e:
print(f'连接错误: {e}')
except Timeout as e:
print(f'超时错误: {e}')
except TooManyRedirects as e:
print(f'重定向过多: {e}')
except RequestException as e:
print(f'请求异常: {e}')

高级用法

流式请求

import requests

# 流式响应
response = requests.get('https://httpbin.org/stream/20', stream=True)

for line in response.iter_lines():
if line:
print(line.decode('utf-8'))

# 流式 JSON
response = requests.get('https://httpbin.org/stream/20', stream=True)
for line in response.iter_lines():
if line:
import json
data = json.loads(line)
print(data)

钩子函数

import requests
import time

def timing_hook(response, *args, **kwargs):
"""计算请求耗时"""
response.elapsed_total = response.elapsed.total_seconds()
print(f'请求耗时: {response.elapsed_total} 秒')

def print_url(response, *args, **kwargs):
"""打印请求 URL"""
print(f'请求 URL: {response.url}')

response = requests.get(
'https://httpbin.org/get',
hooks={'response': [timing_hook, print_url]}
)

自定义传输适配器

import requests
from requests.adapters import HTTPAdapter

class CustomAdapter(HTTPAdapter):
def send(self, request, **kwargs):
# 在发送请求前修改请求
request.headers['X-Custom-Header'] = 'custom_value'
return super().send(request, **kwargs)

session = requests.Session()
session.mount('https://', CustomAdapter())

response = session.get('https://httpbin.org/get')

实际应用示例

示例 1:REST API 客户端

import requests
import json

class APIClient:
def __init__(self, base_url, api_key=None):
self.base_url = base_url
self.session = requests.Session()

if api_key:
self.session.headers.update({
'Authorization': f'Bearer {api_key}'
})

def get(self, endpoint, params=None):
url = f'{self.base_url}{endpoint}'
response = self.session.get(url, params=params)
response.raise_for_status()
return response.json()

def post(self, endpoint, data=None, json=None):
url = f'{self.base_url}{endpoint}'
response = self.session.post(url, data=data, json=json)
response.raise_for_status()
return response.json()

def put(self, endpoint, data=None, json=None):
url = f'{self.base_url}{endpoint}'
response = self.session.put(url, data=data, json=json)
response.raise_for_status()
return response.json()

def delete(self, endpoint):
url = f'{self.base_url}{endpoint}'
response = self.session.delete(url)
response.raise_for_status()
return response.status_code == 204

# 使用示例
client = APIClient('https://api.example.com/v1/', api_key='your_api_key')
users = client.get('/users')
new_user = client.post('/users', json={'name': '张三', 'email': '[email protected]'})

示例 2:网页爬虫

import requests
from bs4 import BeautifulSoup
import time

class WebScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
self.delay = 1 # 请求间隔(秒)

def fetch(self, url):
"""获取网页内容"""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except requests.exceptions.RequestException as e:
print(f'获取页面失败: {e}')
return None
finally:
time.sleep(self.delay)

def parse(self, html):
"""解析 HTML"""
soup = BeautifulSoup(html, 'html.parser')
return soup

def scrape(self, url):
"""爬取页面"""
html = self.fetch(url)
if html:
return self.parse(html)
return None

# 使用示例
scraper = WebScraper()
soup = scraper.scrape('https://example.com')
if soup:
title = soup.find('title')
print(f'页面标题: {title.text if title else "N/A"}')

示例 3:文件下载器

import requests
import os
from concurrent.futures import ThreadPoolExecutor

class FileDownloader:
def __init__(self, max_workers=5):
self.max_workers = max_workers
self.session = requests.Session()

def download(self, url, save_path=None, chunk_size=8192):
"""下载单个文件"""
if save_path is None:
save_path = os.path.basename(url.split('?')[0])

try:
response = self.session.get(url, stream=True, timeout=30)
response.raise_for_status()

total_size = int(response.headers.get('content-length', 0))
downloaded = 0

with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
downloaded += len(chunk)

if total_size > 0:
percent = (downloaded / total_size) * 100
print(f'\r下载 {save_path}: {percent:.1f}%', end='')

print(f'\n下载完成: {save_path}')
return True

except requests.exceptions.RequestException as e:
print(f'下载失败: {e}')
return False

def download_multiple(self, urls, save_dir='downloads'):
"""批量下载文件"""
os.makedirs(save_dir, exist_ok=True)

with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = []
for url in urls:
save_path = os.path.join(save_dir, os.path.basename(url.split('?')[0]))
future = executor.submit(self.download, url, save_path)
futures.append(future)

# 等待所有下载完成
for future in futures:
future.result()

# 使用示例
downloader = FileDownloader(max_workers=3)
urls = [
'https://example.com/file1.pdf',
'https://example.com/file2.pdf',
'https://example.com/file3.pdf'
]
downloader.download_multiple(urls, save_dir='downloads')

小结

Requests 是 Python 中处理 HTTP 请求的首选库,它简化了与 Web 服务交互的过程。

核心概念

  1. 请求方法:GET、POST、PUT、DELETE 等
  2. 响应处理:状态码、响应头、响应体
  3. Session:保持连接和 Cookie
  4. 异常处理:网络错误、超时、HTTP 错误

常用功能

  • 发送请求:get(), post(), put(), delete()
  • 参数传递:params, data, json
  • 请求头设置:headers
  • Cookie 处理:cookies, Session
  • 文件操作:files, 流式下载
  • 身份认证:auth
  • 超时控制:timeout
  • 代理设置:proxies

练习

  1. 编写一个函数,获取 GitHub API 上某个用户的公开仓库列表
  2. 实现一个简单的网页爬虫,抓取新闻网站的标题
  3. 创建一个文件下载器,支持断点续传功能
  4. 编写一个 REST API 客户端类,支持自动重试和错误处理
  5. 实现一个批量 URL 检查工具,检查多个 URL 的可访问性

参考资源