BeautifulSoup 详解

BeautifulSoup 是 Python 最流行的 HTML/XML 解析库之一，能够方便地解析和遍历网页内容。

官方文档

安装: pip install beautifulsoup4 | 推荐解析器: lxml

安装 BeautifulSoup

pip install beautifulsoup4

# 推荐同时安装解析器
pip install lxml
pip install html5lib

基本用法

创建 BeautifulSoup 对象

from bs4 import BeautifulSoup

# 从字符串解析
html = """
<html>
    <head>
        <title>页面标题</title>
    </head>
    <body>
        <h1>欢迎</h1>
        <div class="content">
            <p>第一段</p>
            <p>第二段</p>
        </div>
    </body>
</html>
"""

# 使用 lxml 解析器（推荐，速度快）
soup = BeautifulSoup(html, 'lxml')

# 使用 html.parser（Python 内置，无需安装）
soup = BeautifulSoup(html, 'html.parser')

# 使用 html5lib（容错性最强）
soup = BeautifulSoup(html, 'html5lib')

# 从文件读取
with open('page.html', 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f, 'lxml')

# 从网络获取
import requests
response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'lxml')

格式化输出

# 格式化 HTML 输出
print(soup.prettify())

# 获取标题
print(soup.title)        # <title>页面标题</title>
print(soup.title.name)   # title
print(soup.title.text)   # 页面标题

元素定位

BeautifulSoup 提供了多种定位元素的方法：

通过标签名查找

from bs4 import BeautifulSoup

html = """
<body>
    <h1>标题1</h1>
    <h2>标题2</h2>
    <p>段落</p>
    <a>链接</a>
</body>
"""

soup = BeautifulSoup(html, 'lxml')

# 获取第一个匹配的元素
print(soup.h1)        # <h1>标题1</h1>

# 获取所有匹配的元素
for p in soup.find_all('p'):
    print(p.text)

find() 和 find_all()

# find() - 返回第一个匹配的元素
soup.find('div')           # 找到第一个 div
soup.find('div', class_='content')  # 带有 class 的 div

# find_all() - 返回所有匹配的元素列表
soup.find_all('a')         # 所有链接
soup.find_all('a', limit=5)  # 限制返回数量

# 支持的查询参数
soup.find_all('div', id='main')       # id 查询
soup.find_all('div', class_='content')  # class 查询
soup.find_all('a', attrs={'data-id': '123'})  # 任意属性

# 使用正则表达式
import re
soup.find_all(href=re.compile('^http'))  # href 以 http 开头

# 使用函数
soup.find_all(lambda tag: tag.name == 'div' and tag.has_attr('id'))

通过 CSS 选择器

# 使用 select() 方法
soup.select('div')                    # 所有 div
soup.select('#header')                # id=header
soup.select('.content')               # class=content
soup.select('div.content')            # div 且 class=content
soup.select('div p')                  # div 内的所有 p
soup.select('div > p')                # div 的直接子元素 p
soup.select('a[href]')                # 带有 href 的 a
soup.select('a[href="url"]')           # href=url 的 a
soup.select('a[href*="partial"]')      # href 包含 partial
soup.select('a[href^="http"]')         # href 以 http 开头
soup.select('a[href$=".pdf"]')         # href 以 .pdf 结尾

获取元素内容

获取文本

soup = BeautifulSoup('<p>Hello <b>World</b></p>', 'lxml')
p = soup.p

# 获取所有文本（包含子元素）
print(p.get_text())           # Hello World

# 获取带分隔符的文本
print(p.get_text(strip=True))  # Hello World
print(p.get_text('|'))        # Hello |World

# 只获取直接文本（不含子元素）
print(p.string)              # Hello (第一个子字符串)
print(p.strings)             # 生成器对象

for string in p.strings:
    print(repr(string))

获取属性

html = '<a href="https://example.com" id="link1" class="external">Example</a>'
soup = BeautifulSoup(html, 'lxml')
a = soup.a

# 获取单个属性
print(a['href'])             # https://example.com
print(a.get('href'))         # https://example.com

# 获取所有属性
print(a.attrs)               # {'href': '..., 'id': 'link1', 'class': ['external']}

# 获取默认值
print(a.get('title', '无标题'))  # 无标题
print(a.get('class'))            # ['external']

获取子元素

html = """
<div>
    <p>第一段</p>
    <p>第二段</p>
</div>
"""
soup = BeautifulSoup(html, 'lxml')
div = soup.div

# 直接子元素
print(div.children)        # 生成器
for child in div.children:
    print(child)

# 所有子孙元素
for descendant in div.descendants:
    print(descendant)

# 父元素
p = soup.p
print(p.parent)            # 父元素
print(p.parents)          # 所有祖先元素

# 下一个/上一个兄弟元素
print(p.next_sibling)     # 下一个兄弟
print(p.previous_sibling)  # 上一个兄弟
print(p.next_siblings)    # 所有后续兄弟

修改元素

修改文本

soup = BeautifulSoup('<p>旧文本</p>', 'lxml')
p = soup.p
p.string = '新文本'
print(p)                  # <p>新文本</p>

修改属性

soup = BeautifulSoup('<a href="#">链接</a>', 'lxml')
a = soup.a

# 添加/修改属性
a['href'] = 'https://new-url.com'
a['target'] = '_blank'
print(a)                  # <a href="https://new-url.com" target="_blank">链接</a>

# 删除属性
del a['target']

添加新元素

from bs4 import BeautifulSoup, NavigableString, Tag

soup = BeautifulSoup('<div></div>', 'lxml')
div = soup.div

# 添加新标签
new_p = soup.new_tag('p')
new_p.string = '新段落'
div.append(new_p)

# 或使用 insert
div.insert(0, new_p)

# 插入字符串
div.insert(0, NavigableString('开头文字'))

# 创建完整元素
new_link = soup.new_tag('a', href='http://example.com', id='link')
new_link.string = '点击这里'
div.append(new_link)

包装和 unwrap

# 包装元素
soup = BeautifulSoup('<p>文本</p>', 'lxml')
p = soup.p
p.wrap(soup.new_tag('div'))
print(soup)               # <div><p>文本</p></div>

# 取消包装
soup = BeautifulSoup('<div><p>文本</p></div>', 'lxml')
p = soup.p
p.unwrap()
print(soup)               # <div>文本</div>

删除元素

soup = BeautifulSoup('<div><p>删除我</p><p>保留</p></div>', 'lxml')

# 删除元素
p = soup.find('p')
p.decompose()             # 完全删除

# 清空元素内容
div = soup.div
div.clear()               # <div></div>

导航树

按标签层级导航

html = """
<html>
    <body>
        <div id="main">
            <div class="article">
                <h1>标题</h1>
                <p>第一段<span>强调</span></p>
            </div>
        </div>
    </body>
</html>
"""
soup = BeautifulSoup(html, 'lxml')

# 获取 html 根元素
root = soup.html
print(root.body)          # <body>...</body>
print(root.head)          # <head>...</head>

# 获取特定元素
article = soup.find('div', class_='article')
print(article.h1)         # 下一级 h1
print(article.find('p'))  # 任意后代 p

常见任务示例

提取所有链接

html = """
<body>
    <a href="http://example1.com">链接1</a>
    <a href="http://example2.com">链接2</a>
</body>
"""
soup = BeautifulSoup(html, 'lxml')

# 提取所有链接
links = []
for a in soup.find_all('a'):
    links.append({
        'text': a.get_text(strip=True),
        'href': a.get('href')
    })

print(links)
# [{'text': '链接1', 'href': 'http://example1.com'}, ...]

提取表格数据

html = """
<table>
    <tr>
        <th>姓名</th>
        <th>年龄</th>
    </tr>
    <tr>
        <td>张三</td>
        <td>25</td>
    </tr>
    <tr>
        <td>李四</td>
        <td>30</td>
    </tr>
</table>
"""
soup = BeautifulSoup(html, 'lxml')

# 获取表头
headers = [th.get_text(strip=True) for th in soup.find_all('th')]
print(headers)  # ['姓名', '年龄']

# 获取所有行数据
data = []
for tr in soup.find_all('tr')[1:]:  # 跳过表头
    row = [td.get_text(strip=True) for td in tr.find_all('td')]
    data.append(row)

print(data)  # [['张三', '25'], ['李四', '30']]

提取列表项

html = """
<ul class="products">
    <li class="product">
        <h3 class="name">产品A</h3>
        <span class="price">99元</span>
    </li>
    <li class="product">
        <h3 class="name">产品B</h3>
        <span class="price">199元</span>
    </li>
</ul>
"""
soup = BeautifulSoup(html, 'lxml')

products = []
for li in soup.select('li.product'):
    products.append({
        'name': li.select_one('.name').get_text(strip=True),
        'price': li.select_one('.price').get_text(strip=True)
    })

print(products)

处理分页

# 假设分页URL格式为: https://example.com?page=1
import requests
from bs4 import BeautifulSoup

base_url = 'https://example.com'
all_data = []

for page in range(1, 11):  # 假设10页
    url = f'{base_url}?page={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # 提取数据
    items = soup.select('.item')
    if not items:
        break
    
    for item in items:
        all_data.append(item.get_text(strip=True))
    
    print(f'第 {page} 页完成')

处理中文编码

import requests
from bs4 import BeautifulSoup

# 自动检测编码
response = requests.get('https://example.com')
response.encoding = response.apparent_encoding  # 自动检测
soup = BeautifulSoup(response.text, 'lxml')

# 或手动指定编码
response = requests.get('https://example.com')
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')

# 处理乱码
print(soup.get_text())

性能优化

使用 lxml 解析器

# lxml 解析器比 html.parser 快很多
soup = BeautifulSoup(html, 'lxml')

使用 CSS 选择器

# CSS 选择器通常比 find_all 更简洁
soup.select('div.content p')  # 比 soup.find('div', class_='content').find_all('p') 快

使用正则表达式预编译

import re

# 预编译正则
pattern = re.compile(r'class=".*?product.*?"')

# 使用预编译的模式
soup.find_all(class_=re.compile(r'^product'))

完整示例

import requests
from bs4 import BeautifulSoup
import csv
import time
import random

class BookSpider:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.books = []
    
    def get_page(self, page_num):
        """获取页面"""
        url = f'https://example.com/books?page={page_num}'
        response = requests.get(url, headers=self.headers)
        
        # 检查编码
        response.encoding = response.apparent_encoding or 'utf-8'
        return response.text
    
    def parse_books(self, html):
        """解析书籍信息"""
        soup = BeautifulSoup(html, 'lxml')
        
        # 选择所有书籍条目
        book_items = soup.select('div.book-item')
        
        for item in book_items:
            book = {}
            
            # 书名
            name_elem = item.select_one('h3.book-title')
            book['name'] = name_elem.get_text(strip=True) if name_elem else ''
            
            # 作者
            author_elem = item.select_one('span.author')
            book['author'] = author_elem.get_text(strip=True) if author_elem else ''
            
            # 价格
            price_elem = item.select_one('span.price')
            book['price'] = price_elem.get_text(strip=True) if price_elem else ''
            
            # 评分
            rating_elem = item.select_one('span.rating')
            book['rating'] = rating_elem.get_text(strip=True) if rating_elem else ''
            
            self.books.append(book)
    
    def save_to_csv(self, filename):
        """保存到 CSV"""
        if not self.books:
            print('没有数据')
            return
        
        with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=self.books[0].keys())
            writer.writeheader()
            writer.writerows(self.books)
        
        print(f'已保存 {len(self.books)} 条数据到 {filename}')
    
    def run(self, pages=5):
        """运行爬虫"""
        for page in range(1, pages + 1):
            print(f'正在抓取第 {page} 页...')
            
            html = self.get_page(page)
            self.parse_books(html)
            
            # 随机延迟
            time.sleep(random.uniform(1, 3))
        
        self.save_to_csv('books.csv')

if __name__ == '__main__':
    spider = BookSpider()
    spider.run(pages=3)

小结

本章我们学习了：

BeautifulSoup 入门 - 创建解析对象
元素定位 - find、find_all、select 方法
获取内容 - text、属性、子元素
修改元素 - 添加、修改、删除
导航树 - 层级遍历
实战示例 - 提取链接、表格、列表等

练习

爬取一个新闻网站，提取所有文章标题和链接
爬取电商网站的产品列表，提取名称、价格、图片
实现一个图片爬虫，下载指定页面的所有图片

安装 BeautifulSoup​

基本用法​

创建 BeautifulSoup 对象​

格式化输出​

元素定位​

通过标签名查找​

find() 和 find_all()​

通过 CSS 选择器​

获取元素内容​

获取文本​

获取属性​

获取子元素​

修改元素​

修改文本​

修改属性​

添加新元素​

包装和 unwrap​

删除元素​

导航树​

按标签层级导航​

常见任务示例​

提取所有链接​

提取表格数据​

提取列表项​

处理分页​

处理中文编码​

性能优化​

使用 lxml 解析器​

使用 CSS 选择器​

使用正则表达式预编译​

完整示例​

小结​

练习​

安装 BeautifulSoup

基本用法

创建 BeautifulSoup 对象

格式化输出

元素定位

通过标签名查找

find() 和 find_all()

通过 CSS 选择器

获取元素内容

获取文本

获取属性

获取子元素

修改元素

修改文本

修改属性

添加新元素

包装和 unwrap

删除元素

导航树

按标签层级导航

常见任务示例

提取所有链接

提取表格数据

提取列表项

处理分页

处理中文编码

性能优化

使用 lxml 解析器

使用 CSS 选择器

使用正则表达式预编译

完整示例

小结

练习