NumPy 字符串操作

NumPy 提供了 numpy.char 模块来处理字符串数组。这些函数对数组中的每个元素执行字符串操作，支持向量化处理，比 Python 原生字符串方法在处理大量数据时更高效。本章将详细介绍 NumPy 字符串操作函数的使用方法。

字符串数组基础

创建字符串数组

NumPy 字符串数组使用固定长度的 Unicode 字符串或字节字符串：

import numpy as np

# Unicode 字符串数组（推荐）
arr = np.array(['hello', 'world', 'numpy'])
print(f"字符串数组: {arr}")
print(f"数据类型: {arr.dtype}")  # <U5 表示最大长度为5的Unicode字符串

# 指定长度的字符串数组
arr_fixed = np.array(['a', 'bb', 'ccc'], dtype='U10')  # 最大长度10
print(f"\n固定长度数组: {arr_fixed}")
print(f"数据类型: {arr_fixed.dtype}")

# 字节字符串数组
arr_bytes = np.array([b'hello', b'world'])
print(f"\n字节字符串数组: {arr_bytes}")
print(f"数据类型: {arr_bytes.dtype}")

字符串数组的特点

import numpy as np

# 字符串长度自动对齐到最长元素
arr = np.array(['hi', 'hello', 'greetings'])
print(f"数组: {arr}")
print(f"dtype: {arr.dtype}")  # <U9（greetings 的长度）

# 超出长度的字符串会被截断
arr_truncate = np.array(['hello world'], dtype='U5')
print(f"\n截断后: {arr_truncate}")  # 'hello'

# 空字符串
arr_empty = np.array(['', 'a', 'ab'])
print(f"\n含空字符串: {arr_empty}")

StringDType：NumPy 2.0 可变长度字符串

NumPy 2.0 引入了 StringDType，这是一种新的可变长度字符串数据类型，解决了固定长度字符串的限制。

固定长度字符串的问题

传统的 NumPy 字符串数组使用固定长度，存在以下问题：

import numpy as np

# 问题1：长度浪费
arr1 = np.array(['a', 'hello', 'very long string'])
print(f"固定长度: {arr1.dtype}")  # <U16，所有元素占用16字符空间
print(f"内存浪费: 短字符串也占用最大长度空间")

# 问题2：截断风险
arr2 = np.array(['initial'], dtype='U5')
print(f"\n指定长度 U5 后赋值长字符串:")
arr2[0] = 'this is a long string'
print(f"结果被截断: '{arr2[0]}'")  # 'this '

# 问题3：无法动态扩展
# 一旦创建，无法存储超过指定长度的字符串

StringDType 的优势

StringDType 存储可变长度字符串，自动调整存储空间：

import numpy as np

# 创建 StringDType 数组
arr = np.array(['short', 'medium string', 'a very long string indeed'], 
               dtype=np.dtypes.StringDType())

print(f"StringDType 数组: {arr}")
print(f"dtype: {arr.dtype}")  # StringDType()

# 可以存储任意长度的字符串
arr[0] = 'x'  # 短字符串
arr[1] = 'this string is much longer than before'  # 长字符串
print(f"\n动态长度:")
for i, s in enumerate(arr):
    print(f"  arr[{i}]: '{s}' (长度 {len(s)})")

创建 StringDType 数组

import numpy as np

# 方法1：显式指定 dtype
arr1 = np.array(['hello', 'world'], dtype=np.dtypes.StringDType())
print(f"方法1 - 显式指定: {arr1.dtype}")

# 方法2：使用字符串 'T' 作为简写（NumPy 2.0+）
arr2 = np.array(['hello', 'world'], dtype='T')
print(f"方法2 - 'T' 简写: {arr2.dtype}")

# 方法3：从其他类型转换
arr3 = np.array([1, 2, 3]).astype(np.dtypes.StringDType())
print(f"\n从整数转换: {arr3}")

# 创建空数组
empty = np.empty(5, dtype=np.dtypes.StringDType())
print(f"空数组默认值: {empty}")  # 空字符串

StringDType 与固定长度字符串的对比

import numpy as np

# 测试数据
words = ['hi', 'hello', 'greetings', 'a very very long string']

# 固定长度字符串
fixed = np.array(words)
print(f"固定长度 dtype: {fixed.dtype}")
print(f"每个元素大小: {fixed.itemsize} 字节")

# StringDType
variable = np.array(words, dtype=np.dtypes.StringDType())
print(f"\nStringDType dtype: {variable.dtype}")
print(f"数组总大小: {variable.nbytes} 字节")

# 内存效率对比
import sys
fixed_total = fixed.nbytes
variable_total = variable.nbytes + sys.getsizeof(variable)
print(f"\n固定长度总内存: {fixed_total} 字节")
print(f"StringDType 更灵活，按需分配内存")

StringDType 的操作

import numpy as np

arr = np.array(['apple', 'banana', 'cherry'], dtype=np.dtypes.StringDType())

# 字符串操作（使用 numpy.char 模块）
upper = np.char.upper(arr)
print(f"大写: {upper}")

# 注意：操作结果可能返回固定长度字符串类型
print(f"操作后 dtype: {upper.dtype}")

# 保持 StringDType 类型
upper_sd = np.char.upper(arr).astype(np.dtypes.StringDType())
print(f"保持类型: {upper_sd.dtype}")

# 字符串连接
arr2 = np.array([' pie', ' split', ' tart'], dtype=np.dtypes.StringDType())
combined = np.char.add(arr, arr2)
print(f"\n连接结果: {combined}")

StringDType 的性能考虑

import numpy as np
import time

# 大量短字符串测试
n = 100000
short_words = ['word'] * n

# 固定长度字符串
start = time.time()
fixed = np.array(short_words, dtype='U10')
_ = np.char.upper(fixed)
time_fixed = time.time() - start

# StringDType
start = time.time()
variable = np.array(short_words, dtype=np.dtypes.StringDType())
_ = np.char.upper(variable)
time_variable = time.time() - start

print("短字符串场景:")
print(f"  固定长度: {time_fixed:.4f}s")
print(f"  StringDType: {time_variable:.4f}s")

# 对于长度差异大的数据，StringDType 更节省内存
mixed_words = ['a' * i for i in range(1, 1001)]  # 长度从1到1000
fixed_mixed = np.array(mixed_words)
variable_mixed = np.array(mixed_words, dtype=np.dtypes.StringDType())

print(f"\n混合长度场景:")
print(f"  固定长度数组大小: {fixed_mixed.nbytes / 1024:.1f} KB")
print(f"  StringDType 更节省内存（按需分配）")

StringDType 与缺失值

import numpy as np

# StringDType 支持 NA（缺失值）
arr = np.array(['hello', 'world', None, 'numpy'], dtype=np.dtypes.StringDType())
print(f"含缺失值的数组: {arr}")
print(f"缺失值表示: {arr[2]}")  # <NA>

# 检查缺失值
import numpy.ma as ma
is_missing = arr != arr  # NaN != NaN
print(f"\n缺失值位置: {is_missing}")

# 使用 pd.NA 或 None
arr2 = np.array(['a', 'b'], dtype=np.dtypes.StringDType())
arr2[0] = None
print(f"\n赋值 None 后: {arr2}")

StringDType 使用建议

import numpy as np

# 推荐使用场景：
# 1. 字符串长度差异大
# 2. 需要存储任意长度字符串
# 3. 处理包含缺失值的字符串数据

# 示例：处理文本数据
texts = [
    'Title',
    'This is a paragraph with moderate length.',
    'A very long article content that spans multiple sentences and contains extensive information about the topic.',
    'Short.'
]

# 使用 StringDType 存储
text_array = np.array(texts, dtype=np.dtypes.StringDType())
print("文本数据存储:")
for i, text in enumerate(text_array):
    print(f"  文本{i+1}: 长度 {len(text)}")

# 字符串操作后注意类型
lengths = np.char.str_len(text_array)
print(f"\n字符串长度: {lengths}")

兼容性说明

import numpy as np

# StringDType 是 NumPy 2.0+ 的新特性
# 检查 NumPy 版本
print(f"NumPy 版本: {np.__version__}")

# 如果使用 NumPy 1.x，需要使用固定长度字符串或 object 类型
# object 类型可以存储任意长度字符串，但效率较低
arr_object = np.array(['short', 'very long string'], dtype=object)
print(f"\nNumPy 1.x 替代方案 (object 类型): {arr_object.dtype}")
print(f"优点: 可存储任意长度")
print(f"缺点: 内存效率低，操作慢")

字符串转换

大小写转换

import numpy as np

arr = np.array(['Hello', 'WORLD', 'NumPy', 'python'])

# 转换为大写
upper = np.char.upper(arr)
print(f"大写: {upper}")

# 转换为小写
lower = np.char.lower(arr)
print(f"小写: {lower}")

# 首字母大写
capitalize = np.char.capitalize(arr)
print(f"首字母大写: {capitalize}")

# 每个单词首字母大写
title = np.char.title(np.array(['hello world', 'numpy tutorial']))
print(f"标题格式: {title}")

# 大小写互换
swapcase = np.char.swapcase(arr)
print(f"大小写互换: {swapcase}")

类型转换

import numpy as np

# 字符串与数值转换
numbers = np.array([1, 2, 3, 4, 5])
str_numbers = np.char.mod('%d', numbers)
print(f"数值转字符串: {str_numbers}")

# 格式化字符串
formatted = np.char.mod('Value: %.2f', np.array([1.234, 5.678, 9.012]))
print(f"格式化: {formatted}")

# 字符串转数值（使用 astype）
str_arr = np.array(['1', '2', '3'])
num_arr = str_arr.astype(int)
print(f"\n字符串转整数: {str_arr} -> {num_arr}")

字符串拼接与分割

字符串拼接

import numpy as np

arr1 = np.array(['Hello', 'Good'])
arr2 = np.array(['World', 'Morning'])

# 字符串连接
concat = np.char.add(arr1, arr2)
print(f"直接连接: {concat}")

# 使用分隔符连接
join = np.char.join('-', np.array(['hello', 'world']))
print(f"用-连接: {join}")

# 指定分隔符连接两个数组
concat_sep = np.char.add(np.char.add(arr1, ' '), arr2)
print(f"用空格连接: {concat_sep}")

# 重复字符串
repeat = np.char.multiply(np.array(['ab', 'cd']), 3)
print(f"重复3次: {repeat}")

字符串分割

import numpy as np

arr = np.array(['hello world', 'numpy tutorial', 'python programming'])

# 按空格分割
split = np.char.split(arr)
print(f"分割结果:")
for i, s in enumerate(split):
    print(f"  '{arr[i]}' -> {s}")

# 指定分隔符
split_sep = np.char.split(np.array(['a,b,c', 'x-y-z']), sep=',')
print(f"\n按逗号分割: {split_sep}")

# 分割为多行（返回数组列表）
splitlines = np.char.splitlines(np.array(['hello\nworld', 'foo\r\nbar']))
print(f"\n按行分割: {splitlines}")

字符串查找与替换

查找子字符串

import numpy as np

arr = np.array(['hello world', 'numpy array', 'python code'])

# 查找子字符串位置
find = np.char.find(arr, 'o')
print(f"查找 'o' 的位置: {find}")  # 返回第一个匹配的位置，-1表示未找到

# 查找多个子字符串
find2 = np.char.find(arr, 'py')
print(f"查找 'py' 的位置: {find2}")

# 从右边开始查找
rfind = np.char.rfind(arr, 'o')
print(f"从右查找 'o': {rfind}")

# 检查是否包含子字符串（返回布尔数组）
contains = np.char.find(arr, 'python') >= 0
print(f"\n是否包含 'python': {contains}")

字符串替换

import numpy as np

arr = np.array(['hello world', 'hello numpy', 'hello python'])

# 替换子字符串
replace = np.char.replace(arr, 'hello', 'hi')
print(f"替换后: {replace}")

# 替换多个
arr2 = np.array(['apple-banana', 'cherry-date'])
replace2 = np.char.replace(arr2, '-', ', ')
print(f"替换分隔符: {replace2}")

# 使用正则表达式替换（需要使用 Python re 模块配合）
import re
arr3 = np.array(['abc123', 'def456'])
# 对每个元素应用正则替换
regex_replace = np.array([re.sub(r'\d+', 'NUM', s) for s in arr3])
print(f"正则替换: {regex_replace}")

计数

import numpy as np

arr = np.array(['hello world', 'numpy tutorial', 'python programming'])

# 计算子字符串出现次数
count = np.char.count(arr, 'o')
print(f"'o' 出现次数: {count}")

count2 = np.char.count(arr, 'py')
print(f"'py' 出现次数: {count2}")

字符串判断

前缀和后缀判断

import numpy as np

arr = np.array(['hello.txt', 'world.py', 'numpy.cpp', 'test.txt'])

# 判断是否以某字符串开头
startswith = np.char.startswith(arr, 'hello')
print(f"以 'hello' 开头: {startswith}")

# 判断是否以某字符串结尾
endswith_py = np.char.endswith(arr, '.py')
print(f"以 '.py' 结尾: {endswith_py}")

endswith_txt = np.char.endswith(arr, '.txt')
print(f"以 '.txt' 结尾: {endswith_txt}")

# 应用：筛选文件
python_files = arr[endswith_py]
print(f"\nPython 文件: {python_files}")

字符类型判断

import numpy as np

# 判断是否只包含字母
arr_alpha = np.array(['hello', 'world123', 'numpy'])
is_alpha = np.char.isalpha(arr_alpha)
print(f"只含字母: {is_alpha}")

# 判断是否只包含字母和数字
arr_alnum = np.array(['hello', 'world123', 'test!'])
is_alnum = np.char.isalnum(arr_alnum)
print(f"只含字母数字: {is_alnum}")

# 判断是否只包含数字
arr_digit = np.array(['123', '45.6', '789'])
is_digit = np.char.isdigit(arr_digit)
print(f"只含数字: {is_digit}")

# 判断是否只包含小写字母
arr_lower = np.array(['hello', 'World', 'numpy'])
is_lower = np.char.islower(arr_lower)
print(f"只含小写: {is_lower}")

# 判断是否只包含大写字母
arr_upper = np.array(['HELLO', 'World', 'NUMPY'])
is_upper = np.char.isupper(arr_upper)
print(f"只含大写: {is_upper}")

# 判断是否为空白字符
arr_space = np.array([' ', '\t', '\n', 'hello'])
is_space = np.char.isspace(arr_space)
print(f"空白字符: {is_space}")

其他判断函数

import numpy as np

# 判断是否是有效的标识符
arr_identifier = np.array(['valid_name', '123invalid', '_private'])
is_identifier = np.char.isidentifier(arr_identifier)
print(f"有效标识符: {is_identifier}")

# 判断是否是标题格式
arr_title = np.array(['Hello World', 'hello world', 'HelloWorld'])
is_title = np.char.istitle(arr_title)
print(f"标题格式: {is_title}")

# 判断是否是数字字符串（包括中文数字等）
arr_numeric = np.array(['123', '½', 'abc'])
is_numeric = np.char.isnumeric(arr_numeric)
print(f"数字字符串: {is_numeric}")

# 判断是否是十进制数字
arr_decimal = np.array(['123', '45.6', '789'])
is_decimal = np.char.isdecimal(arr_decimal)
print(f"十进制数字: {is_decimal}")

字符串修剪与填充

去除空白

import numpy as np

arr = np.array(['  hello  ', '\tworld\t', '\nnumpy\n'])

# 去除两端空白
strip = np.char.strip(arr)
print(f"去除两端空白: {strip}")

# 去除左侧空白
lstrip = np.char.lstrip(arr)
print(f"去除左侧空白: {lstrip}")

# 去除右侧空白
rstrip = np.char.rstrip(arr)
print(f"去除右侧空白: {rstrip}")

# 去除指定字符
arr2 = np.array(['xxhelloxx', 'yyworldyy'])
strip_chars = np.char.strip(arr2, 'xy')
print(f"\n去除指定字符: {strip_chars}")

填充

import numpy as np

arr = np.array(['1', '22', '333', '4444'])

# 左侧填充（右对齐）
just_right = np.char.rjust(arr, width=5)
print(f"右对齐:\n{just_right}")

# 右侧填充（左对齐）
just_left = np.char.ljust(arr, width=5)
print(f"\n左对齐:\n{just_left}")

# 居中填充
center = np.char.center(arr, width=5)
print(f"\n居中:\n{center}")

# 使用自定义填充字符
center_star = np.char.center(arr, width=5, fillchar='*')
print(f"\n用*居中: {center_star}")

# 数字补零（常用于编号）
numbers = np.array(['1', '25', '100', '1000'])
zfill = np.char.zfill(numbers, width=4)
print(f"\n补零: {zfill}")

字符串编码与解码

import numpy as np

# Unicode 字符串编码为字节
arr = np.array(['hello', 'world', '你好'])
encoded = np.char.encode(arr, encoding='utf-8')
print(f"编码后类型: {encoded.dtype}")
print(f"编码后: {encoded}")

# 字节解码为字符串
decoded = np.char.decode(encoded, encoding='utf-8')
print(f"解码后: {decoded}")

# 指定错误处理方式
arr_invalid = np.array(['hello', 'world'])
try:
    # 使用 'ignore' 忽略无法编码的字符
    encoded_ignore = np.char.encode(arr_invalid, encoding='ascii', errors='ignore')
    print(f"\n忽略错误编码: {encoded_ignore}")
except:
    print("编码错误")

字符串比较

import numpy as np

arr1 = np.array(['apple', 'banana', 'cherry'])
arr2 = np.array(['apple', 'blueberry', 'cherry'])

# 元素级比较
equal = np.char.equal(arr1, arr2)
print(f"相等: {equal}")

not_equal = np.char.not_equal(arr1, arr2)
print(f"不相等: {not_equal}")

# 字符串比较（按字典序）
arr3 = np.array(['apple', 'banana', 'cherry'])
arr4 = np.array(['apricot', 'apple', 'date'])

greater = np.char.greater(arr3, arr4)
print(f"大于: {greater}")

less = np.char.less(arr3, arr4)
print(f"小于: {less}")

greater_equal = np.char.greater_equal(arr3, arr4)
print(f"大于等于: {greater_equal}")

实际应用示例

示例1：数据清洗

import numpy as np

# 原始数据（包含不一致的格式）
raw_names = np.array(['  John  ', 'JANE', 'bob', '  ALICE'])

# 清洗：去除空白、统一大小写
cleaned = np.char.capitalize(np.char.strip(raw_names))
print(f"原始数据: {raw_names}")
print(f"清洗后: {cleaned}")

示例2：文件名处理

import numpy as np

# 文件名列表
files = np.array([
    'data_2023_01.csv',
    'data_2023_02.csv',
    'data_2023_03.csv',
    'summary_2023.txt'
])

# 提取文件扩展名
extensions = np.char.split(files, '.')
ext_list = [parts[-1] if len(parts) > 1 else '' for parts in extensions]
print(f"扩展名: {ext_list}")

# 筛选 CSV 文件
is_csv = np.char.endswith(files, '.csv')
csv_files = files[is_csv]
print(f"\nCSV 文件: {csv_files}")

# 提取日期部分
dates = np.char.replace(np.char.replace(files, 'data_', ''), '.csv', '')
print(f"日期: {dates[is_csv]}")

示例3：格式化输出

import numpy as np

# 学生成绩数据
names = np.array(['Alice', 'Bob', 'Charlie', 'David'])
scores = np.array([95.5, 87.3, 92.1, 78.9])

# 创建格式化报告
report = np.char.add(
    np.char.add(np.char.rjust(names, 8), ': '),
    np.char.mod('%.1f', scores)
)
print("成绩报告:")
for line in report:
    print(f"  {line}")

# 创建排名格式
ranks = np.arange(1, len(names) + 1)
ranked = np.char.add(
    np.char.add(np.char.zfill(np.char.mod('%d', ranks), 2), '. '),
    report
)
print("\n排名:")
for line in ranked:
    print(f"  {line}")

示例4：批量字符串处理

import numpy as np

# 处理 URL
urls = np.array([
    'https://example.com/page1',
    'http://test.org/page2',
    'https://demo.net/page3'
])

# 提取域名
domains = np.array([url.split('/')[2] for url in urls])
print(f"域名: {domains}")

# 判断协议
is_https = np.char.startswith(urls, 'https://')
print(f"HTTPS: {is_https}")

# 提取路径
paths = np.char.replace(urls, r'https?://[^/]+', '')
# 更简单的方式
paths_simple = np.array(['/' + '/'.join(url.split('/')[3:]) for url in urls])
print(f"路径: {paths_simple}")

性能考虑

向量化 vs 循环

import numpy as np
import time

# 大量字符串
words = np.array(['hello'] * 100000)

# 向量化操作
start = time.time()
upper = np.char.upper(words)
time_vectorized = time.time() - start

# Python 循环
start = time.time()
upper_loop = np.array([w.upper() for w in words])
time_loop = time.time() - start

print(f"向量化: {time_vectorized:.4f}s")
print(f"循环: {time_loop:.4f}s")
print(f"加速比: {time_loop / time_vectorized:.2f}x")

注意事项

import numpy as np

# 固定长度字符串可能截断
arr = np.array(['a'], dtype='U2')
arr[0] = 'hello'  # 只有 'he' 被存储
print(f"截断问题: '{arr[0]}'")

# 解决方案：使用足够大的长度或不指定
arr2 = np.array(['a'], dtype='U10')
arr2[0] = 'hello'
print(f"正确存储: '{arr2[0]}'")

# 或使用 object 类型存储任意长度字符串
arr3 = np.array(['a'], dtype=object)
arr3[0] = 'a very long string that would be truncated'
print(f"object 类型: '{arr3[0]}'")

字符串函数速查

函数	说明
`char.add(a, b)`	字符串连接
`char.multiply(a, i)`	字符串重复
`char.capitalize(a)`	首字母大写
`char.title(a)`	每个单词首字母大写
`char.lower(a)`	转小写
`char.upper(a)`	转大写
`char.swapcase(a)`	大小写互换
`char.strip(a)`	去除两端空白
`char.lstrip(a)`	去除左侧空白
`char.rstrip(a)`	去除右侧空白
`char.split(a)`	分割字符串
`char.replace(a, old, new)`	替换字符串
`char.find(a, sub)`	查找子字符串
`char.count(a, sub)`	计数子字符串
`char.startswith(a, prefix)`	判断前缀
`char.endswith(a, suffix)`	判断后缀
`char.isalpha(a)`	判断是否字母
`char.isdigit(a)`	判断是否数字
`char.isalnum(a)`	判断是否字母数字
`char.islower(a)`	判断是否小写
`char.isupper(a)`	判断是否大写
`char.center(a, width)`	居中填充
`char.ljust(a, width)`	左对齐填充
`char.rjust(a, width)`	右对齐填充
`char.zfill(a, width)`	补零
`char.encode(a)`	编码
`char.decode(a)`	解码
`char.equal(a, b)`	比较

小结

本章介绍了 NumPy 字符串操作的主要功能：

基础操作：大小写转换、类型转换
拼接分割：字符串连接、分割、重复
查找替换：子字符串查找、替换、计数
判断函数：前缀后缀、字符类型判断
修剪填充：去空白、对齐、补零
编码解码：Unicode 和字节转换
实际应用：数据清洗、文件名处理、格式化输出

NumPy 字符串操作虽然不如 Python 原生字符串功能丰富，但在处理大量数据时效率更高，适合数据分析场景。

练习

将一组姓名数据统一为首字母大写格式
从一组文件路径中提取文件名和扩展名
实现简单的字符串模板替换功能
处理 CSV 格式的字符串数据，提取各列值
实现字符串相似度比较（使用 NumPy 字符串函数）

字符串数组基础​

创建字符串数组​

字符串数组的特点​

StringDType：NumPy 2.0 可变长度字符串​

固定长度字符串的问题​

StringDType 的优势​

创建 StringDType 数组​

StringDType 与固定长度字符串的对比​

StringDType 的操作​

StringDType 的性能考虑​

StringDType 与缺失值​

StringDType 使用建议​

兼容性说明​

字符串转换​

大小写转换​

类型转换​

字符串拼接与分割​

字符串拼接​

字符串分割​

字符串查找与替换​

查找子字符串​

字符串替换​

计数​

字符串判断​

前缀和后缀判断​

字符类型判断​

其他判断函数​

字符串修剪与填充​

去除空白​

填充​

字符串编码与解码​

字符串比较​

实际应用示例​

示例1：数据清洗​

示例2：文件名处理​

示例3：格式化输出​

示例4：批量字符串处理​

性能考虑​

向量化 vs 循环​

注意事项​

字符串函数速查​

小结​

练习​