跳到主要内容

多模态模型

多模态模型能够同时处理多种类型的数据(如图像、文本、音频),实现跨模态的理解和生成。本章将介绍如何使用 Transformers 库中的多模态模型,包括视觉-语言模型、图像描述生成、视觉问答等任务。

什么是多模态模型?

传统的 NLP 模型只能处理文本,而多模态模型打破了这一限制。它们能够:

  • 理解图像内容:识别图像中的物体、场景、动作
  • 关联图文信息:理解图像与文本之间的语义关系
  • 生成多模态内容:根据图像生成文本描述,或根据文本生成图像
┌─────────────────────────────────────────────────────────────────┐
│ 多模态模型架构示意 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 图像输入 视觉编码器 语言模型 │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────┐ ┌─────────┐ ┌─────────┐ │
│ │ 图像 │ │ ViT/ │ │ LLaMA/ │ │
│ │ │──────────────▶│ CLIP │───────────▶│ GPT │ │
│ └──────┘ │ SigLIP │ │ │ │
│ └─────────┘ └─────────┘ │
│ │ │ │
│ ▼ ▼ │
│ 视觉特征向量 文本输出/生成 │
│ │
│ 文本输入 ────────────────────────────────────────▶ │
│ │
└─────────────────────────────────────────────────────────────────┘

支持的多模态任务

Transformers 支持多种多模态任务:

任务类型描述代表模型
图像描述生成根据图像生成文本描述ViT-GPT2, BLIP
视觉问答根据图像回答问题LLaVA, Qwen-VL
图文匹配判断图像与文本是否匹配CLIP
文档理解理解文档图像中的文字和布局LayoutLM, Donut
视觉定位在图像中定位文本描述的对象Grounding DINO
图像生成根据文本描述生成图像Stable Diffusion

CLIP:图文匹配与零样本分类

CLIP(Contrastive Language-Image Pre-training)是 OpenAI 开发的视觉-语言模型,通过对比学习将图像和文本映射到同一特征空间。

CLIP 的工作原理

CLIP 由两个编码器组成:

  • 图像编码器:通常是 ViT(Vision Transformer),将图像转换为特征向量
  • 文本编码器:通常是 Transformer,将文本转换为特征向量

两个编码器通过对比学习进行联合训练,使得匹配的图文对在特征空间中距离更近。

使用 CLIP 进行零样本分类

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# 加载模型和处理器
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 加载图像
image = Image.open("cat.jpg")

# 定义候选标签(零样本分类)
candidate_labels = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]

# 处理输入
inputs = processor(
text=candidate_labels,
images=image,
return_tensors="pt",
padding=True
)

# 推理
with torch.no_grad():
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # 图像与文本的相似度
probs = logits_per_image.softmax(dim=1) # 转换为概率

# 输出结果
for label, prob in zip(candidate_labels, probs[0]):
print(f"{label}: {prob.item():.4f}")
# 示例输出:
# a photo of a cat: 0.8923
# a photo of a dog: 0.0756
# a photo of a bird: 0.0321

图文相似度计算

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 多张图像和多段文本
images = [
Image.open("cat.jpg"),
Image.open("dog.jpg"),
Image.open("car.jpg")
]

texts = [
"a fluffy cat sitting on a couch",
"a dog playing in the park",
"a red sports car on the road"
]

inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)

with torch.no_grad():
outputs = model(**inputs)
# 图像-文本相似度矩阵 [num_images, num_texts]
logits_per_image = outputs.logits_per_image

print("图像-文本相似度矩阵:")
print(logits_per_image)
# 对角线上的值应该最高(匹配的图文对)

提取图像和文本特征

from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import torch

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("example.jpg")
text = "a beautiful landscape"

inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)

with torch.no_grad():
outputs = model(**inputs)

# 图像特征向量 [1, 512]
image_features = outputs.image_embeds

# 文本特征向量 [1, 512]
text_features = outputs.text_embeds

print(f"图像特征维度: {image_features.shape}")
print(f"文本特征维度: {text_features.shape}")

# 计算余弦相似度
cosine_similarity = torch.nn.functional.cosine_similarity(
image_features, text_features
)
print(f"图文相似度: {cosine_similarity.item():.4f}")

LLaVA:视觉对话与问答

LLaVA(Large Language and Vision Assistant)是一个强大的视觉-语言模型,能够理解图像并进行多轮对话。它将视觉编码器(如 CLIP)与大语言模型(如 Vicuna/LLaMA)连接起来。

基本使用

from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch

# 加载模型和处理器
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto"
)

# 加载图像
image = Image.open("example.jpg")

# 构建对话 prompt
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "请详细描述这张图片中的内容。"}
]
}
]

# 处理输入
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)

# 生成回答
output = model.generate(**inputs, max_new_tokens=512)
response = processor.decode(output[0], skip_special_tokens=True)

print(response)

多轮对话

from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch

model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto"
)

image = Image.open("city.jpg")

# 多轮对话
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "这是什么地方?"}
]
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "这是一张城市街景的照片,可以看到高楼大厦和繁忙的街道。"}
]
},
{
"role": "user",
"content": [
{"type": "text", "text": "图中大约有多少辆车?"}
]
}
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)

output = model.generate(**inputs, max_new_tokens=256)
response = processor.decode(output[0], skip_special_tokens=True)

print(response)

视觉问答任务

from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch

def visual_qa(image_path, question):
"""视觉问答函数"""
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto"
)

image = Image.open(image_path)

conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": question}
]
}
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)

output = model.generate(**inputs, max_new_tokens=256)
response = processor.decode(output[0], skip_special_tokens=True)

# 提取助手回复部分
if "[/INST]" in response:
response = response.split("[/INST]")[-1].strip()

return response

# 使用示例
answer = visual_qa("product.jpg", "这个产品有什么特点?适合什么人群使用?")
print(answer)

BLIP:图像描述与理解

BLIP(Bootstrapping Language-Image Pre-training)是一个统一的视觉-语言预训练框架,支持图像描述生成、视觉问答、图文检索等任务。

图像描述生成

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# 加载模型和处理器
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base",
torch_dtype=torch.float16
).to("cuda")

# 加载图像
image = Image.open("beach.jpg")

# 生成描述
inputs = processor(images=image, return_tensors="pt").to("cuda", torch.float16)

with torch.no_grad():
output = model.generate(**inputs, max_length=50)

caption = processor.decode(output[0], skip_special_tokens=True)
print(f"图像描述: {caption}")
# 示例输出: a sandy beach with a blue ocean in the background

条件图像描述

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base",
torch_dtype=torch.float16
).to("cuda")

image = Image.open("park.jpg")

# 提供文本提示,引导生成方向
text_prompt = "a photography of"

inputs = processor(images=image, text=text_prompt, return_tensors="pt").to("cuda", torch.float16)

with torch.no_grad():
output = model.generate(**inputs, max_length=50)

caption = processor.decode(output[0], skip_special_tokens=True)
print(f"条件描述: {caption}")

视觉问答(BLIP-2)

from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch

# 加载 BLIP-2 模型
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16,
device_map="auto"
)

image = Image.open("kitchen.jpg")
question = "What objects are on the table?"

inputs = processor(images=image, text=question, return_tensors="pt").to(model.device)

with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=100)

answer = processor.decode(output[0], skip_special_tokens=True)
print(f"问题: {question}")
print(f"回答: {answer}")

VisionEncoderDecoder:自定义视觉-语言模型

Transformers 提供了 VisionEncoderDecoderModel 类,可以灵活组合视觉编码器和语言解码器。

图像描述生成

from transformers import (
VisionEncoderDecoderModel,
ViTImageProcessor,
GPT2TokenizerFast
)
from PIL import Image
import torch

# 加载预训练的视觉编码器 + 语言解码器
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# 设置解码参数
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id

# 加载图像
image = Image.open("sunset.jpg")

# 预处理
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values

# 生成描述
with torch.no_grad():
output_ids = model.generate(pixel_values, max_length=50)

caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"描述: {caption}")

从零构建视觉-语言模型

from transformers import (
VisionEncoderDecoderModel,
ViTImageProcessor,
AutoTokenizer,
ViTConfig,
GPT2Config
)
import torch

# 方法1:从预训练模型组合
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
encoder_pretrained_model_name_or_path="google/vit-base-patch16-224-in21k",
decoder_pretrained_model_name_or_path="gpt2"
)

# 方法2:自定义配置
encoder_config = ViTConfig(
image_size=224,
patch_size=16,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12
)

decoder_config = GPT2Config(
vocab_size=50257,
n_positions=1024,
n_embd=768,
n_layer=12,
n_head=12
)

model = VisionEncoderDecoderModel.from_encoder_decoder_configs(
encoder_config,
decoder_config
)

# 微调后的使用
image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# 配置生成参数
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.max_length = 50
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 1.0
model.config.num_beams = 4

Qwen-VL:多图像理解

Qwen-VL 是阿里开源的视觉-语言模型,支持多图像输入和细粒度视觉定位。

基本使用

from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch

# 加载模型
model_id = "Qwen/Qwen-VL-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.float16,
device_map="auto"
)
model.generation_config = model.generation_config

# 构建输入
query = tokenizer.from_list_format([
{'image': 'image1.jpg'},
{'text': '描述这两张图片的差异'},
{'image': 'image2.jpg'}
])

response, history = model.chat(tokenizer, query=query, history=None)
print(response)

图像定位

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "Qwen/Qwen-VL-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.float16,
device_map="auto"
)

# 询问图像中物体的位置
query = tokenizer.from_list_format([
{'image': 'street.jpg'},
{'text': '用边界框标出图中所有的汽车'}
])

response, history = model.chat(tokenizer, query=query, history=None)
print(response)

# 绘制边界框
image = tokenizer.draw_bbox_on_latest_picture(response, history)
if image is not None:
image.save("output_with_boxes.jpg")

实际应用场景

商品图片描述生成

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

class ProductCaptionGenerator:
def __init__(self):
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
self.model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-large",
torch_dtype=torch.float16
).to("cuda")

def generate(self, image_path: str, style: str = "detailed") -> str:
"""
生成商品描述

参数:
image_path: 商品图片路径
style: 描述风格 ("brief" 或 "detailed")
"""
image = Image.open(image_path).convert("RGB")

if style == "brief":
inputs = self.processor(images=image, return_tensors="pt").to("cuda", torch.float16)
max_length = 30
else:
# 使用提示词引导详细描述
prompt = "a detailed product photo showing"
inputs = self.processor(
images=image,
text=prompt,
return_tensors="pt"
).to("cuda", torch.float16)
max_length = 100

with torch.no_grad():
output = self.model.generate(**inputs, max_length=max_length)

return self.processor.decode(output[0], skip_special_tokens=True)

# 使用
generator = ProductCaptionGenerator()
caption = generator.generate("product.jpg", style="detailed")
print(f"商品描述: {caption}")

文档理解

from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

class DocumentParser:
"""文档解析器,用于提取结构化信息"""

def __init__(self, model_id="naver-clova-ix/donut-base-finetuned-cord-v2"):
self.processor = DonutProcessor.from_pretrained(model_id)
self.model = VisionEncoderDecoderModel.from_pretrained(model_id)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)

def parse_receipt(self, image_path: str) -> dict:
"""解析收据"""
image = Image.open(image_path).convert("RGB")

# 准备解码输入
task_prompt = "<s_cord-v2>"
decoder_input_ids = self.processor.tokenizer(
task_prompt,
add_special_tokens=False,
return_tensors="pt"
).input_ids.to(self.device)

# 处理图像
pixel_values = self.processor(image, return_tensors="pt").pixel_values.to(self.device)

# 生成
with torch.no_grad():
outputs = self.model.generate(
pixel_values,
decoder_input_ids=decoder_input_ids,
max_length=self.model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=self.processor.tokenizer.pad_token_id,
eos_token_id=self.processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)

# 解码结果
sequence = self.processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(self.processor.tokenizer.eos_token, "").replace(
self.processor.tokenizer.pad_token, ""
)

# 解析为 JSON
result = self.processor.token2json(sequence)
return result

# 使用
parser = DocumentParser()
receipt_data = parser.parse_receipt("receipt.jpg")
print(receipt_data)

内容审核

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

class ContentModerator:
"""图文内容审核"""

def __init__(self):
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 定义审核类别
self.categories = [
"safe content",
"violent content",
"adult content",
"hate speech imagery",
"dangerous activities",
"gambling content"
]

def moderate(self, image_path: str, threshold: float = 0.3) -> dict:
"""
审核图片内容

返回:
dict: 各类别的概率和是否违规
"""
image = Image.open(image_path).convert("RGB")

inputs = self.processor(
text=self.categories,
images=image,
return_tensors="pt",
padding=True
)

with torch.no_grad():
outputs = self.model(**inputs)
probs = outputs.logits_per_image.softmax(dim=1)[0]

results = {}
for category, prob in zip(self.categories, probs.tolist()):
results[category] = {
"probability": prob,
"flagged": prob > threshold
}

# 判断是否安全
is_safe = results["safe content"]["probability"] > 1 - threshold

return {
"is_safe": is_safe,
"details": results
}

# 使用
moderator = ContentModerator()
result = moderator.moderate("user_upload.jpg")
print(f"内容安全: {result['is_safe']}")
for category, info in result['details'].items():
if info['flagged']:
print(f"警告 - {category}: {info['probability']:.2%}")

性能优化建议

量化推理

from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
import torch

# 4-bit 量化加载(需要 bitsandbytes)
model = LlavaNextForConditionalGeneration.from_pretrained(
"llava-hf/llava-v1.6-mistral-7b-hf",
load_in_4bit=True,
device_map="auto"
)
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

# 这样可以将显存占用从约 14GB 降低到约 4GB

批处理推理

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base"
).to("cuda")

# 批量处理多张图像
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg", "img4.jpg"]
images = [Image.open(path).convert("RGB") for path in image_paths]

# 批量预处理
inputs = processor(images=images, return_tensors="pt", padding=True).to("cuda")

# 批量生成
with torch.no_grad():
outputs = model.generate(**inputs, max_length=50)

# 批量解码
captions = processor.batch_decode(outputs, skip_special_tokens=True)

for path, caption in zip(image_paths, captions):
print(f"{path}: {caption}")

使用 Flash Attention

from transformers import LlavaNextForConditionalGeneration

model = LlavaNextForConditionalGeneration.from_pretrained(
"llava-hf/llava-v1.6-mistral-7b-hf",
torch_dtype=torch.float16,
attn_implementation="flash_attention_2", # 启用 Flash Attention
device_map="auto"
)

小结

多模态模型扩展了 Transformers 的应用范围,使其能够处理图像与文本的结合。关键要点:

  1. CLIP:通过对比学习实现图文匹配和零样本分类
  2. LLaVA:强大的视觉对话和问答能力
  3. BLIP:统一的图像描述和视觉问答框架
  4. VisionEncoderDecoder:灵活组合视觉编码器和语言解码器

在实际应用中,需要根据任务需求选择合适的模型,并注意显存管理和推理优化。

参考资源