详细复现 BERT

实现自注意力机制

BertSelfAttention 是 BERT 中的自注意力机制实现，它可以让模型在处理每个单词时考虑到句子中的其他单词，这有助于模型理解单词上下文中的依赖关系。

class BertSelfAttention(nn.Module):  
    def __init__(self, hidden_size, num_attention_heads, dropout_rate):  
        super(BertSelfAttention, self).__init__()  
        self.num_attention_heads = num_attention_heads  
        self.attention_head_size = int(hidden_size / num_attention_heads)  
        self.all_head_size = self.num_attention_heads * self.attention_head_size  
          
        self.query = nn.Linear(hidden_size, self.all_head_size)  
        self.key = nn.Linear(hidden_size, self.all_head_size)  
        self.value = nn.Linear(hidden_size, self.all_head_size)  
          
        self.dropout = nn.Dropout(dropout_rate)  
  
    def transpose_for_scores(self, x):  
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  
        x = x.view(*new_x_shape)  
        return x.permute(0, 2, 1, 3)  
  
    def forward(self, hidden_states, attention_mask):  
        mixed_query_layer = self.query(hidden_states)  
        mixed_key_layer = self.key(hidden_states)  
        mixed_value_layer = self.value(hidden_states)  
          
        query_layer = self.transpose_for_scores(mixed_query_layer)  
        key_layer = self.transpose_for_scores(mixed_key_layer)  
        value_layer = self.transpose_for_scores(mixed_value_layer)  
          
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))  
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)  
        attention_scores = attention_scores + attention_mask  
          
        attention_probs = nn.Softmax(dim=-1)(attention_scores)  
        attention_probs = self.dropout(attention_probs)  
          
        context_layer = torch.matmul(attention_probs, value_layer)  
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()  
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)  
        context_layer = context_layer.view(*new_context_layer_shape)  
        return context_layer

实现层归一化

层归一化通过计算隐藏层输出的均值和标准差，然后使用这些统计量来归一化隐藏层的激活值。这有助于确保网络中的激活值分布保持相对稳定，减少训练过程中的内部协变量偏移（Internal Covariate Shift）。这个类也不是必须要实现的，可以直接用torch.nn.LayerNorm，只是在某些情况下，自定义实现可能会进行特定的性能优化，以适应模型的特定方面。

class BertLayerNorm(nn.Module):  
    def __init__(self, hidden_size, eps=1e-12):  
        super(BertLayerNorm, self).__init__()  
        self.weight = nn.Parameter(torch.ones(hidden_size))  
        self.bias = nn.Parameter(torch.zeros(hidden_size))  
        self.variance_epsilon = eps  
  
    def forward(self, x):  
        u = x.mean(-1, keepdim=True)  
        s = (x - u).pow(2).mean(-1, keepdim=True)  
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)  
        return self.weight * x + self.bias

自注意力输出处理

BertSelfOutput 包含了自注意力机制的输出处理，包括残差连接和层归一化。这有助于防止训练过程中出现梯度消失或爆炸的问题，并能让模型从不同层中学习特征。

class BertSelfOutput(nn.Module):  
    def __init__(self, hidden_size, dropout_rate):  
        super(BertSelfOutput, self).__init__()  
        self.dense = nn.Linear(hidden_size, hidden_size)  
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)  
        self.dropout = nn.Dropout(dropout_rate)  
  
    def forward(self, hidden_states, input_tensor):  
        hidden_states = self.dense(hidden_states)  
        hidden_states = self.dropout(hidden_states)  
        hidden_states = self.LayerNorm(hidden_states + input_tensor)  
        return hidden_states

实现前馈神经网络

BertIntermediate 是 BERT 层中的前馈网络的一部分，它对自注意力层的输出进行进一步的处理，并通过一个激活函数（如 GELU）引入非线性。

class BertIntermediate(nn.Module):  
    def __init__(self, hidden_size, intermediate_size):  
        super(BertIntermediate, self).__init__()  
        self.dense = nn.Linear(hidden_size, intermediate_size)  
        self.intermediate_act_fn = nn.GELU()  
  
    def forward(self, hidden_states):  
        hidden_states = self.dense(hidden_states)  
        hidden_states = self.intermediate_act_fn(hidden_states)  
        return hidden_states

BERT 块输出处理

BertOutput 用来处理 BertIntermediate 的输出，它也包含了残差连接和层归一化，确保了信息可以顺畅地流过整个网络。

class BertOutput(nn.Module):  
    def __init__(self, intermediate_size, hidden_size, dropout_rate):  
        super(BertOutput, self).__init__()  
        self.dense = nn.Linear(intermediate_size, hidden_size)  
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)  
        self.dropout = nn.Dropout(dropout_rate)  
  
    def forward(self, hidden_states, input_tensor):  
        hidden_states = self.dense(hidden_states)  
        hidden_states = self.dropout(hidden_states)  
        hidden_states = self.LayerNorm(hidden_states + input_tensor)  
        return hidden_states

实现 BERT 层

BertLayer 是BERT编码器的单个层，它包含了自注意力机制和前馈网络。这个类的实例化会创建一个Transformer层，这是BERT模型的基础构建块。

class BertLayer(nn.Module):  
    def __init__(self, hidden_size, num_attention_heads, intermediate_size, dropout_rate):  
        super(BertLayer, self).__init__()  
        self.attention = BertSelfAttention(hidden_size, num_attention_heads, dropout_rate)  
        self.attention_output = BertSelfOutput(hidden_size, dropout_rate)  
        self.intermediate = BertIntermediate(hidden_size, intermediate_size)  
        self.output = BertOutput(intermediate_size, hidden_size, dropout_rate)  
  
    def forward(self, hidden_states, attention_mask):  
        attention_output = self.attention(hidden_states, attention_mask)  
        attention_output = self.attention_output(attention_output, hidden_states)  
        intermediate_output = self.intermediate(attention_output)  
        layer_output = self.output(intermediate_output, attention_output)  
        return layer_output

实现 BERT 编码器

BertEncoder 是由多个 BertLayer 层堆叠而成的编码器。它负责处理嵌入向量，并通过一系列的编码层来生成高级特征表示。每个 BertLayer 都接收到来自前一个层的输出，并输出给下一个层。

class BertEncoder(nn.Module):  
    def __init__(self, hidden_size, num_attention_heads, intermediate_size, num_hidden_layers, dropout_rate):  
        super(BertEncoder, self).__init__()  
        self.layer = nn.ModuleList([BertLayer(hidden_size, num_attention_heads, intermediate_size, dropout_rate) for _ in range(num_hidden_layers)])  
  
    def forward(self, hidden_states, attention_mask):  
        for layer_module in self.layer:  
            hidden_states = layer_module(hidden_states, attention_mask)  
        return hidden_states

实现 BERT 词编码

BertEmbeddings 类负责将输入的单词ID（通常是单词的tokenized和索引化形式）转换为固定大小的向量。它包含了词嵌入、位置嵌入和分割嵌入，这些嵌入会被相加，以提供能够捕捉单词含义、在序列中的位置以及句子界限信息的综合向量表示。

class BertEmbeddings(nn.Module):  
    def __init__(self, vocab_size, hidden_size, max_position_embeddings, dropout_rate):  
        super(BertEmbeddings, self).__init__()  
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)  
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)  
        self.token_type_embeddings = nn.Embedding(2, hidden_size)  # Typically 2 for BERT  
  
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)  
        self.dropout = nn.Dropout(dropout_rate)  
  
    def forward(self, input_ids, token_type_ids=None):  
        seq_length = input_ids.size(1)  
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  
  
        if token_type_ids is None:  
            token_type_ids = torch.zeros_like(input_ids)  
  
        words_embeddings = self.word_embeddings(input_ids)  
        position_embeddings = self.position_embeddings(position_ids)  
        token_type_embeddings = self.token_type_embeddings(token_type_ids)  
  
        embeddings = words_embeddings + position_embeddings + token_type_embeddings  
        embeddings = self.LayerNorm(embeddings)  
        embeddings = self.dropout(embeddings)  
        return embeddings

获取全局表示

BertPooler 负责从编码器的输出中提取一个固定大小的向量，通常是处理 CLS 标记所对应的隐藏状态。这个池化后的向量可以用于分类任务，它经过一个线性层和一个激活函数（通常是tanh）处理后，用于下游任务的输入。

class BertPooler(nn.Module):  
    def __init__(self, hidden_size):  
        super(BertPooler, self).__init__()  
        self.dense = nn.Linear(hidden_size, hidden_size)  
        self.activation = nn.Tanh()  
  
    def forward(self, hidden_states):  
        # We take the hidden state corresponding to the first token (CLS)  
        first_token_tensor = hidden_states[:, 0]  
        pooled_output = self.dense(first_token_tensor)  
        pooled_output = self.activation(pooled_output)  
        return pooled_output

搭建完整的 BERT 模型

BertModel 是BERT模型的核心类，它整合了BERT的所有组件，并提供了模型的前向传播机制。当你调用这个类的实例时，它会处理输入数据，通过嵌入层、编码器层，并提供序列输出和池化输出。

class BertModel(nn.Module):  
    def __init__(self, vocab_size, hidden_size, num_attention_heads, intermediate_size, num_hidden_layers, max_position_embeddings, dropout_rate):  
        super(BertModel, self).__init__()  
        self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings, dropout_rate)  
        self.encoder = BertEncoder(hidden_size, num_attention_heads, intermediate_size, num_hidden_layers, dropout_rate)  
        self.pooler = BertPooler(hidden_size)  
  
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):  
        if attention_mask is None:  
            attention_mask = torch.ones_like(input_ids)  
        if token_type_ids is None:  
            token_type_ids = torch.zeros_like(input_ids)  
  
        # Generate extended attention mask for the self-attention  
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0  
  
        # Embedding layer  
        embedding_output = self.embeddings(input_ids, token_type_ids)  
  
        # Encoder layers  
        encoder_outputs = self.encoder(embedding_output, extended_attention_mask)  
        sequence_output = encoder_outputs[-1]  
  
        # Pooling layer  
        pooled_output = self.pooler(sequence_output)  
  
        return sequence_output, pooled_output

测试 BERT

在这个例子中，input_ids 是一个整数张量，表示输入词的 ID。attention_mask 用于指示哪些令牌是实际的输入，哪些是填充的。token_type_ids 用于区分两个句子，这在句子对任务中很有用。

sequence_output 是模型的最后一层隐藏状态，它可以用于诸如序列标注或问答任务。pooled_output 是一个经过池化的表示，它通常用于分类任务。

在实际应用中，批次大小和序列长度可以根据需要进行调整，而hidden_size通常由预训练模型的配置决定。在BERT Base模型中，hidden_size通常是768，而在BERT Large模型中，hidden_size是1024。

# 假定的配置参数  
vocab_size = 30522  # BERT Base使用的词汇表大小  
hidden_size = 768  # 隐藏层维度  
num_attention_heads = 12  # 注意力头的数量  
intermediate_size = 3072  # 中间层维度  
num_hidden_layers = 12  # 隐藏层的数量  
max_position_embeddings = 512  # 最大序列长度  
dropout_rate = 0.1  # Dropout比率  
  
# 创建BERT模型实例  
model = BertModel(vocab_size, hidden_size, num_attention_heads, intermediate_size, num_hidden_layers, max_position_embeddings, dropout_rate)  
  
# 假定的输入数据  
input_ids = torch.tensor([[101, 2003, 2023, 1037, 2742, 102, 0, 0, 0]])  # 示例输入ID，通常由分词器产生  
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0]])  # 注意力掩码，用于遮蔽填充令牌  
token_type_ids = torch.tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1]])  # 令牌类型ID，用于区分不同的句子  
  
# 前向传播，获取模型输出  
sequence_output, pooled_output = model(input_ids, attention_mask, token_type_ids)  
# sequence_output.shape: [batch_size, sequence_length, hidden_size]
# pooled_output.shape: [batch_size, hidden_size]

直接使用 BERT

在实际使用时，直接使用 Hugging Face 的 transformers 库中的 BERT 就好了，上面的复现只是帮助理解 BERT。

from transformers import BertModel, BertTokenizer  
  
# 加载预训练的BERT模型和分词器  
model_name = 'bert-base-uncased'  # 你可以选择其他的模型  
tokenizer = BertTokenizer.from_pretrained(model_name)  
model = BertModel.from_pretrained(model_name)  

# 输入文本  
text = "Here is some text to encode"  
  
# 使用分词器将文本转换为BERT所需的格式，add_special_tokens=True 确保在句子的开头和结尾添加了特殊的token ([CLS] 和 [SEP])  
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)  
  
# 获取编码后的输入  
input_ids = inputs["input_ids"]  
attention_mask = inputs["attention_mask"]  

# 设置模型为评估模式  
model.eval()  
  
# 使用BERT模型获取隐藏状态  
with torch.no_grad():  
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)  
  
# 获取编码的最后一层的隐藏状态，last_hidden_state 是模型的主要输出，它包含了输入序列的隐藏状态
last_hidden_states = outputs.last_hidden_state

如果想要对 BERT 进行微调以适应特定的下游任务（例如分类），通常需要添加一个适合任务的头部到 BERT 模型，并在数据集上训练整个模型或者分类头

# 创建一个分类头  
class BertForBinaryClassification(nn.Module):  
    def __init__(self, bert_model):  
        super().__init__()  
        self.bert = bert_model  
        self.classifier = nn.Linear(bert_model.config.hidden_size, 2)  
  
    def forward(self, input_ids, attention_mask):  
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)  
        cls_output = outputs.pooler_output  # 获取[CLS] token的输出  
        logits = self.classifier(cls_output)  
        return logits

要训练BERT模型，你需要经历一系列复杂的步骤，包括准备数据集、定义模型、设置训练循环等。以下是一个简化版的例子，展示如何在PyTorch中使用Hugging Face的transformers库对BERT进行微调，假设我们的任务是文本分类。

请注意，这个例子是基于已经预训练好的BERT模型进行微调的。训练一个BERT模型从头开始需要大量的数据和计算资源，通常不在个人或小型研究团队的能力范围内。

1. 安装和导入库

确保你安装了 transformers 和 torch 库。

1	pip install transformers torch

在Python脚本中导入必要的模块：

import torch  
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler  
from transformers import BertTokenizer, BertForSequenceClassification, AdamW  
from transformers import get_linear_schedule_with_warmup  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score  
import numpy as np  
from datasets import load_dataset  
  
# 确保使用GPU，如果你有的话  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2. 准备数据集

加载并预处理数据集。这里我们使用 datasets 库加载数据集，它可以与 transformers 库无缝协作。

# 加载数据集（这里以GLUE的MRPC任务为例）  
dataset = load_dataset("glue", "mrpc")  
train_dataset = dataset["train"]  
val_dataset = dataset["validation"]  
  
# 加载分词器  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  
  
# 编码数据集  
def encode(examples):  
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')  
  
train_dataset = train_dataset.map(encode, batched=True)  
val_dataset = val_dataset.map(encode, batched=True)  
  
# 设置格式  
train_dataset = train_dataset.rename_column("label", "labels")  
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])  
val_dataset = val_dataset.rename_column("label", "labels")  
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])  
  
# 创建数据加载器  
batch_size = 32  
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)  
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

3. 定义BERT模型和优化器

加载预训练的BERT模型并准备优化器。

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  
model.to(device)  
  
# 准备优化器和学习率调度器  
optimizer = AdamW(model.parameters(), lr=2e-5)  
total_steps = len(train_loader) * epochs  
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

4. 训练循环

设置训练循环，训练和评估模型。

epochs = 3  
  
for epoch in range(epochs):  
       # 训练模式  
    model.train()  
  
    # 用于跟踪训练过程中的累积损失  
    total_train_loss = 0  
  
    for step, batch in enumerate(train_loader):  
        # 每批数据  
        input_ids = batch['input_ids'].to(device)  
        attention_mask = batch['attention_mask'].to(device)  
        labels = batch['labels'].to(device)  
  
        # 梯度清零  
        model.zero_grad()  
  
        # 前向传播  
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  
  
        # 取出损失  
        loss = outputs.loss  
        total_train_loss += loss.item()  
  
        # 反向传播  
        loss.backward()  
  
        # 梯度裁剪，防止梯度爆炸  
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  
  
        # 更新参数  
        optimizer.step()  
  
        # 更新学习率  
        scheduler.step()  
  
    # 计算本轮训练的平均损失  
    avg_train_loss = total_train_loss / len(train_loader)  
  
    # 打印本轮训练结果  
    print(f"Epoch {epoch + 1} / {epochs}")  
    print(f"Average training loss: {avg_train_loss:.2f}")  
  
    # 验证模式  
    model.eval()  
  
    # 用于跟踪评估过程的变量  
    total_eval_accuracy = 0  
    total_eval_loss = 0  
  
    for batch in val_loader:  
        # 每批数据  
        input_ids = batch['input_ids'].to(device)  
        attention_mask = batch['attention_mask'].to(device)  
        labels = batch['labels'].to(device)  
  
        # 关闭梯度计算  
        with torch.no_grad():  
            # 前向传播  
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  
  
        # 取出损失  
        loss = outputs.loss  
        total_eval_loss += loss.item()  
  
        # 取出预测结果  
        logits = outputs.logits  
        predictions = torch.argmax(logits, dim=-1)  
        label_ids = labels.to('cpu').numpy()  
        predictions = predictions.to('cpu').numpy()  
  
        # 计算准确率  
        total_eval_accuracy += accuracy_score(label_ids, predictions)  
  
    # 计算平均损失和准确率  
    avg_val_accuracy = total_eval_accuracy / len(val_loader)  
    avg_val_loss = total_eval_loss / len(val_loader)  
  
    # 打印验证结果  
    print(f"Validation Loss: {avg_val_loss:.2f}")  
    print(f"Validation Accuracy: {avg_val_accuracy:.2f}")

在上面的代码中，我们在每个epoch结束时都会计算并打印训练损失和验证损失，以及验证集上的准确率。这有助于我们了解模型训练的进展，并且可以根据这些信息对训练过程进行调整。

5. 保存模型

训练完成后，你可能想要保存模型，以便将来可以重新加载它而无需重新训练。

# 保存模型和分词器 
model.save_pretrained('path_to_save_model')  
tokenizer.save_pretrained('path_to_save_model')