質問応答用に CamemBERT (Roberta のフランス語版) を微調整しようとしています。
最初に、CamemBERTモデルを使用して、質問とテキストの入力埋め込みを生成し、出力線形レイヤーを使用して、回答の開始と終了に対応する開始ロジットと終了ロジットを出力します。
論文の公式結果では、質問応答のパフォーマンスは(88%, 77%)の(F1 スコア, EM)ですが、私が得た結果は(71%, 46%)です。
私の質問は、なぜ結果が十分に近くないのですか?
これは、公式モデルと同じハイパーパラメーターを使用して、FQuAD データセットでモデルをトレーニングおよび評価するために使用しているスクリプトの一部です。
MAX_SEQ_LENGTH = 384
TRAIN_BATCH_SIZE = 12
n_epochs = 3
learning_rate = 3e-5
EVAL_BATCH_SIZE = 12
dropout = 0
BERT_TYPE = "fmikaelian/camembert-base-fquad"
class CamemBERTQA(nn.Module):
def __init__(self,bert_type, hidden_size, num_labels):
super(CamemBERTQA, self).__init__()
self.bert_type = bert_type
self.hidden_size = hidden_size
self.num_labels = num_labels
self.camembert = AutoModel.from_pretrained(self.bert_type)
self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)
def forward(self, input_ids):
output = self.camembert(input_ids = input_ids)[0]
logits = self.qa_outputs(output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,)
return outputs
def train_eval_model(model, n_epochs, scheduler=None):
train_lossess = []
valid_lossess = []
avg_train_losses = []
avg_valid_losses = []
res = []
for epoch in trange(n_epochs):
#######################################################################################
################################### train the model ###################################
#######################################################################################
model.train()
for batch, d in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
ids = d['ids']
start_pos = d['start_pos']
end_pos = d['end_pos']
ids = ids.to(device, dtype = torch.long)
start_pos = start_pos.to(device, dtype = torch.long)
end_pos = end_pos.to(device, dtype = torch.long)
optimizer.zero_grad()
start_and_end_scores = model(ids) # Forward pass return start and end positions
loss = loss_func(start_and_end_scores, start_pos, end_pos)
loss.backward()
optimizer.step()
if scheduler is not None:
scheduler.step()
train_lossess.append(loss.item())
##########################################################################################
################################### validate the model ###################################
##########################################################################################
model.eval()
pred_s = None
pred_e = None
# eval_loss = 0.0
# eval_steps = 0
for batch, d in enumerate(eval_dataloader):
ids = d['ids']
start_pos = d['start_pos']
end_pos = d['end_pos']
ids = ids.to(device, dtype = torch.long)
start_pos = start_pos.to(device, dtype = torch.long)
end_pos = end_pos.to(device, dtype = torch.long)
with torch.no_grad():
start_and_end_scores = model(ids)
loss = loss_func(start_and_end_scores, start_pos, end_pos)
valid_lossess.append(loss.item())
# eval_steps += 1
if pred_s is None:
pred_s = start_and_end_scores[0].detach().cpu().numpy()
pred_e = start_and_end_scores[1].detach().cpu().numpy()
else:
pred_s = np.append(pred_s, start_and_end_scores[0].detach().cpu().numpy(), axis=0)
pred_e = np.append(pred_e, start_and_end_scores[1].detach().cpu().numpy(), axis=0)
pred_start = np.argmax(pred_s, axis=1)
pred_end = np.argmax(pred_e, axis=1)
res.append([pred_start,pred_end])
train_loss = np.average(train_lossess)
valid_loss = np.average(valid_lossess)
avg_train_losses.append(train_loss)
avg_valid_losses.append(valid_loss)
epoch_len = len(str(n_epochs))
print_msg = (f'[{epoch:>{epoch_len}}/{n_epochs:>{epoch_len}}] ' +
f'train_loss: {train_loss:.5f} ' +
f'valid_loss: {valid_loss:.5f}')
print(print_msg)
train_lossess = []
valid_lossess = []
return model, avg_train_losses, avg_valid_losses, res