pytorch 预训练T5模型的CNN DailyMail数据集上的Rouge度量不佳

shyt4zoc  于 2023-05-07  发布在  其他
关注(0)|答案(1)|浏览(369)

bounty明天到期。回答此问题可获得+100声望奖励。Robur_131想要引起更多关注这个问题:演示了代码更改的性能改进输出

我正在尝试使用以下代码在CNN/DailyMail数据集上微调预训练的T5模型:

import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

from datasets import load_dataset
from transformers import DefaultDataCollator
from transformers import TrainingArguments, Trainer
from transformers import T5Tokenizer, T5ForConditionalGeneration

import os
import evaluate

tokenizer = T5Tokenizer.from_pretrained("t5-small")
rouge = evaluate.load('rouge')

def process_data_to_model_inputs(batch):
    encoder_max_length = 512
    decoder_max_length = 128

    # tokenize the inputs and labels
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

def setup_distributed_environment():
    dist.init_process_group(backend='nccl')

    torch.manual_seed(42)

def generate_summary(batch, model): 
  inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt") 
  inputs = inputs.to(model.device) # Ensure that tensors are on the same device as the model 
  summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=128, early_stopping=True) 
  batch["predicted_highlights"] = tokenizer.batch_decode(summary_ids, skip_special_tokens=True) 
  return batch

def train():
    setup_distributed_environment()
    
    cnndm = load_dataset("cnn_dailymail", "3.0.0")

    tokenized_cnndm = cnndm.map(
        process_data_to_model_inputs, 
        batched=True, 
        remove_columns=cnndm["train"].column_names
    )
    
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    local_rank = int(os.environ["LOCAL_RANK"])
    global_rank = int(os.environ["RANK"])
    model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])

    training_args = TrainingArguments(
        output_dir="./updated_squad_fine_tuned_model",
        evaluation_strategy="epoch",
        learning_rate=5.6e-05,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        local_rank=local_rank,
        fp16=True,
        remove_unused_columns=False
    )

    data_collator = DefaultDataCollator()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_cnndm["train"].select(range(50000)),
        eval_dataset=tokenized_cnndm["validation"].select(range(10000)),
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    if local_rank == 0:
        model.module.save_pretrained("fine_tuned_squad_model")
        tokenizer.save_pretrained("fine_tuned_squad_model")

    results = cnndm["test"].select(range(5000)).map(lambda batch: generate_summary(batch, model.module), batched=True, remove_columns=["article"], batch_size=16)

    # Compute the metric using the generated summaries and the reference summaries
    rouge_score = rouge.compute(predictions=results["predicted_highlights"], references=results["highlights"])

    print(rouge_score)

def main():
    torch.cuda.empty_cache()
    
    train()

if __name__ == '__main__':
    main()

我没有在整个数据集上运行它,而是使用了前50 k个训练示例和10 k个验证示例。在训练之后,我使用前10 k个测试示例来计算Rouge度量。
我使用的是huggingface transformers库中的t5-small变体。我还使用了分布式设置,使用以下命令在4个节点中运行程序:

torchrun --nproc_per_node=gpu --nnodes=4 --node_rank=0 --rdzv_id=456 --rdzv_backend=c10d --rdzv_endpoint=129.82.44.119:30375 cnn_hf_test.py

训练后,我得到以下输出:

{'loss': 2.1367, 'learning_rate': 4.258706467661692e-05, 'epoch': 0.64}                                                                                                                                            
{'eval_runtime': 8.023, 'eval_samples_per_second': 1246.419, 'eval_steps_per_second': 19.569, 'epoch': 1.0}                                                                                                        
{'loss': 0.0305, 'learning_rate': 2.2686567164179102e-05, 'epoch': 1.28}                                                                                                                                           
{'loss': 0.0172, 'learning_rate': 2.7860696517412936e-06, 'epoch': 1.92}                                                                                                                                           
{'eval_runtime': 8.0265, 'eval_samples_per_second': 1245.871, 'eval_steps_per_second': 19.56, 'epoch': 2.0}                                                                                                        
{'train_runtime': 5110.103, 'train_samples_per_second': 19.569, 'train_steps_per_second': 0.306, 'train_loss': 0.6989707885800726, 'epoch': 2.0}                                                                   
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1564/1564 [1:25:08<00:00,  3.27s/it]
{'rouge1': 0.008768024824095142, 'rouge2': 0.000294696538416436, 'rougeL': 0.008527464153847374, 'rougeLsum': 0.00875863140146953}                                                                                 
WARNING:torch.distributed.elastic.rendezvous.dynamic_rendezvous:The node 'jupiter.cs.colostate.edu_805773_0' has failed to send a keep-alive heartbeat to the rendezvous '456' due to an error of type RendezvousTimeoutError.

从我的理解来看,rouge得分指标非常差,至少应该大于ROGUE-1的0.2,但我得到了0.008
我的集群设置不允许我加载更大的模型,如t5-baset5-large
你能为我提供一些改进胭脂评分指标的建议吗?或者,此设置和型号是否预期具有此性能?任何洞察力都非常赞赏。

rsaldnfx

rsaldnfx1#

为了更好地回答您的问题,以下是我的初步理解-考虑到计算能力(边界条件)的限制,您希望优化ROUGE评分指标,从而提高模型性能。在不将T5增加到基线或大的情况下,将难以将ROUGE分数增加到0.2。
建议的变更如下:
1.将学习率调整为较低值(3e-5)以改善收敛性,并增加训练时期的数量(4)以允许更多的训练。
1.使用步长为4的梯度累积可以有效地增加批处理大小,同时保持内存需求不变。
1.将训练示例的数量增加到100,000个,验证示例增加到15,000个,以更好地进行训练和评估。
1.更新Rouge度量计算以使用rouge_score包来更好地聚合结果。
我希望这有帮助!
我想了想,写了下面的代码,在colab中运行(因为GPU是T4,如果没有Pro+订阅,它可能无法工作):https://colab.research.google.com/drive/1J7hKzX8VSy5MVcbXEGFwVyjJnQvHjPYa#scrollTo=_kOaaGoh3G04,提高了ROUGE的性能:

import torch
import torch.nn as nn
import torch.distributed as dist

from datasets import load_dataset
from transformers import DefaultDataCollator
from transformers import TrainingArguments, Trainer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer, scoring

import os

tokenizer = T5Tokenizer.from_pretrained("t5-small")
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def process_data_to_model_inputs(batch):
    encoder_max_length = 512
    decoder_max_length = 128

    # tokenize the inputs and labels
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

def setup_distributed_environment():
    dist.init_process_group(backend='nccl')

    torch.manual_seed(42)

def generate_summary(batch, model):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs = inputs.to(model.device)
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=128, early_stopping=True)
    batch["predicted_highlights"] = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    return batch

def train():
    setup_distributed_environment()
    
    cnndm = load_dataset("cnn_dailymail", "3.0.0")

    tokenized_cnndm = cnndm.map(
        process_data_to_model_inputs, 
        batched=True, 
        remove_columns=cnndm["train"].column_names
    )
    
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    local_rank = int(os.environ["LOCAL_RANK"])
    global_rank = int(os.environ["RANK"])
    model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])

    # Suggestion 1: Adjust learning rate and increase the number of training epochs
    # Lower learning rate (3e-5) to improve convergence, and increase number of epochs (4) to allow more training
    training_args = TrainingArguments(
        output_dir="./updated_squad_fine_tuned_model",
        evaluation_strategy="epoch",
        learning_rate=3e-05,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=4,
        weight_decay=0.01,
        local_rank=local_rank,
        fp16=True,
        remove_unused_columns=False,
        gradient_accumulation_steps=4  # Suggestion 2: Use gradient accumulation to effectively increase batch size
    )

    data_collator = DefaultDataCollator()

    #build the trainer- fine-tuned
 
       
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_cnndm["train"].select(range(100000)),  # Increased train examples to 100000
        eval_dataset=tokenized_cnndm["validation"].select(range(15000)),  # Increased validation examples to 15000
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    if local_rank == 0:
        model.module.save_pretrained("fine_tuned_squad_model")
        tokenizer.save_pretrained("fine_tuned_squad_model")

    results = cnndm["test"].select(range(10000)).map(lambda batch: generate_summary(batch, model.module), batched=True, remove_columns=["article"], batch_size=16)

    # Compute the metric using the generated summaries and the reference summaries
    aggregator = scoring.BootstrapAggregator()
    for ref, pred in zip(results["highlights"], results["predicted_highlights"]):
        scores = rouge.score(ref, pred)
        aggregator.add_scores(scores)

    rouge_score = aggregator.aggregate()
    print(rouge_score)

def main():
    torch.cuda.empty_cache()
    
    train()

if __name__ == '__main__':
    main()

相关问题