pytorch 如何使用HuggingFace Transformers设置TPU并行性/FSDP

lx0bsm1f  于 2023-11-19  发布在  其他
关注(0)|答案(1)|浏览(156)

我一直在尝试用Kaggle TPU笔记本微调一个使用TPU并行/ FSDP的模型。我需要设置FSDP的原因是因为我使用的模型非常大(Openlm的open llama 3b v2)。当我尝试微调它时,我很快就用完了TPU上的内存。
下面是我的代码:
Link to Colab Notebook

!pip install sentencepiece
!pip install -U accelerate
!pip install -U transformers
!pip install cloud-tpu-client
!pip install torch-xla
!pip install pyarrow

import torch
import torch_xla
import torch_xla.core.xla_model as xm
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import pandas as pd
import accelerate
from accelerate import Accelerator, DistributedType
import numpy as np
import pyarrow
import os
import logging
from transformers import logging as hf_logging

# Set the environment variables for TPU and the number of processes
os.environ["ACCELERATE_DEVICE_PLACEMENT"] = "TPU"
os.environ["ACCELERATE_NUM_PROCESSES"] = "8"

# Set the device to TPU
device = xm.xla_device()

MODEL_PATH = "openlm-research/open_llama_3b_v2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH).to(device)

# Set configurable variables
output_dir = "/kaggle/working/"  # Directory to save the fine-tuned model
num_train_epochs = 25  # Number of training epochs
per_device_train_batch_size = 32  # Batch size for training
per_device_eval_batch_size = 32  # Batch size for evaluation
warmup_ratio = 0.1  # Ratio of total training steps for warmup steps
logging_steps = 50  # Ratio of total training steps for logging steps
save_ratio = 0.5  # Ratio of total training steps for save steps
evaluation_strategy = "epoch"  # Evaluation strategy (e.g., "steps", "epoch", "no")
weight_decay = 0.01  # Weight decay for the optimizer
max_len = 650

# Load data from parquet file
data = pd.read_parquet("/kaggle/input/train-input/ready_to_train.parquet")

data = data.dropna(subset=["combined"])
data = data.drop_duplicates(subset=["combined"])

# Split data into train and test sets
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)

tokenizer.pad_token = tokenizer.eos_token

# Tokenize train and test data
train_encodings = tokenizer(train_data["combined"].tolist(), truncation=True, padding=True, max_length=max_len)
test_encodings = tokenizer(test_data["combined"].tolist(), truncation=True, padding=True, max_length=max_len)

# Tokenize labels
train_labels_encodings = tokenizer(train_data["answer"].tolist(), truncation=True, padding=True, max_length=max_len)
test_labels_encodings = tokenizer(test_data["answer"].tolist(), truncation=True, padding=True, max_length=max_len)

# Create dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, label_encodings):
        self.encodings = encodings
        self.label_encodings = label_encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        labels = torch.full_like(item["input_ids"], -100)  # Fill the tensor with -100 (ignore index)
        labels[0] = self.label_encodings["input_ids"][idx][0]  # Set the first position to the correct label
        item["labels"] = labels
        return item

    def __len__(self):
        return len(self.label_encodings["input_ids"])

# Create train and test datasets
train_dataset = Dataset(train_encodings, train_labels_encodings)
test_dataset = Dataset(test_encodings, test_labels_encodings)

# Initialize the accelerator with the desired configuration
accelerator = Accelerator()

# Calculate total training steps
total_train_steps = len(train_dataset) // per_device_train_batch_size * num_train_epochs

# Calculate warmup_steps, logging_steps, save_steps, and eval_steps based on epoch
warmup_steps = int(total_train_steps * warmup_ratio)
eval_steps = len(train_dataset) // per_device_train_batch_size

# Set training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir=output_dir,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    load_best_model_at_end=True,
    save_strategy="epoch",
    report_to="none",  # Set report_to to "none"
)

# Set the log level to logging.INFO
hf_logging.set_verbosity(logging.INFO)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model(output_dir)

字符串
我已经尝试了一堆,降低批处理大小(接近但不够接近,我可能可以运行它的批处理大小为1,但2太多了,我宁愿使用并行无论如何。
我从HuggingFace transformers文档中找到了这个链接,但我不明白它是如何工作的。

ef1yzkbh

ef1yzkbh1#

这是我的工作笔记本:https://www.kaggle.com/code/defdet/llama2-ru-casual-xla。它使用SPMD,这比完全分片的数据并行更好

相关问题