pandas 将CustomDataset加载到HuggingFace Trainer时发生属性错误

o7jaxewo  于 2023-11-15  发布在  其他
关注(0)|答案(1)|浏览(123)

我正在跟踪这个网页,试图将一个pandas框架加载到一个pytorch数据集中,以便使用Trainer API:https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer。我的脚本如下所示。

modelName='bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(modelName)
model = BertForSequenceClassification.from_pretrained(modelName, num_labels=6)

def tokenizeFunction(sentence):
    return tokenizer(sentence, padding="max_length", truncation=True)

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer=tokenizer
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.loc[idx, "text"]  # 'text' column
        emotions = self.data.loc[idx, "emotions"]  # 'emotions' column
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs["input_ids"].squeeze(),
            'attention_mask': inputs["attention_mask"].squeeze(),
            'labels': int(emotions),
        }

#Tokenizing the training and text strings
trainDataset=CustomDataset(trainDF, tokenizer)
testDataset=CustomDataset(testDF, tokenizer)
trainLoader=DataLoader(trainDataset, batch_size=16, shuffle=True)
testLoader=DataLoader(testDataset, batch_size=16, shuffle=True)

trainingArgs= TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainLoader,
    data_collator=lambda data: {"input_ids": data[0], "attention_mask": data[1], "labels": data[2]},
    eval_dataset=testLoader,
)

trainer.train()

字符串
在这段代码中,我一直得到一个attributeError,“AttributeError:'dict' object has no attribute 'size'”。我该如何解决这个问题?谢谢。

7uhlpewt

7uhlpewt1#

我修复了你的代码,数据集不是pandas数据集,它是pyarrow表,它们有不同的列名,没有loc方法,你需要数据集作为Trainer中的参数,它开始训练模型

from datasets import load_dataset
dataset = load_dataset("yelp_review_full")

modelName='bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(modelName)
model = BertForSequenceClassification.from_pretrained(modelName, num_labels=6)

trainDF = dataset["train"]

testDF = dataset["test"]

def tokenizeFunction(sentence):
    return tokenizer(sentence, padding="max_length", truncation=True)

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer=tokenizer
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        text = self.data["text"][idx]
        emotions = self.data["label"][idx]
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs["input_ids"].squeeze(),
            'attention_mask': inputs["attention_mask"].squeeze(),
            'labels': int(emotions),
        }

#Tokenizing the training and text strings
trainDataset=CustomDataset(trainDF, tokenizer)
testDataset=CustomDataset(testDF, tokenizer)
trainLoader=DataLoader(trainDataset, batch_size=16, shuffle=True)
testLoader=DataLoader(testDataset, batch_size=16, shuffle=True)

trainingArgs= TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainDataset,
    # data_collator=lambda data: {"input_ids": data[0], "attention_mask": data[1], "labels": data[2]},
    eval_dataset=testDataset,
)

trainer.train()

字符串

相关问题