使用PyTorch和PyTorchLightning进行光线训练会引发ValueError(“预期父对象”)

fsi0uk1n  于 2023-03-18  发布在  其他
关注(0)|答案(1)|浏览(394)

我有一个代码,有一个数据模块和一个模型,我训练我的模型与射线教练,这里是我的代码:

class CSIDataset(pl.LightningDataModule):
        def __init__(self, pkl_dir):
            super().__init__()
            self.samples = []
            for file in os.listdir(pkl_dir):
                sample = pickle.load(open(pkl_dir + file, 'rb'))
                if sample['x_score'].shape[0] > 0:
                    self.samples += [sample]
            
        def __getitem__(self, idx):
            data = self.samples[idx]
            return data['x_capture'], data['x_score'], data['label']
        
        def __len__(self):
            return len(self.samples)
    
    class CSIModel(pl.LightningModule):
        def __init__(self, config):
            super().__init__()
            self.config = config
            self.criterion = nn.BCELoss()
            self.capture_rnn = nn.Sequential(
                nn.Linear(config['capture_input_dim'], config['d_Wa']),
                nn.Tanh(),
                nn.Dropout(config['dropout']),
                nn.LSTM(
                    input_size=config['d_Wa'], 
                    hidden_size=config['d_lstm'],
                    num_layers=1, 
                    batch_first=True)
            )
            self.capture_proj = nn.Sequential(
                nn.Linear(config['d_lstm'], config['d_Wr']),
                nn.Tanh(),
                nn.Dropout(config['dropout'])
            )
            self.score = nn.Sequential(
                nn.Linear(config['score_input_dim'], config['d_Wu']),
                nn.Tanh(),
                nn.Linear(config['d_Wu'], config['d_Ws']),
                nn.Sigmoid()
            )
            self.cls = nn.Sequential(
                nn.Linear(config['d_Ws'] + config['d_Wr'], 1),
                nn.Sigmoid()
            )
            
        def configure_optimizers(self):
            all_params = dict(self.named_parameters())
            wd_name = 'score.0.weight'
            wd_params = all_params[wd_name]
            del all_params[wd_name]
            return torch.optim.Adam(
                [
                    {'params':  wd_params, 'weight_decay': self.config['weight_decay']}, 
                    {'params': list(all_params.values())},
                ], 
                lr=self.config['lr']
            )
            
    
        def count_parameters(self):
            return sum(p.numel() for p in self.parameters() if p.requires_grad)    
            
        def forward(self, x_capture, x_score):
            hc, (_, _) = self.capture_rnn(x_capture.float())
            hc = self.capture_proj(hc[:, -1])
            hs = self.score(x_score.float()).mean(dim=1)
            h = torch.cat([hc, hs], dim=1)
            return self.cls(h)
        
        def step(self, batch, mode='train'):
            x_capture, x_score, labels = batch
            labels = labels[:, None].float()
            logits = self.forward(x_capture, x_score)
            loss = self.criterion(logits, labels)
            
            preds = logits.clone()
            preds[preds >=0.5] = 1
            preds[preds < 0.5] = 0
            acc = (preds == labels).sum() / labels.shape[0]
            tn, fn, fp, tp = confusion_matrix(logits, labels.int(), num_classes=2, threshold=0.5).flatten()
    
            self.log(f'{mode}_loss', loss.item())
            self.log(f'{mode}_acc', acc.item())
            self.log(f'{mode}_tn', tn.item())
            self.log(f'{mode}_fn', fn.item())
            self.log(f'{mode}_fp', fp.item())
            self.log(f'{mode}_tp', tp.item())
            return {
                'loss':loss, 
                'acc':acc, 
                'tn':tn, 
                'fn':fn, 
                'fp':fp, 
                'tp':tp
            }
        
        def training_step(self, batch, batch_idx):
            return self.step(batch)
        
        def test_step(self, batch, batch_idx):
            return self.step(batch, mode='test')
        
        def validation_step(self, batch, batch_idx):
            return self.step(batch, mode='val')
    
    
    
    def experiment(args):
        dataset = 'weibo'
        path = f'assets/{dataset}/'
        
        train_set = CSIDataset(pkl_dir=path + 'train/pkls/')
        val_set = CSIDataset(pkl_dir=path + 'validation/pkls/')
        train_loader = DataLoader(train_set, batch_size=1, shuffle=False, num_workers=1)
        val_loader = DataLoader(val_set, batch_size=1, shuffle=False, num_workers=1)
        
        
        conf = {
            'capture_input_dim' : 112,
            'score_input_dim' : 50,
            'd_Wa': 100,
            'd_lstm' : 50,
            'd_Wr' : 100,
            'd_Wu' : 100,
            'd_Ws' : 1,
            'lr': args['lr'],
            'dropout' : args['dropout'],
            'weight_decay' : args['weight_decay']
        }
        model = CSIModel(conf)
    
        name = f"dataset={dataset}-do={args['dropout']}-lr={args['lr']}-wd={args['weight_decay']}"
        save_dir = f'weights/{name}/'
        logger = TensorBoardLogger(save_dir='logs/', name=name)
        checkpoint = ModelCheckpoint(
            dirpath=save_dir, 
            filename='{epoch}-{val_loss:.2f}', 
            monitor='val_loss',
            mode='min',
            save_top_k=10, 
            every_n_epochs = 5
        )
    #     reporter = TuneReportCallback(
    #         {
    #             "loss": "ptl/val_loss",
    #             "mean_accuracy": "ptl/val_acc"
    #         },
    #         on="validation_end"
    #     )
        os.makedirs(save_dir, exist_ok=True)
        json.dump(conf, open(save_dir + 'config.json', 'w'))
    
        trainer = Trainer(
            benchmark=True, 
            gpus=[1], 
            accumulate_grad_batches=64,
            logger=logger, 
            enable_progress_bar=False,
            max_epochs=10,
            callbacks=[checkpoint]
        )
        trainer.fit(model, train_loader, val_loader)
        res = trainer.validate(val_loader)[0]
        tune.report(**res)
    
    analysis = tune.run(
        experiment,
        num_samples=4,
        resources_per_trial={"cpu": 1, "gpu": 2},
        verbose=1,
        config={
            "weight_decay": tune.grid_search([0., 0.1, 0.01, 0.001]),
            "lr": tune.loguniform(1e-5, 1e-1),
            "dropout": tune.uniform(0., 0.3)
        }
    )

但在运行代码时,我收到了以下错误:

File "/tmp/ipykernel_2468582/2857088609.py", line 62, in experiment
 File "/home/user/venv37/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 821, in validate
   return self._call_and_handle_interrupt(self._validate_impl, model, dataloaders, ckpt_path, verbose, datamodule)
 File "/home/user/venv37/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 685, in _call_and_handle_interrupt
   return trainer_fn(*args, **kwargs)
 File "/home/user/venv37/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 864, in _validate_impl
   results = self._run(model, ckpt_path=self.validated_ckpt_path)
 File "/home/user/venv37/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1128, in _run
   verify_loop_configurations(self)
 File "/home/user/venv37/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py", line 40, in verify_loop_configurations
   __verify_eval_loop_configuration(trainer, model, "val")
 File "/home/user/venv37/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py", line 170, in __verify_eval_loop_configuration
   has_step = is_overridden(step_name, model)
 File "/home/user/venv37/lib/python3.7/site-packages/pytorch_lightning/utilities/model_helpers.py", line 47, in is_overridden
   raise ValueError("Expected a parent")
ValueError: Expected a parent

尽管我的数据模块和模型都继承了pl.LightningModule!我试过Python 3.8,torch==1.9.0和pytorch_lightning==1.5.5以及Python 3.7,但是错误仍然存在。任何解决方案都是值得赞赏的!我也试过this solution,但是没有成功!

46scxncf

46scxncf1#

这可能涉及版本兼容性。
升级lightning并使用导入的lightning.pytorch作为pl
而不是将pytorch_lightning作为pl导入
对我有用

相关问题