Paddle 用paddle重写pytorch的代码后,loss一直在上升

h22fl7wq  于 2021-11-30  发布在  Java
关注(0)|答案(2)|浏览(329)

PaddlePaddle版本:2.1.2
pytorch实现:https://github.com/hujinsen/pytorch-StarGAN-VC
然后我对该项目进行了一些修改,loss可以正常下降
用paddle重写后,使用之前训练的数据,loss一直在上升
模型经测试后没有问题,可能哪些问题让loss上升呢

修改后的模型
model.py

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

class Down2d(nn.Layer):
    def __init__(self,in_channel ,out_channel, kernel, stride, padding):
        super(Down2d, self).__init__()
        self.c1 = nn.Conv2D(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
        self.n1 = nn.InstanceNorm2D(out_channel)
        self.c2 = nn.Conv2D(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
        self.n2 = nn.InstanceNorm2D(out_channel)

    def forward(self, x):
        x1 = self.c1(x)
        x1 = self.n1(x1)

        x2 = self.c2(x)
        x2 = self.n2(x2)

        x3 = x1 * F.sigmoid(x2)
        return x3

class Up2d(nn.Layer):
    def __init__(self, in_channel ,out_channel, kernel, stride, padding):
        super(Up2d, self).__init__()
        self.c1 = nn.Conv2DTranspose(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
        self.n1 = nn.InstanceNorm2D(out_channel)
        self.c2 = nn.Conv2DTranspose(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
        self.n2 = nn.InstanceNorm2D(out_channel)

    def forward(self, x):
        x1 = self.c1(x)
        x1 = self.n1(x1)

        x2 = self.c2(x)
        x2 = self.n2(x2)

        x3 =  x1 * F.sigmoid(x2)

        return x3

class Generator(nn.Layer):
    def __init__(self,num):
        super(Generator, self).__init__()
        self.downsample = nn.Sequential(
            Down2d(1, 32, (3,9), (1,1), (1,4)),
            Down2d(32, 64, (4,8), (2,2), (1,3)),
            Down2d(64, 128, (4,8), (2,2), (1,3)),
            Down2d(128, 64, (3,5), (1,1), (1,2)),
            Down2d(64, 5, (9,5), (9,1), (1,2))
        )
        self.up1 = Up2d(num+5, 64, (9,5), (9,1), (0,2))
        self.up2 = Up2d(num+64, 128, (3,5), (1,1), (1,2))
        self.up3 = Up2d(num+128, 64, (4,8), (2,2), (1,3))
        self.up4 = Up2d(num+64, 32, (4,8), (2,2), (1,3))

        self.deconv = nn.Conv2DTranspose(num+32, 1, (3,9), (1,1), (1,4))

    def forward(self, x, c):
        x = self.downsample(x)
        c = paddle.reshape(c,[c.shape[0], c.shape[1], 1, 1])

        c1 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c1],axis=1)
        x = self.up1(x)

        c2 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c2],axis=1)
        x = self.up2(x)

        c3 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c3],axis=1)
        x = self.up3(x)

        c4 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c4],axis=1)
        x = self.up4(x)

        c5 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c5],axis=1)
        x = self.deconv(x)
        return x

class Discriminator(nn.Layer):
    def __init__(self,num):
        super(Discriminator, self).__init__()
        self.d1 = Down2d(num+1, 32, (3,9), (1,1), (1,4))
        self.d2 = Down2d(num+32, 32, (3,8), (1,2), (1,3))    
        self.d3 = Down2d(num+32, 32, (3,8), (1,2), (1,3))    
        self.d4 = Down2d(num+32, 32, (3,6), (1,2), (1,2)) 

        self.conv = nn.Conv2D(num+32, 1, (36,5), (36,1), (0,2))
        self.pool = nn.AvgPool2D((1,64))

    def forward(self, x, c):
        c = paddle.reshape(c,[c.shape[0], c.shape[1], 1, 1])

        c1 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c1],axis=1)
        x = self.d1(x)

        c2 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c2],axis=1)
        x = self.d2(x)

        c3 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c3],axis=1)
        x = self.d3(x)

        c4 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c4],axis=1)
        x = self.d4(x)

        c5 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
        x = paddle.concat(x=[x, c5],axis=1)
        x = self.conv(x)

        x = self.pool(x)
        x = paddle.squeeze(x)
        x = paddle.tanh(x)
        return x

class DomainClassifier(nn.Layer):
    def __init__(self,num):
        super(DomainClassifier, self).__init__()
        self.main = nn.Sequential(
            Down2d(1, 8, (4,4), (2,2), (5,1)),
            Down2d(8, 16, (4,4), (2,2), (1,1)),
            Down2d(16, 32, (4,4), (2,2), (0,1)),
            Down2d(32, 16, (3,4), (1,2), (1,1)),
            nn.Conv2D(16, num, (1,4), (1,2), (0,1)),
            nn.AvgPool2D((1,16)),
            #nn.LogSoftmax()
        )

    def forward(self, x):

        x = x[:, :, 0:8, :]
        x = self.main(x)
        x = paddle.reshape(x,[x.shape[0], x.shape[1]])
        return x

model_loader.py

import paddle

def save(model,g_lr,d_lr,c_lr,speakers,path,go,do,co):
	paddle.save({'state_dict':model,'g_lr':g_lr,'d_lr':d_lr,'c_lr':c_lr,'speakers':speakers,'go':go,'do':do,'co':co},path)

def load(path):
	return paddle.load(path)

solver.py

import sys
sys.path.append('/home/aistudio/external-libraries')
sys.path.append('/home/aistudio/vc')
import os
import time
from datetime import datetime, timedelta

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.static import Variable

from data_loader import TestSet
from model import Discriminator, DomainClassifier, Generator
from utility import Normalizer, speakers, load_step
from preprocess import FRAMES, SAMPLE_RATE, FFTSIZE
import random
from sklearn.preprocessing import LabelBinarizer
from pyworld import decode_spectral_envelope, synthesize
import librosa
import ast
import model_loader
from tensorboardX import SummaryWriter

class Solver(object):
    """docstring for Solver."""
    def __init__(self, data_loader, config,spes):
        global speakers
        speakers=spes
        self.config = config
        self.data_loader = data_loader
        # Model configurations.

        self.lambda_cycle = config.lambda_cycle
        self.lambda_cls = config.lambda_cls
        self.lambda_identity = config.lambda_identity

        # Training configurations.
        self.data_dir = config.data_dir
        self.test_dir = config.test_dir
        self.batch_size = config.batch_size
        self.num_iters = config.num_iters
        self.num_iters_decay = config.num_iters_decay
        self.g_lr = config.g_lr
        self.d_lr = config.d_lr
        self.c_lr = config.c_lr
        self.n_critic = config.n_critic
        self.beta1 = config.beta1
        self.beta2 = config.beta2
        self.resume_iters = config.resume_iters

        # Test configurations.
        self.test_iters = config.test_iters
        self.trg_speaker = ast.literal_eval(config.trg_speaker)
        self.src_speaker = config.src_speaker

        # Miscellaneous.
        self.spk_enc = LabelBinarizer().fit(speakers)
        # Directories.
        self.log_dir = config.log_dir
        self.model_save_dir = config.model_save_dir
        self.result_dir = config.result_dir

        # Step size.
        self.log_step = config.log_step
        self.model_save_step = config.model_save_step
        self.lr_update_step = config.lr_update_step

        self.auto_load = config.auto_load
        self.load = config.load
        self.target = config.target
        self.threshold = config.threshold

        self.temp_g_lr=config.g_lr
        self.temp_d_lr=config.d_lr
        self.temp_c_lr=config.c_lr
        self.temp_i=0

        self.writer = SummaryWriter(log_dir=config.log_dir)

        # Build the model and tensorboard.
        self.build_model()

    def build_model(self):
        self.G = Generator(len(speakers))
        self.D = Discriminator(len(speakers))
        self.C = DomainClassifier(len(speakers))

        self.g_optimizer = paddle.optimizer.Adam(learning_rate=self.g_lr,parameters=self.G.parameters(),beta1=self.beta1,beta2=self.beta2)
        self.d_optimizer = paddle.optimizer.Adam(learning_rate=self.d_lr,parameters=self.D.parameters(),beta1=self.beta1,beta2=self.beta2)
        self.c_optimizer = paddle.optimizer.Adam(learning_rate=self.c_lr,parameters=self.C.parameters(),beta1=self.beta1,beta2=self.beta2)

        #self.G.to(self.device)
        #self.D.to(self.device)
        #self.C.to(self.device)

    def update_lr(self, g_lr, d_lr, c_lr):
        pass

    def save_lr(self,g_lr,d_lr,c_lr):
        self.temp_g_lr=g_lr
        self.temp_d_lr=d_lr
        self.temp_c_lr=c_lr

    def train(self):
        start_iters = 0

        # Learning rate cache for decaying.
        g_lr = self.g_lr
        d_lr = self.d_lr
        c_lr = self.c_lr

        if self.auto_load:
            step=load_step(self.model_save_dir)
            if self.load > 0:
                step=self.load
            if not step==0:
                print('Load step:'+str(step))
                G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(step))
                D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(step))
                C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(step))
                modG=model_loader.load(G_path)
                g_lr=modG['g_lr']
                d_lr=modG['d_lr']
                c_lr=modG['c_lr']
                try:
                    self.g_optimizer.set_state_dict(modG['go'])
                    self.d_optimizer.set_state_dict(modG['do'])
                    self.c_optimizer.set_state_dict(modG['co'])
                except:
                    print('no optimizer data')
                self.save_lr(g_lr,d_lr,c_lr)
                self.G.set_state_dict(modG['state_dict'])
                self.D.set_state_dict(model_loader.load(D_path)['state_dict'])
                self.C.set_state_dict(model_loader.load(C_path)['state_dict'])
                start_iters=step

        if self.resume_iters:
            pass

        norm = Normalizer()
        data_iter = iter(self.data_loader)

        print('Start training......')
        start_time = datetime.now()

        for i in range(start_iters, self.num_iters):
            self.temp_i=i
            # =================================================================================== #
            #                             1. Preprocess input data                                #
            # =================================================================================== #
             # Fetch real images and labels.
            try:
                x_real, speaker_idx_org, label_org = next(data_iter)
                speaker_idx_org = paddle.reshape(speaker_idx_org,[-1])
            except:
                data_iter = iter(self.data_loader)
                x_real, speaker_idx_org, label_org = next(data_iter)   
                speaker_idx_org = paddle.reshape(speaker_idx_org,[-1])        

            # Generate target domain labels randomly.
            rand_idx = paddle.randperm(label_org.shape[0])
            label_trg = paddle.reshape(label_org[rand_idx],[1,-1])
            label_trg = label_org[rand_idx]
            speaker_idx_trg = speaker_idx_org[rand_idx]

            #x_real = x_real.to(self.device)           # Input images.
            #label_org = label_org.to(self.device)     # Original domain one-hot labels.
            #label_trg = label_trg.to(self.device)     # Target domain one-hot labels.
            #speaker_idx_org = speaker_idx_org.to(self.device) # Original domain labels
            #speaker_idx_trg = speaker_idx_trg.to(self.device) #Target domain labels

            # =================================================================================== #
            #                             2. Train the discriminator                              #
            # =================================================================================== #
            # Compute loss with real audio frame.
            CELoss = nn.CrossEntropyLoss()
            cls_real = self.C(x_real)
            cls_loss_real = CELoss(input=cls_real, label=speaker_idx_org)

            self.reset_grad()
            cls_loss_real.backward()
            self.c_optimizer.step()
             # Logging.
            loss = {}
            loss['C/C_loss'] = cls_loss_real.item()

            self.writer.add_scalar('C_Loss', loss['C/C_loss'], i)

            if not self.target =='g':
                out_r = self.D(x_real, label_org)
                # Compute loss with fake audio frame.
                x_fake = self.G(x_real, label_trg)
                out_f = self.D(x_fake.detach(), label_trg)
                d_loss_t = F.binary_cross_entropy_with_logits(logit=out_f,label=paddle.zeros_like(out_f, dtype='float32')) + \
                    F.binary_cross_entropy_with_logits(logit=out_r, label=paddle.ones_like(out_r, dtype='float32'))

                out_cls = self.C(x_fake)
                d_loss_cls = CELoss(input=out_cls, label=speaker_idx_trg)

                # Compute loss for gradient penalty.
                alpha = paddle.rand([x_real.shape[0], 1, 1, 1])
                x_hat = paddle.to_tensor((alpha * x_real + (1 - alpha) * x_fake),stop_gradient=False)
                out_src = self.D(x_hat, label_trg)
                d_loss_gp = self.gradient_penalty(out_src, x_hat)

                d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5*d_loss_gp

                if self.threshold < d_loss:
                    self.reset_grad()
                    d_loss.backward()
                    self.d_optimizer.step()

                loss['D/D_loss'] = d_loss.item()

                self.writer.add_scalar('D_Loss', loss['D/D_loss'], i)

            # =================================================================================== #
            #                               3. Train the generator                                #
            # =================================================================================== #        
            if (i+1) % self.n_critic == 0 and not self.target == 'd':
                # Original-to-target domain.
                x_fake = self.G(x_real, label_trg)
                g_out_src = self.D(x_fake, label_trg)
                g_loss_fake = F.binary_cross_entropy_with_logits(logit=g_out_src, label=paddle.ones_like(g_out_src, dtype='float32'))

                out_cls = self.C(x_real)
                g_loss_cls = CELoss(input=out_cls, label=speaker_idx_org)

                # Target-to-original domain.
                x_reconst = self.G(x_fake, label_org)
                g_loss_rec = F.l1_loss(x_reconst, x_real )

                # Original-to-Original domain(identity).
                x_fake_iden = self.G(x_real, label_org)
                id_loss = F.l1_loss(x_fake_iden, x_real )

                # Backward and optimize.
                g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\
                 self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss

                if self.threshold < d_loss:
                    self.reset_grad()
                    g_loss.backward()
                    self.g_optimizer.step()

                # Logging.
                loss['G/loss_fake'] = g_loss_fake.item()
                loss['G/loss_rec'] = g_loss_rec.item()
                loss['G/loss_cls'] = g_loss_cls.item()
                loss['G/loss_id'] = id_loss.item()
                loss['G/g_loss'] = g_loss.item()

                self.writer.add_scalar('G_Loss', loss['G/g_loss'], i)
            # =================================================================================== #
            #                                 4. Miscellaneous                                    #
            # =================================================================================== #
            # Print out training information.
            if (i+1) % self.log_step == 0:
                et = datetime.now() - start_time
                et = str(et)[:-7]
                log = "Elapsed [{}], Iteration [{}/{}]".format(et, i+1, self.num_iters)
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)

            # Save model checkpoints.
            if (i+1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(i+1))
                D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(i+1))
                C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(i+1))
                model_loader.save(self.G.state_dict(),g_lr,d_lr,c_lr,speakers,G_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
                model_loader.save(self.D.state_dict(),g_lr,d_lr,c_lr,speakers,D_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
                model_loader.save(self.C.state_dict(),g_lr,d_lr,c_lr,speakers,C_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
                print('Saved model checkpoints into {}...'.format(self.model_save_dir))

            # Decay learning rates.
            if (i+1) % self.lr_update_step == 0 and (i+1) > (self.num_iters - self.num_iters_decay):
                g_lr -= (self.g_lr / float(self.num_iters_decay))
                d_lr -= (self.d_lr / float(self.num_iters_decay))
                c_lr -= (self.c_lr / float(self.num_iters_decay))
                self.update_lr(g_lr, d_lr, c_lr)
                self.save_lr(g_lr,d_lr,c_lr)
                print ('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(g_lr, d_lr))

    def gradient_penalty(self, y, x):
        """Compute gradient penalty: (L2_norm(dy/dx) - 1)**2."""
        y = paddle.reshape(y,[-1])
        weight = paddle.ones(y.shape)
        dydx = paddle.grad(outputs=y,inputs=x,grad_outputs=weight,retain_graph=True,create_graph=False,only_inputs=True)[0]
        dydx.stop_gradient = False

        dydx = paddle.reshape(dydx,[dydx.shape[0],-1])
        dydx_l2norm = paddle.sqrt(paddle.sum(dydx**2, axis=1))
        return paddle.mean((dydx_l2norm-1)**2)

    def reset_grad(self):
        """Reset the gradient buffers."""
        self.g_optimizer.clear_grad()
        self.d_optimizer.clear_grad()
        self.c_optimizer.clear_grad()

    def restore_model(self, resume_iters):
        """Restore the trained generator and discriminator."""
        print('Loading the trained models from step {}...'.format(resume_iters))
        G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(resume_iters))
        D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(resume_iters))
        C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(resume_iters))
        self.G.set_state_dict(model_loader.load(G_path)['state_dict'])
        self.D.set_state_dict(model_loader.load(D_path)['state_dict'])
        self.C.set_state_dict(model_loader.load(C_path)['state_dict'])

    @staticmethod
    def pad_coded_sp(coded_sp_norm):
        f_len = coded_sp_norm.shape[1]
        if  f_len >= FRAMES: 
            pad_length = FRAMES-(f_len - (f_len//FRAMES) * FRAMES)
        elif f_len < FRAMES:
            pad_length = FRAMES - f_len

        sp_norm_pad = np.hstack((coded_sp_norm, np.zeros((coded_sp_norm.shape[0], pad_length))))
        return sp_norm_pad

    def test(self):
        """Translate speech using StarGAN ."""
        # Load the trained generator.
        self.restore_model(self.test_iters)
        norm = Normalizer()

        # Set data loader.
        d, speaker = TestSet(self.test_dir).test_data(self.src_speaker)
        targets = self.trg_speaker

        for target in targets:
            print(target)
            assert target in speakers
            label_t = self.spk_enc.transform([target])[0]
            label_t = np.asarray([label_t])

            with paddle.no_grad():

                for filename, content in d.items():
                    f0 = content['f0']
                    ap = content['ap']
                    sp_norm_pad = self.pad_coded_sp(content['coded_sp_norm'])

                    convert_result = []
                    for start_idx in range(0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
                        one_seg = sp_norm_pad[:, start_idx : start_idx+FRAMES]

                        one_seg = paddle.to_tensor(one_seg,dtype='float32')
                        one_seg = paddle.reshape(one_seg,[1,1,one_seg.shape[0],one_seg.shape[1]])
                        l = paddle.to_tensor(label_t,dtype='float32')
                        #one_seg = one_seg.to(self.device)
                        #l = l.to(self.device)
                        one_set_return = self.G(one_seg, l).cpu().numpy()
                        one_set_return = np.squeeze(one_set_return)
                        one_set_return = norm.backward_process(one_set_return, target)
                        convert_result.append(one_set_return)

                    convert_con = np.concatenate(convert_result, axis=1)
                    convert_con = convert_con[:, 0:content['coded_sp_norm'].shape[1]]
                    contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64)   
                    decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE)
                    f0_converted = norm.pitch_conversion(f0, speaker, target)
                    wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE)

                    name = f'{speaker}-{target}_iter{self.test_iters}_{filename}'
                    path = os.path.join(self.result_dir, name)
                    print(f'[save]:{path}')
                    librosa.output.write_wav(path, wav, SAMPLE_RATE)            

    def safe_train(self,throw=False):
        try:
            self.train()
            self.writer.close()
        except BaseException as e:
            print('Warning! An exception has been thrown')
            print('Warning! Saving model')
            self.writer.close()
            G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(self.temp_i+1))
            D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(self.temp_i+1))
            C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(self.temp_i+1))
            model_loader.save(self.G.state_dict(),self.temp_g_lr,self.temp_d_lr,self.temp_c_lr,speakers,G_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
            model_loader.save(self.D.state_dict(),self.temp_g_lr,self.temp_d_lr,self.temp_c_lr,speakers,D_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
            model_loader.save(self.C.state_dict(),self.temp_g_lr,self.temp_d_lr,self.temp_c_lr,speakers,C_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
            print('Saved model checkpoints into {}...'.format(self.model_save_dir))
            print('Training termination...')
            if throw:
                raise e

if __name__ == '__main__':
    pass

data_loader.py

import sys
sys.path.append('/home/aistudio/external-libraries')
sys.path.append('/home/aistudio/vc')
import os

import librosa
import numpy as np
import paddle
from sklearn.preprocessing import LabelBinarizer
from paddle.io import DataLoader,Dataset

from preprocess import (FEATURE_DIM, FFTSIZE, FRAMES, SAMPLE_RATE,
                        world_features)
from utility import Normalizer, speakers
import random

class AudioDataset(Dataset):
    """docstring for AudioDataset."""
    def __init__(self, datadir:str):
        super(AudioDataset, self).__init__()
        self.datadir = datadir
        self.files = []
        for root,dirs,files in os.walk(datadir,followlinks=True):
            for f in files:
                if os.path.splitext(f)[0].split('-',1)[0].split('_',1)[0] in speakers and os.path.splitext(f)[1].lower()=='.npy':
                    self.files.append(os.path.join(root,f))
        self.encoder = LabelBinarizer().fit(speakers)

    def __getitem__(self, idx):
        p = self.files[idx]
        filename = os.path.basename(p)
        speaker = filename.split(sep='_', maxsplit=1)[0]
        label = self.encoder.transform([speaker])[0]
        mcep = np.load(p)
        mcep = paddle.to_tensor(mcep, dtype='float32')
        mcep = paddle.unsqueeze(mcep, 0)
        return mcep, paddle.to_tensor(speakers.index(speaker), dtype='int64'), paddle.to_tensor(label, dtype='float32')

    def speaker_encoder(self):
        return self.encoder

    def __len__(self):
        return len(self.files)

def data_loader(datadir: str, batch_size=4, shuffle=True, mode='train', num_workers=2,spes=speakers):
    '''if mode is train datadir should contains training set which are all npy files
or, mode is test and datadir should contains only wav files.
'''
    global speakers
    speakers=spes
    dataset = AudioDataset(datadir)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

    return loader

class TestSet(object):
    """docstring for TestSet."""
    def __init__(self, datadir:str):
        super(TestSet, self).__init__()
        self.datadir = datadir
        self.norm = Normalizer()

    def choose(self):
        '''choose one speaker for test'''
        r = random.choice(speakers)
        return r

    def test_data(self, src_speaker=None):
        '''choose one speaker for conversion'''
        if src_speaker:
            r_s = src_speaker
        else:
            r_s = self.choose()
        p = os.path.join(self.datadir, r_s)
        wavfiles = librosa.util.find_files(p, ext='wav')

        res = {}
        for f in wavfiles:
            filename = os.path.basename(f)
            wav, _ = librosa.load(f, sr=SAMPLE_RATE, dtype=np.float64)
            f0, timeaxis, sp, ap, coded_sp = world_features(wav, SAMPLE_RATE, FFTSIZE, FEATURE_DIM)
            coded_sp_norm = self.norm.forward_process(coded_sp.T, r_s)

            if not res.__contains__(filename):
                res[filename] = {}
            res[filename]['coded_sp_norm'] = np.asarray(coded_sp_norm)
            res[filename]['f0'] = np.asarray(f0)
            res[filename]['ap'] = np.asarray(ap)
        return res , r_s
7uhlpewt

7uhlpewt1#

您好,我们已经收到了您的问题,会安排技术人员尽快解答您的问题,请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时,您也可以通过查看官网API文档常见问题历史IssueAI社区来寻求解答。祝您生活愉快~

Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the APIFAQGithub Issue and AI community to get the answer.Have a nice day!

g6ll5ycj

g6ll5ycj2#

这边提供一些建议,希望能对您有帮助:
1.建议检查一下pytorch和paddle的op对应和使用方法问题,有些op的参数和使用方法是不相同的
2.尝试将数据更换成全1的人造数据,测试是否可以正常收敛拟合
3.如果有一些随机量,您可以通过paddle.seed()以及FLAGS_cudnn_deterministic,使用确定的初始化方式来固定,这样方便您查找
4.您可以逐层打印op的输出,查看真实输出是否符合预期输出,包括反向的grad也可以通过tensor.grad查看
5.可以检查一下训练和infer的时候是否处于相应分支的代码逻辑中,使用model.train()来切换为训练逻辑

相关问题