PaddlePaddle版本:2.1.2
pytorch实现:https://github.com/hujinsen/pytorch-StarGAN-VC
然后我对该项目进行了一些修改,loss可以正常下降
用paddle重写后,使用之前训练的数据,loss一直在上升
模型经测试后没有问题,可能哪些问题让loss上升呢
修改后的模型model.py
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class Down2d(nn.Layer):
def __init__(self,in_channel ,out_channel, kernel, stride, padding):
super(Down2d, self).__init__()
self.c1 = nn.Conv2D(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
self.n1 = nn.InstanceNorm2D(out_channel)
self.c2 = nn.Conv2D(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
self.n2 = nn.InstanceNorm2D(out_channel)
def forward(self, x):
x1 = self.c1(x)
x1 = self.n1(x1)
x2 = self.c2(x)
x2 = self.n2(x2)
x3 = x1 * F.sigmoid(x2)
return x3
class Up2d(nn.Layer):
def __init__(self, in_channel ,out_channel, kernel, stride, padding):
super(Up2d, self).__init__()
self.c1 = nn.Conv2DTranspose(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
self.n1 = nn.InstanceNorm2D(out_channel)
self.c2 = nn.Conv2DTranspose(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding)
self.n2 = nn.InstanceNorm2D(out_channel)
def forward(self, x):
x1 = self.c1(x)
x1 = self.n1(x1)
x2 = self.c2(x)
x2 = self.n2(x2)
x3 = x1 * F.sigmoid(x2)
return x3
class Generator(nn.Layer):
def __init__(self,num):
super(Generator, self).__init__()
self.downsample = nn.Sequential(
Down2d(1, 32, (3,9), (1,1), (1,4)),
Down2d(32, 64, (4,8), (2,2), (1,3)),
Down2d(64, 128, (4,8), (2,2), (1,3)),
Down2d(128, 64, (3,5), (1,1), (1,2)),
Down2d(64, 5, (9,5), (9,1), (1,2))
)
self.up1 = Up2d(num+5, 64, (9,5), (9,1), (0,2))
self.up2 = Up2d(num+64, 128, (3,5), (1,1), (1,2))
self.up3 = Up2d(num+128, 64, (4,8), (2,2), (1,3))
self.up4 = Up2d(num+64, 32, (4,8), (2,2), (1,3))
self.deconv = nn.Conv2DTranspose(num+32, 1, (3,9), (1,1), (1,4))
def forward(self, x, c):
x = self.downsample(x)
c = paddle.reshape(c,[c.shape[0], c.shape[1], 1, 1])
c1 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c1],axis=1)
x = self.up1(x)
c2 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c2],axis=1)
x = self.up2(x)
c3 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c3],axis=1)
x = self.up3(x)
c4 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c4],axis=1)
x = self.up4(x)
c5 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c5],axis=1)
x = self.deconv(x)
return x
class Discriminator(nn.Layer):
def __init__(self,num):
super(Discriminator, self).__init__()
self.d1 = Down2d(num+1, 32, (3,9), (1,1), (1,4))
self.d2 = Down2d(num+32, 32, (3,8), (1,2), (1,3))
self.d3 = Down2d(num+32, 32, (3,8), (1,2), (1,3))
self.d4 = Down2d(num+32, 32, (3,6), (1,2), (1,2))
self.conv = nn.Conv2D(num+32, 1, (36,5), (36,1), (0,2))
self.pool = nn.AvgPool2D((1,64))
def forward(self, x, c):
c = paddle.reshape(c,[c.shape[0], c.shape[1], 1, 1])
c1 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c1],axis=1)
x = self.d1(x)
c2 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c2],axis=1)
x = self.d2(x)
c3 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c3],axis=1)
x = self.d3(x)
c4 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c4],axis=1)
x = self.d4(x)
c5 = paddle.tile(c,repeat_times=[1, 1, x.shape[2], x.shape[3]])
x = paddle.concat(x=[x, c5],axis=1)
x = self.conv(x)
x = self.pool(x)
x = paddle.squeeze(x)
x = paddle.tanh(x)
return x
class DomainClassifier(nn.Layer):
def __init__(self,num):
super(DomainClassifier, self).__init__()
self.main = nn.Sequential(
Down2d(1, 8, (4,4), (2,2), (5,1)),
Down2d(8, 16, (4,4), (2,2), (1,1)),
Down2d(16, 32, (4,4), (2,2), (0,1)),
Down2d(32, 16, (3,4), (1,2), (1,1)),
nn.Conv2D(16, num, (1,4), (1,2), (0,1)),
nn.AvgPool2D((1,16)),
#nn.LogSoftmax()
)
def forward(self, x):
x = x[:, :, 0:8, :]
x = self.main(x)
x = paddle.reshape(x,[x.shape[0], x.shape[1]])
return x
model_loader.py
import paddle
def save(model,g_lr,d_lr,c_lr,speakers,path,go,do,co):
paddle.save({'state_dict':model,'g_lr':g_lr,'d_lr':d_lr,'c_lr':c_lr,'speakers':speakers,'go':go,'do':do,'co':co},path)
def load(path):
return paddle.load(path)
solver.py
import sys
sys.path.append('/home/aistudio/external-libraries')
sys.path.append('/home/aistudio/vc')
import os
import time
from datetime import datetime, timedelta
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.static import Variable
from data_loader import TestSet
from model import Discriminator, DomainClassifier, Generator
from utility import Normalizer, speakers, load_step
from preprocess import FRAMES, SAMPLE_RATE, FFTSIZE
import random
from sklearn.preprocessing import LabelBinarizer
from pyworld import decode_spectral_envelope, synthesize
import librosa
import ast
import model_loader
from tensorboardX import SummaryWriter
class Solver(object):
"""docstring for Solver."""
def __init__(self, data_loader, config,spes):
global speakers
speakers=spes
self.config = config
self.data_loader = data_loader
# Model configurations.
self.lambda_cycle = config.lambda_cycle
self.lambda_cls = config.lambda_cls
self.lambda_identity = config.lambda_identity
# Training configurations.
self.data_dir = config.data_dir
self.test_dir = config.test_dir
self.batch_size = config.batch_size
self.num_iters = config.num_iters
self.num_iters_decay = config.num_iters_decay
self.g_lr = config.g_lr
self.d_lr = config.d_lr
self.c_lr = config.c_lr
self.n_critic = config.n_critic
self.beta1 = config.beta1
self.beta2 = config.beta2
self.resume_iters = config.resume_iters
# Test configurations.
self.test_iters = config.test_iters
self.trg_speaker = ast.literal_eval(config.trg_speaker)
self.src_speaker = config.src_speaker
# Miscellaneous.
self.spk_enc = LabelBinarizer().fit(speakers)
# Directories.
self.log_dir = config.log_dir
self.model_save_dir = config.model_save_dir
self.result_dir = config.result_dir
# Step size.
self.log_step = config.log_step
self.model_save_step = config.model_save_step
self.lr_update_step = config.lr_update_step
self.auto_load = config.auto_load
self.load = config.load
self.target = config.target
self.threshold = config.threshold
self.temp_g_lr=config.g_lr
self.temp_d_lr=config.d_lr
self.temp_c_lr=config.c_lr
self.temp_i=0
self.writer = SummaryWriter(log_dir=config.log_dir)
# Build the model and tensorboard.
self.build_model()
def build_model(self):
self.G = Generator(len(speakers))
self.D = Discriminator(len(speakers))
self.C = DomainClassifier(len(speakers))
self.g_optimizer = paddle.optimizer.Adam(learning_rate=self.g_lr,parameters=self.G.parameters(),beta1=self.beta1,beta2=self.beta2)
self.d_optimizer = paddle.optimizer.Adam(learning_rate=self.d_lr,parameters=self.D.parameters(),beta1=self.beta1,beta2=self.beta2)
self.c_optimizer = paddle.optimizer.Adam(learning_rate=self.c_lr,parameters=self.C.parameters(),beta1=self.beta1,beta2=self.beta2)
#self.G.to(self.device)
#self.D.to(self.device)
#self.C.to(self.device)
def update_lr(self, g_lr, d_lr, c_lr):
pass
def save_lr(self,g_lr,d_lr,c_lr):
self.temp_g_lr=g_lr
self.temp_d_lr=d_lr
self.temp_c_lr=c_lr
def train(self):
start_iters = 0
# Learning rate cache for decaying.
g_lr = self.g_lr
d_lr = self.d_lr
c_lr = self.c_lr
if self.auto_load:
step=load_step(self.model_save_dir)
if self.load > 0:
step=self.load
if not step==0:
print('Load step:'+str(step))
G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(step))
D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(step))
C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(step))
modG=model_loader.load(G_path)
g_lr=modG['g_lr']
d_lr=modG['d_lr']
c_lr=modG['c_lr']
try:
self.g_optimizer.set_state_dict(modG['go'])
self.d_optimizer.set_state_dict(modG['do'])
self.c_optimizer.set_state_dict(modG['co'])
except:
print('no optimizer data')
self.save_lr(g_lr,d_lr,c_lr)
self.G.set_state_dict(modG['state_dict'])
self.D.set_state_dict(model_loader.load(D_path)['state_dict'])
self.C.set_state_dict(model_loader.load(C_path)['state_dict'])
start_iters=step
if self.resume_iters:
pass
norm = Normalizer()
data_iter = iter(self.data_loader)
print('Start training......')
start_time = datetime.now()
for i in range(start_iters, self.num_iters):
self.temp_i=i
# =================================================================================== #
# 1. Preprocess input data #
# =================================================================================== #
# Fetch real images and labels.
try:
x_real, speaker_idx_org, label_org = next(data_iter)
speaker_idx_org = paddle.reshape(speaker_idx_org,[-1])
except:
data_iter = iter(self.data_loader)
x_real, speaker_idx_org, label_org = next(data_iter)
speaker_idx_org = paddle.reshape(speaker_idx_org,[-1])
# Generate target domain labels randomly.
rand_idx = paddle.randperm(label_org.shape[0])
label_trg = paddle.reshape(label_org[rand_idx],[1,-1])
label_trg = label_org[rand_idx]
speaker_idx_trg = speaker_idx_org[rand_idx]
#x_real = x_real.to(self.device) # Input images.
#label_org = label_org.to(self.device) # Original domain one-hot labels.
#label_trg = label_trg.to(self.device) # Target domain one-hot labels.
#speaker_idx_org = speaker_idx_org.to(self.device) # Original domain labels
#speaker_idx_trg = speaker_idx_trg.to(self.device) #Target domain labels
# =================================================================================== #
# 2. Train the discriminator #
# =================================================================================== #
# Compute loss with real audio frame.
CELoss = nn.CrossEntropyLoss()
cls_real = self.C(x_real)
cls_loss_real = CELoss(input=cls_real, label=speaker_idx_org)
self.reset_grad()
cls_loss_real.backward()
self.c_optimizer.step()
# Logging.
loss = {}
loss['C/C_loss'] = cls_loss_real.item()
self.writer.add_scalar('C_Loss', loss['C/C_loss'], i)
if not self.target =='g':
out_r = self.D(x_real, label_org)
# Compute loss with fake audio frame.
x_fake = self.G(x_real, label_trg)
out_f = self.D(x_fake.detach(), label_trg)
d_loss_t = F.binary_cross_entropy_with_logits(logit=out_f,label=paddle.zeros_like(out_f, dtype='float32')) + \
F.binary_cross_entropy_with_logits(logit=out_r, label=paddle.ones_like(out_r, dtype='float32'))
out_cls = self.C(x_fake)
d_loss_cls = CELoss(input=out_cls, label=speaker_idx_trg)
# Compute loss for gradient penalty.
alpha = paddle.rand([x_real.shape[0], 1, 1, 1])
x_hat = paddle.to_tensor((alpha * x_real + (1 - alpha) * x_fake),stop_gradient=False)
out_src = self.D(x_hat, label_trg)
d_loss_gp = self.gradient_penalty(out_src, x_hat)
d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5*d_loss_gp
if self.threshold < d_loss:
self.reset_grad()
d_loss.backward()
self.d_optimizer.step()
loss['D/D_loss'] = d_loss.item()
self.writer.add_scalar('D_Loss', loss['D/D_loss'], i)
# =================================================================================== #
# 3. Train the generator #
# =================================================================================== #
if (i+1) % self.n_critic == 0 and not self.target == 'd':
# Original-to-target domain.
x_fake = self.G(x_real, label_trg)
g_out_src = self.D(x_fake, label_trg)
g_loss_fake = F.binary_cross_entropy_with_logits(logit=g_out_src, label=paddle.ones_like(g_out_src, dtype='float32'))
out_cls = self.C(x_real)
g_loss_cls = CELoss(input=out_cls, label=speaker_idx_org)
# Target-to-original domain.
x_reconst = self.G(x_fake, label_org)
g_loss_rec = F.l1_loss(x_reconst, x_real )
# Original-to-Original domain(identity).
x_fake_iden = self.G(x_real, label_org)
id_loss = F.l1_loss(x_fake_iden, x_real )
# Backward and optimize.
g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\
self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss
if self.threshold < d_loss:
self.reset_grad()
g_loss.backward()
self.g_optimizer.step()
# Logging.
loss['G/loss_fake'] = g_loss_fake.item()
loss['G/loss_rec'] = g_loss_rec.item()
loss['G/loss_cls'] = g_loss_cls.item()
loss['G/loss_id'] = id_loss.item()
loss['G/g_loss'] = g_loss.item()
self.writer.add_scalar('G_Loss', loss['G/g_loss'], i)
# =================================================================================== #
# 4. Miscellaneous #
# =================================================================================== #
# Print out training information.
if (i+1) % self.log_step == 0:
et = datetime.now() - start_time
et = str(et)[:-7]
log = "Elapsed [{}], Iteration [{}/{}]".format(et, i+1, self.num_iters)
for tag, value in loss.items():
log += ", {}: {:.4f}".format(tag, value)
print(log)
# Save model checkpoints.
if (i+1) % self.model_save_step == 0:
G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(i+1))
D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(i+1))
C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(i+1))
model_loader.save(self.G.state_dict(),g_lr,d_lr,c_lr,speakers,G_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
model_loader.save(self.D.state_dict(),g_lr,d_lr,c_lr,speakers,D_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
model_loader.save(self.C.state_dict(),g_lr,d_lr,c_lr,speakers,C_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
print('Saved model checkpoints into {}...'.format(self.model_save_dir))
# Decay learning rates.
if (i+1) % self.lr_update_step == 0 and (i+1) > (self.num_iters - self.num_iters_decay):
g_lr -= (self.g_lr / float(self.num_iters_decay))
d_lr -= (self.d_lr / float(self.num_iters_decay))
c_lr -= (self.c_lr / float(self.num_iters_decay))
self.update_lr(g_lr, d_lr, c_lr)
self.save_lr(g_lr,d_lr,c_lr)
print ('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(g_lr, d_lr))
def gradient_penalty(self, y, x):
"""Compute gradient penalty: (L2_norm(dy/dx) - 1)**2."""
y = paddle.reshape(y,[-1])
weight = paddle.ones(y.shape)
dydx = paddle.grad(outputs=y,inputs=x,grad_outputs=weight,retain_graph=True,create_graph=False,only_inputs=True)[0]
dydx.stop_gradient = False
dydx = paddle.reshape(dydx,[dydx.shape[0],-1])
dydx_l2norm = paddle.sqrt(paddle.sum(dydx**2, axis=1))
return paddle.mean((dydx_l2norm-1)**2)
def reset_grad(self):
"""Reset the gradient buffers."""
self.g_optimizer.clear_grad()
self.d_optimizer.clear_grad()
self.c_optimizer.clear_grad()
def restore_model(self, resume_iters):
"""Restore the trained generator and discriminator."""
print('Loading the trained models from step {}...'.format(resume_iters))
G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(resume_iters))
D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(resume_iters))
C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(resume_iters))
self.G.set_state_dict(model_loader.load(G_path)['state_dict'])
self.D.set_state_dict(model_loader.load(D_path)['state_dict'])
self.C.set_state_dict(model_loader.load(C_path)['state_dict'])
@staticmethod
def pad_coded_sp(coded_sp_norm):
f_len = coded_sp_norm.shape[1]
if f_len >= FRAMES:
pad_length = FRAMES-(f_len - (f_len//FRAMES) * FRAMES)
elif f_len < FRAMES:
pad_length = FRAMES - f_len
sp_norm_pad = np.hstack((coded_sp_norm, np.zeros((coded_sp_norm.shape[0], pad_length))))
return sp_norm_pad
def test(self):
"""Translate speech using StarGAN ."""
# Load the trained generator.
self.restore_model(self.test_iters)
norm = Normalizer()
# Set data loader.
d, speaker = TestSet(self.test_dir).test_data(self.src_speaker)
targets = self.trg_speaker
for target in targets:
print(target)
assert target in speakers
label_t = self.spk_enc.transform([target])[0]
label_t = np.asarray([label_t])
with paddle.no_grad():
for filename, content in d.items():
f0 = content['f0']
ap = content['ap']
sp_norm_pad = self.pad_coded_sp(content['coded_sp_norm'])
convert_result = []
for start_idx in range(0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
one_seg = sp_norm_pad[:, start_idx : start_idx+FRAMES]
one_seg = paddle.to_tensor(one_seg,dtype='float32')
one_seg = paddle.reshape(one_seg,[1,1,one_seg.shape[0],one_seg.shape[1]])
l = paddle.to_tensor(label_t,dtype='float32')
#one_seg = one_seg.to(self.device)
#l = l.to(self.device)
one_set_return = self.G(one_seg, l).cpu().numpy()
one_set_return = np.squeeze(one_set_return)
one_set_return = norm.backward_process(one_set_return, target)
convert_result.append(one_set_return)
convert_con = np.concatenate(convert_result, axis=1)
convert_con = convert_con[:, 0:content['coded_sp_norm'].shape[1]]
contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64)
decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE)
f0_converted = norm.pitch_conversion(f0, speaker, target)
wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE)
name = f'{speaker}-{target}_iter{self.test_iters}_{filename}'
path = os.path.join(self.result_dir, name)
print(f'[save]:{path}')
librosa.output.write_wav(path, wav, SAMPLE_RATE)
def safe_train(self,throw=False):
try:
self.train()
self.writer.close()
except BaseException as e:
print('Warning! An exception has been thrown')
print('Warning! Saving model')
self.writer.close()
G_path = os.path.join(self.model_save_dir, '{}-G.pad'.format(self.temp_i+1))
D_path = os.path.join(self.model_save_dir, '{}-D.pad'.format(self.temp_i+1))
C_path = os.path.join(self.model_save_dir, '{}-C.pad'.format(self.temp_i+1))
model_loader.save(self.G.state_dict(),self.temp_g_lr,self.temp_d_lr,self.temp_c_lr,speakers,G_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
model_loader.save(self.D.state_dict(),self.temp_g_lr,self.temp_d_lr,self.temp_c_lr,speakers,D_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
model_loader.save(self.C.state_dict(),self.temp_g_lr,self.temp_d_lr,self.temp_c_lr,speakers,C_path,self.g_optimizer.state_dict(),self.d_optimizer.state_dict(),self.c_optimizer.state_dict())
print('Saved model checkpoints into {}...'.format(self.model_save_dir))
print('Training termination...')
if throw:
raise e
if __name__ == '__main__':
pass
data_loader.py
import sys
sys.path.append('/home/aistudio/external-libraries')
sys.path.append('/home/aistudio/vc')
import os
import librosa
import numpy as np
import paddle
from sklearn.preprocessing import LabelBinarizer
from paddle.io import DataLoader,Dataset
from preprocess import (FEATURE_DIM, FFTSIZE, FRAMES, SAMPLE_RATE,
world_features)
from utility import Normalizer, speakers
import random
class AudioDataset(Dataset):
"""docstring for AudioDataset."""
def __init__(self, datadir:str):
super(AudioDataset, self).__init__()
self.datadir = datadir
self.files = []
for root,dirs,files in os.walk(datadir,followlinks=True):
for f in files:
if os.path.splitext(f)[0].split('-',1)[0].split('_',1)[0] in speakers and os.path.splitext(f)[1].lower()=='.npy':
self.files.append(os.path.join(root,f))
self.encoder = LabelBinarizer().fit(speakers)
def __getitem__(self, idx):
p = self.files[idx]
filename = os.path.basename(p)
speaker = filename.split(sep='_', maxsplit=1)[0]
label = self.encoder.transform([speaker])[0]
mcep = np.load(p)
mcep = paddle.to_tensor(mcep, dtype='float32')
mcep = paddle.unsqueeze(mcep, 0)
return mcep, paddle.to_tensor(speakers.index(speaker), dtype='int64'), paddle.to_tensor(label, dtype='float32')
def speaker_encoder(self):
return self.encoder
def __len__(self):
return len(self.files)
def data_loader(datadir: str, batch_size=4, shuffle=True, mode='train', num_workers=2,spes=speakers):
'''if mode is train datadir should contains training set which are all npy files
or, mode is test and datadir should contains only wav files.
'''
global speakers
speakers=spes
dataset = AudioDataset(datadir)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
return loader
class TestSet(object):
"""docstring for TestSet."""
def __init__(self, datadir:str):
super(TestSet, self).__init__()
self.datadir = datadir
self.norm = Normalizer()
def choose(self):
'''choose one speaker for test'''
r = random.choice(speakers)
return r
def test_data(self, src_speaker=None):
'''choose one speaker for conversion'''
if src_speaker:
r_s = src_speaker
else:
r_s = self.choose()
p = os.path.join(self.datadir, r_s)
wavfiles = librosa.util.find_files(p, ext='wav')
res = {}
for f in wavfiles:
filename = os.path.basename(f)
wav, _ = librosa.load(f, sr=SAMPLE_RATE, dtype=np.float64)
f0, timeaxis, sp, ap, coded_sp = world_features(wav, SAMPLE_RATE, FFTSIZE, FEATURE_DIM)
coded_sp_norm = self.norm.forward_process(coded_sp.T, r_s)
if not res.__contains__(filename):
res[filename] = {}
res[filename]['coded_sp_norm'] = np.asarray(coded_sp_norm)
res[filename]['f0'] = np.asarray(f0)
res[filename]['ap'] = np.asarray(ap)
return res , r_s
2条答案
按热度按时间7uhlpewt1#
您好,我们已经收到了您的问题,会安排技术人员尽快解答您的问题,请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时,您也可以通过查看官网API文档、常见问题、历史Issue、AI社区来寻求解答。祝您生活愉快~
Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the API,FAQ,Github Issue and AI community to get the answer.Have a nice day!
g6ll5ycj2#
这边提供一些建议,希望能对您有帮助:
1.建议检查一下pytorch和paddle的op对应和使用方法问题,有些op的参数和使用方法是不相同的
2.尝试将数据更换成全1的人造数据,测试是否可以正常收敛拟合
3.如果有一些随机量,您可以通过paddle.seed()以及FLAGS_cudnn_deterministic,使用确定的初始化方式来固定,这样方便您查找
4.您可以逐层打印op的输出,查看真实输出是否符合预期输出,包括反向的grad也可以通过tensor.grad查看
5.可以检查一下训练和infer的时候是否处于相应分支的代码逻辑中,使用model.train()来切换为训练逻辑