请提出你的问题 Please ask your question
多机运行paddle任务时报错,报错信息如下,在单机情况下可以调试通过。
workerlog.0
--------------------------------------
C++ Traceback (most recent call last):
--------------------------------------
0 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocator<std::string > > const&, bool, bool)
1 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
2 paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
3 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
4 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
5 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
6 std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, double>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, int>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, signed char>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, long> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&)
7 paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, float>::Compute(paddle::framework::ExecutionContext const&) const
8 paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, float>::LoadLodTensor(std::istream&, paddle::platform::Place const&, paddle::framework::Variable*, paddle::framework::ExecutionContext const&) const
9 paddle::framework::DeserializeFromStream(std::istream&, paddle::framework::LoDTensor*, paddle::platform::DeviceContext const&)
10 paddle::framework::TensorFromStream(std::istream&, paddle::framework::Tensor*, paddle::platform::DeviceContext const&)
11 paddle::framework::TensorCopy(paddle::framework::Tensor const&, paddle::platform::Place const&, paddle::platform::DeviceContext const&, paddle::framework::Tensor*)
12 void paddle::memory::Copy<paddle::platform::CUDAPlace, paddle::platform::CPUPlace>(paddle::platform::CUDAPlace, void*, paddle::platform::CPUPlace, void const*, unsigned long, CUstream_st*)
13 paddle::platform::GpuMemcpyAsync(void*, void const*, unsigned long, cudaMemcpyKind, CUstream_st*)
----------------------
Error Message Summary:
----------------------
FatalError: `Termination signal` is detected by the operating system.
[TimeInfo: *** Aborted at 1657889243 (unix time) try "date -d @1657889243" if you are using GNU date ***]
[SignalInfo: *** SIGTERM (@0x1f95) received by PID 8166 (TID 0x7f3cd74cacc0) from PID 8085 ***]
workerlog.7
--------------------------------------
C++ Traceback (most recent call last):
--------------------------------------
0 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocator<std::string > > const&, bool, bool)
1 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
2 paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
3 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
4 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
5 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
6 std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, double>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, int>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, signed char>, paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, long> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&)
7 paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, float>::Compute(paddle::framework::ExecutionContext const&) const
8 paddle::operators::LoadOpKernel<paddle::platform::CUDADeviceContext, float>::LoadLodTensor(std::istream&, paddle::platform::Place const&, paddle::framework::Variable*, paddle::framework::ExecutionContext const&) const
9 paddle::framework::DeserializeFromStream(std::istream&, paddle::framework::LoDTensor*, paddle::platform::DeviceContext const&)
10 paddle::framework::TensorFromStream(std::istream&, paddle::framework::Tensor*, paddle::platform::DeviceContext const&)
11 void paddle::framework::VisitDataType<paddle::framework::DeserializedDataFunctor>(paddle::framework::proto::VarType_Type, paddle::framework::DeserializedDataFunctor)
12 paddle::framework::Tensor::mutable_data(paddle::platform::Place const&, paddle::framework::proto::VarType_Type, unsigned long)
13 paddle::memory::AllocShared(paddle::platform::Place const&, unsigned long)
14 paddle::memory::allocation::AllocatorFacade::AllocShared(paddle::platform::Place const&, unsigned long)
15 paddle::memory::allocation::AllocatorFacade::Alloc(paddle::platform::Place const&, unsigned long)
16 paddle::memory::allocation::NaiveBestFitAllocator::AllocateImpl(unsigned long)
17 void* paddle::memory::legacy::Alloc<paddle::platform::CPUPlace>(paddle::platform::CPUPlace const&, unsigned long)
18 paddle::memory::detail::BuddyAllocator::Alloc(unsigned long)
19 paddle::memory::detail::BuddyAllocator::SystemAlloc(unsigned long)
20 paddle::memory::detail::CPUAllocator::Alloc(unsigned long*, unsigned long)
----------------------
Error Message Summary:
----------------------
FatalError: `Termination signal` is detected by the operating system.
[TimeInfo: *** Aborted at 1657889243 (unix time) try "date -d @1657889243" if you are using GNU date ***]
[SignalInfo: *** SIGTERM (@0x1f95) received by PID 8215 (TID 0x7fb35a09fcc0) from PID 8085 ***]
workerlog.1
I0715 20:47:18.208267 8169 nccl_by_eccl.cc:62] Eccl init global comm successfully!
I0715 20:47:18.301676 8169 converter_helper.cc:27] get a env var from [ECCL_SPLIT_INDEX], its value is 0 !
I0715 20:47:18.301740 8169 nccl_by_eccl.cc:48] Parameter for ncclCommInitRank is ready: endpoint = 10.67.193.140:3685; group_id = ugfrukzJ1Yi9kpw47v12_0; rank = 0; nranks = 3; my_device_id = 1; split_index = 0!
I0715 20:47:18.301754 8169 eccl.cc:284] Begin gen unique id in group [ugfrukzJ1Yi9kpw47v12_0]!
I0715 20:47:18.301775 8169 eccl.cc:88] Begin started a new ECCL RPC server!
I0715 20:47:18.302381 8169 eccl.cc:107] Add rpc server successfully!
E0715 20:47:18.343429 8169 /home/users/kanghui/nccl_hook/cuda11_test/baidu/hac-aiacc/eccl/build/third_party/brpc/src/extern_brpc/src/brpc/server.cpp:955] Fail to listen 0.0.0.0:3685
F0715 20:47:18.343511 8169 eccl.cc:109] Check failed: eccl_rpc_server->Start(std::stoi(parts[1]), &options) == 0 Start rpc server failed!
*** Check failure stack trace: ***
--------------------------------------
C++ Traceback (most recent call last):
--------------------------------------
0 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocator<std::string > > const&, bool, bool)
1 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
2 paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
3 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
4 paddle::operators::CCommInitOp::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
5 paddle::platform::NCCLCommContext::CreateComm(ncclUniqueId*, int, int, int, int)
6 ncclCommInitRank
7 google::LogMessageFatal::~LogMessageFatal()
8 google::LogMessage::Flush()
9 google::LogMessage::SendToLog()
10 google::LogMessage::SendToSink()
11 google::InstallFailureFunction(void (*)())
----------------------
Error Message Summary:
----------------------
FatalError: `Process abort signal` is detected by the operating system.
[TimeInfo: *** Aborted at 1657889238 (unix time) try "date -d @1657889238" if you are using GNU date ***]
[SignalInfo: *** SIGABRT (@0x1fe9) received by PID 8169 (TID 0x7f956b3bdcc0) from PID 8169 ***]
6条答案
按热度按时间cgvd09ve1#
您好,我们已经收到了您的问题,会安排技术人员尽快解答您的问题,请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时,您也可以通过查看 官网API文档 、 常见问题 、 历史Issue 、 AI社区 来寻求解答。祝您生活愉快~
Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the API , FAQ , Github Issue and AI community to get the answer.Have a nice day!
uajslkp62#
不同卡报错信息完全不一样,看不出问题在哪里?麻烦帮忙解决一下
ebdffaop3#
麻烦贴一下所有卡的报错信息吧。
qlckcl4x4#
麻烦提供一下最简单的可复现代码,以及运行环境等信息。
xcitsw885#
麻烦贴一下所有卡的报错信息吧。
其他卡的报错信息与workerlog.1相同
db2dz4w86#
看上去是卡间通讯建立失败。可以check一下机器环境。