我试图根据同一个计算的查找表对一个计算进行基准测试,但我怀疑在发布时,编译器报告的数据不准确。
我将基准测试作为调试,然后作为发布
- 单个随机点的查找表
- for循环中的整个查找表
- 直接计算
- 直接计算整个表。
'''
Lookup table took 0.1099 seconds to execute
Lookup entire table took 663.521 seconds to execute
Calculation took 0.2698 seconds to execute
Calculation of entire table took 1804.6 seconds to execute
字符串
'''
然后我再次对所有版本进行基准测试,并使用-O2优化
Lookup table took 0.0005 seconds to execute
Lookup entire table took 0.0003 seconds to execute
Calculation took 0.098 seconds to execute
Calculation of entire table took 422.066 seconds to execute
型
在调试时,查找表的速度快了4/5倍,但在发布时,快了数百倍,所以我怀疑我对直接计算进行了错误的基准测试。这是直接计算:
igc__blade1 = blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 2u;
igc__blade1 ^= igc__blade1 >> 4u;
blade2__masked = igc__blade1 & blade2;
//__builtin_popcount replaces commented text
parity__2 = __builtin_popcount(blade2__masked & 0b11111110) & 1u;
/*parity__2 = blade2__masked;
parity__2 ^= parity__2 >> 1u;
parity__2 ^= parity__2 >> 2u;
parity__2 ^= parity__2 >> 4u;
parity__2 = parity__2 & 2u;*/
sign_of_product = 1 - 2 * parity__2;
//sign_of_product = 1 - parity__2;
型
这是查找表调用
sign_of_product = lookup_table_sign[row][col];
型
**据我所知,从L1缓存中获取查找表需要10个时钟周期,我的整个代码使用15/20指令(XOR
,AND
,算术),(我用popcount
指令替换了一半)。
因此,直接计算,特别是在使用-O2优化编译之后,最多应该花费两倍的查表时间,而不是数百次。
我做错事了,我不知道是什么。
我怀疑编译器理解for循环只以最终计算结束,并切断整个嵌套循环,只获取最后一个值,但我无法知道发生了什么**
这是完整的代码
#include <algorithm>
#include <bitset>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <stdint.h>
#ifdef _MSC_VER
#include <intrin.h>
#define __builtin_popcount __popcnt
#endif
using namespace std;
long long REPETITIONS = 10000;
int64_t inverseGrayCode(int64_t n)
{
n ^= n >> 1;
n ^= n >> 2;
n ^= n >> 4;
n ^= n >> 8; // unnecessary for 8 bit unsigned integers
n ^= n >> 16; // unnecessary for 16 bit unsigned integers
n ^= n >> 32; // unnecessary for 32 bit unsigned integers
return n;
}
const uint8_t BITS = 6;
const uint8_t SIZE = 0b1 << (BITS);
uint8_t blades[SIZE];
uint8_t igc_blades[SIZE];
uint8_t abs_blades_product[SIZE][SIZE];
uint8_t blades_masked[SIZE][SIZE];
uint8_t parity_2[SIZE][SIZE];
int8_t sign[SIZE][SIZE];
int8_t (*lookup_table_sign)[SIZE];
#pragma region lexicographical
uint64_t _64INDIVIDUAL_BIT[SIZE];
double WEIGHT[SIZE];
void initialize_constants()
{
uint64_t one = 1;
for (uint64_t x = 0; x < 64; x++)
{
_64INDIVIDUAL_BIT[x] = one << x;
WEIGHT[x] = pow(2.0, -(1 + (double)x));
}
}
// Function to calculate the lexicographical weight of an integer
double lexicographical_weight(uint64_t x)
{
double weight = 0.0;
for (int i = 0; i < 64; i++)
{
weight += ((x & _64INDIVIDUAL_BIT[i]) > 0) * WEIGHT[i];
}
return __builtin_popcount(x) + 1 - weight;
}
bool compareByWeight(int element1, int element2)
{
return lexicographical_weight(element1) < lexicographical_weight(element2);
}
#pragma endregion lexicographical
int8_t (*calc_blade_products(uint8_t blades[]))[SIZE]
{
for (int i = 0; i < SIZE; i++)
{
igc_blades[i] = inverseGrayCode(blades[i] >> 1);
}
for (int row = 0; row < SIZE; row++)
{
for (int col = 0; col < SIZE; col++)
{
abs_blades_product[row][col] = blades[row] ^ blades[col];
blades_masked[row][col] = igc_blades[row] & blades[col];
// aux[i][j] = inverseGrayCode(blades_masked[i][j]);
parity_2[row][col] = inverseGrayCode(blades_masked[row][col]) & 2u;
sign[row][col] = 1 - parity_2[row][col];
}
}
return sign;
}
uint8_t igc__blade1;
uint8_t blade2__masked;
uint8_t parity__2;
int8_t sign_of_product;
uint8_t blade1;
uint8_t blade2;
int main()
{
initialize_constants();
#pragma region generate_blades_in_lexicographical_order
for (unsigned int i = 0; i < SIZE; i++)
{
blades[i] = i << 1;
// fprintf(stdout, "blade %d: %8sb\n", i, bitset<8>(blades[i]).to_string().c_str());
}
blades[0] = 1;
int arraySize = sizeof(blades) / sizeof(blades[0]);
sort(blades, blades + arraySize, compareByWeight);
#pragma endregion generate_blades_in_lexicographical_order
// Create lookup_table
lookup_table_sign = calc_blade_products(blades);
#pragma region benchmark
uint8_t blade1 = rand() % 256;
uint8_t blade2 = rand() % 256;
// benchmark single point with table lookup
auto start = std::chrono::high_resolution_clock::now();
for (int rept = 0; rept < REPETITIONS; rept++)
{
sign_of_product = lookup_table_sign[blade1][blade2];
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed_miliseconds = end - start;
cout << endl;
cout << "Lookup table took " << elapsed_miliseconds.count() << " seconds to execute" << endl;
start = std::chrono::high_resolution_clock::now();
// benchmark entire table with table lookup
for (int rept = 0; rept < REPETITIONS; rept++)
{
for (int row = 0; row < SIZE; row++)
{
for (int col = 0; col < SIZE; col++)
{
sign_of_product = lookup_table_sign[row][col];
}
}
}
end = std::chrono::high_resolution_clock::now();
elapsed_miliseconds = end - start;
cout << endl;
cout << "Lookup entire table took " << elapsed_miliseconds.count() << " seconds to execute" << endl;
// benchmark single point with direct calculation
start = std::chrono::high_resolution_clock::now();
for (int rept = 0; rept < REPETITIONS; rept++)
{
igc__blade1 = blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 2u;
igc__blade1 ^= igc__blade1 >> 4u;
blade2__masked = igc__blade1 & blade2;
//__builtin_popcount replaces commented text
parity__2 = __builtin_popcount(blade2__masked & 0b11111110) & 1u;
/*parity__2 = blade2__masked;
parity__2 ^= parity__2 >> 1u;
parity__2 ^= parity__2 >> 2u;
parity__2 ^= parity__2 >> 4u;
parity__2 = parity__2 & 2u;*/
sign_of_product = 1 - 2 * parity__2;
// sign_of_product = 1 - parity__2;
}
end = std::chrono::high_resolution_clock::now();
elapsed_miliseconds = end - start;
cout << endl;
cout << "Calculation took " << elapsed_miliseconds.count() << " seconds to execute" << endl;
// benchmark entire table with direct calculation
start = std::chrono::high_resolution_clock::now();
for (int rept = 0; rept < REPETITIONS; rept++)
{
for (int blade1 = 0; blade1 < SIZE; blade1++)
{
for (int blade2 = 0; blade2 < SIZE; blade2++)
{
igc__blade1 = blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 2u;
igc__blade1 ^= igc__blade1 >> 4u;
blade2__masked = igc__blade1 & blade2;
//__builtin_popcount replaces commented text
parity__2 = __builtin_popcount(blade2__masked & 0b11111110) & 1u;
/*parity__2 = blade2__masked;
parity__2 ^= parity__2 >> 1u;
parity__2 ^= parity__2 >> 2u;
parity__2 ^= parity__2 >> 4u;
parity__2 = parity__2 & 2u;*/
sign_of_product = 1 - 2 * parity__2;
// sign_of_product = 1 - parity__2;
}
}
}
end = std::chrono::high_resolution_clock::now();
elapsed_miliseconds = end - start;
cout << endl;
cout << "Calculation of entire table took " << elapsed_miliseconds.count() << " seconds to execute" << endl;
#pragma endregion benchmark
return 0;
}
型
1条答案
按热度按时间isr3a4wc1#
在“Peter Cordes”的注解之后,我将获取计算结果的变量标记为
volatile
,以阻止编译器忽略它们并跳转到循环的最后一个值。它得到了更合理的结果,其中计算成本大约是表查找的两倍,虽然我没有办法知道引擎盖下发生了什么,以及两种选择之间的CPU时间的真实的差异是什么。
字符串
'''(使用ore重复发布构建输出)
型
'''