Ive developed a Sieve of Erastosthenes in C++20 with an unbeaten
performance. With this code I get all primes <= 2 ^ 32 in 140ms
on my AMD 7950X 16-core Zen4-system.
It calculates first only the primes up to the square root of the
maximum. The remaining bits are partitioned according to the num-
ber of CPU-cores. Within each thread the bitmap is partitioned in
parts that fit into the L2-cache (queried via CPUID).
All operations are aligned on a cacheline-boundary to avoid false
sharing and locking of shared words.
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
#include
#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) ||
defined(__x86_64__))
#include
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 26495) // uninitialized member
#endif
using namespace std;
#if defined(__cpp_lib_hardware_interference_size)
constexpr ptrdiff_t CL_SIZE = hardware_destructive_interference_size;
#else
constexpr ptrdiff_t CL_SIZE = 64;
#endif
size_t getL2Size();
bool smt();
int main( int argc, char **argv )
{
try
{
if( argc < 2 )
return EXIT_FAILURE;
auto parse = []( char const *str, auto &value, bool bytes )
{
bool hex = str[0] == '0' && tolower( str[1] ) == 'x';
str += 2 * hex;
auto [ptr, ec] = from_chars( str, str + strlen( str ), value, !hex ?
10 : 16 );
if( ec != errc() )
return false;
if( bytes && tolower( *ptr ) == 'x' && !ptr[1] )
{
value *= 8;
--value;
++ptr;
}
return !*ptr;
};
size_t end;
if( !parse( argv[1], end, true ) )
return EXIT_FAILURE;
if( end < 2 )
return EXIT_FAILURE;
if( (ptrdiff_t)end++ < 0 )
throw bad_alloc();
unsigned
hc = jthread::hardware_concurrency(),
nThreads;
if( argc < 4 || !parse( argv[3], nThreads, false ) )
nThreads = hc;
if( !nThreads )
return EXIT_FAILURE;
using word_t = size_t;
constexpr size_t
BITS_PER_CL = CL_SIZE * 8,
BITS = sizeof(word_t) * 8;
size_t nBits = end / 2;
union alignas(CL_SIZE) ndi_words_cl { word_t words[CL_SIZE /
sizeof(word_t)]; ndi_words_cl() {} };
vector sieveCachelines( (nBits + BITS_PER_CL - 1) /
BITS_PER_CL );
span sieve( &sieveCachelines[0].words[0], (nBits + BITS - 1) /
BITS );
assert(to_address( sieve.end() ) <= &to_address( sieveCachelines.end()
)->words[0]);
fill( sieve.begin(), sieve.end(), -1 );
auto punch = [&]( size_t start, size_t end, size_t prime, auto )
{
size_t
bit = start / 2,
bitEnd = end / 2;
if( bit >= bitEnd ) [[unlikely]]
return;
if( prime >= BITS ) [[likely]]
do [[likely]]
sieve[bit / BITS] &= rotl( (word_t)-2, bit % BITS );
while( (bit += prime) < bitEnd );
else
{
auto pSieve = sieve.begin() + bit / BITS;
do [[likely]]
{
word_t
word = *pSieve,
mask = rotl( (word_t)-2, bit % BITS ),
oldMask;
do
{
word &= mask;
oldMask = mask;
mask = rotl( mask, prime % BITS );
bit += prime;
} while( mask < oldMask );
*pSieve++ = word;
} while( bit < bitEnd );
}
};
size_t sqrt = (ptrdiff_t)ceil( ::sqrt( (ptrdiff_t)end ) );
[&]
{
for( size_t bSqrt = sqrt / 2, bit = 1; bit < bSqrt; ++bit ) [[likely]]
{
auto pSieve = sieve.begin () + bit / BITS;
for( ; ; )
{
if( word_t w = *pSieve >> bit % BITS; w ) [[likely]]
{
bit += countr_zero ( w );
break;
}
++pSieve;
bit = bit + BITS & -(ptrdiff_t)BITS;
if( bit >= bSqrt ) [[unlikely]]
return;
}
size_t prime = 2 * bit + 1;
punch( prime * prime, sqrt, prime, false_type () );
}
}();
auto scan = [&]( size_t start, size_t end, auto fn )
{
size_t
bit = start / 2,
bitEnd = end / 2;
if( bit >= bitEnd ) [[unlikely]]
return;
auto pSieve = sieve.begin() + bit / BITS;
word_t word = *pSieve++ >> bit % BITS;
for( ; ; )
{
size_t nextWordBit = bit + BITS & -(ptrdiff_t)BITS;
for( unsigned gap; word; word >>= gap, word >>= 1, ++bit )
{
gap = countr_zero( word );
bit += gap;
if( bit >= bitEnd ) [[unlikely]]
return;
fn( bit * 2 + 1, true_type() );
}
bit = nextWordBit;
if( bit >= bitEnd ) [[unlikely]]
return;
word = *pSieve++;
}
};
vector threads;
threads.reserve( nThreads );
static auto dbl = []( ptrdiff_t x ) { return (double)x; };
auto initiate = [&]( size_t start, auto fn )
{
double threadRange = dbl( end - start ) / nThreads;
ptrdiff_t t = nThreads;
size_t trEnd = end;
size_t prevStart, prevEnd;
bool prevValid = false;
do [[likely]]
{
size_t trStart = start + (ptrdiff_t)(--t * threadRange + 0.5);
trStart &= -(2 * CL_SIZE * 8);
trStart = trStart >= start ? trStart : start;
if( trStart < trEnd ) [[likely]]
{
if( prevValid ) [[likely]]
threads.emplace_back( fn, prevStart, prevEnd );
prevStart = trStart;
prevEnd = trEnd;
prevValid = true;
}
trEnd = trStart;
} while( t );
if( prevValid ) [[likely]]
fn( prevStart, prevEnd );
threads.resize( 0 );
};
double maxCacheRange = dbl( getL2Size() * 8 * 2 ) / 3 * (smt() &&
nThreads > hc / 2 ? 1 : 2);
initiate( sqrt,
[&]( size_t rStart, size_t rEnd )
{
double
thisThreadRange = dbl( rEnd - rStart ),
nCachePartitions = ceil( thisThreadRange / maxCacheRange ),
cacheRange = thisThreadRange / nCachePartitions;
for( size_t p = (ptrdiff_t)nCachePartitions, cacheEnd = rEnd,
cacheStart; p--; cacheEnd = cacheStart ) [[likely]]
{
cacheStart = rStart + (ptrdiff_t)((double)(ptrdiff_t)p * cacheRange
+ 0.5);
cacheStart &= -(2 * CL_SIZE * 8);
cacheStart = cacheStart >= sqrt ? cacheStart : sqrt;
scan( 3, sqrt,
[&]( size_t prime, auto )
{
size_t start = (cacheStart + prime - 1) / prime * prime;
start += start % 2 ? 0 : prime;
punch( start, cacheEnd, prime, true_type() );
} );
}
} );
if( bool count = false; argc >= 3 && (!*argv[2] || (count = strcmp(
argv[2], "*" ) == 0)) )
{
if( !count )
return EXIT_SUCCESS;
atomic aNPrimes( 1 );
initiate( 3,
[&]( size_t rStart, size_t rEnd )
{
size_t nPrimes = 0;
scan( rStart, rEnd, [&]( ... ) { ++nPrimes; } );
aNPrimes.fetch_add( nPrimes, memory_order::relaxed );
} );
cout << aNPrimes.load( memory_order::relaxed ) << " primes" << endl;
return EXIT_SUCCESS;
}
ofstream ofs;
ofs.exceptions( ofstream::failbit | ofstream::badbit );
ofs.open( argc >= 3 ? argv[2] : "primes.txt", ofstream::binary |
ofstream::trunc );
constexpr size_t
BUF_SIZE = 0x100000,
AHEAD = 32;
union ndi_char { char c; ndi_char() {} };
vector rawPrintBuf( BUF_SIZE + AHEAD - 1 );
span printBuf( &to_address( rawPrintBuf.begin() )->c, &to_address(
rawPrintBuf.end() )->c );
auto
bufBegin = printBuf.begin(),
bufEnd = bufBegin,
bufFlush = bufBegin + BUF_SIZE;
auto print = [&]() { ofs << string_view( bufBegin, bufEnd ); };
auto printPrime = [&]( size_t prime, auto )
{
auto [ptr, ec] = to_chars( &*bufEnd, &bufEnd[AHEAD - 1], prime );
if( (bool)ec ) [[unlikely]]
throw system_error( (int)ec, generic_category(), "converson failed" );
bufEnd = ptr - &*bufBegin + bufBegin; // pointer to iterator - NOP
*bufEnd++ = '\n';
if( bufEnd >= bufFlush ) [[unlikely]]
print(), bufEnd = bufBegin;
};
printPrime( 2, false_type() );
scan( 3, end, printPrime );
print();
}
catch( exception const &exc )
{
cout << exc.what() << endl;
}
}
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) ||
defined(__x86_64__)) || defined(_MSC_VER) && (defined(_M_IX86) ||
defined(_M_X64))
array cpuId( unsigned code )
{
int regs[4];
#if defined(_MSC_VER)
__cpuid( regs, code );
#elif defined(__GNUC__) || defined(__clang__)
__cpuid(code, regs[0], regs[1], regs[2], regs[3]);
#endif
using u = unsigned;
return array { (u)regs[0], (u)regs[1], (u)regs[2], (u)regs[3] };
}
bool smt()
{
if( cpuId( 0 )[0] < 1 )
return false;
return cpuId( 1 )[3] >> 28 & 1;
}
size_t getL2Size()
{
if( cpuId( 0x80000000u )[0] < 0x80000006u )
return 512ull * 1024;
return (cpuId( 0x80000006u )[2] >> 16) * 1024;
}
#else
size_t getL2Size()
{
return 512ull * 1024;
}
bool smt()
{
return false;
}
#endif
|
|