From:  Bonita Montero <Bonita.Montero@gmail.com>
Date:  02 Oct 2024 20:44:04 Hong Kong Time
Newsgroup:  news.alt119.net/comp.lang.c
Subject:  

Re: A very slow program

NNTP-Posting-Host:  null

Ive developed a Sieve of Erastosthenes in C++20 with an unbeaten
performance. With this code I get all primes <= 2 ^ 32 in 140ms
on my AMD 7950X 16-core Zen4-system.
It calculates first only the primes up to the square root of the
maximum. The remaining bits are partitioned according to the num-
ber of CPU-cores. Within each thread the bitmap is partitioned in
parts that fit into the L2-cache (queried via CPUID).
All operations are aligned on a cacheline-boundary to avoid false
sharing and locking of shared words.

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
	#include 
#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || 
defined(__x86_64__))
	#include 
#endif

#if defined(_MSC_VER)
	#pragma warning(disable: 26495) // uninitialized member
#endif

using namespace std;

#if defined(__cpp_lib_hardware_interference_size)
constexpr ptrdiff_t CL_SIZE = hardware_destructive_interference_size;
#else
constexpr ptrdiff_t CL_SIZE = 64;
#endif

size_t getL2Size();
bool smt();

int main( int argc, char **argv )
{
	try
	{
		if( argc < 2 )
			return EXIT_FAILURE;
		auto parse = []( char const *str, auto &value, bool bytes )
		{
			bool hex = str[0] == '0' && tolower( str[1] ) == 'x';
			str += 2 * hex;
			auto [ptr, ec] = from_chars( str, str + strlen( str ), value, !hex ? 
10 : 16 );
			if( ec != errc() )
				return false;
			if( bytes && tolower( *ptr ) == 'x' && !ptr[1] )
			{
				value *= 8;
				--value;
				++ptr;
			}
			return !*ptr;
		};
		size_t end;
		if( !parse( argv[1], end, true ) )
			return EXIT_FAILURE;
		if( end < 2 )
			return EXIT_FAILURE;
		if( (ptrdiff_t)end++ < 0 )
			throw bad_alloc();
		unsigned
			hc = jthread::hardware_concurrency(),
			nThreads;
		if( argc < 4 || !parse( argv[3], nThreads, false ) )
			nThreads = hc;
		if( !nThreads )
			return EXIT_FAILURE;
		using word_t = size_t;
		constexpr size_t
			BITS_PER_CL = CL_SIZE * 8,
			BITS = sizeof(word_t) * 8;
		size_t nBits = end / 2;
		union alignas(CL_SIZE) ndi_words_cl { word_t words[CL_SIZE / 
sizeof(word_t)]; ndi_words_cl() {} };
		vector sieveCachelines( (nBits + BITS_PER_CL - 1) / 
BITS_PER_CL );
		span sieve( &sieveCachelines[0].words[0], (nBits + BITS - 1) / 
BITS );
		assert(to_address( sieve.end() ) <= &to_address( sieveCachelines.end() 
)->words[0]);
		fill( sieve.begin(), sieve.end(), -1 );
		auto punch = [&]( size_t start, size_t end, size_t prime, auto )
		{
			size_t
				bit = start / 2,
				bitEnd = end / 2;
			if( bit >= bitEnd ) [[unlikely]]
				return;
			if( prime >= BITS ) [[likely]]
				do [[likely]]
					sieve[bit / BITS] &= rotl( (word_t)-2, bit % BITS );
				while( (bit += prime) < bitEnd );
			else
			{
				auto pSieve = sieve.begin() + bit / BITS;
				do [[likely]]
				{
					word_t
						word = *pSieve,
						mask = rotl( (word_t)-2, bit % BITS ),
						oldMask;
					do
					{
						word &= mask;
						oldMask = mask;
						mask = rotl( mask, prime % BITS );
						bit += prime;
					} while( mask < oldMask );
					*pSieve++ = word;
				} while( bit < bitEnd );
			}
		};
		size_t sqrt = (ptrdiff_t)ceil( ::sqrt( (ptrdiff_t)end ) );
		[&]
		{
			for( size_t bSqrt = sqrt / 2, bit = 1; bit < bSqrt; ++bit ) [[likely]]
			{
				auto pSieve = sieve.begin () + bit / BITS;
				for( ; ; )
				{
					if( word_t w = *pSieve >> bit % BITS; w ) [[likely]]
					{
						bit += countr_zero ( w );
						break;
					}
					++pSieve;
					bit = bit + BITS & -(ptrdiff_t)BITS;
					if( bit >= bSqrt ) [[unlikely]]
						return;
				}
				size_t prime = 2 * bit + 1;
				punch( prime * prime, sqrt, prime, false_type () );
			}
		}();
		auto scan = [&]( size_t start, size_t end, auto fn )
		{
			size_t
				bit = start / 2,
				bitEnd = end / 2;
			if( bit >= bitEnd ) [[unlikely]]
				return;
			auto pSieve = sieve.begin() + bit / BITS;
			word_t word = *pSieve++ >> bit % BITS;
			for( ; ; )
			{
				size_t nextWordBit = bit + BITS & -(ptrdiff_t)BITS;
				for( unsigned gap; word; word >>= gap, word >>= 1, ++bit )
				{
					gap = countr_zero( word );
					bit += gap;
					if( bit >= bitEnd ) [[unlikely]]
						return;
					fn( bit * 2 + 1, true_type() );
				}
				bit = nextWordBit;
				if( bit >= bitEnd ) [[unlikely]]
					return;
				word = *pSieve++;
			}
		};
		vector threads;
		threads.reserve( nThreads );
		static auto dbl = []( ptrdiff_t x ) { return (double)x; };
		auto initiate = [&]( size_t start, auto fn )
		{
			double threadRange = dbl( end - start ) / nThreads;
			ptrdiff_t t = nThreads;
			size_t trEnd = end;
			size_t prevStart, prevEnd;
			bool prevValid = false;
			do [[likely]]
			{
				size_t trStart = start + (ptrdiff_t)(--t * threadRange + 0.5);
				trStart &= -(2 * CL_SIZE * 8);
				trStart = trStart >= start ? trStart : start;
				if( trStart < trEnd ) [[likely]]
				{
					if( prevValid ) [[likely]]
						threads.emplace_back( fn, prevStart, prevEnd );
					prevStart = trStart;
					prevEnd = trEnd;
					prevValid = true;
				}
				trEnd = trStart;
			} while( t );
			if( prevValid ) [[likely]]
				fn( prevStart, prevEnd );
			threads.resize( 0 );
		};
		double maxCacheRange = dbl( getL2Size() * 8 * 2 ) / 3 * (smt() && 
nThreads > hc / 2 ? 1 : 2);
		initiate( sqrt,
			[&]( size_t rStart, size_t rEnd )
			{
				double
					thisThreadRange = dbl( rEnd - rStart ),
					nCachePartitions = ceil( thisThreadRange / maxCacheRange ),
					cacheRange = thisThreadRange / nCachePartitions;
				for( size_t p = (ptrdiff_t)nCachePartitions, cacheEnd = rEnd, 
cacheStart; p--; cacheEnd = cacheStart ) [[likely]]
				{
					cacheStart = rStart + (ptrdiff_t)((double)(ptrdiff_t)p * cacheRange 
+ 0.5);
					cacheStart &= -(2 * CL_SIZE * 8);
					cacheStart = cacheStart >= sqrt ? cacheStart : sqrt;
					scan( 3, sqrt,
						[&]( size_t prime, auto )
						{
							size_t start = (cacheStart + prime - 1) / prime * prime;
							start += start % 2 ? 0 : prime;
							punch( start, cacheEnd, prime, true_type() );
						} );
				}
			} );
		if( bool count = false; argc >= 3 && (!*argv[2] || (count = strcmp( 
argv[2], "*" ) == 0)) )
		{
			if( !count )
				return EXIT_SUCCESS;
			atomic aNPrimes( 1 );
			initiate( 3,
				[&]( size_t rStart, size_t rEnd )
				{
					size_t nPrimes = 0;
					scan( rStart, rEnd, [&]( ... ) { ++nPrimes; } );
					aNPrimes.fetch_add( nPrimes, memory_order::relaxed );
				} );
			cout << aNPrimes.load( memory_order::relaxed ) << " primes" << endl;
			return EXIT_SUCCESS;
		}
		ofstream ofs;
		ofs.exceptions( ofstream::failbit | ofstream::badbit );
		ofs.open( argc >= 3 ? argv[2] : "primes.txt", ofstream::binary | 
ofstream::trunc );
		constexpr size_t
			BUF_SIZE = 0x100000,
			AHEAD = 32;
		union ndi_char { char c; ndi_char() {} };
		vector rawPrintBuf( BUF_SIZE + AHEAD - 1 );
		span printBuf( &to_address( rawPrintBuf.begin() )->c, &to_address( 
rawPrintBuf.end() )->c );
		auto
			bufBegin = printBuf.begin(),
			bufEnd = bufBegin,
			bufFlush = bufBegin + BUF_SIZE;
		auto print = [&]() { ofs << string_view( bufBegin, bufEnd ); };
		auto printPrime = [&]( size_t prime, auto )
		{
			auto [ptr, ec] = to_chars( &*bufEnd, &bufEnd[AHEAD - 1], prime );
			if( (bool)ec ) [[unlikely]]
				throw system_error( (int)ec, generic_category(), "converson failed" );
			bufEnd = ptr - &*bufBegin + bufBegin; // pointer to iterator - NOP
			*bufEnd++ = '\n';
			if( bufEnd >= bufFlush ) [[unlikely]]
				print(), bufEnd = bufBegin;
		};
		printPrime( 2, false_type() );
		scan( 3, end, printPrime );
		print();
	}
	catch( exception const &exc )
	{
		cout << exc.what() << endl;
	}
}

#if (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || 
defined(__x86_64__)) || defined(_MSC_VER) && (defined(_M_IX86) || 
defined(_M_X64))
array cpuId( unsigned code )
{
	int regs[4];
#if defined(_MSC_VER)
	__cpuid( regs, code );
#elif defined(__GNUC__) || defined(__clang__)
	__cpuid(code, regs[0], regs[1], regs[2], regs[3]);
#endif
	using u = unsigned;
	return array { (u)regs[0], (u)regs[1], (u)regs[2], (u)regs[3] };
}

bool smt()
{
	if( cpuId( 0 )[0] < 1 )
		return false;
	return cpuId( 1 )[3] >> 28 & 1;
}

size_t getL2Size()
{
	if( cpuId( 0x80000000u )[0] < 0x80000006u )
		return 512ull * 1024;
	return (cpuId( 0x80000006u )[2] >> 16) * 1024;
}
#else
size_t getL2Size()
{
	return 512ull * 1024;
}

bool smt()
{
	return false;
}
#endif