125 lines
3.3 KiB
C++

/*
Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef CHECKSUM_HPP
#define CHECKSUM_HPP
/**
Optimized XOR checksum calculation. Loop unrolling will
reduce relative loop overhead and encourace usage of parallel
arithmetic adders which are common on most modern CPUs.
*/
inline
Uint32
computeXorChecksumShort(const Uint32 *buf, Uint32 words, Uint32 sum = 0)
{
const Uint32 *end_unroll = buf + (words & ~3);
const Uint32 *end = buf + words;
/**
* Aggregate as chunks of 4*Uint32 words:
* Take care if rewriting this part, code has intentionally
* been unrolled in order to take advantage of HW parallelism
* where there are multiple adders in the CPU core.
*/
while (buf < end_unroll)
{
sum ^= buf[0] ^ buf[1] ^ buf[2] ^ buf[3];
buf += 4;
}
// Wrap up remaining part
while (buf < end)
{
sum ^= buf[0];
buf++;
}
return sum;
}
/**
Optimized XOR checksum calculation intended for longer strings.
Temporary aggregate XOR-sums into Uint64 which are folded into
Uint32 in the final stage.
Also unrool loop as above to take advantage of HW parallelism.
Callee is responsible for checking that there are sufficient 'words'
to be checksumed to complete at least a chunk of 4*Uint64 words.
*/
inline
Uint32
computeXorChecksumLong(const Uint32 *buf, Uint32 words, Uint32 sum = 0)
{
// Align to Uint64 boundary to optimize mem. access below
if (((size_t)(buf) % 8) != 0)
{
sum ^= buf[0];
buf++;
words--;
}
const Uint64 *p = reinterpret_cast<const Uint64*>(buf);
Uint64 sum64 = *p++;
const Uint32 words64 = (words/2) - 1; // Rem. after init of sum64
const Uint64 *end = p + (words64 & ~3);
/**
* Aggregate as chunks of 4*Uint64 words:
* Take care if rewriting this part: code has intentionally
* been unrolled in order to take advantage of HW parallelism
* where there are multiple adders in the CPU core.
*/
do
{
sum64 ^= p[0] ^ p[1] ^ p[2] ^ p[3];
p+=4;
} while (p < end);
// Wrap up last part which didn't fit in a 4*Uint64 chunk
end += (words64 % 4);
while (p < end)
{
sum64 ^= p[0];
p++;
}
// Fold temp Uint64 sum into a final Uint32 sum
sum ^= (Uint32)(sum64 & 0xffffffff) ^
(Uint32)(sum64 >> 32);
// Append last odd Uint32 word
if ((words%2) != 0)
sum ^= buf[words-1];
return sum;
}
inline
Uint32
computeXorChecksum(const Uint32 *buf, Uint32 words, Uint32 sum = 0)
{
if (words < 16) // Decided by empirical experiments
return computeXorChecksumShort(buf,words,sum);
else
return computeXorChecksumLong(buf,words,sum);
}
#endif // CHECKSUM_HPP