blob: 89d261f4cc0dfbbf2cbd6c56c3f1083a172d74ce [file] [log] [blame]
/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
#include <commonlib/bsd/ipchksum.h>
/* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and
then narrow down, why we don't need to worry about endianness, etc. */
uint16_t ipchksum(const void *data, size_t size)
{
const uint8_t *p1 = data;
unsigned long wide_sum = 0;
uint32_t sum = 0;
size_t i = 0;
#if defined(__aarch64__)
size_t size16 = size / 16;
const uint64_t *p8 = data;
if (size16) {
unsigned long tmp1, tmp2;
i = size16 * 16;
asm (
"adds xzr, xzr, xzr\n\t" /* init carry flag for addition */
"1:\n\t"
"ldp %[v1], %[v2], [%[p8]], #16\n\t"
"adcs %[wsum], %[wsum], %[v1]\n\t"
"adcs %[wsum], %[wsum], %[v2]\n\t"
"sub %[size16], %[size16], #1\n\t"
"cbnz %[size16], 1b\n\t"
"adcs %[wsum], %[wsum], xzr\n\t" /* use up last carry */
: [v1] "=r" (tmp1),
[v2] "=r" (tmp2),
[wsum] "+r" (wide_sum),
[p8] "+r" (p8),
[size16] "+r" (size16)
:: "cc"
);
}
#endif
while (wide_sum) {
sum += wide_sum & 0xFFFF;
wide_sum >>= 16;
}
sum = (sum & 0xFFFF) + (sum >> 16);
for (; i < size; i++) {
uint32_t v = p1[i];
if (i % 2)
v <<= 8;
sum += v;
/* Doing this unconditionally seems to be faster. */
sum = (sum & 0xFFFF) + (sum >> 16);
}
return (uint16_t)~sum;
}
uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second)
{
first = ~first;
second = ~second;
/*
* Since the checksum is calculated in 16-bit chunks, if the offset at which
* the data covered by the second checksum would start (if both data streams
* came one after the other) is odd, that means the second stream starts in
* the middle of a 16-bit chunk. This means the second checksum is byte
* swapped compared to what we need it to be, and we must swap it back.
*/
if (offset % 2)
second = (second >> 8) | (second << 8);
uint32_t sum = first + second;
sum = (sum & 0xFFFF) + (sum >> 16);
return (uint16_t)~sum;
}