src/commonlib/bsd/ipchksum.c - coreboot - Gitiles

 /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */

 #include <commonlib/bsd/ipchksum.h>

 /* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and
    then narrow down, why we don't need to worry about endianness, etc. */
 uint16_t ipchksum(const void *data, size_t size)
 {
 	const uint8_t *p1 = data;
 	unsigned long wide_sum = 0;
 	uint32_t sum = 0;
 	size_t i = 0;

 #if defined(__aarch64__)
 	size_t size16 = size / 16;
 	const uint64_t *p8 = data;
 	if (size16) {
 		unsigned long tmp1, tmp2;
 		i = size16 * 16;
 		asm (
 			"adds	xzr, xzr, xzr\n\t"	/* init carry flag for addition */
 			"1:\n\t"
 			"ldp	%[v1], %[v2], [%[p8]], #16\n\t"
 			"adcs	%[wsum], %[wsum], %[v1]\n\t"
 			"adcs	%[wsum], %[wsum], %[v2]\n\t"
 			"sub	%[size16], %[size16], #1\n\t"
 			"cbnz	%[size16], 1b\n\t"
 			"adcs	%[wsum], %[wsum], xzr\n\t"	/* use up last carry */
 		: [v1] "=r" (tmp1),
 		  [v2] "=r" (tmp2),
 		  [wsum] "+r" (wide_sum),
 		  [p8] "+r" (p8),
 		  [size16] "+r" (size16)
 		:: "cc"
 		);
 	}
 #endif

 	while (wide_sum) {
 		sum += wide_sum & 0xFFFF;
 		wide_sum >>= 16;
 	}
 	sum = (sum & 0xFFFF) + (sum >> 16);

 	for (; i < size; i++) {
 		uint32_t v = p1[i];
 		if (i % 2)
 			v <<= 8;
 		sum += v;

 		/* Doing this unconditionally seems to be faster. */
 		sum = (sum & 0xFFFF) + (sum >> 16);
 	}

 	return (uint16_t)~sum;
 }

 uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second)
 {
 	first = ~first;
 	second = ~second;

 	/*
 	 * Since the checksum is calculated in 16-bit chunks, if the offset at which
 	 * the data covered by the second checksum would start (if both data streams
 	 * came one after the other) is odd, that means the second stream starts in
 	 * the middle of a 16-bit chunk. This means the second checksum is byte
 	 * swapped compared to what we need it to be, and we must swap it back.
 	 */
 	if (offset % 2)
 		second = (second >> 8) | (second << 8);

 	uint32_t sum = first + second;
 	sum = (sum & 0xFFFF) + (sum >> 16);

 	return (uint16_t)~sum;
 }
	/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */

	#include <commonlib/bsd/ipchksum.h>

	/* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and
	then narrow down, why we don't need to worry about endianness, etc. */
	uint16_t ipchksum(const void *data, size_t size)
	{
	const uint8_t *p1 = data;
	unsigned long wide_sum = 0;
	uint32_t sum = 0;
	size_t i = 0;

	#if defined(__aarch64__)
	size_t size16 = size / 16;
	const uint64_t *p8 = data;
	if (size16) {
	unsigned long tmp1, tmp2;
	i = size16 * 16;
	asm (
	"adds xzr, xzr, xzr\n\t" /* init carry flag for addition */
	"1:\n\t"
	"ldp %[v1], %[v2], [%[p8]], #16\n\t"
	"adcs %[wsum], %[wsum], %[v1]\n\t"
	"adcs %[wsum], %[wsum], %[v2]\n\t"
	"sub %[size16], %[size16], #1\n\t"
	"cbnz %[size16], 1b\n\t"
	"adcs %[wsum], %[wsum], xzr\n\t" /* use up last carry */
	: [v1] "=r" (tmp1),
	[v2] "=r" (tmp2),
	[wsum] "+r" (wide_sum),
	[p8] "+r" (p8),
	[size16] "+r" (size16)
	:: "cc"
	);
	}
	#endif

	while (wide_sum) {
	sum += wide_sum & 0xFFFF;
	wide_sum >>= 16;
	}
	sum = (sum & 0xFFFF) + (sum >> 16);

	for (; i < size; i++) {
	uint32_t v = p1[i];
	if (i % 2)
	v <<= 8;
	sum += v;

	/* Doing this unconditionally seems to be faster. */
	sum = (sum & 0xFFFF) + (sum >> 16);
	}

	return (uint16_t)~sum;
	}

	uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second)
	{
	first = ~first;
	second = ~second;

	/*
	* Since the checksum is calculated in 16-bit chunks, if the offset at which
	* the data covered by the second checksum would start (if both data streams
	* came one after the other) is odd, that means the second stream starts in
	* the middle of a 16-bit chunk. This means the second checksum is byte
	* swapped compared to what we need it to be, and we must swap it back.
	*/
	if (offset % 2)
	second = (second >> 8) \| (second << 8);

	uint32_t sum = first + second;
	sum = (sum & 0xFFFF) + (sum >> 16);

	return (uint16_t)~sum;
	}