blob: b7434e5fd76371fa1391ab6e94336afd43a77883 [file] [log] [blame]
Julius Wernerde371092024-01-30 16:51:05 -08001/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
2
3#include <commonlib/bsd/ipchksum.h>
4
5/* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and
6 then narrow down, why we don't need to worry about endianness, etc. */
7uint16_t ipchksum(const void *data, size_t size)
8{
9 const uint8_t *p1 = data;
10 unsigned long wide_sum = 0;
11 uint32_t sum = 0;
12 size_t i = 0;
13
Julius Werner89fae182024-01-30 17:26:52 -080014#if defined(__aarch64__)
15 size_t size16 = size / 16;
16 const uint64_t *p8 = data;
17 if (size16) {
18 unsigned long tmp1, tmp2;
19 i = size16 * 16;
20 asm (
21 "adds xzr, xzr, xzr\n\t" /* init carry flag for addition */
22 "1:\n\t"
23 "ldp %[v1], %[v2], [%[p8]], #16\n\t"
24 "adcs %[wsum], %[wsum], %[v1]\n\t"
25 "adcs %[wsum], %[wsum], %[v2]\n\t"
26 "sub %[size16], %[size16], #1\n\t"
27 "cbnz %[size16], 1b\n\t"
28 "adcs %[wsum], %[wsum], xzr\n\t" /* use up last carry */
29 : [v1] "=r" (tmp1),
30 [v2] "=r" (tmp2),
31 [wsum] "+r" (wide_sum),
32 [p8] "+r" (p8),
33 [size16] "+r" (size16)
34 :: "cc"
35 );
36 }
Julius Wernerf6e35842024-01-30 17:34:05 -080037#elif defined(__i386__) || defined(__x86_64__)
38 size_t size8 = size / 8;
39 const uint64_t *p8 = data;
40 i = size8 * 8;
41 asm (
42 "clc\n\t"
43 "1:\n\t"
44 "jecxz 2f\n\t" /* technically RCX on 64, but not gonna be that big */
45 "adc (%[p8]), %[wsum]\n\t"
46#if defined(__i386__)
47 "adc 4(%[p8]), %[wsum]\n\t"
48#endif /* __i386__ */
49 "lea -1(%[size8]), %[size8]\n\t" /* Use LEA as a makeshift ADD that */
50 "lea 8(%[p8]), %[p8]\n\t" /* doesn't modify the carry flag. */
51 "jmp 1b\n\t"
52 "2:\n\t"
53 "setc %b[size8]\n\t" /* reuse size register to save last carry */
54 "add %[size8], %[wsum]\n\t"
55 : [wsum] "+r" (wide_sum),
56 [p8] "+r" (p8),
57 [size8] "+c" (size8) /* put size in ECX so we can JECXZ */
58 :: "cc"
59 );
60#endif /* __i386__ || __x86_64__ */
Julius Werner89fae182024-01-30 17:26:52 -080061
Julius Wernerde371092024-01-30 16:51:05 -080062 while (wide_sum) {
63 sum += wide_sum & 0xFFFF;
64 wide_sum >>= 16;
65 }
66 sum = (sum & 0xFFFF) + (sum >> 16);
67
68 for (; i < size; i++) {
69 uint32_t v = p1[i];
70 if (i % 2)
71 v <<= 8;
72 sum += v;
73
74 /* Doing this unconditionally seems to be faster. */
75 sum = (sum & 0xFFFF) + (sum >> 16);
76 }
77
78 return (uint16_t)~sum;
79}
80
81uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second)
82{
83 first = ~first;
84 second = ~second;
85
86 /*
87 * Since the checksum is calculated in 16-bit chunks, if the offset at which
88 * the data covered by the second checksum would start (if both data streams
89 * came one after the other) is odd, that means the second stream starts in
90 * the middle of a 16-bit chunk. This means the second checksum is byte
91 * swapped compared to what we need it to be, and we must swap it back.
92 */
93 if (offset % 2)
94 second = (second >> 8) | (second << 8);
95
96 uint32_t sum = first + second;
97 sum = (sum & 0xFFFF) + (sum >> 16);
98
99 return (uint16_t)~sum;
100}