1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
#include "arm_arch.h"
.text
.arch armv8-a+crypto
.global gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
ld1 {v17.2d},[x1] //load H
movi v16.16b,#0xe1
ext v3.16b,v17.16b,v17.16b,#8
shl v16.2d,v16.2d,#57
ushr v18.2d,v16.2d,#63
ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01
dup v17.4s,v17.s[1]
ushr v19.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
and v19.16b,v19.16b,v16.16b
shl v3.2d,v3.2d,#1
ext v19.16b,v19.16b,v19.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v19.16b //H<<<=1
eor v3.16b,v3.16b,v16.16b //twisted H
st1 {v3.2d},[x0]
ret
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d},[x1] //load twisted H
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ext v21.16b,v20.16b,v20.16b,#8
mov x3,#0
ext v3.16b,v17.16b,v17.16b,#8
mov x12,#0
eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
mov x2,x0
b .Lgmult_v8
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
ld1 {v0.2d},[x0] //load [rotated] Xi
subs x3,x3,#16
movi v19.16b,#0xe1
mov x12,#16
ld1 {v20.2d},[x1] //load twisted H
csel x12,xzr,x12,eq
ext v0.16b,v0.16b,v0.16b,#8
shl v19.2d,v19.2d,#57
ld1 {v17.2d},[x2],x12 //load [rotated] inp
ext v21.16b,v20.16b,v20.16b,#8
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
rev64 v17.16b,v17.16b
#endif
eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
ext v3.16b,v17.16b,v17.16b,#8
b .Loop_v8
.align 4
.Loop_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi
.Lgmult_v8:
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
subs x3,x3,#16
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
csel x12,xzr,x12,eq
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] inp
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
eor v0.16b,v1.16b,v18.16b
ext v3.16b,v17.16b,v17.16b,#8
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
b.hs .Loop_v8
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2
|