1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
|
#!/usr/bin/env perl
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
if ($win64) { $arg1="%rcx"; $arg2="%rdx"; }
else { $arg1="%rdi"; $arg2="%rsi"; }
print<<___;
.extern OPENSSL_cpuid_setup
.section .init
call OPENSSL_cpuid_setup
.text
.globl OPENSSL_atomic_add
.type OPENSSL_atomic_add,\@abi-omnipotent
.align 16
OPENSSL_atomic_add:
movl ($arg1),%eax
.Lspin: leaq ($arg2,%rax),%r8
.byte 0xf0 # lock
cmpxchgl %r8d,($arg1)
jne .Lspin
movl %r8d,%eax
.byte 0x48,0x98 # cltq/cdqe
ret
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
.globl OPENSSL_rdtsc
.type OPENSSL_rdtsc,\@abi-omnipotent
.align 16
OPENSSL_rdtsc:
rdtsc
shl \$32,%rdx
or %rdx,%rax
ret
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
.globl OPENSSL_ia32_cpuid
.type OPENSSL_ia32_cpuid,\@abi-omnipotent
.align 16
OPENSSL_ia32_cpuid:
mov %rbx,%r8
xor %eax,%eax
cpuid
mov %eax,%r11d # max value for standard query level
xor %eax,%eax
cmp \$0x756e6547,%ebx # "Genu"
setne %al
mov %eax,%r9d
cmp \$0x49656e69,%edx # "ineI"
setne %al
or %eax,%r9d
cmp \$0x6c65746e,%ecx # "ntel"
setne %al
or %eax,%r9d # 0 indicates Intel CPU
jz .Lintel
cmp \$0x68747541,%ebx # "Auth"
setne %al
mov %eax,%r10d
cmp \$0x69746E65,%edx # "enti"
setne %al
or %eax,%r10d
cmp \$0x444D4163,%ecx # "cAMD"
setne %al
or %eax,%r10d # 0 indicates AMD CPU
jnz .Lintel
# AMD specific
mov \$0x80000000,%eax
cpuid
cmp \$0x80000008,%eax
jb .Lintel
mov \$0x80000008,%eax
cpuid
movzb %cl,%r10 # number of cores - 1
inc %r10 # number of cores
mov \$1,%eax
cpuid
bt \$28,%edx # test hyper-threading bit
jnc .Ldone
shr \$16,%ebx # number of logical processors
cmp %r10b,%bl
ja .Ldone
and \$0xefffffff,%edx # ~(1<<28)
jmp .Ldone
.Lintel:
cmp \$4,%r11d
mov \$-1,%r10d
jb .Lnocacheinfo
mov \$4,%eax
mov \$0,%ecx # query L1D
cpuid
mov %eax,%r10d
shr \$14,%r10d
and \$0xfff,%r10d # number of cores -1 per L1D
.Lnocacheinfo:
mov \$1,%eax
cpuid
cmp \$0,%r9d
jne .Lnotintel
or \$0x00100000,%edx # use reserved 20th bit to engage RC4_CHAR
and \$15,%ah
cmp \$15,%ah # examine Family ID
je .Lnotintel
or \$0x40000000,%edx # use reserved bit to skip unrolled loop
.Lnotintel:
bt \$28,%edx # test hyper-threading bit
jnc .Ldone
and \$0xefffffff,%edx # ~(1<<28)
cmp \$0,%r10d
je .Ldone
or \$0x10000000,%edx # 1<<28
shr \$16,%ebx
cmp \$1,%bl # see if cache is shared
ja .Ldone
and \$0xefffffff,%edx # ~(1<<28)
.Ldone:
shl \$32,%rcx
mov %edx,%eax
mov %r8,%rbx
or %rcx,%rax
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,\@abi-omnipotent
.align 16
OPENSSL_cleanse:
xor %rax,%rax
cmp \$15,$arg2
jae .Lot
cmp \$0,$arg2
je .Lret
.Little:
mov %al,($arg1)
sub \$1,$arg2
lea 1($arg1),$arg1
jnz .Little
.Lret:
ret
.align 16
.Lot:
test \$7,$arg1
jz .Laligned
mov %al,($arg1)
lea -1($arg2),$arg2
lea 1($arg1),$arg1
jmp .Lot
.Laligned:
mov %rax,($arg1)
lea -8($arg2),$arg2
test \$-8,$arg2
lea 8($arg1),$arg1
jnz .Laligned
cmp \$0,$arg2
jne .Little
ret
.size OPENSSL_cleanse,.-OPENSSL_cleanse
___
print<<___ if (!$win64);
.globl OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,\@abi-omnipotent
.align 16
OPENSSL_wipe_cpu:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
pxor %xmm10,%xmm10
pxor %xmm11,%xmm11
pxor %xmm12,%xmm12
pxor %xmm13,%xmm13
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
xorq %rcx,%rcx
xorq %rdx,%rdx
xorq %rsi,%rsi
xorq %rdi,%rdi
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
xorq %r11,%r11
leaq 8(%rsp),%rax
ret
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
___
print<<___ if ($win64);
.globl OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,\@abi-omnipotent
.align 16
OPENSSL_wipe_cpu:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
xorq %rcx,%rcx
xorq %rdx,%rdx
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
xorq %r11,%r11
leaq 8(%rsp),%rax
ret
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
___
close STDOUT; # flush
|