Print this page
3882 remove xmod & friends
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/common/crypto/arcfour/amd64/arcfour-x86_64.pl
+++ new/usr/src/common/crypto/arcfour/amd64/arcfour-x86_64.pl
1 1 #!/usr/bin/env perl
2 2 #
3 3 # ====================================================================
4 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 5 # project. The module is, however, dual licensed under OpenSSL and
6 6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 7 # details see http://www.openssl.org/~appro/cryptogams/.
8 8 # ====================================================================
9 9 #
10 10 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
11 11 # "hand-coded assembler"] doesn't stand for the whole improvement
12 12 # coefficient. It turned out that eliminating RC4_CHAR from config
13 13 # line results in ~40% improvement (yes, even for C implementation).
14 14 # Presumably it has everything to do with AMD cache architecture and
15 15 # RAW or whatever penalties. Once again! The module *requires* config
16 16 # line *without* RC4_CHAR! As for coding "secret," I bet on partial
17 17 # register arithmetics. For example instead of 'inc %r8; and $255,%r8'
18 18 # I simply 'inc %r8b'. Even though optimization manual discourages
19 19 # to operate on partial registers, it turned out to be the best bet.
20 20 # At least for AMD... How IA32E would perform remains to be seen...
21 21
22 22 # As was shown by Marc Bevand reordering of couple of load operations
23 23 # results in even higher performance gain of 3.3x:-) At least on
24 24 # Opteron... For reference, 1x in this case is RC4_CHAR C-code
25 25 # compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
26 26 # Latter means that if you want to *estimate* what to expect from
27 27 # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
28 28
29 29 # Intel P4 EM64T core was found to run the AMD64 code really slow...
30 30 # The only way to achieve comparable performance on P4 was to keep
31 31 # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
32 32 # compose blended code, which would perform even within 30% marginal
33 33 # on either AMD and Intel platforms, I implement both cases. See
34 34 # rc4_skey.c for further details...
35 35
36 36 # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
37 37 # those with add/sub results in 50% performance improvement of folded
38 38 # loop...
39 39
40 40 # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
41 41 # performance by >30% [unlike P4 32-bit case that is]. But this is
42 42 # provided that loads are reordered even more aggressively! Both code
43 43 # pathes, AMD64 and EM64T, reorder loads in essentially same manner
44 44 # as my IA-64 implementation. On Opteron this resulted in modest 5%
45 45 # improvement [I had to test it], while final Intel P4 performance
46 46 # achieves respectful 432MBps on 2.8GHz processor now. For reference.
47 47 # If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
48 48 # RC4_INT code-path. While if executed on Opteron, it's only 25%
49 49 # slower than the RC4_INT one [meaning that if CPU ยต-arch detection
50 50 # is not implemented, then this final RC4_CHAR code-path should be
51 51 # preferred, as it provides better *all-round* performance].
52 52
53 53 # Intel Core2 was observed to perform poorly on both code paths:-( It
54 54 # apparently suffers from some kind of partial register stall, which
55 55 # occurs in 64-bit mode only [as virtually identical 32-bit loop was
56 56 # observed to outperform 64-bit one by almost 50%]. Adding two movzb to
57 57 # cloop1 boosts its performance by 80%! This loop appears to be optimal
58 58 # fit for Core2 and therefore the code was modified to skip cloop8 on
59 59 # this CPU.
60 60
61 61 #
62 62 # OpenSolaris OS modifications
63 63 #
64 64 # Sun elects to use this software under the BSD license.
65 65 #
66 66 # This source originates from OpenSSL file rc4-x86_64.pl at
67 67 # ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
68 68 # (presumably for future OpenSSL release 0.9.8h), with these changes:
69 69 #
70 70 # 1. Added some comments, "use strict", and declared all variables.
71 71 #
72 72 # 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
73 73 # /usr/include/sys/asm_linkage.h.
74 74 #
75 75 # 3. Changed function name from RC4() to arcfour_crypt_asm() and RC4_set_key()
76 76 # to arcfour_key_init(), and changed the parameter order for both to that
77 77 # used by OpenSolaris.
78 78 #
79 79 # 4. The current method of using cpuid feature bits 20 (NX) or 28 (HTT) from
80 80 # function OPENSSL_ia32_cpuid() to distinguish Intel/AMD does not work for
81 81 # some newer AMD64 processors, as these bits are set on both Intel EM64T
82 82 # processors and newer AMD64 processors. I replaced this with C code
83 83 # (function arcfour_crypt_on_intel()) to call cpuid_getvendor()
84 84 # when executing in the kernel and getisax() when executing in userland.
85 85 #
86 86 # 5. Set a new field in the key structure, key->flag to 0 for AMD AMD64
87 87 # and 1 for Intel EM64T. This is to select the most-efficient arcfour_crypt()
88 88 # function to use.
89 89 #
90 90 # 6. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) assemblers).
91 91 #
92 92 # 7. Removed unused RC4_CHAR, Lcloop1, and Lcloop8 code.
93 93 #
94 94 # 8. Added C function definitions for use by lint(1B).
95 95 #
96 96
97 97 use strict;
98 98 my ($code, $dat, $inp, $out, $len, $idx, $ido, $i, @XX, @TX, $YY, $TY);
99 99 my $output = shift;
100 100 open STDOUT,">$output";
101 101
102 102 #
103 103 # Parameters
104 104 #
105 105
106 106 # OpenSSL:
107 107 # void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
108 108 # unsigned char *outdata);
109 109 #$dat="%rdi"; # arg1
110 110 #$len="%rsi"; # arg2
111 111 #$inp="%rdx"; # arg3
112 112 #$out="%rcx"; # arg4
113 113
114 114 # OpenSolaris:
115 115 # void arcfour_crypt_asm(ARCFour_key *key, uchar_t *in, uchar_t *out,
116 116 # size_t len);
117 117 $dat="%rdi"; # arg1
118 118 $inp="%rsi"; # arg2
119 119 $out="%rdx"; # arg3
120 120 $len="%rcx"; # arg4
121 121
122 122 #
123 123 # Register variables
124 124 #
125 125 # $XX[0] is key->i (aka key->x), $XX[1] is a temporary.
126 126 # $TX[0] and $TX[1] are temporaries.
127 127 # $YY is key->j (aka key->y).
128 128 # $TY is a temporary.
129 129 #
130 130 @XX=("%r8","%r10");
131 131 @TX=("%r9","%r11");
132 132 $YY="%r12";
133 133 $TY="%r13";
134 134
135 135 $code=<<___;
136 136 #if defined(lint) || defined(__lint)
137 137
138 138 #include "arcfour.h"
139 139
140 140 /* ARGSUSED */
141 141 void
142 142 arcfour_crypt_asm(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
143 143 {}
↓ open down ↓ |
143 lines elided |
↑ open up ↑ |
144 144
145 145 /* ARGSUSED */
146 146 void
147 147 arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
148 148 {}
149 149
150 150 #else
151 151 #include <sys/asm_linkage.h>
152 152
153 153 ENTRY_NP(arcfour_crypt_asm)
154 - /* EXPORT DELETE START */
155 -
156 154 or $len,$len # If (len == 0) return
157 155 jne .Lentry
158 156 ret
159 157 .Lentry:
160 158 push %r12
161 159 push %r13
162 160
163 161 / Set $dat to beginning of array, key->arr[0]
164 162 add \$8,$dat
165 163 / Get key->j
166 164 movl -8($dat),$XX[0]#d
167 165 / Get key->i
168 166 movl -4($dat),$YY#d
169 167
170 168 /
171 169 / Use a 4-byte key schedule element array
172 170 /
173 171 inc $XX[0]#b
174 172 movl ($dat,$XX[0],4),$TX[0]#d
175 173 test \$-8,$len
176 174 jz .Lloop1
177 175 jmp .Lloop8
178 176
179 177 .align 16
180 178 .Lloop8:
181 179 ___
182 180 for ($i=0;$i<8;$i++) {
183 181 $code.=<<___;
184 182 add $TX[0]#b,$YY#b
185 183 mov $XX[0],$XX[1]
186 184 movl ($dat,$YY,4),$TY#d
187 185 ror \$8,%rax # ror is redundant when $i=0
188 186 inc $XX[1]#b
189 187 movl ($dat,$XX[1],4),$TX[1]#d
190 188 cmp $XX[1],$YY
191 189 movl $TX[0]#d,($dat,$YY,4)
192 190 cmove $TX[0],$TX[1]
193 191 movl $TY#d,($dat,$XX[0],4)
194 192 add $TX[0]#b,$TY#b
195 193 movb ($dat,$TY,4),%al
196 194 ___
197 195 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
198 196 }
199 197 $code.=<<___;
200 198 ror \$8,%rax
201 199 sub \$8,$len
202 200
203 201 xor ($inp),%rax
204 202 add \$8,$inp
205 203 mov %rax,($out)
206 204 add \$8,$out
207 205
208 206 test \$-8,$len
209 207 jnz .Lloop8
210 208 cmp \$0,$len
211 209 jne .Lloop1
212 210
213 211 .Lexit:
214 212 /
215 213 / Cleanup and exit code
216 214 /
217 215 / --i to undo ++i done at entry
218 216 sub \$1,$XX[0]#b
219 217 / set key->i
220 218 movl $XX[0]#d,-8($dat)
221 219 / set key->j
222 220 movl $YY#d,-4($dat)
223 221
224 222 pop %r13
225 223 pop %r12
226 224 ret
227 225
228 226 .align 16
229 227 .Lloop1:
230 228 add $TX[0]#b,$YY#b
231 229 movl ($dat,$YY,4),$TY#d
232 230 movl $TX[0]#d,($dat,$YY,4)
233 231 movl $TY#d,($dat,$XX[0],4)
234 232 add $TY#b,$TX[0]#b
235 233 inc $XX[0]#b
↓ open down ↓ |
70 lines elided |
↑ open up ↑ |
236 234 movl ($dat,$TX[0],4),$TY#d
237 235 movl ($dat,$XX[0],4),$TX[0]#d
238 236 xorb ($inp),$TY#b
239 237 inc $inp
240 238 movb $TY#b,($out)
241 239 inc $out
242 240 dec $len
243 241 jnz .Lloop1
244 242 jmp .Lexit
245 243
246 - /* EXPORT DELETE END */
247 244 ret
248 245 SET_SIZE(arcfour_crypt_asm)
249 246 ___
250 247
251 248
252 249 #
253 250 # Parameters
254 251 #
255 252
256 253 # OpenSSL:
257 254 # void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
258 255 #$dat="%rdi"; # arg1
259 256 #$len="%rsi"; # arg2
260 257 #$inp="%rdx"; # arg3
261 258
262 259 # OpenSolaris:
263 260 # void arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen);
264 261 $dat="%rdi"; # arg1
265 262 $inp="%rsi"; # arg2
266 263 $len="%rdx"; # arg3
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
267 264
268 265 # Temporaries
269 266 $idx="%r8";
270 267 $ido="%r9";
271 268
272 269 $code.=<<___;
273 270 / int arcfour_crypt_on_intel(void);
274 271 .extern arcfour_crypt_on_intel
275 272
276 273 ENTRY_NP(arcfour_key_init)
277 - /* EXPORT DELETE START */
278 -
279 274 / Find out if we're running on Intel or something else (e.g., AMD64).
280 275 / This sets %eax to 1 for Intel, otherwise 0.
281 276 push %rdi / Save arg1
282 277 push %rsi / Save arg2
283 278 push %rdx / Save arg3
284 279 call arcfour_crypt_on_intel
285 280 pop %rdx / Restore arg3
286 281 pop %rsi / Restore arg2
287 282 pop %rdi / Restore arg1
288 283 / Save return value in key->flag (1=Intel, 0=AMD)
289 284 movl %eax,1032($dat)
290 285
291 286 / Set $dat to beginning of array, key->arr[0]
292 287 lea 8($dat),$dat
293 288 lea ($inp,$len),$inp
294 289 neg $len
295 290 mov $len,%rcx
296 291
297 292 xor %eax,%eax
298 293 xor $ido,$ido
299 294 xor %r10,%r10
300 295 xor %r11,%r11
301 296
302 297 / Use a 4-byte data array
303 298 jmp .Lw1stloop
304 299
305 300 .align 16
306 301 .Lw1stloop:
307 302 / AMD64 (4-byte array)
308 303 mov %eax,($dat,%rax,4)
309 304 add \$1,%al
310 305 jnc .Lw1stloop
311 306
312 307 xor $ido,$ido
313 308 xor $idx,$idx
314 309
315 310 .align 16
316 311 .Lw2ndloop:
317 312 mov ($dat,$ido,4),%r10d
318 313 add ($inp,$len,1),$idx#b
319 314 add %r10b,$idx#b
320 315 add \$1,$len
321 316 mov ($dat,$idx,4),%r11d
322 317 cmovz %rcx,$len
↓ open down ↓ |
34 lines elided |
↑ open up ↑ |
323 318 mov %r10d,($dat,$idx,4)
324 319 mov %r11d,($dat,$ido,4)
325 320 add \$1,$ido#b
326 321 jnc .Lw2ndloop
327 322
328 323 / Exit code
329 324 xor %eax,%eax
330 325 mov %eax,-8($dat)
331 326 mov %eax,-4($dat)
332 327
333 - /* EXPORT DELETE END */
334 328 ret
335 329 SET_SIZE(arcfour_key_init)
336 330 .asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
337 331 #endif /* !lint && !__lint */
338 332 ___
339 333
340 334 $code =~ s/#([bwd])/$1/gm;
341 335
342 336 print $code;
343 337
344 338 close STDOUT;
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX