/* * Blowfish Cipher Algorithm (x86_64) * * Copyright (C) 2011 Jussi Kivilinna * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * */ .file "blowfish-x86_64-asm.S" .text /* structure of crypto context */ #define p 0 #define s0 ((16 + 2) * 4) #define s1 ((16 + 2 + (1 * 256)) * 4) #define s2 ((16 + 2 + (2 * 256)) * 4) #define s3 ((16 + 2 + (3 * 256)) * 4) /* register macros */ #define CTX %rdi #define RIO %rsi #define RX0 %rax #define RX1 %rbx #define RX2 %rcx #define RX3 %rdx #define RX0d %eax #define RX1d %ebx #define RX2d %ecx #define RX3d %edx #define RX0bl %al #define RX1bl %bl #define RX2bl %cl #define RX3bl %dl #define RX0bh %ah #define RX1bh %bh #define RX2bh %ch #define RX3bh %dh #define RT0 %rbp #define RT1 %rsi #define RT0d %ebp #define RT1d %esi #define RK0 %r8 #define RK1 %r9 #define RK2 %r10 #define RK3 %r11 #define RK0d %r8d #define RK1d %r9d #define RK2d %r10d #define RK3d %r11d #define RKEY %r12 /*********************************************************************** * 1-way blowfish ***********************************************************************/ #define F(x, k) \ rorq $16, x; \ movzbl x ## bh, RT0d; \ movzbl x ## bl, RT1d; \ rolq $16, x; \ movl s0(CTX,RT0,4), k ## d; \ addl s1(CTX,RT1,4), k ## d; \ movzbl x ## bh, RT0d; \ movzbl x ## bl, RT1d; \ rolq $32, x; \ xorl s2(CTX,RT0,4), k ## d; \ addl s3(CTX,RT1,4), k ## d; \ xorq k, x; #define add_roundkey_enc(n) \ xorq p+4*(n)(CTX), RX0; #define round_enc(n) \ add_roundkey_enc(n); \ \ F(RX0, RK0); \ F(RX0, RK0); #define round_final_enc(n) \ xorq p+4*(n)(CTX), RX0; #define add_roundkey_dec(n) \ movq p+4*(n-1)(CTX), RT0; \ rorq $32, RT0; \ xorq RT0, RX0; #define round_dec(n) \ add_roundkey_dec(n); \ \ F(RX0, RK0); \ F(RX0, RK0); \ #define read_block() \ movq (RIO), RX0; \ rorq $32, RX0; \ bswapq RX0; #define write_block() \ bswapq RX0; \ movq RX0, (RIO); #define xor_block() \ bswapq RX0; \ xorq RX0, (RIO); .align 8 .global __blowfish_enc_blk .type __blowfish_enc_blk,@function; __blowfish_enc_blk: // input: // %rdi: ctx, CTX // %rsi: dst // %rdx: src // %rcx: bool xor pushq %rbp; pushq %rbx; pushq %rsi; pushq %rcx; movq %rdx, RIO; read_block(); round_enc(0); round_enc(2); round_enc(4); round_enc(6); round_enc(8); round_enc(10); round_enc(12); round_enc(14); add_roundkey_enc(16); popq %rbp; popq RIO; test %bpl, %bpl; jnz __enc_xor; write_block(); __enc_ret: popq %rbx; popq %rbp; ret; __enc_xor: xor_block(); jmp __enc_ret; .align 8 .global blowfish_dec_blk .type blowfish_dec_blk,@function; blowfish_dec_blk: // input: // %rdi: ctx, CTX // %rsi: dst // %rdx: src pushq %rbp; pushq %rbx; pushq %rsi; movq %rdx, RIO; read_block(); round_dec(17); round_dec(15); round_dec(13); round_dec(11); round_dec(9); round_dec(7); round_dec(5); round_dec(3); add_roundkey_dec(1); popq RIO; write_block(); popq %rbx; popq %rbp; ret; /********************************************************************** 4-way blowfish, four blocks parallel **********************************************************************/ #define add_preloaded_roundkey4() \ xorq RKEY, RX0; \ xorq RKEY, RX1; \ xorq RKEY, RX2; \ xorq RKEY, RX3; #define preload_roundkey_enc(n) \ movq p+4*(n)(CTX), RKEY; #define add_roundkey_enc4(n) \ add_preloaded_roundkey4(); \ preload_roundkey_enc(n + 2); #define round_enc4(n) \ add_roundkey_enc4(n); \ \ F(RX0, RK0); \ F(RX1, RK1); \ F(RX2, RK2); \ F(RX3, RK3); \ \ F(RX0, RK0); \ F(RX1, RK1); \ F(RX2, RK2); \ F(RX3, RK3); #define preload_roundkey_dec(n) \ movq p+4*((n)-1)(CTX), RKEY; \ rorq $32, RKEY; #define add_roundkey_dec4(n) \ add_preloaded_roundkey4(); \ preload_roundkey_dec(n - 2); #define round_dec4(n) \ add_roundkey_dec4(n); \ \ F(RX0, RK0); \ F(RX1, RK1); \ F(RX2, RK2); \ F(RX3, RK3); \ \ F(RX0, RK0); \ F(RX1, RK1); \ F(RX2, RK2); \ F(RX3, RK3); #define read_block4() \ movq (RIO), RX0; \ rorq $32, RX0; \ bswapq RX0; \ \ movq 8(RIO), RX1; \ rorq $32, RX1; \ bswapq RX1; \ \ movq 16(RIO), RX2; \ rorq $32, RX2; \ bswapq RX2; \ \ movq 24(RIO), RX3; \ rorq $32, RX3; \ bswapq RX3; #define write_block4() \ bswapq RX0; \ movq RX0, (RIO); \ \ bswapq RX1; \ movq RX1, 8(RIO); \ \ bswapq RX2; \ movq RX2, 16(RIO); \ \ bswapq RX3; \ movq RX3, 24(RIO); #define xor_block4() \ bswapq RX0; \ xorq RX0, (RIO); \ \ bswapq RX1; \ xorq RX1, 8(RIO); \ \ bswapq RX2; \ xorq RX2, 16(RIO); \ \ bswapq RX3; \ xorq RX3, 24(RIO); .align 8 .global __blowfish_enc_blk_4way .type __blowfish_enc_blk_4way,@function; __blowfish_enc_blk_4way: // input: // %rdi: ctx, CTX // %rsi: dst // %rdx: src // %rcx: bool xor pushq %rbp; pushq %rbx; pushq RKEY; preload_roundkey_enc(0); pushq %rsi; pushq %rcx; movq %rdx, RIO; read_block4(); round_enc4(0); round_enc4(2); round_enc4(4); round_enc4(6); round_enc4(8); round_enc4(10); round_enc4(12); round_enc4(14); add_preloaded_roundkey4(); popq %rbp; popq RIO; test %bpl, %bpl; jnz __enc_xor4; write_block4(); __enc_ret4: popq RKEY; popq %rbx; popq %rbp; ret; __enc_xor4: xor_block4(); jmp __enc_ret4; .align 8 .global blowfish_dec_blk_4way .type blowfish_dec_blk_4way,@function; blowfish_dec_blk_4way: // input: // %rdi: ctx, CTX // %rsi: dst // %rdx: src pushq %rbp; pushq %rbx; pushq RKEY; preload_roundkey_dec(17); pushq %rsi; movq %rdx, RIO; read_block4(); round_dec4(17); round_dec4(15); round_dec4(13); round_dec4(11); round_dec4(9); round_dec4(7); round_dec4(5); round_dec4(3); add_preloaded_roundkey4(); popq RIO; write_block4(); popq RKEY; popq %rbx; popq %rbp; ret;