mirror of
https://github.com/caddyserver/caddy.git
synced 2026-05-30 18:45:18 -04:00
update to quic-go v0.10.0 (#2288)
quic-go now vendors all of its dependencies, so we don't need to vendor them here. Created by running: gvt delete github.com/lucas-clemente/quic-go gvt delete github.com/bifurcation/mint gvt delete github.com/lucas-clemente/aes12 gvt delete github.com/lucas-clemente/fnv128a gvt delete github.com/lucas-clemente/quic-go-certificates gvt delete github.com/aead/chacha20 gvt delete github.com/hashicorp/golang-lru gvt fetch -tag v0.10.0-no-integrationtests github.com/lucas-clemente/quic-go
This commit is contained in:
committed by
Matt Holt
parent
9edc16e4d6
commit
dfbc2e81e3
+22
-1
@@ -9,6 +9,7 @@ package chacha // import "github.com/aead/chacha20/chacha"
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"math"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -28,6 +29,7 @@ const (
|
||||
var (
|
||||
useSSE2 bool
|
||||
useSSSE3 bool
|
||||
useAVX bool
|
||||
useAVX2 bool
|
||||
)
|
||||
|
||||
@@ -55,7 +57,7 @@ func setup(state *[64]byte, nonce, key []byte) (err error) {
|
||||
|
||||
copy(hNonce[:], nonce[:16])
|
||||
copy(tmpKey[:], key)
|
||||
hChaCha20(&tmpKey, &hNonce, &tmpKey)
|
||||
HChaCha20(&tmpKey, &hNonce, &tmpKey)
|
||||
copy(Nonce[8:], nonce[16:])
|
||||
initialize(state, tmpKey[:], &Nonce)
|
||||
|
||||
@@ -161,6 +163,21 @@ func (c *Cipher) XORKeyStream(dst, src []byte) {
|
||||
c.off = 0
|
||||
}
|
||||
|
||||
// check for counter overflow
|
||||
blocksToXOR := len(src) / 64
|
||||
if len(src)%64 != 0 {
|
||||
blocksToXOR++
|
||||
}
|
||||
var overflow bool
|
||||
if c.noncesize == INonceSize {
|
||||
overflow = binary.LittleEndian.Uint32(c.state[48:]) > math.MaxUint32-uint32(blocksToXOR)
|
||||
} else {
|
||||
overflow = binary.LittleEndian.Uint64(c.state[48:]) > math.MaxUint64-uint64(blocksToXOR)
|
||||
}
|
||||
if overflow {
|
||||
panic("chacha20/chacha: counter overflow")
|
||||
}
|
||||
|
||||
c.off += xorKeyStream(dst, src, &(c.block), &(c.state), c.rounds)
|
||||
}
|
||||
|
||||
@@ -174,3 +191,7 @@ func (c *Cipher) SetCounter(ctr uint64) {
|
||||
}
|
||||
c.off = 0
|
||||
}
|
||||
|
||||
// HChaCha20 generates 32 pseudo-random bytes from a 128 bit nonce and a 256 bit secret key.
|
||||
// It can be used as a key-derivation-function (KDF).
|
||||
func HChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { hChaCha20(out, nonce, key) }
|
||||
|
||||
+49
-185
@@ -2,111 +2,10 @@
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build go1.7,amd64,!gccgo,!appengine,!nacl
|
||||
// +build amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA ·sigma_AVX<>+0x00(SB)/4, $0x61707865
|
||||
DATA ·sigma_AVX<>+0x04(SB)/4, $0x3320646e
|
||||
DATA ·sigma_AVX<>+0x08(SB)/4, $0x79622d32
|
||||
DATA ·sigma_AVX<>+0x0C(SB)/4, $0x6b206574
|
||||
GLOBL ·sigma_AVX<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA ·one_AVX<>+0x00(SB)/8, $1
|
||||
DATA ·one_AVX<>+0x08(SB)/8, $0
|
||||
GLOBL ·one_AVX<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA ·one_AVX2<>+0x00(SB)/8, $0
|
||||
DATA ·one_AVX2<>+0x08(SB)/8, $0
|
||||
DATA ·one_AVX2<>+0x10(SB)/8, $1
|
||||
DATA ·one_AVX2<>+0x18(SB)/8, $0
|
||||
GLOBL ·one_AVX2<>(SB), (NOPTR+RODATA), $32
|
||||
|
||||
DATA ·two_AVX2<>+0x00(SB)/8, $2
|
||||
DATA ·two_AVX2<>+0x08(SB)/8, $0
|
||||
DATA ·two_AVX2<>+0x10(SB)/8, $2
|
||||
DATA ·two_AVX2<>+0x18(SB)/8, $0
|
||||
GLOBL ·two_AVX2<>(SB), (NOPTR+RODATA), $32
|
||||
|
||||
DATA ·rol16_AVX2<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16_AVX2<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
DATA ·rol16_AVX2<>+0x10(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16_AVX2<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL ·rol16_AVX2<>(SB), (NOPTR+RODATA), $32
|
||||
|
||||
DATA ·rol8_AVX2<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8_AVX2<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
DATA ·rol8_AVX2<>+0x10(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8_AVX2<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL ·rol8_AVX2<>(SB), (NOPTR+RODATA), $32
|
||||
|
||||
#define ROTL(n, t, v) \
|
||||
VPSLLD $n, v, t; \
|
||||
VPSRLD $(32-n), v, v; \
|
||||
VPXOR v, t, v
|
||||
|
||||
#define CHACHA_QROUND(v0, v1, v2, v3, t, c16, c8) \
|
||||
VPADDD v0, v1, v0; \
|
||||
VPXOR v3, v0, v3; \
|
||||
VPSHUFB c16, v3, v3; \
|
||||
VPADDD v2, v3, v2; \
|
||||
VPXOR v1, v2, v1; \
|
||||
ROTL(12, t, v1); \
|
||||
VPADDD v0, v1, v0; \
|
||||
VPXOR v3, v0, v3; \
|
||||
VPSHUFB c8, v3, v3; \
|
||||
VPADDD v2, v3, v2; \
|
||||
VPXOR v1, v2, v1; \
|
||||
ROTL(7, t, v1)
|
||||
|
||||
#define CHACHA_SHUFFLE(v1, v2, v3) \
|
||||
VPSHUFD $0x39, v1, v1; \
|
||||
VPSHUFD $0x4E, v2, v2; \
|
||||
VPSHUFD $-109, v3, v3
|
||||
|
||||
#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
|
||||
VMOVDQU (0+off)(src), t0; \
|
||||
VPERM2I128 $32, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (0+off)(dst); \
|
||||
VMOVDQU (32+off)(src), t0; \
|
||||
VPERM2I128 $32, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (32+off)(dst); \
|
||||
VMOVDQU (64+off)(src), t0; \
|
||||
VPERM2I128 $49, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (64+off)(dst); \
|
||||
VMOVDQU (96+off)(src), t0; \
|
||||
VPERM2I128 $49, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (96+off)(dst)
|
||||
|
||||
#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
|
||||
VMOVDQU (0+off)(src), t0; \
|
||||
VPERM2I128 $32, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (0+off)(dst); \
|
||||
VMOVDQU (32+off)(src), t0; \
|
||||
VPERM2I128 $32, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (32+off)(dst); \
|
||||
|
||||
#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
|
||||
VPERM2I128 $49, v1, v0, t0; \
|
||||
VMOVDQU t0, 0(dst); \
|
||||
VPERM2I128 $49, v3, v2, t0; \
|
||||
VMOVDQU t0, 32(dst)
|
||||
|
||||
#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t0) \
|
||||
VPXOR 0+off(src), v0, t0; \
|
||||
VMOVDQU t0, 0+off(dst); \
|
||||
VPXOR 16+off(src), v1, t0; \
|
||||
VMOVDQU t0, 16+off(dst); \
|
||||
VPXOR 32+off(src), v2, t0; \
|
||||
VMOVDQU t0, 32+off(dst); \
|
||||
VPXOR 48+off(src), v3, t0; \
|
||||
VMOVDQU t0, 48+off(dst)
|
||||
#include "const.s"
|
||||
#include "macro.s"
|
||||
|
||||
#define TWO 0(SP)
|
||||
#define C16 32(SP)
|
||||
@@ -122,10 +21,10 @@ GLOBL ·rol8_AVX2<>(SB), (NOPTR+RODATA), $32
|
||||
TEXT ·xorKeyStreamAVX2(SB), 4, $320-80
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ src_base+24(FP), SI
|
||||
MOVQ src_len+32(FP), CX
|
||||
MOVQ block+48(FP), BX
|
||||
MOVQ state+56(FP), AX
|
||||
MOVQ rounds+64(FP), DX
|
||||
MOVQ src_len+32(FP), CX
|
||||
|
||||
MOVQ SP, R8
|
||||
ADDQ $32, SP
|
||||
@@ -185,28 +84,28 @@ at_least_512:
|
||||
|
||||
chacha_loop_512:
|
||||
VMOVDQA Y8, TMP_0
|
||||
CHACHA_QROUND(Y0, Y1, Y2, Y3, Y8, C16, C8)
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y8, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
|
||||
VMOVDQA TMP_0, Y8
|
||||
VMOVDQA Y0, TMP_0
|
||||
CHACHA_QROUND(Y8, Y9, Y10, Y11, Y0, C16, C8)
|
||||
CHACHA_QROUND(Y12, Y13, Y14, Y15, Y0, C16, C8)
|
||||
CHACHA_SHUFFLE(Y1, Y2, Y3)
|
||||
CHACHA_SHUFFLE(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE(Y9, Y10, Y11)
|
||||
CHACHA_SHUFFLE(Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
|
||||
CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
|
||||
CHACHA_SHUFFLE_AVX(Y13, Y14, Y15)
|
||||
|
||||
CHACHA_QROUND(Y12, Y13, Y14, Y15, Y0, C16, C8)
|
||||
CHACHA_QROUND(Y8, Y9, Y10, Y11, Y0, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
|
||||
VMOVDQA TMP_0, Y0
|
||||
VMOVDQA Y8, TMP_0
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y8, C16, C8)
|
||||
CHACHA_QROUND(Y0, Y1, Y2, Y3, Y8, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
|
||||
VMOVDQA TMP_0, Y8
|
||||
CHACHA_SHUFFLE(Y3, Y2, Y1)
|
||||
CHACHA_SHUFFLE(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE(Y11, Y10, Y9)
|
||||
CHACHA_SHUFFLE(Y15, Y14, Y13)
|
||||
CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
|
||||
CHACHA_SHUFFLE_AVX(Y15, Y14, Y13)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_512
|
||||
|
||||
@@ -289,18 +188,18 @@ between_320_and_448:
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_384:
|
||||
CHACHA_QROUND(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE(Y1, Y2, Y3)
|
||||
CHACHA_SHUFFLE(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE(Y9, Y10, Y11)
|
||||
CHACHA_QROUND(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE(Y3, Y2, Y1)
|
||||
CHACHA_SHUFFLE(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE(Y11, Y10, Y9)
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_384
|
||||
|
||||
@@ -361,14 +260,14 @@ between_192_and_320:
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_256:
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE(Y9, Y10, Y11)
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE(Y11, Y10, Y9)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_256
|
||||
|
||||
@@ -413,10 +312,10 @@ between_64_and_192:
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_128:
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE(Y5, Y6, Y7)
|
||||
CHACHA_QROUND(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE(Y7, Y6, Y5)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_128
|
||||
|
||||
@@ -455,10 +354,10 @@ between_0_and_64:
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_64:
|
||||
CHACHA_QROUND(X4, X5, X6, X7, X13, X14, X15)
|
||||
CHACHA_SHUFFLE(X5, X6, X7)
|
||||
CHACHA_QROUND(X4, X5, X6, X7, X13, X14, X15)
|
||||
CHACHA_SHUFFLE(X7, X6, X5)
|
||||
CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
|
||||
CHACHA_SHUFFLE_AVX(X5, X6, X7)
|
||||
CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
|
||||
CHACHA_SHUFFLE_AVX(X7, X6, X5)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_64
|
||||
|
||||
@@ -466,7 +365,7 @@ chacha_loop_64:
|
||||
VPADDD X1, X5, X5
|
||||
VPADDD X2, X6, X6
|
||||
VPADDD X3, X7, X7
|
||||
VMOVDQU ·one_AVX<>(SB), X0
|
||||
VMOVDQU ·one<>(SB), X0
|
||||
VPADDQ X0, X3, X3
|
||||
|
||||
CMPQ CX, $64
|
||||
@@ -505,38 +404,3 @@ done:
|
||||
MOVQ CX, ret+72(FP)
|
||||
RET
|
||||
|
||||
// func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
TEXT ·hChaCha20AVX(SB), 4, $0-24
|
||||
MOVQ out+0(FP), DI
|
||||
MOVQ nonce+8(FP), AX
|
||||
MOVQ key+16(FP), BX
|
||||
|
||||
VMOVDQU ·sigma_AVX<>(SB), X0
|
||||
VMOVDQU 0(BX), X1
|
||||
VMOVDQU 16(BX), X2
|
||||
VMOVDQU 0(AX), X3
|
||||
VMOVDQU ·rol16_AVX2<>(SB), X5
|
||||
VMOVDQU ·rol8_AVX2<>(SB), X6
|
||||
|
||||
MOVQ $20, CX
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE(X1, X2, X3)
|
||||
CHACHA_QROUND(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE(X3, X2, X1)
|
||||
SUBQ $2, CX
|
||||
JNZ chacha_loop
|
||||
|
||||
VMOVDQU X0, 0(DI)
|
||||
VMOVDQU X3, 16(DI)
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// func supportsAVX2() bool
|
||||
TEXT ·supportsAVX2(SB), 4, $0-1
|
||||
MOVQ runtime·support_avx(SB), AX
|
||||
MOVQ runtime·support_avx2(SB), BX
|
||||
ANDQ AX, BX
|
||||
MOVB BX, ret+0(FP)
|
||||
RET
|
||||
|
||||
+15
-22
@@ -6,11 +6,16 @@
|
||||
|
||||
package chacha
|
||||
|
||||
import "encoding/binary"
|
||||
import (
|
||||
"encoding/binary"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
func init() {
|
||||
useSSE2 = supportsSSE2()
|
||||
useSSSE3 = supportsSSSE3()
|
||||
useSSE2 = cpu.X86.HasSSE2
|
||||
useSSSE3 = cpu.X86.HasSSSE3
|
||||
useAVX = false
|
||||
useAVX2 = false
|
||||
}
|
||||
|
||||
@@ -23,14 +28,6 @@ func initialize(state *[64]byte, key []byte, nonce *[16]byte) {
|
||||
copy(state[48:], nonce[:])
|
||||
}
|
||||
|
||||
// This function is implemented in chacha_386.s
|
||||
//go:noescape
|
||||
func supportsSSE2() bool
|
||||
|
||||
// This function is implemented in chacha_386.s
|
||||
//go:noescape
|
||||
func supportsSSSE3() bool
|
||||
|
||||
// This function is implemented in chacha_386.s
|
||||
//go:noescape
|
||||
func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
@@ -43,25 +40,21 @@ func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
//go:noescape
|
||||
func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
// This function is implemented in chacha_386.s
|
||||
//go:noescape
|
||||
func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
|
||||
if useSSSE3 {
|
||||
switch {
|
||||
case useSSSE3:
|
||||
hChaCha20SSSE3(out, nonce, key)
|
||||
} else if useSSE2 {
|
||||
case useSSE2:
|
||||
hChaCha20SSE2(out, nonce, key)
|
||||
} else {
|
||||
default:
|
||||
hChaCha20Generic(out, nonce, key)
|
||||
}
|
||||
}
|
||||
|
||||
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
|
||||
if useSSSE3 {
|
||||
return xorKeyStreamSSSE3(dst, src, block, state, rounds)
|
||||
} else if useSSE2 {
|
||||
if useSSE2 {
|
||||
return xorKeyStreamSSE2(dst, src, block, state, rounds)
|
||||
} else {
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
|
||||
+130
-278
@@ -4,126 +4,125 @@
|
||||
|
||||
// +build 386,!gccgo,!appengine,!nacl
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA ·sigma<>+0x00(SB)/4, $0x61707865
|
||||
DATA ·sigma<>+0x04(SB)/4, $0x3320646e
|
||||
DATA ·sigma<>+0x08(SB)/4, $0x79622d32
|
||||
DATA ·sigma<>+0x0C(SB)/4, $0x6b206574
|
||||
GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA ·one<>+0x00(SB)/8, $1
|
||||
DATA ·one<>+0x08(SB)/8, $0
|
||||
GLOBL ·one<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
#define ROTL_SSE2(n, t, v) \
|
||||
MOVO v, t; \
|
||||
PSLLL $n, t; \
|
||||
PSRLL $(32-n), v; \
|
||||
PXOR t, v
|
||||
|
||||
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t0) \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
ROTL_SSE2(16, t0, v3); \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE2(12, t0, v1); \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
ROTL_SSE2(8, t0, v3); \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE2(7, t0, v1)
|
||||
|
||||
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t0, r16, r8) \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
PSHUFB r16, v3; \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE2(12, t0, v1); \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
PSHUFB r8, v3; \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE2(7, t0, v1)
|
||||
|
||||
#define CHACHA_SHUFFLE(v1, v2, v3) \
|
||||
PSHUFL $0x39, v1, v1; \
|
||||
PSHUFL $0x4E, v2, v2; \
|
||||
PSHUFL $0x93, v3, v3
|
||||
|
||||
#define XOR(dst, src, off, v0, v1, v2, v3, t0) \
|
||||
MOVOU 0+off(src), t0; \
|
||||
PXOR v0, t0; \
|
||||
MOVOU t0, 0+off(dst); \
|
||||
MOVOU 16+off(src), t0; \
|
||||
PXOR v1, t0; \
|
||||
MOVOU t0, 16+off(dst); \
|
||||
MOVOU 32+off(src), t0; \
|
||||
PXOR v2, t0; \
|
||||
MOVOU t0, 32+off(dst); \
|
||||
MOVOU 48+off(src), t0; \
|
||||
PXOR v3, t0; \
|
||||
MOVOU t0, 48+off(dst)
|
||||
#include "const.s"
|
||||
#include "macro.s"
|
||||
|
||||
// FINALIZE xors len bytes from src and block using
|
||||
// the temp. registers t0 and t1 and writes the result
|
||||
// to dst.
|
||||
#define FINALIZE(dst, src, block, len, t0, t1) \
|
||||
XORL t0, t0; \
|
||||
XORL t1, t1; \
|
||||
finalize: \
|
||||
MOVB 0(src), t0; \
|
||||
MOVB 0(block), t1; \
|
||||
XORL t0, t1; \
|
||||
MOVB t1, 0(dst); \
|
||||
INCL src; \
|
||||
INCL block; \
|
||||
INCL dst; \
|
||||
DECL len; \
|
||||
JA finalize \
|
||||
XORL t0, t0; \
|
||||
XORL t1, t1; \
|
||||
FINALIZE_LOOP:; \
|
||||
MOVB 0(src), t0; \
|
||||
MOVB 0(block), t1; \
|
||||
XORL t0, t1; \
|
||||
MOVB t1, 0(dst); \
|
||||
INCL src; \
|
||||
INCL block; \
|
||||
INCL dst; \
|
||||
DECL len; \
|
||||
JG FINALIZE_LOOP \
|
||||
|
||||
#define Dst DI
|
||||
#define Nonce AX
|
||||
#define Key BX
|
||||
#define Rounds DX
|
||||
|
||||
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
TEXT ·hChaCha20SSE2(SB), 4, $0-12
|
||||
MOVL out+0(FP), Dst
|
||||
MOVL nonce+4(FP), Nonce
|
||||
MOVL key+8(FP), Key
|
||||
|
||||
MOVOU ·sigma<>(SB), X0
|
||||
MOVOU 0*16(Key), X1
|
||||
MOVOU 1*16(Key), X2
|
||||
MOVOU 0*16(Nonce), X3
|
||||
MOVL $20, Rounds
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||||
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||||
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||||
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||||
SUBL $2, Rounds
|
||||
JNZ chacha_loop
|
||||
|
||||
MOVOU X0, 0*16(Dst)
|
||||
MOVOU X3, 1*16(Dst)
|
||||
RET
|
||||
|
||||
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
TEXT ·hChaCha20SSSE3(SB), 4, $0-12
|
||||
MOVL out+0(FP), Dst
|
||||
MOVL nonce+4(FP), Nonce
|
||||
MOVL key+8(FP), Key
|
||||
|
||||
MOVOU ·sigma<>(SB), X0
|
||||
MOVOU 0*16(Key), X1
|
||||
MOVOU 1*16(Key), X2
|
||||
MOVOU 0*16(Nonce), X3
|
||||
MOVL $20, Rounds
|
||||
|
||||
MOVOU ·rol16<>(SB), X5
|
||||
MOVOU ·rol8<>(SB), X6
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||||
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||||
SUBL $2, Rounds
|
||||
JNZ chacha_loop
|
||||
|
||||
MOVOU X0, 0*16(Dst)
|
||||
MOVOU X3, 1*16(Dst)
|
||||
RET
|
||||
|
||||
#undef Dst
|
||||
#undef Nonce
|
||||
#undef Key
|
||||
#undef Rounds
|
||||
|
||||
#define State AX
|
||||
#define Dst DI
|
||||
#define Src SI
|
||||
#define Len DX
|
||||
#define Tmp0 BX
|
||||
#define Tmp1 BP
|
||||
|
||||
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
TEXT ·xorKeyStreamSSE2(SB), 4, $0-40
|
||||
MOVL dst_base+0(FP), DI
|
||||
MOVL src_base+12(FP), SI
|
||||
MOVL src_len+16(FP), CX
|
||||
MOVL state+28(FP), AX
|
||||
MOVL rounds+32(FP), DX
|
||||
MOVL dst_base+0(FP), Dst
|
||||
MOVL src_base+12(FP), Src
|
||||
MOVL state+28(FP), State
|
||||
MOVL src_len+16(FP), Len
|
||||
MOVL $0, ret+36(FP) // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0
|
||||
|
||||
MOVOU 0(AX), X0
|
||||
MOVOU 16(AX), X1
|
||||
MOVOU 32(AX), X2
|
||||
MOVOU 48(AX), X3
|
||||
MOVOU 0*16(State), X0
|
||||
MOVOU 1*16(State), X1
|
||||
MOVOU 2*16(State), X2
|
||||
MOVOU 3*16(State), X3
|
||||
TESTL Len, Len
|
||||
JZ DONE
|
||||
|
||||
TESTL CX, CX
|
||||
JZ done
|
||||
|
||||
at_least_64:
|
||||
GENERATE_KEYSTREAM:
|
||||
MOVO X0, X4
|
||||
MOVO X1, X5
|
||||
MOVO X2, X6
|
||||
MOVO X3, X7
|
||||
MOVL rounds+32(FP), Tmp0
|
||||
|
||||
MOVL DX, BX
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_LOOP:
|
||||
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||||
CHACHA_SHUFFLE(X5, X6, X7)
|
||||
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||||
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||||
CHACHA_SHUFFLE(X7, X6, X5)
|
||||
SUBL $2, BX
|
||||
JA chacha_loop
|
||||
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||||
SUBL $2, Tmp0
|
||||
JA CHACHA_LOOP
|
||||
|
||||
MOVOU 0(AX), X0
|
||||
MOVOU 0*16(State), X0 // Restore X0 from state
|
||||
PADDL X0, X4
|
||||
PADDL X1, X5
|
||||
PADDL X2, X6
|
||||
@@ -131,181 +130,34 @@ chacha_loop:
|
||||
MOVOU ·one<>(SB), X0
|
||||
PADDQ X0, X3
|
||||
|
||||
CMPL CX, $64
|
||||
JB less_than_64
|
||||
CMPL Len, $64
|
||||
JL BUFFER_KEYSTREAM
|
||||
|
||||
XOR(DI, SI, 0, X4, X5, X6, X7, X0)
|
||||
MOVOU 0(AX), X0
|
||||
ADDL $64, SI
|
||||
ADDL $64, DI
|
||||
SUBL $64, CX
|
||||
JNZ at_least_64
|
||||
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
|
||||
MOVOU 0*16(State), X0 // Restore X0 from state
|
||||
ADDL $64, Src
|
||||
ADDL $64, Dst
|
||||
SUBL $64, Len
|
||||
JZ DONE
|
||||
JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte
|
||||
|
||||
less_than_64:
|
||||
MOVL CX, BP
|
||||
TESTL BP, BP
|
||||
JZ done
|
||||
BUFFER_KEYSTREAM:
|
||||
MOVL block+24(FP), State
|
||||
MOVOU X4, 0(State)
|
||||
MOVOU X5, 16(State)
|
||||
MOVOU X6, 32(State)
|
||||
MOVOU X7, 48(State)
|
||||
MOVL Len, ret+36(FP) // Number of bytes written to the keystream buffer - 0 < Len < 64
|
||||
FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1)
|
||||
|
||||
MOVL block+24(FP), BX
|
||||
MOVOU X4, 0(BX)
|
||||
MOVOU X5, 16(BX)
|
||||
MOVOU X6, 32(BX)
|
||||
MOVOU X7, 48(BX)
|
||||
FINALIZE(DI, SI, BX, BP, AX, DX)
|
||||
|
||||
done:
|
||||
MOVL state+28(FP), AX
|
||||
MOVOU X3, 48(AX)
|
||||
MOVL CX, ret+36(FP)
|
||||
DONE:
|
||||
MOVL state+28(FP), State
|
||||
MOVOU X3, 3*16(State)
|
||||
RET
|
||||
|
||||
// func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
TEXT ·xorKeyStreamSSSE3(SB), 4, $64-40
|
||||
MOVL dst_base+0(FP), DI
|
||||
MOVL src_base+12(FP), SI
|
||||
MOVL src_len+16(FP), CX
|
||||
MOVL state+28(FP), AX
|
||||
MOVL rounds+32(FP), DX
|
||||
|
||||
MOVOU 48(AX), X3
|
||||
TESTL CX, CX
|
||||
JZ done
|
||||
|
||||
MOVL SP, BP
|
||||
ADDL $16, SP
|
||||
ANDL $-16, SP
|
||||
|
||||
MOVOU ·one<>(SB), X0
|
||||
MOVOU 16(AX), X1
|
||||
MOVOU 32(AX), X2
|
||||
MOVO X0, 0(SP)
|
||||
MOVO X1, 16(SP)
|
||||
MOVO X2, 32(SP)
|
||||
|
||||
MOVOU 0(AX), X0
|
||||
MOVOU ·rol16<>(SB), X1
|
||||
MOVOU ·rol8<>(SB), X2
|
||||
|
||||
at_least_64:
|
||||
MOVO X0, X4
|
||||
MOVO 16(SP), X5
|
||||
MOVO 32(SP), X6
|
||||
MOVO X3, X7
|
||||
|
||||
MOVL DX, BX
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
|
||||
CHACHA_SHUFFLE(X5, X6, X7)
|
||||
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
|
||||
CHACHA_SHUFFLE(X7, X6, X5)
|
||||
SUBL $2, BX
|
||||
JA chacha_loop
|
||||
|
||||
MOVOU 0(AX), X0
|
||||
PADDL X0, X4
|
||||
PADDL 16(SP), X5
|
||||
PADDL 32(SP), X6
|
||||
PADDL X3, X7
|
||||
PADDQ 0(SP), X3
|
||||
|
||||
CMPL CX, $64
|
||||
JB less_than_64
|
||||
|
||||
XOR(DI, SI, 0, X4, X5, X6, X7, X0)
|
||||
MOVOU 0(AX), X0
|
||||
ADDL $64, SI
|
||||
ADDL $64, DI
|
||||
SUBL $64, CX
|
||||
JNZ at_least_64
|
||||
|
||||
less_than_64:
|
||||
MOVL BP, SP
|
||||
MOVL CX, BP
|
||||
TESTL BP, BP
|
||||
JE done
|
||||
|
||||
MOVL block+24(FP), BX
|
||||
MOVOU X4, 0(BX)
|
||||
MOVOU X5, 16(BX)
|
||||
MOVOU X6, 32(BX)
|
||||
MOVOU X7, 48(BX)
|
||||
FINALIZE(DI, SI, BX, BP, AX, DX)
|
||||
|
||||
done:
|
||||
MOVL state+28(FP), AX
|
||||
MOVOU X3, 48(AX)
|
||||
MOVL CX, ret+36(FP)
|
||||
RET
|
||||
|
||||
// func supportsSSE2() bool
|
||||
TEXT ·supportsSSE2(SB), NOSPLIT, $0-1
|
||||
XORL AX, AX
|
||||
INCL AX
|
||||
CPUID
|
||||
SHRL $26, DX
|
||||
ANDL $1, DX
|
||||
MOVB DX, ret+0(FP)
|
||||
RET
|
||||
|
||||
// func supportsSSSE3() bool
|
||||
TEXT ·supportsSSSE3(SB), NOSPLIT, $0-1
|
||||
XORL AX, AX
|
||||
INCL AX
|
||||
CPUID
|
||||
SHRL $9, CX
|
||||
ANDL $1, CX
|
||||
MOVB CX, ret+0(FP)
|
||||
RET
|
||||
|
||||
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
TEXT ·hChaCha20SSE2(SB), 4, $0-12
|
||||
MOVL out+0(FP), DI
|
||||
MOVL nonce+4(FP), AX
|
||||
MOVL key+8(FP), BX
|
||||
|
||||
MOVOU ·sigma<>(SB), X0
|
||||
MOVOU 0(BX), X1
|
||||
MOVOU 16(BX), X2
|
||||
MOVOU 0(AX), X3
|
||||
|
||||
MOVL $20, CX
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||||
CHACHA_SHUFFLE(X1, X2, X3)
|
||||
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||||
CHACHA_SHUFFLE(X3, X2, X1)
|
||||
SUBL $2, CX
|
||||
JNZ chacha_loop
|
||||
|
||||
MOVOU X0, 0(DI)
|
||||
MOVOU X3, 16(DI)
|
||||
RET
|
||||
|
||||
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
TEXT ·hChaCha20SSSE3(SB), 4, $0-12
|
||||
MOVL out+0(FP), DI
|
||||
MOVL nonce+4(FP), AX
|
||||
MOVL key+8(FP), BX
|
||||
|
||||
MOVOU ·sigma<>(SB), X0
|
||||
MOVOU 0(BX), X1
|
||||
MOVOU 16(BX), X2
|
||||
MOVOU 0(AX), X3
|
||||
MOVOU ·rol16<>(SB), X5
|
||||
MOVOU ·rol8<>(SB), X6
|
||||
|
||||
MOVL $20, CX
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE(X1, X2, X3)
|
||||
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE(X3, X2, X1)
|
||||
SUBL $2, CX
|
||||
JNZ chacha_loop
|
||||
|
||||
MOVOU X0, 0(DI)
|
||||
MOVOU X3, 16(DI)
|
||||
RET
|
||||
#undef State
|
||||
#undef Dst
|
||||
#undef Src
|
||||
#undef Len
|
||||
#undef Tmp0
|
||||
#undef Tmp1
|
||||
|
||||
Generated
Vendored
+23
-19
@@ -6,24 +6,19 @@
|
||||
|
||||
package chacha
|
||||
|
||||
import "golang.org/x/sys/cpu"
|
||||
|
||||
func init() {
|
||||
useSSE2 = true
|
||||
useSSSE3 = supportsSSSE3()
|
||||
useAVX2 = supportsAVX2()
|
||||
useSSE2 = cpu.X86.HasSSE2
|
||||
useSSSE3 = cpu.X86.HasSSSE3
|
||||
useAVX = cpu.X86.HasAVX
|
||||
useAVX2 = cpu.X86.HasAVX2
|
||||
}
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func initialize(state *[64]byte, key []byte, nonce *[16]byte)
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func supportsSSSE3() bool
|
||||
|
||||
// This function is implemented in chachaAVX2_amd64.s
|
||||
//go:noescape
|
||||
func supportsAVX2() bool
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
@@ -44,29 +39,38 @@ func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
//go:noescape
|
||||
func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
// This function is implemented in chachaAVX2_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamAVX2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
|
||||
if useAVX2 {
|
||||
switch {
|
||||
case useAVX:
|
||||
hChaCha20AVX(out, nonce, key)
|
||||
} else if useSSSE3 {
|
||||
case useSSSE3:
|
||||
hChaCha20SSSE3(out, nonce, key)
|
||||
} else if useSSE2 { // on amd64 this is always true - neccessary for testing generic on amd64
|
||||
case useSSE2:
|
||||
hChaCha20SSE2(out, nonce, key)
|
||||
} else {
|
||||
default:
|
||||
hChaCha20Generic(out, nonce, key)
|
||||
}
|
||||
}
|
||||
|
||||
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
|
||||
if useAVX2 {
|
||||
switch {
|
||||
case useAVX2:
|
||||
return xorKeyStreamAVX2(dst, src, block, state, rounds)
|
||||
} else if useSSSE3 {
|
||||
case useAVX:
|
||||
return xorKeyStreamAVX(dst, src, block, state, rounds)
|
||||
case useSSSE3:
|
||||
return xorKeyStreamSSSE3(dst, src, block, state, rounds)
|
||||
} else if useSSE2 { // on amd64 this is always true - neccessary for testing generic on amd64
|
||||
case useSSE2:
|
||||
return xorKeyStreamSSE2(dst, src, block, state, rounds)
|
||||
default:
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
+1035
-751
File diff suppressed because it is too large
Load Diff
-56
@@ -1,56 +0,0 @@
|
||||
// Copyright (c) 2017 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build amd64,!gccgo,!appengine,!nacl,!go1.7
|
||||
|
||||
package chacha
|
||||
|
||||
func init() {
|
||||
useSSE2 = true
|
||||
useSSSE3 = supportsSSSE3()
|
||||
useAVX2 = false
|
||||
}
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func initialize(state *[64]byte, key []byte, nonce *[16]byte)
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func supportsSSSE3() bool
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
|
||||
if useSSSE3 {
|
||||
hChaCha20SSSE3(out, nonce, key)
|
||||
} else if useSSE2 { // on amd64 this is always true - used to test generic on amd64
|
||||
hChaCha20SSE2(out, nonce, key)
|
||||
} else {
|
||||
hChaCha20Generic(out, nonce, key)
|
||||
}
|
||||
}
|
||||
|
||||
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
|
||||
if useSSSE3 {
|
||||
return xorKeyStreamSSSE3(dst, src, block, state, rounds)
|
||||
} else if useSSE2 { // on amd64 this is always true - used to test generic on amd64
|
||||
return xorKeyStreamSSE2(dst, src, block, state, rounds)
|
||||
}
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
+7
@@ -8,6 +8,13 @@ package chacha
|
||||
|
||||
import "encoding/binary"
|
||||
|
||||
func init() {
|
||||
useSSE2 = false
|
||||
useSSSE3 = false
|
||||
useAVX = false
|
||||
useAVX2 = false
|
||||
}
|
||||
|
||||
func initialize(state *[64]byte, key []byte, nonce *[16]byte) {
|
||||
binary.LittleEndian.PutUint32(state[0:], sigma[0])
|
||||
binary.LittleEndian.PutUint32(state[4:], sigma[1])
|
||||
|
||||
+53
@@ -0,0 +1,53 @@
|
||||
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA ·sigma<>+0x00(SB)/4, $0x61707865
|
||||
DATA ·sigma<>+0x04(SB)/4, $0x3320646e
|
||||
DATA ·sigma<>+0x08(SB)/4, $0x79622d32
|
||||
DATA ·sigma<>+0x0C(SB)/4, $0x6b206574
|
||||
GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16 // The 4 ChaCha initialization constants
|
||||
|
||||
// SSE2/SSE3/AVX constants
|
||||
|
||||
DATA ·one<>+0x00(SB)/8, $1
|
||||
DATA ·one<>+0x08(SB)/8, $0
|
||||
GLOBL ·one<>(SB), (NOPTR+RODATA), $16 // The constant 1 as 128 bit value
|
||||
|
||||
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 16 bit left rotate constant
|
||||
|
||||
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 8 bit left rotate constant
|
||||
|
||||
// AVX2 constants
|
||||
|
||||
DATA ·one_AVX2<>+0x00(SB)/8, $0
|
||||
DATA ·one_AVX2<>+0x08(SB)/8, $0
|
||||
DATA ·one_AVX2<>+0x10(SB)/8, $1
|
||||
DATA ·one_AVX2<>+0x18(SB)/8, $0
|
||||
GLOBL ·one_AVX2<>(SB), (NOPTR+RODATA), $32 // The constant 1 as 256 bit value
|
||||
|
||||
DATA ·two_AVX2<>+0x00(SB)/8, $2
|
||||
DATA ·two_AVX2<>+0x08(SB)/8, $0
|
||||
DATA ·two_AVX2<>+0x10(SB)/8, $2
|
||||
DATA ·two_AVX2<>+0x18(SB)/8, $0
|
||||
GLOBL ·two_AVX2<>(SB), (NOPTR+RODATA), $32
|
||||
|
||||
DATA ·rol16_AVX2<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16_AVX2<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
DATA ·rol16_AVX2<>+0x10(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16_AVX2<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL ·rol16_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 16 bit left rotate constant
|
||||
|
||||
DATA ·rol8_AVX2<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8_AVX2<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
DATA ·rol8_AVX2<>+0x10(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8_AVX2<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL ·rol8_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 8 bit left rotate constant
|
||||
+163
@@ -0,0 +1,163 @@
|
||||
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
// ROTL_SSE rotates all 4 32 bit values of the XMM register v
|
||||
// left by n bits using SSE2 instructions (0 <= n <= 32).
|
||||
// The XMM register t is used as a temp. register.
|
||||
#define ROTL_SSE(n, t, v) \
|
||||
MOVO v, t; \
|
||||
PSLLL $n, t; \
|
||||
PSRLL $(32-n), v; \
|
||||
PXOR t, v
|
||||
|
||||
// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v
|
||||
// left by n bits using AVX/AVX2 instructions (0 <= n <= 32).
|
||||
// The AVX/AVX2 register t is used as a temp. register.
|
||||
#define ROTL_AVX(n, t, v) \
|
||||
VPSLLD $n, v, t; \
|
||||
VPSRLD $(32-n), v, v; \
|
||||
VPXOR v, t, v
|
||||
|
||||
// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the
|
||||
// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for
|
||||
// rotations. The XMM register t is used as a temp. register.
|
||||
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
ROTL_SSE(16, t, v3); \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(12, t, v1); \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
ROTL_SSE(8, t, v3); \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(7, t, v1)
|
||||
|
||||
// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the
|
||||
// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit
|
||||
// rotations. The XMM register t is used as a temp. register.
|
||||
//
|
||||
// r16 holds the PSHUFB constant for a 16 bit left rotate.
|
||||
// r8 holds the PSHUFB constant for a 8 bit left rotate.
|
||||
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
PSHUFB r16, v3; \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(12, t, v1); \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
PSHUFB r8, v3; \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(7, t, v1)
|
||||
|
||||
// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the
|
||||
// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit
|
||||
// rotations. The AVX/AVX2 register t is used as a temp. register.
|
||||
//
|
||||
// r16 holds the VPSHUFB constant for a 16 bit left rotate.
|
||||
// r8 holds the VPSHUFB constant for a 8 bit left rotate.
|
||||
#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \
|
||||
VPADDD v0, v1, v0; \
|
||||
VPXOR v3, v0, v3; \
|
||||
VPSHUFB r16, v3, v3; \
|
||||
VPADDD v2, v3, v2; \
|
||||
VPXOR v1, v2, v1; \
|
||||
ROTL_AVX(12, t, v1); \
|
||||
VPADDD v0, v1, v0; \
|
||||
VPXOR v3, v0, v3; \
|
||||
VPSHUFB r8, v3, v3; \
|
||||
VPADDD v2, v3, v2; \
|
||||
VPXOR v1, v2, v1; \
|
||||
ROTL_AVX(7, t, v1)
|
||||
|
||||
// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the
|
||||
// 3 XMM registers v1, v2 and v3. The inverse shuffle is
|
||||
// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).
|
||||
#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \
|
||||
PSHUFL $0x39, v1, v1; \
|
||||
PSHUFL $0x4E, v2, v2; \
|
||||
PSHUFL $0x93, v3, v3
|
||||
|
||||
// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the
|
||||
// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is
|
||||
// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).
|
||||
#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \
|
||||
VPSHUFD $0x39, v1, v1; \
|
||||
VPSHUFD $0x4E, v2, v2; \
|
||||
VPSHUFD $0x93, v3, v3
|
||||
|
||||
// XOR_SSE extracts 4x16 byte vectors from src at
|
||||
// off, xors all vectors with the corresponding XMM
|
||||
// register (v0 - v3) and writes the result to dst
|
||||
// at off.
|
||||
// The XMM register t is used as a temp. register.
|
||||
#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \
|
||||
MOVOU 0+off(src), t; \
|
||||
PXOR v0, t; \
|
||||
MOVOU t, 0+off(dst); \
|
||||
MOVOU 16+off(src), t; \
|
||||
PXOR v1, t; \
|
||||
MOVOU t, 16+off(dst); \
|
||||
MOVOU 32+off(src), t; \
|
||||
PXOR v2, t; \
|
||||
MOVOU t, 32+off(dst); \
|
||||
MOVOU 48+off(src), t; \
|
||||
PXOR v3, t; \
|
||||
MOVOU t, 48+off(dst)
|
||||
|
||||
// XOR_AVX extracts 4x16 byte vectors from src at
|
||||
// off, xors all vectors with the corresponding AVX
|
||||
// register (v0 - v3) and writes the result to dst
|
||||
// at off.
|
||||
// The XMM register t is used as a temp. register.
|
||||
#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \
|
||||
VPXOR 0+off(src), v0, t; \
|
||||
VMOVDQU t, 0+off(dst); \
|
||||
VPXOR 16+off(src), v1, t; \
|
||||
VMOVDQU t, 16+off(dst); \
|
||||
VPXOR 32+off(src), v2, t; \
|
||||
VMOVDQU t, 32+off(dst); \
|
||||
VPXOR 48+off(src), v3, t; \
|
||||
VMOVDQU t, 48+off(dst)
|
||||
|
||||
#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
|
||||
VMOVDQU (0+off)(src), t0; \
|
||||
VPERM2I128 $32, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (0+off)(dst); \
|
||||
VMOVDQU (32+off)(src), t0; \
|
||||
VPERM2I128 $32, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (32+off)(dst); \
|
||||
VMOVDQU (64+off)(src), t0; \
|
||||
VPERM2I128 $49, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (64+off)(dst); \
|
||||
VMOVDQU (96+off)(src), t0; \
|
||||
VPERM2I128 $49, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (96+off)(dst)
|
||||
|
||||
#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
|
||||
VMOVDQU (0+off)(src), t0; \
|
||||
VPERM2I128 $32, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (0+off)(dst); \
|
||||
VMOVDQU (32+off)(src), t0; \
|
||||
VPERM2I128 $32, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (32+off)(dst); \
|
||||
|
||||
#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
|
||||
VPERM2I128 $49, v1, v0, t0; \
|
||||
VMOVDQU t0, 0(dst); \
|
||||
VPERM2I128 $49, v3, v2, t0; \
|
||||
VMOVDQU t0, 32(dst)
|
||||
Reference in New Issue
Block a user