02ff96
--- a/configure	2023-03-01 08:14:25.954898388 +0200
02ff96
+++ b/configure	2023-03-01 08:24:45.239014676 +0200
02ff96
@@ -115,6 +115,8 @@
e6944d
       echo '    [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log
e6944d
       echo '    [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log
e6944d
       echo '    [--dfltcc]' | tee -a configure.log
02ff96
+      echo '    [--enable-sse-slide]' | tee -a configure.log
02ff96
+      echo '    [--enable-avx2-slide]' | tee -a configure.log
e6944d
         exit 0 ;;
e6944d
     -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;;
e6944d
     -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;;
02ff96
@@ -144,6 +146,18 @@
e6944d
 	    PIC_OBJC="$PIC_OBJC dfltcc.lo"
e6944d
       shift
e6944d
       ;; 
02ff96
+    --enable-sse-slide)
e6944d
+	    CFLAGS="$CFLAGS -DUSE_SSE_SLIDE"
e6944d
+	    OBJC="$OBJC slide_sse.o"
e6944d
+	    PIC_OBJC="$PIC_OBJC slide_sse.lo"
e6944d
+      shift
e6944d
+      ;; 
02ff96
+    --enable-avx2-slide)
02ff96
+	    CFLAGS="$CFLAGS -DUSE_AVX2_SLIDE"
02ff96
+	    OBJC="$OBJC slide_avx2.o"
02ff96
+	    PIC_OBJC="$PIC_OBJC slide_avx2.lo"
02ff96
+      shift
02ff96
+      ;; 
e6944d
     *)
e6944d
       echo "unknown option: $1" | tee -a configure.log
e6944d
       echo "$0 --help for help" | tee -a configure.log
02ff96
--- a/Makefile.in	2023-03-01 08:14:25.950898032 +0200
02ff96
+++ b/Makefile.in	2023-03-01 08:28:07.734042879 +0200
02ff96
@@ -151,6 +151,22 @@
e6944d
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/dfltcc.o $(SRCDIR)contrib/s390/dfltcc.c
e6944d
 	-@mv objs/dfltcc.o $@
e6944d
 
e6944d
+slide_sse.o: $(SRCDIR)slide_sse.c
e6944d
+	$(CC) $(CFLAGS) $(ZINC) -msse2 -c -o $@ $(SRCDIR)slide_sse.c
e6944d
+
e6944d
+slide_sse.lo: $(SRCDIR)slide_sse.c
e6944d
+	-@mkdir objs 2>/dev/null || test -d objs
e6944d
+	$(CC) $(SFLAGS) $(ZINC) -DPIC -msse2 -c -o objs/slide_sse.o $(SRCDIR)slide_sse.c
e6944d
+	-@mv objs/slide_sse.o $@
e6944d
+
02ff96
+slide_avx2.o: $(SRCDIR)slide_avx2.c
02ff96
+	$(CC) $(CFLAGS) $(ZINC) -mavx2 -c -o $@ $(SRCDIR)slide_avx2.c
02ff96
+
02ff96
+slide_avx2.lo: $(SRCDIR)slide_avx2.c
02ff96
+	-@mkdir objs 2>/dev/null || test -d objs
02ff96
+	$(CC) $(SFLAGS) $(ZINC) -DPIC -mavx2 -c -o objs/slide_avx2.o $(SRCDIR)slide_avx2.c
02ff96
+	-@mv objs/slide_avx2.o $@
02ff96
+
e6944d
 example.o: $(SRCDIR)test/example.c $(SRCDIR)zlib.h zconf.h
e6944d
 	$(CC) $(CFLAGS) $(ZINCOUT) -c -o $@ $(SRCDIR)test/example.c
e6944d
 
02ff96
--- a/deflate.c	2023-03-01 14:04:13.871373364 +0200
02ff96
+++ b/deflate.c	2023-03-01 14:15:35.345276070 +0200
02ff96
@@ -90,6 +90,13 @@
e6944d
 
e6944d
 local int deflateStateCheck      OF((z_streamp strm));
e6944d
 local void slide_hash     OF((deflate_state *s));
e6944d
+local void slide_hash_c     OF((deflate_state *s));
02ff96
+#ifdef USE_SSE_SLIDE
02ff96
+extern void slide_hash_sse(deflate_state *s);
02ff96
+#endif
02ff96
+#ifdef USE_AVX2_SLIDE
02ff96
+extern void slide_hash_avx2(deflate_state *s);
02ff96
+#endif
e6944d
 local void fill_window    OF((deflate_state *s));
e6944d
 local block_state deflate_stored OF((deflate_state *s, int flush));
e6944d
 local block_state deflate_fast   OF((deflate_state *s, int flush));
02ff96
@@ -212,7 +219,7 @@
e6944d
  * bit values at the expense of memory usage). We slide even when level == 0 to
e6944d
  * keep the hash table consistent if we switch back to level > 0 later.
e6944d
  */
e6944d
-local void slide_hash(s)
e6944d
+local void slide_hash_c(s)
e6944d
     deflate_state *s;
e6944d
 {
e6944d
     unsigned n, m;
02ff96
@@ -238,6 +245,17 @@
e6944d
 #endif
e6944d
 }
e6944d
 
e6944d
+local void slide_hash(deflate_state *s)
e6944d
+{
e6944d
+#ifdef USE_SSE_SLIDE
e6944d
+	slide_hash_sse(s);
e6944d
+#endif
02ff96
+#ifdef USE_AVX2_SLIDE
02ff96
+	slide_hash_avx2(s);
02ff96
+#endif
02ff96
+	slide_hash_c(s);
e6944d
+}
e6944d
+
e6944d
 /* ========================================================================= */
e6944d
 int ZEXPORT deflateInit_(strm, level, version, stream_size)
e6944d
     z_streamp strm;
e6944d
--- /dev/null
e6944d
+++ b/slide_sse.c
02ff96
@@ -0,0 +1,47 @@
e6944d
+/*
e6944d
+ * SSE optimized hash slide
e6944d
+ *
e6944d
+ * Copyright (C) 2017 Intel Corporation
e6944d
+ * Authors:
e6944d
+ *   Arjan van de Ven	<arjan@linux.intel.com>
e6944d
+ *   Jim Kukunas	<james.t.kukunas@linux.intel.com>
e6944d
+ *
e6944d
+ * For conditions of distribution and use, see copyright notice in zlib.h
e6944d
+ */
e6944d
+#include "deflate.h"
e6944d
+#include <immintrin.h>
e6944d
+
e6944d
+void slide_hash_sse(deflate_state *s)
e6944d
+{
e6944d
+    unsigned n;
e6944d
+    Posf *p;
e6944d
+    uInt wsize = s->w_size;
e6944d
+    z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
e6944d
+
e6944d
+    n = s->hash_size;
e6944d
+    p = &s->head[n] - 8;
e6944d
+    do {
e6944d
+        __m128i value, result;
e6944d
+
e6944d
+	value = _mm_loadu_si128((__m128i *)p);
e6944d
+	result= _mm_subs_epu16(value, xmm_wsize);
e6944d
+	_mm_storeu_si128((__m128i *)p, result);
e6944d
+	p -= 8;
e6944d
+	n -= 8;
e6944d
+    } while (n > 0);
e6944d
+
e6944d
+#ifndef FASTEST
e6944d
+    n = wsize;
e6944d
+    p = &s->prev[n] - 8;
e6944d
+    do {
e6944d
+        __m128i value, result;
e6944d
+
e6944d
+	value = _mm_loadu_si128((__m128i *)p);
e6944d
+	result= _mm_subs_epu16(value, xmm_wsize);
e6944d
+	_mm_storeu_si128((__m128i *)p, result);
e6944d
+
e6944d
+	p -= 8;
e6944d
+	n -= 8;
e6944d
+    } while (n > 0);
e6944d
+#endif
e6944d
+}
02ff96
--- /dev/null
02ff96
+++ b/slide_avx2.c
02ff96
@@ -0,0 +1,44 @@
02ff96
+/*
02ff96
+ * AVX2 optimized hash slide
02ff96
+ *
02ff96
+ * Copyright (C) 2020 Intel Corporation
02ff96
+ *
02ff96
+ * For conditions of distribution and use, see copyright notice in zlib.h
02ff96
+ */
02ff96
+#include "deflate.h"
02ff96
+#include <immintrin.h>
02ff96
+
02ff96
+void slide_hash_avx2(deflate_state *s)
02ff96
+{
02ff96
+    unsigned n;
02ff96
+    Posf *p;
02ff96
+    uInt wsize = s->w_size;
02ff96
+    z_const __m256i ymm_wsize = _mm256_set1_epi16(s->w_size);
02ff96
+
02ff96
+    n = s->hash_size;
02ff96
+    p = &s->head[n] - 16;
02ff96
+    do {
02ff96
+        __m256i value, result;
e6944d
+
02ff96
+        value = _mm256_loadu_si256((__m256i *)p);
02ff96
+        result= _mm256_subs_epu16(value, ymm_wsize);
02ff96
+        _mm256_storeu_si256((__m256i *)p, result);
02ff96
+        p -= 16;
02ff96
+        n -= 16;
02ff96
+    } while (n > 0);
02ff96
+
02ff96
+#ifndef FASTEST
02ff96
+    n = wsize;
02ff96
+    p = &s->prev[n] - 16;
02ff96
+    do {
02ff96
+        __m256i value, result;
e6944d
+
02ff96
+        value = _mm256_loadu_si256((__m256i *)p);
02ff96
+        result= _mm256_subs_epu16(value, ymm_wsize);
02ff96
+        _mm256_storeu_si256((__m256i *)p, result);
02ff96
+
02ff96
+        p -= 16;
02ff96
+        n -= 16;
02ff96
+    } while (n > 0);
02ff96
+#endif
02ff96
+}