Import Skein 1.3vendor/skein/1.3 vendor/skein

Bruce Schneier's hashing algorithm Used by newer versions of ZFS
author: Allan Jude <allanjude@FreeBSD.org> 2016-05-27 02:42:46 +0000
committer: Allan Jude <allanjude@FreeBSD.org> 2016-05-27 02:42:46 +0000
commit: 92f76dc624c277a7c731733a4e51997c0e9ad981 (patch)
tree: 8fef288bf5f480e476a789ae0525520eeea04f99
download: src-test-vendor/skein.tar.gz
src-test-vendor/skein.zip
69 files changed, 31494 insertions, 0 deletions
diff --git a/Additional_Implementations/Atmel_AVR.c b/Additional_Implementations/Atmel_AVR.c
new file mode 100644
index 0000000000000..11cfdd8d74f81
--- /dev/null
+++ b/Additional_Implementations/Atmel_AVR.c
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include "skein.h"
+
+#define   SKEIN_CODE_SIZE (1)       /* instantiate code size routines */
+#define   SKEIN_LOOP    (111)       /* unroll only 8 rounds */
+#define   SKEIN_USE_ASM (512+1024)  /* what to exclude here */
+#include "skein.c"
+#include "skein_block.c"
+
+/* for code size limitations, make "dummy" versions of unused block functions */
+#if SKEIN_USE_ASM & 256
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 512
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 1024
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+
+const u08b_t msg[1] = 
+  {
+  0
+  };
+
+int main(int argc,char *argv[])
+    {
+    u08b_t hash[1024/8];
+	u08b_t i,x;
+    static size_t aBytes,bBytes,uCount;
+
+#if !(SKEIN_USE_ASM & 256)
+    Skein_256_Ctxt_t ctx;
+
+    aBytes = 2*Skein_256_API_CodeSize();
+	bBytes = 2*Skein_256_Process_Block_CodeSize();
+	uCount =   Skein_256_Unroll_Cnt();
+
+    Skein_256_Init  (&ctx,256);
+	Skein_256_Update(&ctx,msg,sizeof(msg));
+	Skein_256_Final (&ctx,hash);
+
+    Skein_256_Process_Block(&ctx,msg,1,256);
+#endif
+
+#if !(SKEIN_USE_ASM & 512)
+    Skein_512_Ctxt_t ctx;
+
+    aBytes = 2*Skein_512_API_CodeSize();
+	bBytes = 2*Skein_512_Process_Block_CodeSize();
+	uCount =   Skein_512_Unroll_Cnt();
+
+    Skein_512_Init  (&ctx,512);
+	Skein_512_Update(&ctx,msg,sizeof(msg));
+	Skein_512_Final (&ctx,hash);
+
+    Skein_512_Process_Block(&ctx,msg,1,512);
+#endif
+
+#if !(SKEIN_USE_ASM & 1024)
+    Skein1024_Ctxt_t ctx;
+
+    aBytes = 2*Skein1024_API_CodeSize();
+	bBytes = 2*Skein1024_Process_Block_CodeSize();
+	uCount =   Skein1024_Unroll_Cnt();
+
+    Skein1024_Init  (&ctx,1024);
+	Skein1024_Update(&ctx,msg,sizeof(msg));
+	Skein1024_Final (&ctx,hash);
+
+    Skein1024_Process_Block(&ctx,msg,1,1024);
+#endif
+    printf("API size = %4d bytes. Block size = %4d bytes. Unroll=%d\n",
+	          aBytes,bBytes,uCount);
+    for (i=x=0;i<5;i++)
+	    printf("hash[%d] = %02X [%02X]\n",i,hash[i],x ^= hash[i]);
+    }
diff --git a/Additional_Implementations/skein_8bit_estimates.xls b/Additional_Implementations/skein_8bit_estimates.xls
new file mode 100644
index 0000000000000..ecc66a28f2054
--- /dev/null
+++ b/Additional_Implementations/skein_8bit_estimates.xls
diff --git a/Additional_Implementations/skein_MSC_v9_perf.txt b/Additional_Implementations/skein_MSC_v9_perf.txt
new file mode 100644
index 0000000000000..9e8f125a45c5b
--- /dev/null
+++ b/Additional_Implementations/skein_MSC_v9_perf.txt
@@ -0,0 +1,129 @@
+File STDIN:
+      1_ ||  2802.00  2814.00  |  5952.00  5952.00  | 30606.00 30606.00  | //: 32-bit, MSC_v9.00 [ C =...]
+     10_ ||   278.40   278.40  |   593.40   593.40  |  3063.00  3063.00  | //: 32-bit, MSC_v9.00 [ C =...]
+    100_ ||    65.52    65.58  |    88.02    88.08  |   306.30   306.30  | //: 32-bit, MSC_v9.00 [ C =...]
+   1000_ ||    41.26    41.41  |    47.96    47.96  |   135.28   135.29  | //: 32-bit, MSC_v9.00 [ C =...]
+  10000_ ||    38.86    39.08  |    44.13    44.21  |   119.88   120.11  | //: 32-bit, MSC_v9.00 [ C =...]
+ 100000_ ||    38.85    39.09  |    43.56    43.77  |   105.79   114.18  | //: 32-bit, MSC_v9.00 [ C =...]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+  Block  ||        10192 bytes |        22960 bytes |        53072 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+      1_ ||   780.00   786.00  |  1110.00  1110.00  |  3288.00  3318.00  | //: 64-bit, MSC_v9.00 [ C =...]
+     10_ ||    78.60    79.80  |   109.80   109.80  |   331.20   331.80  | //: 64-bit, MSC_v9.00 [ C =...]
+    100_ ||    16.74    16.80  |    15.54    15.54  |    33.30    33.30  | //: 64-bit, MSC_v9.00 [ C =...]
+   1000_ ||     9.88    10.67  |     7.38     7.38  |    14.16    14.17  | //: 64-bit, MSC_v9.00 [ C =...]
+  10000_ ||     9.21     9.22  |     6.60     6.60  |    12.27    12.39  | //: 64-bit, MSC_v9.00 [ C =...]
+ 100000_ ||     9.98    10.01  |     7.04     7.08  |    12.36    13.14  | //: 64-bit, MSC_v9.00 [ C =...]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+  Block  ||         2272 bytes |         4944 bytes |        15264 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+      1_ ||  2484.00  2490.00  |  4830.00  4836.00  | 22182.00 22188.00  | //: 32-bit, MSC_v9.00 [asm=...]
+     10_ ||   250.20   252.00  |   485.40   488.40  |  1936.80  1959.00  | //: 32-bit, MSC_v9.00 [asm=...]
+    100_ ||    58.62    58.68  |    70.74    70.80  |   221.76   221.76  | //: 32-bit, MSC_v9.00 [asm=...]
+   1000_ ||    34.12    34.16  |    35.44    35.44  |    85.27    85.31  | //: 32-bit, MSC_v9.00 [asm=...]
+  10000_ ||    34.78    34.98  |    35.36    35.36  |    86.31    86.35  | //: 32-bit, MSC_v9.00 [asm=...]
+ 100000_ ||    32.96    33.40  |    33.29    33.60  |    75.79    76.81  | //: 32-bit, MSC_v9.00 [asm=...]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+  Block  ||         7588 bytes |        16636 bytes |        38262 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+      1_ ||   672.00   672.00  |  1068.00  1068.00  |  1920.00  1926.00  | //: 64-bit, MSC_v9.00 [asm=...]
+     10_ ||    64.80    65.40  |   107.40   108.00  |   192.00   192.60  | //: 64-bit, MSC_v9.00 [asm=...]
+    100_ ||    15.54    15.60  |    16.20    16.26  |    21.06    21.06  | //: 64-bit, MSC_v9.00 [asm=...]
+   1000_ ||     8.18     8.18  |     6.97     6.97  |     7.77     7.78  | //: 64-bit, MSC_v9.00 [asm=...]
+  10000_ ||     7.59     7.59  |     6.23     6.23  |     6.69     6.69  | //: 64-bit, MSC_v9.00 [asm=...]
+ 100000_ ||     7.55     7.71  |     6.14     6.38  |     6.56     6.86  | //: 64-bit, MSC_v9.00 [asm=...]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+  Block  ||         2323 bytes |         4733 bytes |        11817 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+      1_ ||  2952.00  2958.00  |  6030.00  6036.00  | 13668.00 13674.00  | //: 32-bit, MSC_v9.00 [ C =111]
+     10_ ||   295.80   295.80  |   603.00   603.60  |  1366.80  1366.80  | //: 32-bit, MSC_v9.00 [ C =111]
+    100_ ||    69.96    70.02  |    88.98    89.04  |   136.92   137.52  | //: 32-bit, MSC_v9.00 [ C =111]
+   1000_ ||    43.90    43.96  |    48.78    48.85  |    60.08    60.11  | //: 32-bit, MSC_v9.00 [ C =111]
+  10000_ ||    41.53    41.59  |    44.76    44.80  |    53.01    53.01  | //: 32-bit, MSC_v9.00 [ C =111]
+ 100000_ ||    41.32    41.60  |    44.52    44.62  |    51.75    51.92  | //: 32-bit, MSC_v9.00 [ C =111]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+  Block  ||         1712 bytes |         3664 bytes |         7200 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+      1_ ||   780.00   786.00  |  1422.00  1434.00  |  3810.00  3816.00  | //: 64-bit, MSC_v9.00 [ C =111]
+     10_ ||    75.60    76.20  |   140.40   140.40  |   380.40   381.00  | //: 64-bit, MSC_v9.00 [ C =111]
+    100_ ||    17.16    17.22  |    20.52    21.00  |    38.22    38.28  | //: 64-bit, MSC_v9.00 [ C =111]
+   1000_ ||     9.69     9.69  |    10.42    10.42  |    16.51    16.51  | //: 64-bit, MSC_v9.00 [ C =111]
+  10000_ ||     8.97     8.97  |     9.38     9.38  |    14.38    14.40  | //: 64-bit, MSC_v9.00 [ C =111]
+ 100000_ ||     9.18     9.71  |     9.35     9.49  |    14.79    14.99  | //: 64-bit, MSC_v9.00 [ C =111]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+  Block  ||          704 bytes |         1456 bytes |         2976 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+      1_ ||  2580.00  2598.00  |  4842.00  4848.00  | 10578.00 10602.00  | //: 32-bit, MSC_v9.00 [asm=111]
+     10_ ||   259.80   259.80  |   484.20   484.20  |  1059.60  1060.20  | //: 32-bit, MSC_v9.00 [asm=111]
+    100_ ||    57.18    57.24  |    66.42    66.48  |    98.40    98.46  | //: 32-bit, MSC_v9.00 [asm=111]
+   1000_ ||    35.56    35.59  |    35.96    35.96  |    42.79    42.80  | //: 32-bit, MSC_v9.00 [asm=111]
+  10000_ ||    33.69    36.50  |    33.29    33.42  |    37.98    41.34  | //: 32-bit, MSC_v9.00 [asm=111]
+ 100000_ ||    33.96    34.57  |    33.93    35.69  |    38.04    38.20  | //: 32-bit, MSC_v9.00 [asm=111]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+  Block  ||         1276 bytes |         2532 bytes |         4983 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+      1_ ||   678.00   678.00  |  1098.00  1098.00  |  2034.00  2040.00  | //: 64-bit, MSC_v9.00 [asm=111]
+     10_ ||    66.60    66.60  |   109.80   109.80  |   204.00   204.00  | //: 64-bit, MSC_v9.00 [asm=111]
+    100_ ||    15.48    16.68  |    16.98    16.98  |    22.38    22.38  | //: 64-bit, MSC_v9.00 [asm=111]
+   1000_ ||     8.45     8.45  |     7.93     7.93  |     8.39     8.39  | //: 64-bit, MSC_v9.00 [asm=111]
+  10000_ ||     7.81     7.81  |     6.50     6.50  |     7.18     7.18  | //: 64-bit, MSC_v9.00 [asm=111]
+ 100000_ ||     8.08     8.09  |     6.40     6.71  |     6.98     7.21  | //: 64-bit, MSC_v9.00 [asm=111]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+  Block  ||          664 bytes |         1074 bytes |         2221 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+      1_ ||  2988.00  2994.00  |  6240.00  6246.00  | 13794.00 13800.00  | //: 32-bit, MSC_v9.00 [ C =332]
+     10_ ||   297.60   299.40  |   623.40   624.00  |  1379.40  1380.00  | //: 32-bit, MSC_v9.00 [ C =332]
+    100_ ||    70.26    70.32  |    91.92    91.92  |   138.00   138.06  | //: 32-bit, MSC_v9.00 [ C =332]
+   1000_ ||    44.88    44.89  |    50.20    50.20  |    60.44    60.45  | //: 32-bit, MSC_v9.00 [ C =332]
+  10000_ ||    42.42    42.42  |    46.30    46.31  |    53.29    53.31  | //: 32-bit, MSC_v9.00 [ C =332]
+ 100000_ ||    42.21    42.50  |    43.60    45.77  |    49.55    50.03  | //: 32-bit, MSC_v9.00 [ C =332]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+  Block  ||         4560 bytes |         9232 bytes |        12560 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+      1_ ||   780.00   798.00  |  1920.00  1920.00  |  3732.00  3732.00  | //: 64-bit, MSC_v9.00 [ C =332]
+     10_ ||    76.80    76.80  |   189.00   191.40  |   402.60   402.60  | //: 64-bit, MSC_v9.00 [ C =332]
+    100_ ||    17.10    17.16  |    27.66    27.90  |    37.62    37.62  | //: 64-bit, MSC_v9.00 [ C =332]
+   1000_ ||     9.98    10.12  |    14.23    14.25  |    16.13    16.13  | //: 64-bit, MSC_v9.00 [ C =332]
+  10000_ ||     9.27     9.28  |    12.89    12.99  |    13.98    13.98  | //: 64-bit, MSC_v9.00 [ C =332]
+ 100000_ ||     9.32     9.56  |    13.12    13.19  |    14.15    14.23  | //: 64-bit, MSC_v9.00 [ C =332]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+  Block  ||         1200 bytes |         2928 bytes |         5008 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+      1_ ||  2598.00  2604.00  |  4866.00  4878.00  | 10614.00 10632.00  | //: 32-bit, MSC_v9.00 [asm=332]
+     10_ ||   260.40   261.00  |   490.20   490.20  |  1067.40  1067.40  | //: 32-bit, MSC_v9.00 [asm=332]
+    100_ ||    60.78    60.78  |    72.00    72.00  |   106.86   106.92  | //: 32-bit, MSC_v9.00 [asm=332]
+   1000_ ||    38.38    38.42  |    39.17    39.19  |    46.49    46.61  | //: 32-bit, MSC_v9.00 [asm=332]
+  10000_ ||    40.98    47.69  |    35.81    35.86  |    40.96    43.93  | //: 32-bit, MSC_v9.00 [asm=332]
+ 100000_ ||    34.46    36.34  |    34.07    37.16  |    39.60    43.18  | //: 32-bit, MSC_v9.00 [asm=332]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+  Block  ||         3060 bytes |         6300 bytes |         8835 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+      1_ ||   684.00   690.00  |  1104.00  1104.00  |  2028.00  2034.00  | //: 64-bit, MSC_v9.00 [asm=332]
+     10_ ||    70.80    70.80  |   120.00   120.00  |   219.00   219.00  | //: 64-bit, MSC_v9.00 [asm=332]
+    100_ ||    15.72    15.72  |    16.74    16.74  |    22.20    22.20  | //: 64-bit, MSC_v9.00 [asm=332]
+   1000_ ||     8.42     8.42  |     7.22     7.22  |     8.30     8.30  | //: 64-bit, MSC_v9.00 [asm=332]
+  10000_ ||     7.85     8.51  |     6.58     6.58  |     7.11     7.12  | //: 64-bit, MSC_v9.00 [asm=332]
+ 100000_ ||     7.80     9.43  |     6.90     7.71  |     7.18     8.48  | //: 64-bit, MSC_v9.00 [asm=332]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+  Block  ||         1288 bytes |         2182 bytes |         3449 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+      1_ ||  2994.00  2994.00  |  6240.00  6240.00  | 14598.00 14604.00  | //: 32-bit, MSC_v9.00 [ C =335]
+     10_ ||   300.60   301.20  |   624.00   624.60  |  1459.20  1461.00  | //: 32-bit, MSC_v9.00 [ C =335]
+    100_ ||    70.62    70.68  |    91.86    91.92  |   146.10   146.16  | //: 32-bit, MSC_v9.00 [ C =335]
+   1000_ ||    44.65    44.65  |    50.20    50.20  |    62.74    62.76  | //: 32-bit, MSC_v9.00 [ C =335]
+  10000_ ||    42.16    42.42  |    46.31    46.73  |    55.11    55.13  | //: 32-bit, MSC_v9.00 [ C =335]
+ 100000_ ||    40.09    40.55  |    45.76    45.97  |    51.00    53.08  | //: 32-bit, MSC_v9.00 [ C =335]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+  Block  ||         4560 bytes |         9232 bytes |        29280 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+      1_ ||   780.00   798.00  |  1890.00  1920.00  |  3498.00  3498.00  | //: 64-bit, MSC_v9.00 [ C =335]
+     10_ ||    77.40    78.00  |   190.80   195.00  |   350.40   379.20  | //: 64-bit, MSC_v9.00 [ C =335]
+    100_ ||    17.10    17.10  |    27.72    28.08  |    35.28    35.28  | //: 64-bit, MSC_v9.00 [ C =335]
+   1000_ ||     9.95    10.00  |    14.23    14.24  |    15.09    15.10  | //: 64-bit, MSC_v9.00 [ C =335]
+  10000_ ||     9.30    10.06  |    12.94    14.10  |    13.07    14.36  | //: 64-bit, MSC_v9.00 [ C =335]
+ 100000_ ||     9.33     9.58  |    13.94    13.95  |    13.24    13.92  | //: 64-bit, MSC_v9.00 [ C =335]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+  Block  ||         1200 bytes |         2928 bytes |        10880 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+      1_ ||  2586.00  2592.00  |  4896.00  4902.00  | 10668.00 10668.00  | //: 32-bit, MSC_v9.00 [asm=335]
+     10_ ||   263.40   263.40  |   489.60   489.60  |  1069.20  1069.80  | //: 32-bit, MSC_v9.00 [asm=335]
+    100_ ||    61.08    61.14  |    72.30    72.36  |   107.04   107.10  | //: 32-bit, MSC_v9.00 [asm=335]
+   1000_ ||    35.57    35.57  |    36.11    36.12  |    43.07    43.12  | //: 32-bit, MSC_v9.00 [asm=335]
+  10000_ ||    33.68    34.51  |    33.29    36.32  |    37.91    39.80  | //: 32-bit, MSC_v9.00 [asm=335]
+ 100000_ ||    36.32    36.43  |    35.91    35.98  |    38.02    38.19  | //: 32-bit, MSC_v9.00 [asm=335]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+  Block  ||         3060 bytes |         6300 bytes |        20391 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+      1_ ||   684.00   690.00  |  1104.00  1104.00  |  2022.00  2022.00  | //: 64-bit, MSC_v9.00 [asm=335]
+     10_ ||    65.40    65.40  |   109.80   109.80  |   201.60   202.20  | //: 64-bit, MSC_v9.00 [asm=335]
+    100_ ||    15.78    15.78  |    16.80    16.80  |    22.02    22.08  | //: 64-bit, MSC_v9.00 [asm=335]
+   1000_ ||     8.41     8.42  |     7.21     7.22  |     8.24     8.26  | //: 64-bit, MSC_v9.00 [asm=335]
+  10000_ ||     7.84     7.84  |     6.45     6.50  |     7.12     7.12  | //: 64-bit, MSC_v9.00 [asm=335]
+ 100000_ ||     8.11     8.11  |     6.49     6.74  |     6.95     7.26  | //: 64-bit, MSC_v9.00 [asm=335]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=335]
+  Block  ||         1288 bytes |         2182 bytes |         7133 bytes | //: 64-bit, MSC_v9.00 [asm=335]
diff --git a/Additional_Implementations/skein_block_x64.asm b/Additional_Implementations/skein_block_x64.asm
new file mode 100644
index 0000000000000..b5221ae423ad9
--- /dev/null
+++ b/Additional_Implementations/skein_block_x64.asm
@@ -0,0 +1,1335 @@
+;
+;----------------------------------------------------------------
+; 64-bit x86 assembler code (Microsoft ML64) for Skein block functions
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+    .code
+;
+_MASK_ALL_  equ (256+512+1024)      ;all three algorithm bits
+_MAX_FRAME_ equ 240
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_        = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_        = SKEIN_USE_ASM
+else
+_USE_ASM_        = _MASK_ALL_
+endif
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP                           ;configure loop unrolling
+_SKEIN_LOOP       = 0                       ;default is all fully unrolled
+else
+_SKEIN_LOOP       = SKEIN_LOOP
+endif
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) mod 10
+;
+SKEIN_ASM_UNROLL  = 0
+  irp _NN_,<256,512,1024>
+    if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + _NN_
+    endif
+  endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256  =   72
+ROUNDS_512  =   72
+ROUNDS_1024 =   80
+else
+ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) mod 10) + 5)
+endif
+;
+irp _NN_,<256,512,1024>
+  if _USE_ASM_ and _NN_
+    irp _RR_,<%(ROUNDS_&_NN_)>
+      if _NN_ eq 1024
+%out  +++ SKEIN_ROUNDS_&_NN_ = _RR_
+      else
+%out  +++ SKEIN_ROUNDS_&_NN_  = _RR_
+      endif
+    endm
+  endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_CODE_SIZE
+ifdef  SKEIN_PERF
+SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG      = 0
+else
+_SKEIN_DEBUG      = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS   =   0                   ;# bits of hash output
+BCNT        =   8 + HASH_BITS       ;number of bytes in BUFFER[]
+TWEAK       =   8 + BCNT            ;tweak values[0..1]
+X_VARS      =  16 + TWEAK           ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+r08     equ     <r8>
+r09     equ     <r9>
+;
+KW_PARITY   =   01BD11BDAA9FC1A22h  ;overall parity of key schedule words
+FIRST_MASK  =   NOT (1 SHL 62)
+;
+; rotation constants for Skein
+;
+RC_256_0_0  = 14
+RC_256_0_1  = 16
+
+RC_256_1_0  = 52
+RC_256_1_1  = 57
+
+RC_256_2_0  = 23
+RC_256_2_1  = 40
+
+RC_256_3_0  =  5
+RC_256_3_1  = 37
+
+RC_256_4_0  = 25
+RC_256_4_1  = 33
+
+RC_256_5_0  = 46
+RC_256_5_1  = 12
+
+RC_256_6_0  = 58
+RC_256_6_1  = 22
+
+RC_256_7_0  = 32
+RC_256_7_1  = 32
+
+RC_512_0_0  = 46
+RC_512_0_1  = 36
+RC_512_0_2  = 19
+RC_512_0_3  = 37
+
+RC_512_1_0  = 33
+RC_512_1_1  = 27
+RC_512_1_2  = 14
+RC_512_1_3  = 42
+
+RC_512_2_0  = 17
+RC_512_2_1  = 49
+RC_512_2_2  = 36
+RC_512_2_3  = 39
+
+RC_512_3_0  = 44
+RC_512_3_1  =  9
+RC_512_3_2  = 54
+RC_512_3_3  = 56
+
+RC_512_4_0  = 39
+RC_512_4_1  = 30
+RC_512_4_2  = 34
+RC_512_4_3  = 24
+
+RC_512_5_0  = 13
+RC_512_5_1  = 50
+RC_512_5_2  = 10
+RC_512_5_3  = 17
+
+RC_512_6_0  = 25
+RC_512_6_1  = 29
+RC_512_6_2  = 39
+RC_512_6_3  = 43
+
+RC_512_7_0  =  8
+RC_512_7_1  = 35
+RC_512_7_2  = 56
+RC_512_7_3  = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 =  8
+RC_1024_0_3 = 47
+RC_1024_0_4 =  8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 =  4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 =  5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 =  9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 =  4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 =  9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+;  Input:  reg
+; Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+;
+RotL64 macro reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 )
+  if _RCNT_  ;is there anything to do?
+    rol     reg,_RCNT_
+  endif
+endm
+;
+;----------------------------------------------------------------
+;
+; MACROS: define local vars and configure stack
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar    macro localName,localSize
+localName   =   _STK_OFFS_
+_STK_OFFS_  =   _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro BLK_BITS,KS_CNT,NO_FRAME,debugCnt
+    WCNT    =    (BLK_BITS)/64
+;
+_PushCnt_   =   0                   ;save nonvolatile regs on stack
+  irp _reg_,<rbp,rsi,rdi,rbx,r12,r13,r14,r15>
+       push     _reg_
+      .pushreg  _reg_               ;pseudo-op push for exception handling
+_PushCnt_ = _PushCnt_ + 1           ;track count to keep alignment
+  endm
+;
+_STK_OFFS_  =   0                   ;starting offset from rsp
+    ;---- local  variables         ;<-- rsp
+    StackVar    X_stk  ,8*(WCNT)    ;local context vars
+    StackVar    ksTwk  ,8*3         ;key schedule: tweak words
+    StackVar    ksKey  ,8*(WCNT)+8  ;key schedule: key   words
+  if (SKEIN_ASM_UNROLL and (BLK_BITS)) eq 0
+    StackVar    ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen
+  endif
+    StackVar    Wcopy  ,8*(WCNT)    ;copy of input block    
+  if _SKEIN_DEBUG
+  ifnb  <debugCnt>                  ;temp location for debug X[] info
+    StackVar    xDebug_&BLK_BITS ,8*(debugCnt)
+  endif
+  endif
+  if ((8*_PushCnt_ + _STK_OFFS_) and 8) eq 0
+    StackVar    align16,8           ;keep 16-byte aligned (adjust for retAddr?)
+tmpStk_&BLK_BITS = align16          ;use this
+  endif
+LOCAL_SIZE  =   _STK_OFFS_          ;size of local vars
+    ;---- 
+    StackVar    savRegs,8*_PushCnt_ ;saved registers
+    StackVar    retAddr,8           ;return address
+    ;---- caller parameters
+    StackVar    ctxPtr ,8           ;context ptr
+    StackVar    blkPtr ,8           ;pointer to block data
+    StackVar    blkCnt ,8           ;number of full blocks to process
+    StackVar    bitAdd ,8           ;bit count to add to tweak
+    ;---- caller's stack frame
+;
+; set up the stack frame pointer (rbp)
+;
+FRAME_OFFS  =   ksTwk + 128         ;allow short (negative) offset to ksTwk, kwKey
+  if FRAME_OFFS gt _STK_OFFS_       ;keep rbp in the "locals" range
+FRAME_OFFS  =      _STK_OFFS_
+  endif
+  if FRAME_OFFS gt _MAX_FRAME_      ;keep Microsoft .setframe happy
+FRAME_OFFS  =      _MAX_FRAME_
+  endif
+;
+ifdef SKEIN_ASM_INFO
+  if     FRAME_OFFS+128 lt savRegs
+%out +++ SKEIN_&BLK_BITS: Unable to reach all of Wcopy with short offset from rbp.
+  elseif FRAME_OFFS+128 lt Wcopy
+%out +++ SKEIN_&BLK_BITS: Unable to reach end of Wcopy with short offset from rbp.
+  elseif FRAME_OFFS+128 lt _STK_OFFS_
+%out +++ SKEIN_&BLK_BITS: Unable to reach caller parms with short offset from rbp
+  endif
+endif
+  ;put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_&BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_&BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_&BLK_BITS = FRAME_OFFS
+;
+; Notes on stack frame setup:
+;   * the most frequently used variable is X_stk[], based at [rsp+0]
+;   * the next most used is the key schedule arrays, ksKey and ksTwk
+;       so rbp is "centered" there, allowing short offsets to the key 
+;       schedule even in 1024-bit Skein case
+;   * the Wcopy variables are infrequently accessed, but they have long 
+;       offsets from both rsp and rbp only in the 1024-bit case.
+;   * all other local vars and calling parameters can be accessed 
+;       with short offsets, except in the 1024-bit case
+;
+    sub     rsp,LOCAL_SIZE          ;make room for the locals
+    .allocstack LOCAL_SIZE          ;pseudo op for exception handling
+    lea     rbp,[rsp+FRAME_OFFS]    ;maximize use of short offsets
+  ifb <NO_FRAME>
+    .setframe rbp,   FRAME_OFFS     ;pseudo op for exception handling
+  endif
+    mov         [FP_+ctxPtr],rcx    ;save caller's parameters on the stack
+    mov         [FP_+blkPtr],rdx
+    mov         [FP_+blkCnt],r08
+    mov         [FP_+bitAdd],r09
+    .endprolog                      ;pseudo op to support exception handling
+
+    mov     rdi,[FP_+ctxPtr ]       ;rdi --> context
+;
+endm ;Setup_Stack
+;
+FP_         equ <rbp-FRAME_OFFS>    ;keep as many short offsets as possible
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro   procStart
+    add     rsp,LOCAL_SIZE          ;get rid of locals (wipe??)
+  irp _reg_,<r15,r14,r13,r12,rbx,rdi,rsi,rbp>
+    pop     _reg_
+_PushCnt_ = _PushCnt_ - 1
+  endm
+  if _PushCnt_
+    .err    "Mismatched push/pops?"
+  endif
+
+    ;display code size in bytes to stdout
+  irp  _BCNT_,<%($+1-procStart)>    ;account for return opcode
+_ProcBytes_ = _BCNT_
+if     _BCNT_ ge 10000
+%out procStart code size = _BCNT_ bytes  
+elseif _BCNT_ ge  1000
+%out procStart code size =  _BCNT_ bytes  
+else
+%out procStart code size =   _BCNT_ bytes  
+endif
+  endm ;irp _BCNT_
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+    extrn   Skein_Show_Block:proc   ;calls to C routines
+    extrn   Skein_Show_Round:proc
+;
+SKEIN_RND_SPECIAL       =   1000
+SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+;                     const u08b_t *blkPtr, const u64b_t *wPtr, 
+;                     const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+  irp _reg_,<rax,rcx,rdx,r08,r09,r10,r11>
+    push    _reg_                   ;save all volatile regs on tack before the call
+  endm
+    ; get and push call parameters
+    lea     rax,[FP_+ksTwk]         ;tweak pointer
+    push    rax
+    lea     rax,[FP_+ksKey]         ;key pointer
+    push    rax
+    lea     rax,[FP_+Wcopy]         ;wPtr
+    push    rax
+    mov     r09,[FP_+blkPtr]        ;blkPtr
+    push    r09                     ;(push register parameters anyway to make room on stack)
+    mov     rdx,[FP_+ctxPtr]        
+    lea     r08,[rdx+X_VARS]        ;X (pointer)
+    push    r08
+    push    rdx                     ;h (pointer)
+    mov     rcx, BLK_BITS           ;bits
+    push    rdx
+    call    Skein_Show_Block        ;call external debug handler
+    add     rsp,7*8                 ;discard parameters on stack
+  irp _reg_,<r11,r10,r09,r08,rdx,rcx,rax>
+    pop     _reg_                   ;restore regs
+  endm
+endm ; Skein_Debug_Block
+;
+;
+; the macro to "call" to debug a round
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+    ; call the appropriate (local) debug function
+    push    r08
+  if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL)
+    mov     r08, R
+  else                              ;compute round number using edi
+_rOffs_ = RDI_OFFS + 0
+   if BLK_BITS eq 1024
+    mov     r08,[rsp+8+rIdx_offs]   ;get rIdx off the stack (adjust for push r08)
+    lea     r08,[4*r08+1+(((R)-1) and 3)+_rOffs_]
+   else
+    lea     r08,[4*rdi+1+(((R)-1) and 3)+_rOffs_]
+   endif
+  endif
+    call    Skein_Debug_Round_&BLK_BITS
+    pop     r08
+;
+  afterOp
+endm  ;  Skein_Debug_Round
+else  ;------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+Skein_Debug_Block macro BLK_BITS,afterOp
+endm
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+endm
+;
+endif ; _SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+;
+addReg  macro   dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+  ifnb <immOffs>
+       lea     dstReg,[srcReg_A&&srcReg_B + dstReg + immOffs]
+  elseif ((useAddOp + 0) eq 0)
+    ifndef ASM_NO_LEA
+      ;lea seems to be faster on Core 2 Duo CPUs!
+       lea     dstReg,[srcReg_A&&srcReg_B + dstReg]   
+    else
+       add     dstReg, srcReg_A&&srcReg_B
+    endif
+  else
+       add     dstReg, srcReg_A&&srcReg_B
+  endif
+endm
+;
+;=================================== Skein_256 =============================================
+;
+if _USE_ASM_ and 256
+    public  Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+Skein_256_Process_Block proc frame
+    Setup_Stack 256,((ROUNDS_256/8)+1)
+    mov     r14,[rdi+TWEAK+8]
+    jmp   short Skein_256_block_loop
+    align   16
+    ; main hash loop for Skein_256
+Skein_256_block_loop:
+    ;
+    ; general register usage:
+    ;   RAX..RDX        = X0..X3    
+    ;   R08..R12        = ks[0..4]
+    ;   R13..R15        = ts[0..2]
+    ;   RSP, RBP        = stack/frame pointers
+    ;   RDI             = round counter or context pointer
+    ;   RSI             = temp
+    ;
+    mov     r13,[rdi+TWEAK+0]
+    add     r13,[FP_+bitAdd]        ;computed updated tweak value T0
+    mov     r15,r14
+    xor     r15,r13                 ;now r13.r15 is set as the tweak 
+
+    mov     r12,KW_PARITY
+    mov     r08,[rdi+X_VARS+ 0]
+    mov     r09,[rdi+X_VARS+ 8]
+    mov     r10,[rdi+X_VARS+16]
+    mov     r11,[rdi+X_VARS+24]
+    mov         [rdi+TWEAK+0],r13   ;save updated tweak value ctx->h.T[0]
+    xor     r12,r08                 ;start accumulating overall parity
+
+    mov     rsi,[FP_+blkPtr ]       ;esi --> input block
+    xor     r12,r09
+    mov     rax,[rsi+ 0]            ;get X[0..3]
+    xor     r12,r10
+    mov     rbx,[rsi+ 8]
+    xor     r12,r11
+    mov     rcx,[rsi+16]
+    mov     rdx,[rsi+24]
+
+    mov         [FP_+Wcopy+ 0],rax  ;save copy of input block
+    mov         [FP_+Wcopy+ 8],rbx
+    mov         [FP_+Wcopy+16],rcx
+    mov         [FP_+Wcopy+24],rdx
+
+    add     rax, r08                ;initial key injection
+    add     rbx, r09
+    add     rcx, r10
+    add     rdx, r11
+    add     rbx, r13
+    add     rcx, r14
+
+if _SKEIN_DEBUG
+    mov         [rdi+TWEAK+ 8],r14  ;save updated tweak T[1] (start bit cleared?)
+    mov         [FP_+ksKey+ 0],r08  ;save key schedule on stack for Skein_Debug_Block
+    mov         [FP_+ksKey+ 8],r09
+    mov         [FP_+ksKey+16],r10
+    mov         [FP_+ksKey+24],r11
+    mov         [FP_+ksKey+32],r12
+
+    mov         [FP_+ksTwk+ 0],r13
+    mov         [FP_+ksTwk+ 8],r14
+    mov         [FP_+ksTwk+16],r15
+
+    mov         [rsp+X_stk + 0],rax ;save X[] on stack for Skein_Debug_Block
+    mov         [rsp+X_stk + 8],rbx
+    mov         [rsp+X_stk +16],rcx
+    mov         [rsp+X_stk +24],rdx
+
+    Skein_Debug_Block 256           ;debug dump
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+endif
+;
+if ((SKEIN_ASM_UNROLL and 256) eq 0)
+    mov         [FP_+ksKey+40],r08 ;save key schedule on stack for looping code
+    mov         [FP_+ksKey+ 8],r09
+    mov         [FP_+ksKey+16],r10
+    mov         [FP_+ksKey+24],r11
+    mov         [FP_+ksKey+32],r12
+
+    mov         [FP_+ksTwk+24],r13
+    mov         [FP_+ksTwk+ 8],r14
+    mov         [FP_+ksTwk+16],r15
+endif
+    add     rsi, WCNT*8             ;skip the block
+    mov         [FP_+blkPtr   ],rsi ;update block pointer
+;
+opLoop macro op1,op2
+  if (SKEIN_ASM_UNROLL and 256) eq 0
+    op1
+  else
+    op2
+  endif
+endm
+;
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT =   ROUNDS_256/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_256
+  if ((ROUNDS_256/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_256"
+  endif
+    xor     rdi,rdi                   ;rdi = iteration count
+Skein_256_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+    ; all X and ks vars in regs     ; (ops to "rotate" ks vars, via mem, if not unrolled)
+    ; round 4*_RBase_ + 0
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_RBase_+0) and 7),0
+    addReg  rcx, rdx
+                    opLoop  <mov r08,[FP_+ksKey+8*rdi+8*1]>
+    xor     rbx, rax
+    RotL64  rdx, 256,%((4*_RBase_+0) and 7),1
+    xor     rdx, rcx
+ if SKEIN_ASM_UNROLL and 256
+    irp _r0_,<%(08+(_Rbase_+3) mod 5)>
+    irp _r1_,<%(13+(_Rbase_+2) mod 3)>
+      lea   rdi,[r&_r0_+r&_r1_]     ;precompute key injection value for rcx
+    endm
+    endm
+ endif
+                    opLoop  <mov r13,[FP_+ksTwk+8*rdi+8*1]>
+    Skein_Debug_Round 256,%(4*_RBase_+1)
+
+    ; round 4*_RBase_ + 1
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_RBase_+1) and 7),0
+    xor     rdx, rax
+                    opLoop  <mov r09,[FP_+ksKey+8*rdi+8*2]>
+    addReg  rcx, rbx
+    RotL64  rbx, 256,%((4*_RBase_+1) and 7),1
+    xor     rbx, rcx
+                    opLoop  <mov r11,[FP_+ksKey+8*rdi+8*4]>
+    Skein_Debug_Round 256,%(4*_RBase_+2)
+ if SKEIN_ASM_UNROLL and 256
+    irp _r0_,<%(08+(_Rbase_+2) mod 5)>
+    irp _r1_,<%(13+(_Rbase_+1) mod 3)>
+      lea   rsi,[r&_r0_+r&_r1_]     ;precompute key injection value for rbx
+    endm
+    endm
+ endif
+    ; round 4*_RBase_ + 2
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_RBase_+2) and 7),0
+    addReg  rcx, rdx
+                    opLoop  <mov r10,[FP_+ksKey+8*rdi+8*3]>
+    xor     rbx, rax
+    RotL64  rdx, 256,%((4*_RBase_+2) and 7),1
+    xor     rdx, rcx
+                    opLoop  <mov     [FP_+ksKey+8*rdi+8*6],r08> ;"rotate" the key
+                    opLoop  <lea r11,[r11+rdi+1]>   ;precompute key + tweak
+    Skein_Debug_Round 256,%(4*_RBase_+3)
+    ; round 4*_RBase_ + 3
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_RBase_+3) and 7),0
+    addReg  rcx, rbx
+                    opLoop  <add r10,[FP_+ksTwk+8*rdi+8*2]>    ;precompute key + tweak
+                    opLoop  <mov     [FP_+ksTwk+8*rdi+8*4],r13> ;"rotate" the tweak
+    xor     rdx, rax
+    RotL64  rbx, 256,%((4*_RBase_+3) and 7),1
+    xor     rbx, rcx
+    Skein_Debug_Round 256,%(4*_RBase_+4)
+                    opLoop  <addReg r09,r13>    ;precompute key+tweak
+      ;inject key schedule words
+_Rbase_ = _Rbase_+1
+  if SKEIN_ASM_UNROLL and 256
+      addReg    rax,r,%(08+((_Rbase_+0) mod 5))
+      addReg    rbx,rsi
+      addReg    rcx,rdi
+      addReg    rdx,r,%(08+((_Rbase_+3) mod 5)),,_Rbase_
+  else
+      inc       rdi
+      addReg    rax,r08
+      addReg    rcx,r10
+      addReg    rbx,r09
+      addReg    rdx,r11
+  endif
+      Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+endm ;rept _UNROLL_CNT
+
+;
+if (SKEIN_ASM_UNROLL and 256) eq 0
+    cmp     rdi,2*(ROUNDS_256/8)
+    jb      Skein_256_round_loop
+endif ; (SKEIN_ASM_UNROLL and 256) eq 0
+    mov     rdi,[FP_+ctxPtr ]           ;restore edi --> context
+
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
+    xor     rax,[FP_+Wcopy + 0]
+    mov     r14,FIRST_MASK
+    xor     rbx,[FP_+Wcopy + 8]
+    xor     rcx,[FP_+Wcopy +16]
+    xor     rdx,[FP_+Wcopy +24]
+    mov         [rdi+X_VARS+ 0],rax     ;store final result
+    and     r14,[rdi+TWEAK + 8]
+    dec     qword ptr [FP_+blkCnt]      ;set zero flag
+    mov         [rdi+X_VARS+ 8],rbx
+    mov         [rdi+X_VARS+16],rcx
+    mov         [rdi+X_VARS+24],rdx
+
+    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,,<cmp qword ptr [FP_+blkCnt],0>
+
+    ; go back for more blocks, if needed
+    jnz     Skein_256_block_loop
+    mov         [rdi+TWEAK + 8],r14
+    Reset_Stack Skein_256_Process_Block
+    ret
+
+  if _SKEIN_DEBUG
+Skein_Debug_Round_256:
+    mov         [FP_+X_stk+ 0],rax  ;first, save X[] state on stack so debug routines can access it
+    mov         [FP_+X_stk+ 8],rbx  ;(use FP_ since rsp has changed!)
+    mov         [FP_+X_stk+16],rcx
+    mov         [FP_+X_stk+24],rdx
+    push    rdx                     ;save two regs for BLK_BITS-specific parms
+    push    rcx
+    mov     rdx,[FP_+ctxPtr]        ;ctx_hdr_ptr
+    mov     rcx, 256
+    jmp     Skein_Debug_Round_Common
+  endif
+
+Skein_256_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+    public  Skein_256_Process_Block_CodeSize
+Skein_256_Process_Block_CodeSize proc
+    mov     rax,_ProcBytes_
+    ret
+Skein_256_Process_Block_CodeSize endp
+;
+    public  Skein_256_Unroll_Cnt
+Skein_256_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_256/8
+    mov     rax,_UNROLL_CNT
+  else
+    xor     rax,rax
+  endif
+    ret
+Skein_256_Unroll_Cnt endp
+endif
+;
+endif ;_USE_ASM_ and 256
+;
+;=================================== Skein_512 =============================================
+;
+if _USE_ASM_ and 512
+    public  Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+rX_512_0    equ r08         ;register assignments for X[] values during rounds
+rX_512_1    equ r09
+rX_512_2    equ r10
+rX_512_3    equ r11
+rX_512_4    equ r12
+rX_512_5    equ r13
+rX_512_6    equ r14
+rX_512_7    equ r15
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one round for 512-bit blocks
+;
+R_512_OneRound  macro r0,r1,r2,r3,r4,r5,r6,r7,_Rn_,op1,op2,op3,op4
+;
+    addReg      rX_512_&r0, rX_512_&r1
+    RotL64      rX_512_&r1, 512,%((_Rn_) and 7),0
+    xor         rX_512_&r1, rX_512_&r0
+            op1
+    addReg      rX_512_&r2, rX_512_&r3
+    RotL64      rX_512_&r3, 512,%((_Rn_) and 7),1
+    xor         rX_512_&r3, rX_512_&r2
+            op2
+    addReg      rX_512_&r4, rX_512_&r5
+    RotL64      rX_512_&r5, 512,%((_Rn_) and 7),2
+    xor         rX_512_&r5, rX_512_&r4
+            op3
+    addReg      rX_512_&r6, rX_512_&r7
+    RotL64      rX_512_&r7, 512,%((_Rn_) and 7),3
+    xor         rX_512_&r7, rX_512_&r6
+            op4
+    Skein_Debug_Round 512,%(_Rn_+1),-4
+;
+endm ;R_512_OneRound
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: eight rounds for 512-bit blocks
+;
+R_512_FourRounds macro _RR_    ;RR = base round number (0 mod 8)
+  if SKEIN_ASM_UNROLL and 512
+    ; here for fully unrolled case.
+    _II_ = ((_RR_)/4) + 1       ;key injection counter
+    R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),<mov rax,[FP_+ksKey+8*(((_II_)+3) mod 9)]>,,<mov rbx,[FP_+ksKey+8*(((_II_)+4) mod 9)]>
+    R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),<mov rcx,[FP_+ksKey+8*(((_II_)+5) mod 9)]>,,<mov rdx,[FP_+ksKey+8*(((_II_)+6) mod 9)]>
+    R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),<mov rsi,[FP_+ksKey+8*(((_II_)+7) mod 9)]>,,<add rcx,[FP_+ksTwk+8*(((_II_)+0) mod 3)]>
+    R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),<add rdx,[FP_+ksTwk+8*(((_II_)+1) mod 3)]>,
+    ; inject the key schedule
+    add     r08,[FP_+ksKey+8*(((_II_)+0) mod 9)]
+    addReg  r11,rax
+    add     r09,[FP_+ksKey+8*(((_II_)+1) mod 9)]
+    addReg  r12,rbx
+    add     r10,[FP_+ksKey+8*(((_II_)+2) mod 9)]
+    addReg  r13,rcx
+    addReg  r14,rdx
+    addReg  r15,rsi,,,(_II_)
+  else
+    ; here for looping case                                                    ;"rotate" key/tweak schedule (move up on stack)
+    inc     rdi                 ;bump key injection counter
+    R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),<mov rdx,[FP_+ksKey+8*rdi+8*6]>,<mov rax,[FP_+ksTwk+8*rdi-8*1]>    ,<mov rsi,[FP_+ksKey+8*rdi-8*1]>
+    R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),<mov rcx,[FP_+ksKey+8*rdi+8*5]>,<mov     [FP_+ksTwk+8*rdi+8*2],rax>,<mov     [FP_+ksKey+8*rdi+8*8],rsi>
+    R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),<mov rbx,[FP_+ksKey+8*rdi+8*4]>,<add rdx,[FP_+ksTwk+8*rdi+8*1]>    ,<mov rsi,[FP_+ksKey+8*rdi+8*7]>    
+    R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),<mov rax,[FP_+ksKey+8*rdi+8*3]>,<add rcx,[FP_+ksTwk+8*rdi+8*0]>
+    ; inject the key schedule
+    add     r08,[FP_+ksKey+8*rdi+8*0]
+    addReg  r11,rax
+    addReg  r12,rbx
+    add     r09,[FP_+ksKey+8*rdi+8*1]
+    addReg  r13,rcx
+    addReg  r14,rdx
+    add     r10,[FP_+ksKey+8*rdi+8*2]
+    addReg  r15,rsi
+    addReg  r15,rdi              ;inject the round number
+  endif
+    ;show the result of the key injection
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
+endm ;R_512_EightRounds
+;
+;;;;;;;;;;;;;;;;;
+; instantiated code
+;
+Skein_512_Process_Block proc frame
+    Setup_Stack 512,ROUNDS_512/8
+    mov     rbx,[rdi+TWEAK+ 8]
+    jmp   short Skein_512_block_loop
+    align  16
+    ; main hash loop for Skein_512
+Skein_512_block_loop:
+    ; general register usage:
+    ;   RAX..RDX        = temps for key schedule pre-loads
+    ;   R08..R15        = X0..X7
+    ;   RSP, RBP        = stack/frame pointers
+    ;   RDI             = round counter or context pointer
+    ;   RSI             = temp
+    ;
+    mov     rax,[rdi+TWEAK+ 0]
+    add     rax,[FP_+bitAdd]        ;computed updated tweak value T0
+    mov     rcx,rbx
+    xor     rcx,rax                 ;rax/rbx/rcx = tweak schedule
+    mov         [rdi+TWEAK+ 0],rax  ;save updated tweak value ctx->h.T[0]
+    mov         [FP_+ksTwk+ 0],rax
+    mov     rdx,KW_PARITY
+    mov     rsi,[FP_+blkPtr ]       ;rsi --> input block
+    mov         [FP_+ksTwk+ 8],rbx
+    mov         [FP_+ksTwk+16],rcx
+
+    irp _Rn_,<0,1,2,3,4,5,6,7>
+      mov   rX_512_&_Rn_,[rdi+X_VARS+8*(_Rn_)]
+      xor   rdx,rX_512_&_Rn_        ;compute overall parity
+      mov   [FP_+ksKey+8*(_Rn_)],rX_512_&_Rn_
+    endm                            ;load state into r08..r15, compute parity
+      mov   [FP_+ksKey+8*(8)],rdx   ;save key schedule parity
+
+    addReg  rX_512_5,rax            ;precompute key injection for tweak
+    addReg  rX_512_6,rbx
+if _SKEIN_DEBUG
+    mov         [rdi+TWEAK+ 8],rbx  ;save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
+endif
+    mov     rax,[rsi+ 0]            ;load input block
+    mov     rbx,[rsi+ 8]
+    mov     rcx,[rsi+16]
+    mov     rdx,[rsi+24]
+    addReg  r08,rax                 ;do initial key injection
+    addReg  r09,rbx
+    mov         [FP_+Wcopy+ 0],rax  ;keep local copy for feedforward
+    mov         [FP_+Wcopy+ 8],rbx
+    addReg  r10,rcx
+    addReg  r11,rdx
+    mov         [FP_+Wcopy+16],rcx
+    mov         [FP_+Wcopy+24],rdx
+
+    mov     rax,[rsi+32]
+    mov     rbx,[rsi+40]
+    mov     rcx,[rsi+48]
+    mov     rdx,[rsi+56]
+    addReg  r12,rax
+    addReg  r13,rbx
+    addReg  r14,rcx
+    addReg  r15,rdx
+    mov         [FP_+Wcopy+32],rax
+    mov         [FP_+Wcopy+40],rbx
+    mov         [FP_+Wcopy+48],rcx
+    mov         [FP_+Wcopy+56],rdx
+
+if _SKEIN_DEBUG
+    irp _Rn_,<0,1,2,3,4,5,6,7>      ;save values on stack for debug output
+      mov       [rsp+X_stk+8*(_Rn_)],rX_512_&_Rn_
+    endm
+
+    Skein_Debug_Block 512           ;debug dump
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
+endif
+    add     rsi, 8*WCNT             ;skip the block
+    mov         [FP_+blkPtr   ],rsi ;update block pointer
+    ;
+    ;;;;;;;;;;;;;;;;;
+    ; now the key schedule is computed. Start the rounds
+    ;
+if SKEIN_ASM_UNROLL and 512
+_UNROLL_CNT =   ROUNDS_512/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_512
+  if ((ROUNDS_512/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_512"
+  endif
+    xor     rdi,rdi                 ;rdi = round counter
+Skein_512_round_loop:
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+      R_512_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 512) eq 0
+    cmp     rdi,2*(ROUNDS_512/8)
+    jb      Skein_512_round_loop
+    mov     rdi,[FP_+ctxPtr ]           ;restore rdi --> context
+endif
+    ; end of rounds
+    ;;;;;;;;;;;;;;;;;
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
+    irp _Rn_,<0,1,2,3,4,5,6,7>
+  if (_Rn_ eq 0)
+    mov     rbx,FIRST_MASK
+  endif
+      xor   rX_512_&_Rn_,[FP_+Wcopy+8*(_Rn_)]       ;feedforward XOR
+      mov       [rdi+X_VARS+8*(_Rn_)],rX_512_&_Rn_  ;and store result
+  if (_Rn_ eq 6)
+    and     rbx,[rdi+TWEAK+ 8]
+  endif
+    endm
+    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+
+    ; go back for more blocks, if needed
+    dec     qword ptr [FP_+blkCnt]
+    jnz     Skein_512_block_loop
+    mov         [rdi+TWEAK + 8],rbx
+
+    Reset_Stack Skein_512_Process_Block
+    ret
+;
+  if _SKEIN_DEBUG
+; call here with r08 = "round number"
+Skein_Debug_Round_512:
+    push    rdx                     ;save two regs for BLK_BITS-specific parms
+    push    rcx
+    mov     rcx,[rsp+24]            ;get back original r08 (pushed on stack in macro call)
+    mov         [FP_+X_stk],rcx     ;and save it in X_stk
+  irp _Rn_,<1,2,3,4,5,6,7>          ;save rest of X[] state on stack so debug routines can access it
+    mov         [FP_+X_stk+8*(_Rn_)],rX_512_&_Rn_ 
+  endm
+    mov     rdx,[FP_+ctxPtr]        ;ctx_hdr_ptr
+    mov     rcx, 512                ;block size
+    jmp     Skein_Debug_Round_Common
+  endif
+;
+Skein_512_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+    public  Skein_512_Process_Block_CodeSize
+Skein_512_Process_Block_CodeSize proc
+    mov     rax,_ProcBytes_
+    ret
+Skein_512_Process_Block_CodeSize endp
+;
+    public  Skein_512_Unroll_Cnt
+Skein_512_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_512/8
+    mov     rax,_UNROLL_CNT
+  else
+    xor     rax,rax
+  endif
+    ret
+Skein_512_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 512
+;
+;=================================== Skein1024 =============================================
+if _USE_ASM_ and 1024
+    public  Skein1024_Process_Block
+;
+; void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; use details of permutation to make register assignments
+;
+r1K_x0    equ rdi 
+r1K_x1    equ rsi
+r1K_x2    equ rbp
+r1K_x3    equ rax
+r1K_x4    equ rcx           ;"shared" with X6, since X4/X6 alternate
+r1K_x5    equ rbx
+r1K_x6    equ rcx
+r1K_x7    equ rdx
+r1K_x8    equ r08
+r1K_x9    equ r09
+r1K_xA    equ r10
+r1K_xB    equ r11
+r1K_xC    equ r12
+r1K_xD    equ r13
+r1K_xE    equ r14
+r1K_xF    equ r15
+;
+rIdx      equ r1K_x0        ;index register for looping versions
+rIdx_offs equ tmpStk_1024
+;
+R1024_Mix  macro w0,w1,_RN0_,_Rn1_,op1
+_w0  = 0&w0&h               ;handle the hex conversion
+_w1  = 0&w1&h
+_II_ = ((_RN0_)/4)+1        ;injection count
+     ;
+    addReg      r1K_x&w0 , r1K_x&w1                     ;perform the MIX
+    RotL64      r1K_x&w1 , 1024,%((_RN0_) and 7),_Rn1_
+    xor         r1K_x&w1 , r1K_x&w0
+ if ((_RN0_) and 3) eq 3                                ;time to do key injection?
+  if _SKEIN_DEBUG
+    mov         [rsp+xDebug_1024+8*_w0],r1K_x&w0        ;save intermediate values for Debug_Round
+    mov         [rsp+xDebug_1024+8*_w1],r1K_x&w1        ; (before inline key injection)
+  endif
+  if SKEIN_ASM_UNROLL and 1024  ;here to do fully unrolled key injection
+    add         r1K_x&w0, [rsp+ksKey+      8*((_II_+_w0) mod 17)]
+    add         r1K_x&w1, [rsp+ksKey+      8*((_II_+_w1) mod 17)]
+   if     _w1 eq 13                                     ;tweak injection
+    add         r1K_x&w1, [rsp+ksTwk+      8*((_II_+0  ) mod  3)]
+   elseif _w0 eq 14
+    add         r1K_x&w0, [rsp+ksTwk+      8*((_II_+1  ) mod  3)]
+   elseif _w1 eq 15
+    add         r1K_x&w1, _II_                          ;(injection counter)
+   endif
+  else                          ;here to do looping  key injection
+   if  (_w0 eq 0)
+    mov                   [rsp+X_stk+8*_w0],r1K_x0      ;if so, store N0 so we can use reg as index
+    mov         rIdx,     [rsp+rIdx_offs]               ;get the injection counter index into rIdx (N0)
+   else
+    add         r1K_x&w0, [rsp+ksKey+8+8*rIdx+8*_w0]    ;even key injection
+   endif
+   if     _w1 eq 13                                     ;tweak injection
+    add         r1K_x&w1, [rsp+ksTwk+8+8*rIdx+8*0  ]
+   elseif _w0 eq 14
+    add         r1K_x&w0, [rsp+ksTwk+8+8*rIdx+8*1  ]
+   elseif _w1 eq 15
+    addReg      r1K_x&w1, rIdx,,,1                      ;(injection counter)
+   endif
+    add         r1K_x&w1, [rsp+ksKey+8+8*rIdx+8*_w1]    ;odd  key injection
+  endif
+ endif
+    ; insert the op provided, if any
+    op1
+endm
+;;;;;;;;;;;;;;;;;
+; MACRO: one round for 1024-bit blocks
+;
+R1024_OneRound  macro x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF,_Rn_
+  if (x0 ne 0) or ((x4 ne 4) and (x4 ne 6)) or (x4 ne (x6 xor 2))
+    .err "faulty register assignment!"
+  endif
+    R1024_Mix   x0,x1,_Rn_,0
+    R1024_Mix   x2,x3,_Rn_,1 
+    R1024_Mix   x4,x5,_Rn_,2, <mov        [rsp+X_stk+8*0&x4&h],r1K_x4>  ;save x4  on  stack (x4/x6 alternate)
+    R1024_Mix   x8,x9,_Rn_,4, <mov r1K_x6,[rsp+X_stk+8*0&x6&h]>         ;load x6 from stack 
+    R1024_Mix   xA,xB,_Rn_,5
+    R1024_Mix   xC,xD,_Rn_,6
+    R1024_Mix   x6,x7,_Rn_,3
+    R1024_Mix   xE,xF,_Rn_,7
+  if _SKEIN_DEBUG
+    Skein_Debug_Round 1024,%(_Rn_+1)
+  endif
+endm ;R1024_OneRound
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds for 1024-bit blocks
+;
+R1024_FourRounds macro _RR_    ;RR = base round number (0 mod 4)
+    ; should be here with r1K_x4 set properly, x6 stored on stack
+    R1024_OneRound 0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F,%((_RR_)+0)
+    R1024_OneRound 0,9,2,D,6,B,4,F,A,7,C,3,E,5,8,1,%((_RR_)+1)
+    R1024_Oneround 0,7,2,5,4,3,6,1,C,F,E,D,8,B,A,9,%((_RR_)+2)
+    R1024_Oneround 0,F,2,B,6,D,4,9,E,1,8,5,A,3,C,7,%((_RR_)+3)
+  if (SKEIN_ASM_UNROLL and 1024) eq 0       ;here with r1K_x0 == rIdx, X0 on stack
+    ;rotate the key schedule on the stack
+    mov            [rsp+X_stk+       8* 8],r1K_x8;free up a reg
+    mov     r1K_x8,[rsp+ksKey+8*rIdx+8* 0]          ;get key
+    mov            [rsp+ksKey+8*rIdx+8*17],r1K_x8   ;rotate it (must do key first or tweak clobbers it!)
+    mov     r1K_x8,[rsp+ksTwk+8*rIdx+8* 0]          ;get tweak
+    mov            [rsp+ksTwk+8*rIdx+8* 3],r1K_x8   ;rotate it
+    mov     r1K_x8,[rsp+X_stk+       8* 8]      ;get the reg back
+    inc     rIdx                                ;bump the index
+    mov            [rsp+rIdx_offs],rIdx         ;save it
+    mov     r1K_x0,[rsp+ksKey+8*rIdx]           ;get the key schedule word for X0
+    add     r1K_x0,[rsp+X_stk+8*0]              ;perform the X0 key injection
+  endif
+    ;show the result of the key injection
+    Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
+endm ;R1024_FourRounds
+;
+;;;;;;;;;;;;;;;;
+; code
+;
+Skein1024_Process_Block proc frame
+;
+    Setup_Stack 1024,ROUNDS_1024/8,NO_FRAME,<WCNT>
+    mov     r09,[rdi+TWEAK+ 8]
+    jmp   short Skein1024_block_loop
+    align  16
+    ; main hash loop for Skein1024
+Skein1024_block_loop:
+    ; general register usage:
+    ;   RSP             = stack pointer
+    ;   RAX..RDX,RSI,RDI= X1, X3..X7 (state words)
+    ;   R08..R15        = X8..X15    (state words)
+    ;   RBP             = temp (used for X0 and X2)
+    ;
+  if (SKEIN_ASM_UNROLL and 1024) eq 0
+    xor     rax,rax                 ;init loop index on the stack
+    mov     [rsp+rIdx_offs],rax
+  endif
+    mov     r08,[rdi+TWEAK+ 0]
+    add     r08,[FP_+bitAdd]        ;computed updated tweak value T0
+    mov     r10,r09
+    xor     r10,r08                 ;rax/rbx/rcx = tweak schedule
+    mov         [rdi+TWEAK+ 0],r08  ;save updated tweak value ctx->h.T[0]
+    mov         [FP_+ksTwk+ 0],r08
+    mov         [FP_+ksTwk+ 8],r09  ;keep values in r08,r09 for initial tweak injection below
+    mov         [FP_+ksTwk+16],r10
+  if _SKEIN_DEBUG
+    mov         [rdi+TWEAK+ 8],r09  ;save updated tweak value ctx->h.T[1] for Skein_Debug_Block
+  endif
+    mov     rsi ,[FP_+blkPtr ]      ;r1K_x2 --> input block
+    mov     rax , KW_PARITY         ;overall key schedule parity
+
+    ; logic here assumes the set {rdi,rsi,rbp,rax} = r1K_x{0,1,2,3}
+
+    irp _rN_,<0,1,2,3,4,6>            ;process the "initial" words, using r14,r15 as temps
+      mov       r14,[rdi+X_VARS+8*_rN_]                 ;get state word
+      mov       r15,[rsi+       8*_rN_]                 ;get msg   word
+      xor       rax,r14                                 ;update key schedule parity
+      mov           [FP_+ksKey +8*_rN_],r14             ;save key schedule word on stack
+      mov           [FP_+Wcopy +8*_rN_],r15             ;save local msg Wcopy 
+      add       r14,r15                                 ;do the initial key injection
+      mov           [rsp+X_stk +8*_rN_],r14             ;save initial state var on stack
+    endm
+    ; now process the rest, using the "real" registers 
+    ;     (MUST do it in reverse order to inject tweaks r08/r09 first)
+    irp _rN_,<F,E,D,C,B,A,9,8,7,5>
+_rr_ = 0&_rN_&h
+      mov   r1K_x&_rN_,[rdi+X_VARS+8*_rr_]              ;get key schedule word from context
+      mov   r1K_x4    ,[rsi+       8*_rr_]              ;get next input msg word
+      mov              [rsp+ksKey +8*_rr_],r1K_x&_rN_   ;save key schedule on stack
+      xor   rax       , r1K_x&_rN_                      ;accumulate key schedule parity
+      mov              [FP_+Wcopy +8*_rr_],r1K_x4       ;save copy of msg word for feedforward
+      add   r1K_x&_rN_, r1K_x4                          ;do the initial  key  injection
+      if     _rr_ eq 13                                 ;do the initial tweak injection
+        addReg r1K_x&_rN_,r08                           ;          (only in words 13/14)
+      elseif _rr_ eq 14
+        addReg r1K_x&_rN_,r09
+      endif
+    endm
+    mov                [FP_+ksKey+8*WCNT],rax           ;save key schedule parity
+if _SKEIN_DEBUG
+    Skein_Debug_Block 1024           ;debug dump
+endif
+    addReg  rsi,8*WCNT                                  ;bump the msg ptr
+    mov                [FP_+blkPtr],rsi                 ;save bumped msg ptr
+    ; re-load words 0..4 [rbp,rsi,rdi,rax,rbx] from stack, enter the main loop
+    irp _rN_,<0,1,2,3,4>                                ;(no need to re-load x6)
+      mov   r1K_x&_rN_,[rsp+X_stk+8*_rN_]               ;re-load state and get ready to go!
+    endm
+if _SKEIN_DEBUG
+    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        ;show state after initial key injection
+endif
+    ;
+    ;;;;;;;;;;;;;;;;;
+    ; now the key schedule is computed. Start the rounds
+    ;
+if SKEIN_ASM_UNROLL and 1024
+_UNROLL_CNT =   ROUNDS_1024/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_1024
+  if ((ROUNDS_1024/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL1024"
+  endif
+Skein1024_round_loop:
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2                   ;implement the rounds, 4 at a time
+      R1024_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 1024) eq 0
+    cmp     qword ptr [rsp+tmpStk_1024],2*(ROUNDS_1024/8) ;see if we are done
+    jb      Skein1024_round_loop    
+endif
+    ; end of rounds
+    ;;;;;;;;;;;;;;;;;
+    ;
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
+    mov     [rsp+X_stk+8*7],r1K_x7  ;we need a register. x6 already on stack
+    mov     r1K_x7,[rsp+ctxPtr]
+    
+    irp _rN_,<0,1,2,3,4,5,8,9,A,B,C,D,E,F>              ;do all but x6,x7
+      xor   r1K_x&_rN_,[rsp   +Wcopy +8*(0&_rN_&h)]     ;feedforward XOR
+      mov              [r1K_x7+X_VARS+8*(0&_rN_&h)],r1K_x&_rN_ ;save result into context
+  if (0&_rN_&h eq 9)
+    mov     r09,FIRST_MASK
+  endif
+  if (0&_rN_&h eq 0eh)
+    and     r09,[r1K_x7+TWEAK+ 8]
+  endif
+    endm
+    ; 
+    mov     rax,[rsp+X_stk    +8*6] ;now process x6,x7
+    mov     rbx,[rsp+X_stk    +8*7]
+    xor     rax,[rsp+Wcopy    +8*6]
+    xor     rbx,[rsp+Wcopy    +8*7]
+    mov         [r1K_x7+X_VARS+8*6],rax
+    dec     qword ptr [rsp+blkCnt]  ;set zero flag iff done
+    mov         [r1K_x7+X_VARS+8*7],rbx
+
+    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmp qword ptr [rsp+blkCnt],0>
+    ; go back for more blocks, if needed
+    mov     rdi,[rsp+ctxPtr]        ;don't muck with the flags here!
+    lea     rbp,[rsp+FRAME_OFFS]
+    jnz     Skein1024_block_loop
+    mov         [r1K_x7+TWEAK+ 8],r09
+    Reset_Stack Skein1024_Process_Block
+    ret
+;
+if _SKEIN_DEBUG
+; call here with r08 = "round number"
+Skein_Debug_Round_1024:
+_SP_OFFS_ = 8*2                     ;stack "offset" here: r08, return addr
+ SP_ equ <rsp + _SP_OFFS_>          ;useful shorthand below
+;
+  irp _wN_,<1,2,3,5,7,9,A,B,C,D,E,F> ;save rest of X[] state on stack so debug routines can access it
+    mov         [SP_+X_stk+8*(0&_wN_&h)],r1K_x&_wN_
+  endm
+    ;figure out what to do with x0. On rounds R where R==0 mod 4, it's already on the stack
+    cmp     r08,SKEIN_RND_SPECIAL   ;special rounds always save
+    jae     save_x0
+    test    r08,3
+    jz      save_x0_not
+save_x0:
+    mov     [SP_+X_stk+8*0],r1K_x0
+save_x0_not:
+    ;figure out the x4/x6 swapping state and save the correct one!
+    cmp     r08,SKEIN_RND_SPECIAL   ;special rounds always do x4
+    jae     save_x4
+    test    r08,1                   ;and even ones have r4 as well
+    jz      save_x4
+    mov     [SP_+X_stk+8*6],r1K_x6
+    jmp     short debug_1024_go
+save_x4:
+    mov     [SP_+X_stk+8*4],r1K_x4
+debug_1024_go:
+    ;now all is saved in Xstk[] except for X8
+    push    rdx                     ;save two regs for BLK_BITS-specific parms
+    push    rcx
+_SP_OFFS_ = _SP_OFFS_ + 16          ;adjust stack offset accordingly
+    ; now stack offset is 32 to X_stk
+    mov     rcx,[SP_ - 8]           ;get back original r08 (pushed on stack in macro call)
+    mov         [SP_+X_stk+8*8],rcx ;and save it in its rightful place in X_stk[8]
+    mov     rdx,[SP_+ctxPtr]        ;ctx_hdr_ptr
+    mov     rcx, 1024               ;block size
+    jmp     Skein_Debug_Round_Common
+endif
+;
+Skein1024_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+    public  Skein1024_Process_Block_CodeSize
+Skein1024_Process_Block_CodeSize proc
+    mov     rax,_ProcBytes_
+    ret
+Skein1024_Process_Block_CodeSize endp
+;
+    public  Skein1024_Unroll_Cnt
+Skein1024_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_1024/8
+    mov     rax,_UNROLL_CNT
+  else
+    xor     rax,rax
+  endif
+    ret
+Skein1024_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 1024
+;
+if _SKEIN_DEBUG
+;----------------------------------------------------------------
+;local debug routine to set up for calls to:
+;  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X);
+;
+; here with r08 = round number
+;           rdx = ctx_hdr_ptr
+;           rcx = block size (256/512/1024)
+;
+Skein_Debug_Round_Common:
+_SP_OFFS_ = 32                      ;current stack "offset": r08, retAddr, rcx, rdx
+    irp _rr_,<rax,rbx,rsi,rdi,rbp,r09,r10,r11,r12,r13,r14,r15>  ;save the rest of the regs
+      push  _rr_
+_SP_OFFS_ = _SP_OFFS_+8
+    endm
+ if (_SP_OFFS_ and 0Fh)             ; make sure stack is still 16-byte aligned here
+    .err    "Debug_Round_Common: stack alignment"
+ endif
+    ; compute r09 = ptr to the X[] array on the stack
+    lea     r09,[SP_+X_stk]         ;adjust for reg pushes, return address
+    cmp     r08,SKEIN_RND_FEED_FWD  ;special handling for feedforward "round"?
+    jnz     _got_r09a
+    lea     r09,[rdx+X_VARS]
+_got_r09a:
+  if _USE_ASM_ and 1024
+    ; special handling for 1024-bit case
+    ;    (for rounds right before with key injection: 
+    ;        use xDebug_1024[] instead of X_stk[])
+    cmp     r08,SKEIN_RND_SPECIAL
+    jae     _got_r09b               ;must be a normal round
+    or      r08,r08
+    jz      _got_r09b               ;just before key injection
+    test    r08,3
+    jne     _got_r09b
+    cmp     rcx,1024                ;only 1024-bit(s) for now
+    jne     _got_r09b
+    lea     r09,[SP_+xDebug_1024]
+_got_r09b:
+  endif
+    sub     rsp, 8*4                ;make room for parms on stack
+    call    Skein_Show_Round        ;call external debug handler
+    add     rsp, 8*4                ;discard parm space on the stack
+
+    irp _rr_,<r15,r14,r13,r12,r11,r10,r09,rbp,rdi,rsi,rbx,rax>  ;restore regs
+      pop   _rr_
+_SP_OFFS_ = _SP_OFFS_-8
+    endm
+ if _SP_OFFS_ - 32
+    .err    "Debug_Round_Common: push/pop misalignment!"
+ endif    
+    pop     rcx
+    pop     rdx
+    ret
+endif
+;----------------------------------------------------------------
+    end
diff --git a/Additional_Implementations/skein_block_x64.s b/Additional_Implementations/skein_block_x64.s
new file mode 100644
index 0000000000000..b2d0a83acbe93
--- /dev/null
+++ b/Additional_Implementations/skein_block_x64.s
@@ -0,0 +1,1328 @@
+#
+#----------------------------------------------------------------
+# 64-bit x86 assembler code (gnu as) for Skein block functions
+#
+# Author: Doug Whiting, Hifn/Exar
+#
+# This code is released to the public domain.
+#----------------------------------------------------------------
+#
+    .text
+    .altmacro
+    .psize 0,128                            #list file has no page boundaries
+#
+_MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
+_MAX_FRAME_ =  240
+#
+#################
+.ifndef SKEIN_USE_ASM
+_USE_ASM_         = _MASK_ALL_
+.else
+_USE_ASM_         = SKEIN_USE_ASM
+.endif
+#################
+.ifndef SKEIN_LOOP                          #configure loop unrolling
+_SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
+.else
+_SKEIN_LOOP       = SKEIN_LOOP
+  .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
+.print  "+++ SKEIN_LOOP = \_NN_"
+  .endr
+.endif
+# the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
+SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
+#
+SKEIN_ASM_UNROLL  = 0
+  .irp _NN_,256,512,1024
+    .if (SKEIN_UNROLL_\_NN_) == 0
+SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + \_NN_
+    .endif
+  .endr
+#################
+#
+.ifndef SKEIN_ROUNDS
+ROUNDS_256  =   72
+ROUNDS_512  =   72
+ROUNDS_1024 =   80
+.else
+ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
+ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
+# only display rounds if default size is changed on command line
+.irp _NN_,256,512,1024
+  .if _USE_ASM_ && \_NN_
+    .irp _RR_,%(ROUNDS_\_NN_)
+      .if _NN_ < 1024
+.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
+      .else
+.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+      .endif
+    .endr
+  .endif
+.endr
+.endif
+#################
+#
+.ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE = (1)
+.else
+.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE = (1)
+.else
+_SKEIN_CODE_SIZE = (0)
+.endif
+.endif
+#
+#################
+#
+.ifndef SKEIN_DEBUG
+_SKEIN_DEBUG      = 0
+.else
+_SKEIN_DEBUG      = 1
+.endif
+#################
+#
+# define offsets of fields in hash context structure
+#
+HASH_BITS   =   0                   #bits of hash output
+BCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
+TWEAK       =   8 + BCNT            #tweak values[0..1]
+X_VARS      =  16 + TWEAK           #chaining vars
+#
+#(Note: buffer[] in context structure is NOT needed here :-)
+#
+KW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
+FIRST_MASK  =   ~ (1 <<  6)
+FIRST_MASK64=   ~ (1 << 62)
+#
+# rotation constants for Skein
+#
+RC_256_0_0  = 14
+RC_256_0_1  = 16
+
+RC_256_1_0  = 52
+RC_256_1_1  = 57
+
+RC_256_2_0  = 23
+RC_256_2_1  = 40
+
+RC_256_3_0  =  5
+RC_256_3_1  = 37
+
+RC_256_4_0  = 25
+RC_256_4_1  = 33
+
+RC_256_5_0  = 46
+RC_256_5_1  = 12
+
+RC_256_6_0  = 58
+RC_256_6_1  = 22
+
+RC_256_7_0  = 32
+RC_256_7_1  = 32
+
+RC_512_0_0  = 46
+RC_512_0_1  = 36
+RC_512_0_2  = 19
+RC_512_0_3  = 37
+
+RC_512_1_0  = 33
+RC_512_1_1  = 27
+RC_512_1_2  = 14
+RC_512_1_3  = 42
+
+RC_512_2_0  = 17
+RC_512_2_1  = 49
+RC_512_2_2  = 36
+RC_512_2_3  = 39
+
+RC_512_3_0  = 44
+RC_512_3_1  =  9
+RC_512_3_2  = 54
+RC_512_3_3  = 56
+
+RC_512_4_0  = 39
+RC_512_4_1  = 30
+RC_512_4_2  = 34
+RC_512_4_3  = 24
+
+RC_512_5_0  = 13
+RC_512_5_1  = 50
+RC_512_5_2  = 10
+RC_512_5_3  = 17
+
+RC_512_6_0  = 25
+RC_512_6_1  = 29
+RC_512_6_2  = 39
+RC_512_6_3  = 43
+
+RC_512_7_0  =  8
+RC_512_7_1  = 35
+RC_512_7_2  = 56
+RC_512_7_3  = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 =  8
+RC_1024_0_3 = 47
+RC_1024_0_4 =  8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 =  4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 =  5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 =  9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 =  4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 =  9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+#
+#  Input:  reg
+# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+#
+.macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM
+  .if _RCNT_  #is there anything to do?
+    rolq    $_RCNT_,%\reg
+  .endif
+.endm
+#
+#----------------------------------------------------------------
+#
+# MACROS: define local vars and configure stack
+#
+#----------------------------------------------------------------
+# declare allocated space on the stack
+.macro StackVar localName,localSize
+\localName  =   _STK_OFFS_
+_STK_OFFS_  =   _STK_OFFS_+(\localSize)
+.endm #StackVar
+#
+#----------------------------------------------------------------
+#
+# MACRO: Configure stack frame, allocate local vars
+#
+.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
+    WCNT    =    (\BLK_BITS)/64
+#
+_PushCnt_   =   0                   #save nonvolatile regs on stack
+  .irp _reg_,rbp,rbx,r12,r13,r14,r15
+       pushq    %\_reg_
+_PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
+  .endr
+#
+_STK_OFFS_  =   0                   #starting offset from rsp
+    #---- local  variables         #<-- rsp
+    StackVar    X_stk  ,8*(WCNT)    #local context vars
+    StackVar    ksTwk  ,8*3         #key schedule: tweak words
+    StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
+  .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
+    StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
+  .endif
+    StackVar    Wcopy  ,8*(WCNT)    #copy of input block    
+  .if _SKEIN_DEBUG
+  .if \debugCnt + 0                 #temp location for debug X[] info
+    StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
+  .endif
+  .endif
+  .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
+    StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
+tmpStk_\BLK_BITS = align16          #use this
+  .endif
+    #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
+    StackVar    ctxPtr ,8           #context ptr
+    StackVar    blkPtr ,8           #pointer to block data
+    StackVar    blkCnt ,8           #number of full blocks to process
+    StackVar    bitAdd ,8           #bit count to add to tweak
+LOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
+    #---- 
+    StackVar    savRegs,8*_PushCnt_ #saved registers
+    StackVar    retAddr,8           #return address
+    #---- caller's stack frame (aligned mod 16)
+#
+# set up the stack frame pointer (rbp)
+#
+FRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
+  .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
+FRAME_OFFS  =      _STK_OFFS_
+  .endif
+F_O         =   -FRAME_OFFS
+#
+  #put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
+#
+# Notes on stack frame setup:
+#   * the most frequently used variable is X_stk[], based at [rsp+0]
+#   * the next most used is the key schedule arrays, ksKey and ksTwk
+#       so rbp is "centered" there, allowing short offsets to the key 
+#       schedule even in 1024-bit Skein case
+#   * the Wcopy variables are infrequently accessed, but they have long 
+#       offsets from both rsp and rbp only in the 1024-bit case.
+#   * all other local vars and calling parameters can be accessed 
+#       with short offsets, except in the 1024-bit case
+#
+    subq    $LOCAL_SIZE,%rsp        #make room for the locals
+    leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
+    movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
+    movq    %rsi, blkPtr+F_O(%rbp)
+    movq    %rdx, blkCnt+F_O(%rbp)
+    movq    %rcx, bitAdd+F_O(%rbp)
+#
+.endm #Setup_Stack
+#
+#----------------------------------------------------------------
+#
+.macro Reset_Stack
+    addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe??)
+  .irp _reg_,r15,r14,r13,r12,rbx,rbp
+    popq    %\_reg_                 #restore caller's regs
+_PushCnt_ = _PushCnt_ - 1
+  .endr
+  .if _PushCnt_
+    .error  "Mismatched push/pops?"
+  .endif
+.endm # Reset_Stack
+#
+#----------------------------------------------------------------
+# macros to help debug internals
+#
+.if _SKEIN_DEBUG
+    .extern  Skein_Show_Block     #calls to C routines
+    .extern  Skein_Show_Round
+#
+SKEIN_RND_SPECIAL       =   1000
+SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
+#
+.macro Skein_Debug_Block BLK_BITS
+#
+#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+#                     const u08b_t *blkPtr, const u64b_t *wPtr, 
+#                     const u64b_t *ksPtr,const u64b_t *tsPtr)
+#
+_NN_ = 0
+  .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
+    pushq   %\_reg_                 #save all volatile regs on tack before the call
+_NN_ = _NN_ + 1
+  .endr
+    # get and push call parameters
+    movq    $\BLK_BITS      ,%rdi   #bits
+    movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
+    leaq    X_VARS    (%rsi),%rdx   #X (pointer)
+    movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
+    leaq    Wcopy +F_O(%rbp),%r8    #wPtr
+    leaq    ksKey +F_O(%rbp),%r9    #key pointer
+    leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
+    pushq   %rax                    #   (pass on the stack)
+    call    Skein_Show_Block        #call external debug handler
+    addq    $8*1,%rsp               #discard parameters on stack
+  .if (_NN_ % 2 ) == 0              #check stack alignment
+    .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
+  .endif
+  .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
+    popq    %\_reg_                 #restore regs
+_NN_ = _NN_ - 1
+  .endr
+  .if _NN_
+    .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
+  .endif
+.endm # Skein_Debug_Block
+#
+# the macro to "call" to debug a round
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+    # call the appropriate (local) debug "function"
+    pushq   %rdx                    #save rdx, so we can use it for round "number"
+  .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
+    movq    $\R,%rdx
+  .else                             #compute round number using edi
+_rOffs_ = \RDI_OFFS + 0
+   .if \BLK_BITS == 1024
+    movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
+    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
+   .else
+    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
+   .endif
+  .endif
+    call    Skein_Debug_Round_\BLK_BITS
+    popq    %rdx                    #restore origianl rdx value
+#
+    afterOp
+.endm  #  Skein_Debug_Round
+.else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+.macro Skein_Debug_Block BLK_BITS
+.endm
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+.endm
+#
+.endif # _SKEIN_DEBUG
+#
+#----------------------------------------------------------------
+#
+.macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+  .if \immOffs + 0
+       leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+  .elseif ((\useAddOp + 0) == 0)
+    .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
+       leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+    .else
+       addq    %\srcReg_A\srcReg_B,%\dstReg
+    .endif
+  .else
+       addq    %\srcReg_A\srcReg_B,%\dstReg
+  .endif
+.endm
+
+# keep Intel-style ordering here, to match addReg
+.macro  xorReg dstReg,srcReg_A,srcReg_B
+        xorq   %\srcReg_A\srcReg_B,%\dstReg
+.endm
+#
+#----------------------------------------------------------------
+#
+.macro C_label lName
+ \lName:        #use both "genders" to work across linkage conventions
+_\lName:
+    .global  \lName
+    .global _\lName
+.endm
+#
+#=================================== Skein_256 =============================================
+#
+.if _USE_ASM_ & 256
+#
+# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+#
+# code
+#
+C_label Skein_256_Process_Block
+    Setup_Stack 256,((ROUNDS_256/8)+1)
+    movq    TWEAK+8(%rdi),%r14
+    jmp     Skein_256_block_loop
+    .p2align 4
+    # main hash loop for Skein_256
+Skein_256_block_loop:
+    #
+    # general register usage:
+    #   RAX..RDX        = X0..X3    
+    #   R08..R12        = ks[0..4]
+    #   R13..R15        = ts[0..2]
+    #   RSP, RBP        = stack/frame pointers
+    #   RDI             = round counter or context pointer
+    #   RSI             = temp
+    #
+    movq    TWEAK+0(%rdi)     ,%r13
+    addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
+    movq    %r14              ,%r15
+    xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak 
+
+    movq    $KW_PARITY        ,%r12
+    movq       X_VARS+ 0(%rdi),%r8
+    movq       X_VARS+ 8(%rdi),%r9 
+    movq       X_VARS+16(%rdi),%r10
+    movq       X_VARS+24(%rdi),%r11
+    movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
+    xorq    %r8               ,%r12  #start accumulating overall parity
+
+    movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
+    xorq    %r9               ,%r12
+    movq     0(%rsi)          ,%rax  #get X[0..3]
+    xorq    %r10              ,%r12
+    movq     8(%rsi)          ,%rbx
+    xorq    %r11              ,%r12
+    movq    16(%rsi)          ,%rcx
+    movq    24(%rsi)          ,%rdx
+
+    movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
+    movq    %rbx,Wcopy+ 8+F_O(%rbp)    
+    movq    %rcx,Wcopy+16+F_O(%rbp)    
+    movq    %rdx,Wcopy+24+F_O(%rbp)    
+
+    addq    %r8 ,%rax                #initial key injection
+    addq    %r9 ,%rbx 
+    addq    %r10,%rcx
+    addq    %r11,%rdx
+    addq    %r13,%rbx
+    addq    %r14,%rcx
+
+.if _SKEIN_DEBUG
+    movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
+    movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
+    movq    %r9 ,ksKey+ 8+F_O(%rbp)    
+    movq    %r10,ksKey+16+F_O(%rbp)    
+    movq    %r11,ksKey+24+F_O(%rbp)    
+    movq    %r12,ksKey+32+F_O(%rbp)    
+                                       
+    movq    %r13,ksTwk+ 0+F_O(%rbp)    
+    movq    %r14,ksTwk+ 8+F_O(%rbp)    
+    movq    %r15,ksTwk+16+F_O(%rbp)    
+                                       
+    movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
+    movq    %rbx,X_stk + 8(%rsp)       
+    movq    %rcx,X_stk +16(%rsp)       
+    movq    %rdx,X_stk +24(%rsp)       
+
+    Skein_Debug_Block 256            #debug dump
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+.endif
+#
+.if ((SKEIN_ASM_UNROLL & 256) == 0)
+    movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
+    movq    %r9 ,ksKey+ 8+F_O(%rbp)    
+    movq    %r10,ksKey+16+F_O(%rbp)    
+    movq    %r11,ksKey+24+F_O(%rbp)    
+    movq    %r12,ksKey+32+F_O(%rbp)    
+                                       
+    movq    %r13,ksTwk+24+F_O(%rbp)    
+    movq    %r14,ksTwk+ 8+F_O(%rbp)    
+    movq    %r15,ksTwk+16+F_O(%rbp)    
+.endif
+    addq    $WCNT*8,%rsi             #skip the block
+    movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
+    #
+    # now the key schedule is computed. Start the rounds
+    #
+.if SKEIN_ASM_UNROLL & 256
+_UNROLL_CNT =   ROUNDS_256/8
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_256
+  .if ((ROUNDS_256/8) % _UNROLL_CNT)
+    .error "Invalid SKEIN_UNROLL_256"
+  .endif
+    xorq    %rdi,%rdi                #rdi = iteration count
+Skein_256_round_loop:
+.endif
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+    # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
+    # round 4*_RBase_ + 0
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
+    addReg  rcx, rdx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
+                .endif
+    xorReg  rbx, rax
+    RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
+    xorReg  rdx, rcx
+  .if SKEIN_ASM_UNROLL & 256
+    .irp _r0_,%( 8+(_Rbase_+3) % 5)
+    .irp _r1_,%(13+(_Rbase_+2) % 3)
+      leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
+    .endr
+    .endr
+  .endif
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
+                .endif
+    Skein_Debug_Round 256,%(4*_Rbase_+1)
+
+    # round 4*_Rbase_ + 1
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
+    xorReg  rdx, rax
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
+                .endif
+    addReg  rcx, rbx
+    RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
+    xorReg  rbx, rcx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
+                .endif
+    Skein_Debug_Round 256,%(4*_Rbase_+2)
+ .if SKEIN_ASM_UNROLL & 256
+    .irp _r0_,%( 8+(_Rbase_+2) % 5)
+    .irp _r1_,%(13+(_Rbase_+1) % 3)
+      leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
+    .endr
+    .endr
+ .endif
+    # round 4*_Rbase_ + 2
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
+    addReg  rcx, rdx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
+                .endif
+    xorReg  rbx, rax
+    RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
+    xorReg  rdx, rcx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
+                    leaq 1(%r11,%rdi),%r11               #precompute key + tweak
+                .endif
+    Skein_Debug_Round 256,%(4*_Rbase_+3)
+    # round 4*_Rbase_ + 3
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
+    addReg  rcx, rbx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
+                    movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
+                .endif
+    xorReg  rdx, rax
+    RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
+    xorReg  rbx, rcx
+    Skein_Debug_Round 256,%(4*_Rbase_+4)
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    addReg r9 ,r13           #precompute key+tweak
+                .endif
+      #inject key schedule words
+_Rbase_ = _Rbase_+1
+  .if SKEIN_ASM_UNROLL & 256
+    addReg    rax,r,%(8+((_Rbase_+0) % 5))
+    addReg    rbx,rsi
+    addReg    rcx,rdi
+    addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
+  .else
+    incq      %rdi
+    addReg    rax,r8 
+    addReg    rcx,r10
+    addReg    rbx,r9 
+    addReg    rdx,r11
+  .endif
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 256) == 0
+    cmpq    $2*(ROUNDS_256/8),%rdi
+    jb      Skein_256_round_loop
+.endif # (SKEIN_ASM_UNROLL & 256) == 0
+    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
+
+    #----------------------------
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
+    movq    $FIRST_MASK64 ,%r14
+    xorq    Wcopy + 0+F_O (%rbp),%rax
+    xorq    Wcopy + 8+F_O (%rbp),%rbx
+    xorq    Wcopy +16+F_O (%rbp),%rcx
+    xorq    Wcopy +24+F_O (%rbp),%rdx
+    andq    TWEAK + 8     (%rdi),%r14
+    movq    %rax,X_VARS+ 0(%rdi)             #store final result
+    movq    %rbx,X_VARS+ 8(%rdi)        
+    movq    %rcx,X_VARS+16(%rdi)        
+    movq    %rdx,X_VARS+24(%rdi)        
+
+    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
+
+    # go back for more blocks, if needed
+    decq    blkCnt+F_O(%rbp)
+    jnz     Skein_256_block_loop
+    movq    %r14,TWEAK + 8(%rdi)
+    Reset_Stack
+    ret
+Skein_256_Process_Block_End:
+
+  .if _SKEIN_DEBUG
+Skein_Debug_Round_256:               #here with rdx == round "number" from macro
+    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
+    pushq   %rdi
+    movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
+    movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
+    movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
+    movq    %rcx,X_stk+16+F_O(%rbp)
+    movq    %rdi,X_stk+24+F_O(%rbp)
+
+    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
+    movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
+    jmp     Skein_Debug_Round_Common
+  .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label  Skein_256_Process_Block_CodeSize
+    movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
+    ret
+#
+C_label Skein_256_Unroll_Cnt
+  .if _UNROLL_CNT <> ROUNDS_256/8
+    movq    $_UNROLL_CNT,%rax
+  .else
+    xorq    %rax,%rax
+  .endif
+    ret
+.endif
+#
+.endif #_USE_ASM_ & 256
+#
+#=================================== Skein_512 =============================================
+#
+.if _USE_ASM_ & 512
+#
+# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
+#
+# X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
+#
+#################
+# MACRO: one round for 512-bit blocks
+#
+.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
+#
+    addReg      r\rn0, r\rn1
+    RotL64      r\rn1, 512,%((_Rn_) % 8),0
+    xorReg      r\rn1, r\rn0
+            op1
+    addReg      r\rn2, r\rn3
+    RotL64      r\rn3, 512,%((_Rn_) % 8),1
+    xorReg      r\rn3, r\rn2
+            op2
+    addReg      r\rn4, r\rn5
+    RotL64      r\rn5, 512,%((_Rn_) % 8),2
+    xorReg      r\rn5, r\rn4
+            op3
+    addReg      r\rn6, r\rn7
+    RotL64      r\rn7, 512,%((_Rn_) % 8),3
+    xorReg      r\rn7, r\rn6
+            op4
+    Skein_Debug_Round 512,%(_Rn_+1),-4
+#
+.endm #R_512_OneRound
+#
+#################
+# MACRO: eight rounds for 512-bit blocks
+#
+.macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
+  .if (SKEIN_ASM_UNROLL && 512)
+    # here for fully unrolled case.
+    _II_ = ((_RR_)/4) + 1       #key injection counter
+    R_512_OneRound  8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
+    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
+    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
+    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
+    # inject the key schedule
+    addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
+    addReg   r11, rax
+    addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
+    addReg   r12, rbx
+    addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
+    addReg   r13, rcx
+    addReg   r14, rdx
+    addReg   r15, rsi,,,(_II_)
+  .else
+    # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
+    incq    %rdi                 #bump key injection counter
+    R_512_OneRound  8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
+    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
+    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>    
+    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
+    # inject the key schedule
+    addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
+    addReg   r11, rax
+    addReg   r12, rbx
+    addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
+    addReg   r13, rcx
+    addReg   r14, rdx
+    addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
+    addReg   r15, rsi
+    addReg   r15, rdi              #inject the round number
+  .endif
+
+    #show the result of the key injection
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
+.endm #R_512_EightRounds
+#
+#################
+# instantiated code
+#
+C_label Skein_512_Process_Block
+    Setup_Stack 512,ROUNDS_512/8
+    movq    TWEAK+ 8(%rdi),%rbx
+    jmp     Skein_512_block_loop
+    .p2align 4
+    # main hash loop for Skein_512
+Skein_512_block_loop:
+    # general register usage:
+    #   RAX..RDX       = temps for key schedule pre-loads
+    #   R8 ..R15       = X0..X7
+    #   RSP, RBP       = stack/frame pointers
+    #   RDI            = round counter or context pointer
+    #   RSI            = temp
+    #
+    movq    TWEAK +  0(%rdi),%rax
+    addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
+    movq    %rbx,%rcx
+    xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
+    movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
+    movq    %rax,ksTwk+ 0+F_O(%rbp)
+    movq    $KW_PARITY,%rdx
+    movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
+    movq    %rbx,ksTwk+ 8+F_O(%rbp)
+    movq    %rcx,ksTwk+16+F_O(%rbp)
+    .irp _Rn_,8,9,10,11,12,13,14,15
+      movq  X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_
+      xorq  %r\_Rn_,%rdx              #compute overall parity
+      movq  %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp)
+    .endr                             #load state into %r8 ..%r15, compute parity
+      movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
+
+    addReg   r13,rax                  #precompute key injection for tweak
+    addReg   r14, rbx
+.if _SKEIN_DEBUG
+    movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
+.endif
+    movq     0(%rsi),%rax             #load input block
+    movq     8(%rsi),%rbx 
+    movq    16(%rsi),%rcx 
+    movq    24(%rsi),%rdx 
+    addReg   r8 , rax                 #do initial key injection
+    addReg   r9 , rbx
+    movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
+    movq    %rbx,Wcopy+ 8+F_O(%rbp)
+    addReg   r10, rcx
+    addReg   r11, rdx
+    movq    %rcx,Wcopy+16+F_O(%rbp)
+    movq    %rdx,Wcopy+24+F_O(%rbp)
+
+    movq    32(%rsi),%rax
+    movq    40(%rsi),%rbx 
+    movq    48(%rsi),%rcx 
+    movq    56(%rsi),%rdx
+    addReg   r12, rax
+    addReg   r13, rbx
+    addReg   r14, rcx
+    addReg   r15, rdx
+    movq    %rax,Wcopy+32+F_O(%rbp)    
+    movq    %rbx,Wcopy+40+F_O(%rbp)    
+    movq    %rcx,Wcopy+48+F_O(%rbp)    
+    movq    %rdx,Wcopy+56+F_O(%rbp)    
+
+.if _SKEIN_DEBUG
+    .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
+      movq  %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp)
+    .endr
+
+    Skein_Debug_Block 512             #debug dump
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
+.endif
+    addq    $8*WCNT,%rsi              #skip the block
+    movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
+    #
+    #################
+    # now the key schedule is computed. Start the rounds
+    #
+.if SKEIN_ASM_UNROLL & 512
+_UNROLL_CNT =   ROUNDS_512/8
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_512
+  .if ((ROUNDS_512/8) % _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_512"
+  .endif
+    xorq    %rdi,%rdi                 #rdi = round counter
+Skein_512_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+      R_512_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 512) == 0
+    cmpq    $2*(ROUNDS_512/8),%rdi
+    jb      Skein_512_round_loop
+    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
+.endif
+    # end of rounds
+    #################
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
+    .irp _Rn_,8,9,10,11,12,13,14,15
+  .if (_Rn_ == 8)
+    movq    $FIRST_MASK64,%rbx
+  .endif
+      xorq  Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
+      movq  %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi)     #and store result
+  .if (_Rn_ == 14)
+    andq    TWEAK+ 8(%rdi),%rbx
+  .endif
+    .endr
+    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+
+    # go back for more blocks, if needed
+    decq    blkCnt+F_O(%rbp)
+    jnz     Skein_512_block_loop
+    movq    %rbx,TWEAK + 8(%rdi)
+
+    Reset_Stack
+    ret
+Skein_512_Process_Block_End:
+#
+  .if _SKEIN_DEBUG
+# call here with rdx  = "round number"
+Skein_Debug_Round_512:
+    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
+    pushq   %rdi
+  .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
+    movq    %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp)
+  .endr
+    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
+    movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
+    jmp     Skein_Debug_Round_Common
+  .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein_512_Process_Block_CodeSize
+    movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
+    ret
+#
+C_label Skein_512_Unroll_Cnt
+  .if _UNROLL_CNT <> (ROUNDS_512/8)
+    movq    $_UNROLL_CNT,%rax
+  .else
+    xorq    %rax,%rax
+  .endif
+    ret
+.endif
+#
+.endif # _USE_ASM_ & 512
+#
+#=================================== Skein1024 =============================================
+.if _USE_ASM_ & 1024
+#
+# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+# use details of permutation to make register assignments
+# 
+o1K_rdi =  0        #offsets in X[] associated with each register
+o1K_rsi =  1 
+o1K_rbp =  2 
+o1K_rax =  3 
+o1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
+o1K_rbx =  5 
+o1K_rdx =  7 
+o1K_r8  =  8  
+o1K_r9  =  9  
+o1K_r10 = 10
+o1K_r11 = 11
+o1K_r12 = 12
+o1K_r13 = 13
+o1K_r14 = 14
+o1K_r15 = 15
+#
+rIdx_offs = tmpStk_1024
+#
+.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
+    addReg      \reg0 , \reg1                      #perform the MIX
+    RotL64      \reg1 , 1024,%((_RN0_) % 8),_Rn1_
+    xorReg      \reg1 , \reg0
+.if ((_RN0_) && 3) == 3         #time to do key injection?
+ .if _SKEIN_DEBUG
+    movq       %\reg0 , xDebug_1024+8*w0(%rsp)     #save intermediate values for Debug_Round
+    movq       %\reg1 , xDebug_1024+8*w1(%rsp)     # (before inline key injection)
+ .endif
+_II_ = ((_RN0_)/4)+1            #injection count
+ .if SKEIN_ASM_UNROLL && 1024   #here to do fully unrolled key injection
+    addq        ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0
+    addq        ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1
+  .if     w1 == 13                                 #tweak injection
+    addq        ksTwk+ 8*((_II_+ 0) %  3)(%rsp),%\reg1
+  .elseif w0 == 14
+    addq        ksTwk+ 8*((_II_+ 1) %  3)(%rsp),%\reg0
+  .elseif w1 == 15
+    addq        $_II_, %\reg1                      #(injection counter)
+  .endif
+ .else                          #here to do looping  key injection
+  .if  (w0 == 0)
+    movq        %rdi, X_stk+8*w0(%rsp)             #if so, store N0 so we can use reg as index
+    movq         rIdx_offs(%rsp),%rdi              #get the injection counter index into rdi
+  .else
+    addq         ksKey+8+8*w0(%rsp,%rdi,8),%\reg0  #even key injection
+  .endif
+  .if     w1 == 13                                 #tweak injection
+    addq         ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1  
+  .elseif w0 == 14
+    addq         ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0  
+  .elseif w1 == 15
+    addReg      \reg1,rdi,,,1                      #(injection counter)
+  .endif
+    addq         ksKey+8+8*w1(%rsp,%rdi,8),%\reg1  #odd key injection
+ .endif
+.endif
+    # insert the op provided, .if any
+    op1
+.endm
+#################
+# MACRO: four rounds for 1024-bit blocks
+#
+.macro r1024_FourRounds _RR_    #RR = base round number (0 mod 4)
+    # should be here with X4 set properly, X6 stored on stack
+_Rn_ = (_RR_) + 0
+        r1024_Mix  0, 1,rdi,rsi,_Rn_,0
+        r1024_Mix  2, 3,rbp,rax,_Rn_,1
+        r1024_Mix  4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
+        r1024_Mix  8, 9,r8 ,r9 ,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack 
+        r1024_Mix 10,11,r10,r11,_Rn_,5
+        r1024_Mix 12,13,r12,r13,_Rn_,6
+        r1024_Mix  6, 7,rcx,rdx,_Rn_,3
+        r1024_Mix 14,15,r14,r15,_Rn_,7
+    .if _SKEIN_DEBUG
+      Skein_Debug_Round 1024,%(_Rn_+1)
+    .endif
+_Rn_ = (_RR_) + 1
+        r1024_Mix  0, 9,rdi,r9 ,_Rn_,0
+        r1024_Mix  2,13,rbp,r13,_Rn_,1
+        r1024_Mix  6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
+        r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack 
+        r1024_Mix 12, 3,r12,rax,_Rn_,5
+        r1024_Mix 14, 5,r14,rbx,_Rn_,6
+        r1024_Mix  4,15,rcx,r15,_Rn_,3
+        r1024_Mix  8, 1,r8 ,rsi,_Rn_,7
+    .if _SKEIN_DEBUG
+      Skein_Debug_Round 1024,%(_Rn_+1)
+    .endif
+_Rn_ = (_RR_) + 2
+        r1024_Mix  0, 7,rdi,rdx,_Rn_,0
+        r1024_Mix  2, 5,rbp,rbx,_Rn_,1
+        r1024_Mix  4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
+        r1024_Mix 12,15,r12,r15,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack 
+        r1024_Mix 14,13,r14,r13,_Rn_,5
+        r1024_Mix  8,11,r8 ,r11,_Rn_,6
+        r1024_Mix  6, 1,rcx,rsi,_Rn_,3
+        r1024_Mix 10, 9,r10,r9 ,_Rn_,7
+    .if _SKEIN_DEBUG
+      Skein_Debug_Round 1024,%(_Rn_+1)
+    .endif
+_Rn_ = (_RR_) + 3
+        r1024_Mix  0,15,rdi,r15,_Rn_,0
+        r1024_Mix  2,11,rbp,r11,_Rn_,1
+        r1024_Mix  6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
+        r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack 
+        r1024_Mix  8, 5,r8 ,rbx,_Rn_,5
+        r1024_Mix 10, 3,r10,rax,_Rn_,6
+        r1024_Mix  4, 9,rcx,r9 ,_Rn_,3
+        r1024_Mix 12, 7,r12,rdx,_Rn_,7
+    .if _SKEIN_DEBUG
+      Skein_Debug_Round 1024,%(_Rn_+1)
+    .endif
+
+  .if (SKEIN_ASM_UNROLL && 1024) == 0           #here with rdi == rIdx, X0 on stack
+    #"rotate" the key schedule on the stack
+i8 = o1K_r8
+i0 = o1K_rdi
+    movq    %r8 , X_stk+8*i8(%rsp)              #free up a register (save it on the stack)
+    movq          ksKey+8* 0(%rsp,%rdi,8),%r8   #get  key  word
+    movq    %r8 , ksKey+8*17(%rsp,%rdi,8)       #rotate key (must do key first or tweak clobbers it!)
+    movq          ksTwk+8* 0(%rsp,%rdi,8),%r8   #get tweak word
+    movq    %r8 , ksTwk+8* 3(%rsp,%rdi,8)       #rotate tweak (onto the stack)
+    movq          X_stk+8*i8(%rsp)       ,%r8   #get the reg back
+    incq    %rdi                                #bump the index
+    movq    %rdi, rIdx_offs (%rsp)              #save rdi again
+    movq          ksKey+8*i0(%rsp,%rdi,8),%rdi  #get the key schedule word for X0 back
+    addq          X_stk+8*i0(%rsp)       ,%rdi  #perform the X0 key injection
+  .endif
+    #show the result of the key injection
+    Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
+.endm #r1024_FourRounds
+#
+################
+# code
+#
+C_label Skein1024_Process_Block
+#
+    Setup_Stack 1024,ROUNDS_1024/8,WCNT
+    movq    TWEAK+ 8(%rdi),%r9
+    jmp     Skein1024_block_loop
+    # main hash loop for Skein1024
+    .p2align 4
+Skein1024_block_loop:
+    # general register usage:
+    #   RSP              = stack pointer
+    #   RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
+    #   R8 ..R15         = X8..X15    (state words)
+    #   RBP              = temp (used for X0 and X2)
+    #
+  .if (SKEIN_ASM_UNROLL & 1024) == 0
+    xorq    %rax,%rax                      #init loop index on the stack
+    movq    %rax,rIdx_offs(%rsp)
+  .endif
+    movq         TWEAK+     0(%rdi),%r8
+    addq         bitAdd+  F_O(%rbp),%r8    #computed updated tweak value T0
+    movq    %r9 ,%r10 
+    xorq    %r8 ,%r10                      #%rax/%rbx/%rcx = tweak schedule
+    movq    %r8 ,TWEAK+     0(%rdi)        #save updated tweak value ctx->h.T[0]
+    movq    %r8 ,ksTwk+ 0+F_O(%rbp)
+    movq    %r9 ,ksTwk+ 8+F_O(%rbp)        #keep values in %r8 ,%r9  for initial tweak injection below
+    movq    %r10,ksTwk+16+F_O(%rbp)
+  .if _SKEIN_DEBUG
+    movq    %r9 ,TWEAK+     8(%rdi)        #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
+  .endif
+    movq         blkPtr +F_O(%rbp),%rsi    # rsi --> input block
+    movq        $KW_PARITY        ,%rax    #overall key schedule parity
+
+    # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
+    .irp _rN_,0,1,2,3,4,6                  #process the "initial" words, using r14/r15 as temps
+      movq       X_VARS+8*_rN_(%rdi),%r14  #get state word
+      movq              8*_rN_(%rsi),%r15  #get msg   word
+      xorq  %r14,%rax                      #update key schedule overall parity
+      movq  %r14,ksKey +8*_rN_+F_O(%rbp)   #save key schedule word on stack
+      movq  %r15,Wcopy +8*_rN_+F_O(%rbp)   #save local msg Wcopy 
+      addq  %r15,%r14                      #do the initial key injection
+      movq  %r14,X_stk +8*_rN_    (%rsp)   #save initial state var on stack
+    .endr
+    # now process the rest, using the "real" registers 
+    #     (MUST do it in reverse order to inject tweaks r8/r9 first)
+    .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
+_oo_ = o1K_\_rr_                           #offset assocated with the register
+      movq  X_VARS+8*_oo_(%rdi),%\_rr_     #get key schedule word from context
+      movq         8*_oo_(%rsi),%rcx       #get next input msg word
+      movq  %\_rr_, ksKey +8*_oo_(%rsp)    #save key schedule on stack
+      xorq  %\_rr_, %rax                   #accumulate key schedule parity
+      movq  %rcx,Wcopy+8*_oo_+F_O(%rbp)    #save copy of msg word for feedforward
+      addq  %rcx,%\_rr_                    #do the initial  key  injection
+      .if    _oo_ == 13                    #do the initial tweak injection
+        addReg _rr_,r8                     #          (only in words 13/14)
+      .elseif _oo_ == 14
+        addReg _rr_,r9 
+      .endif
+    .endr
+    movq    %rax,ksKey+8*WCNT+F_O(%rbp)    #save key schedule parity
+.if _SKEIN_DEBUG
+    Skein_Debug_Block 1024                 #initial debug dump
+.endif
+    addq     $8*WCNT,%rsi                  #bump the msg ptr
+    movq     %rsi,blkPtr+F_O(%rbp)         #save bumped msg ptr
+    # re-load words 0..4 from stack, enter the main loop
+    .irp _rr_,rdi,rsi,rbp,rax,rcx          #(no need to re-load x6, already on stack)
+      movq  X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
+    .endr
+.if _SKEIN_DEBUG
+    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        #show state after initial key injection
+.endif
+    #
+    #################
+    # now the key schedule is computed. Start the rounds
+    #
+.if SKEIN_ASM_UNROLL & 1024
+_UNROLL_CNT =   ROUNDS_1024/8
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_1024
+  .if ((ROUNDS_1024/8) % _UNROLL_CNT)
+    .error "Invalid SKEIN_UNROLL_1024"
+  .endif
+Skein1024_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2                        #implement the rounds, 4 at a time
+      r1024_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 1024) == 0
+    cmpq    $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
+    jb      Skein1024_round_loop    
+.endif
+    # end of rounds
+    #################
+    #
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
+    movq    %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
+    movq       ctxPtr(%rsp),%rdx
+    
+    .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15   #do all but x6,x7
+_oo_ = o1K_\_rr_
+      xorq  Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
+      movq  %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
+      .if (_oo_ ==  9)
+        movq   $FIRST_MASK64 ,%r9
+      .endif
+      .if (_oo_ == 14)
+        andq   TWEAK+ 8(%rdx),%r9
+      .endif
+    .endr
+    # 
+    movq         X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
+    movq         X_stk +8*7(%rsp),%rbx
+    xorq         Wcopy +8*6(%rsp),%rax
+    xorq         Wcopy +8*7(%rsp),%rbx
+    movq    %rax,X_VARS+8*6(%rdx)
+    decq             blkCnt(%rsp)      #set zero flag iff done
+    movq    %rbx,X_VARS+8*7(%rdx)
+
+    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
+    # go back for more blocks, if needed
+    movq             ctxPtr(%rsp),%rdi #don't muck with the flags here!
+    lea          FRAME_OFFS(%rsp),%rbp
+    jnz     Skein1024_block_loop
+    movq    %r9 ,TWEAK+   8(%rdx)
+    Reset_Stack
+    ret
+#
+Skein1024_Process_Block_End:
+#
+.if _SKEIN_DEBUG
+Skein_Debug_Round_1024:
+    # call here with rdx  = "round number",
+_SP_OFFS_ = 8*2                     #stack "offset" here: rdx, return addr
+    #
+  #save rest of X[] state on stack so debug routines can access it
+  .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
+    movq    %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
+  .endr
+    # Figure out what to do with x0 (rdi).  When rdx == 0 mod 4, it's already on stack
+    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always save
+    jae     save_x0
+    testq   $3,%rdx                 #otherwise only if rdx != 0 mod 4
+    jz      save_x0_not
+save_x0:
+    movq    %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
+save_x0_not:
+    #figure out the x4/x6 swapping state and save the correct one!
+    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
+    jae     save_x4
+    testq   $1,%rdx                  #and even ones have r4 as well
+    jz      save_x4
+    movq    %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
+    jmp     debug_1024_go
+save_x4:
+    movq    %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
+debug_1024_go:
+    #now all is saved in Xstk[] except for rdx
+    push    %rsi                    #save two regs for BLK_BITS-specific parms
+    push    %rdi
+_SP_OFFS_ = _SP_OFFS_ + 16          #adjust stack offset accordingly (now 32)
+
+    movq    _SP_OFFS_-8(%rsp),%rsi  #get back original %rdx (pushed on stack in macro call)
+    movq    %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
+
+    movq    ctxPtr+_SP_OFFS_(%rsp),%rsi  #rsi = ctx_hdr_ptr
+    movq    $1024,%rdi                   #rdi = block size
+    jmp     Skein_Debug_Round_Common
+.endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein1024_Process_Block_CodeSize
+    movq    $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
+    ret
+#
+C_label Skein1024_Unroll_Cnt
+  .if _UNROLL_CNT <> (ROUNDS_1024/8)
+    movq    $_UNROLL_CNT,%rax
+  .else
+    xorq    %rax,%rax
+  .endif
+    ret
+.endif
+#
+.endif # _USE_ASM_ and 1024
+#
+.if _SKEIN_DEBUG
+#----------------------------------------------------------------
+#local debug routine to set up for calls to:
+#  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
+#                       [       rdi                        rsi   rdx              rcx]
+#
+# here with %rdx = round number
+#           %rsi = ctx_hdr_ptr
+#           %rdi = block size (256/512/1024)
+# on stack: saved rdi, saved rsi, retAddr, saved rdx  
+#
+Skein_Debug_Round_Common:
+_SP_OFFS_ = 32                        #account for four words on stack already
+  .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15  #save the rest of the regs
+    pushq %\_rr_
+_SP_OFFS_ = _SP_OFFS_+8
+  .endr
+  .if (_SP_OFFS_ % 16)                # make sure stack is still 16-byte aligned here
+    .error  "Debug_Round_Common: stack alignment"
+  .endif
+    # compute %rcx  = ptr to the X[] array on the stack (final parameter to call)
+    leaq    X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
+    cmpq    $SKEIN_RND_FEED_FWD,%rdx   #special handling for feedforward "round"?
+    jnz     _got_rcxA
+    leaq    X_VARS(%rsi),%rcx
+_got_rcxA:
+  .if _USE_ASM_ & 1024
+    # special handling for 1024-bit case
+    #    (for rounds right before with key injection: 
+    #        use xDebug_1024[] instead of X_stk[])
+    cmpq    $SKEIN_RND_SPECIAL,%rdx
+    jae     _got_rcxB               #must be a normal round
+    orq     %rdx,%rdx
+    jz      _got_rcxB               #just before key injection
+    test    $3,%rdx
+    jne     _got_rcxB
+    cmp     $1024,%rdi              #only 1024-bit(s) for now
+    jne     _got_rcxB
+    leaq    xDebug_1024+_SP_OFFS_(%rsp),%rcx
+_got_rcxB:
+  .endif
+    call    Skein_Show_Round        #call external debug handler
+
+  .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax  #restore regs
+    popq  %\_rr_
+_SP_OFFS_ = _SP_OFFS_-8
+  .endr
+  .if _SP_OFFS_ - 32
+    .error   "Debug_Round_Common: push/pop misalignment!"
+  .endif    
+    popq    %rdi
+    popq    %rsi
+    ret
+.endif
+#----------------------------------------------------------------
+    .end
diff --git a/Additional_Implementations/skein_block_x86.asm b/Additional_Implementations/skein_block_x86.asm
new file mode 100644
index 0000000000000..4679e991fe048
--- /dev/null
+++ b/Additional_Implementations/skein_block_x86.asm
@@ -0,0 +1,1180 @@
+;
+;----------------------------------------------------------------
+; 32-bit x86 assembler code for Skein block functions
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+    .386p
+    .model flat
+    .code
+;
+_MASK_ALL_  equ (256+512+1024)              ;all three algorithm bits
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_        = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_        = SKEIN_USE_ASM
+else
+_USE_ASM_        = _MASK_ALL_
+endif
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP  
+_SKEIN_LOOP       = 0                       ;default is all fully unrolled
+else
+_SKEIN_LOOP       = SKEIN_LOOP
+endif
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) mod 10
+;
+SKEIN_ASM_UNROLL  = 0
+  irp _NN_,<256,512,1024>
+    if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + _NN_
+    endif
+  endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256  =   72
+ROUNDS_512  =   72
+ROUNDS_1024 =   80
+else
+ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) mod 10) + 5)
+endif
+irp _NN_,<256,512,1024>
+  if _USE_ASM_ and _NN_
+    irp _RR_,<%(ROUNDS_&_NN_)>
+      if _NN_ eq 1024
+%out  +++ SKEIN_ROUNDS_&_NN_ = _RR_
+      else
+%out  +++ SKEIN_ROUNDS_&_NN_  = _RR_
+      endif
+    endm
+  endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE equ (1)
+else
+ifdef  SKEIN_PERF                           ;use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG      = 0
+else
+_SKEIN_DEBUG      = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS   =   0                           ;# bits of hash output
+BCNT        =   4 + HASH_BITS               ;number of bytes in BUFFER[]
+TWEAK       =   4 + BCNT                    ;tweak values[0..1]
+X_VARS      =  16 + TWEAK                   ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+KW_PARITY_LO=   0A9FC1A22h                  ;overall parity of key schedule words (hi32/lo32)
+KW_PARITY_HI=   01BD11BDAh                  ;overall parity of key schedule words (hi32/lo32)
+FIRST_MASK  =   NOT (1 SHL 30)              ;FIRST block flag bit
+;
+; rotation constants for Skein
+;
+RC_256_0_0  = 14
+RC_256_0_1  = 16
+
+RC_256_1_0  = 52
+RC_256_1_1  = 57
+
+RC_256_2_0  = 23
+RC_256_2_1  = 40
+
+RC_256_3_0  =  5
+RC_256_3_1  = 37
+
+RC_256_4_0  = 25
+RC_256_4_1  = 33
+
+RC_256_5_0  = 46
+RC_256_5_1  = 12
+
+RC_256_6_0  = 58
+RC_256_6_1  = 22
+
+RC_256_7_0  = 32
+RC_256_7_1  = 32
+
+RC_512_0_0  = 46
+RC_512_0_1  = 36
+RC_512_0_2  = 19
+RC_512_0_3  = 37
+
+RC_512_1_0  = 33
+RC_512_1_1  = 27
+RC_512_1_2  = 14
+RC_512_1_3  = 42
+
+RC_512_2_0  = 17
+RC_512_2_1  = 49
+RC_512_2_2  = 36
+RC_512_2_3  = 39
+
+RC_512_3_0  = 44
+RC_512_3_1  =  9
+RC_512_3_2  = 54
+RC_512_3_3  = 56
+
+RC_512_4_0  = 39
+RC_512_4_1  = 30
+RC_512_4_2  = 34
+RC_512_4_3  = 24
+
+RC_512_5_0  = 13
+RC_512_5_1  = 50
+RC_512_5_2  = 10
+RC_512_5_3  = 17
+
+RC_512_6_0  = 25
+RC_512_6_1  = 29
+RC_512_6_2  = 39
+RC_512_6_3  = 43
+
+RC_512_7_0  =  8
+RC_512_7_1  = 35
+RC_512_7_2  = 56
+RC_512_7_3  = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 =  8
+RC_1024_0_3 = 47
+RC_1024_0_4 =  8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 =  4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 =  5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 =  9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 =  4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 =  9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+;  Input:  rHi,rLo
+; Output: <rHi,rLo> <<< _RCNT_
+Rol64 macro rHi,rLo,tmp,_RCNT_
+  if _RCNT_  ;is there anything to do?
+    if _RCNT_ lt 32
+      mov   tmp,rLo
+      shld  rLo,rHi,_RCNT_
+      shld  rHi,tmp,_RCNT_
+    elseif _RCNT_ gt 32
+      mov   tmp,rLo
+      shrd  rLo,rHi,((64-_RCNT_) AND 63)
+      shrd  rHi,tmp,((64-_RCNT_) AND 63)
+    else    
+      xchg  rHi,rLo ;special case for _RCNT_ == 32
+    endif
+  endif
+endm
+;
+;  Input:  rHi,rLo
+; Output: <rHi,rLo> <<< rName&&rNum, and tmp trashed;
+RotL64 macro rHi,rLo,tmp,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 )
+    Rol64 rHi,rLo,tmp,_RCNT_
+endm
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar    macro localName,localSize
+localName   =   _STK_OFFS_
+_STK_OFFS_  =   _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro WCNT,KS_CNT
+_STK_OFFS_  =   0                   ;starting offset from esp 
+    ;----- local  variables         ;<-- esp
+    StackVar    X_stk  ,8*(WCNT)    ;local context vars
+    StackVar    Wcopy  ,8*(WCNT)    ;copy of input block    
+    StackVar    ksTwk  ,8*3         ;key schedule: tweak words
+    StackVar    ksKey  ,8*(WCNT)+8  ;key schedule: key   words
+  if WCNT le 8
+FRAME_OFFS  =   _STK_OFFS_          ;<-- ebp
+  else
+FRAME_OFFS  =   _STK_OFFS_-8*4      ;<-- ebp
+  endif
+  if (SKEIN_ASM_UNROLL and (WCNT*64)) eq 0
+    StackVar    ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen
+  endif
+LOCAL_SIZE  =   _STK_OFFS_          ;size of local vars
+    ;----- 
+    StackVar    savRegs,8*4         ;pushad data
+    StackVar    retAddr,4           ;return address
+    ;----- caller parameters
+    StackVar    ctxPtr ,4           ;context ptr
+    StackVar    blkPtr ,4           ;pointer to block data
+    StackVar    blkCnt ,4           ;number of full blocks to process
+    StackVar    bitAdd ,4           ;bit count to add to tweak
+    ;----- caller's stack frame
+;
+; Notes on stack frame setup:
+;   * the most frequently used variable is X_stk[], based at [esp+0]
+;   * the next most used is the key schedule words
+;       so ebp is "centered" there, allowing short offsets to the key/tweak
+;       schedule even in 1024-bit Skein case
+;   * the Wcopy variables are infrequently accessed, but they have long 
+;       offsets from both esp and ebp only in the 1024-bit case.
+;   * all other local vars and calling parameters can be accessed 
+;       with short offsets, except in the 1024-bit case
+;
+    pushad                          ;save all regs
+    sub     esp,LOCAL_SIZE          ;make room for the locals
+    lea     ebp,[esp+FRAME_OFFS]    ;maximize use of short offsets
+    mov     edi,[FP_+ctxPtr ]       ;edi --> context
+;
+endm ;Setup_Stack
+;
+FP_         equ <ebp-FRAME_OFFS>    ;keep as many short offsets as possible
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro   procStart
+    add     esp,LOCAL_SIZE          ;get rid of locals (wipe??)
+    popad                           ;restore all regs
+
+    ;display code size in bytes to stdout
+  irp  _BCNT_,<%($+1-procStart)>    ;account for return opcode
+if     _BCNT_ ge 10000              ;(align it all pretty)
+%out procStart code size = _BCNT_ bytes  
+elseif _BCNT_ ge  1000
+%out procStart code size =  _BCNT_ bytes  
+else
+%out procStart code size =   _BCNT_ bytes  
+endif
+  endm ;irp _BCNT_
+
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+    extrn   _Skein_Show_Block:near   ;calls to C routines
+    extrn   _Skein_Show_Round:near
+;
+SKEIN_RND_SPECIAL       =   1000
+SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+;                     const u08b_t *blkPtr, const u64b_t *wPtr, 
+;                     const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+    pushad                          ;save all regs
+    lea     eax,[FP_+ksTwk]
+    lea     ebx,[FP_+ksKey]
+    lea     ecx,[esp+32+Wcopy]
+    mov     edx,[FP_+ctxPtr]        ;ctx_hdr_ptr
+    lea     edx,[edx+X_VARS]        ;edx ==> cxt->X[]
+    push    eax                     ;tsPtr
+    push    ebx                     ;ksPtr
+    push    ecx                     ;wPtr
+    push    dword ptr [FP_+blkPtr]  ;blkPtr
+    push    edx                     ;ctx->Xptr
+    push    dword ptr [FP_+ctxPtr]  ;ctx_hdr_ptr
+    mov     eax,BLK_BITS
+    push    eax                     ;bits
+  ifdef _MINGW_
+    call    _Skein_Show_Block-4     ;strange linkage??
+  else
+    call    _Skein_Show_Block
+  endif
+    add     esp,7*4                 ;discard parameter space on stack
+    popad                           ;restore regs
+endm ;Skein_Debug_Block
+
+;
+Skein_Debug_Round macro BLK_SIZE,R,saveRegs
+;
+;void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X);
+;
+  ifnb <saveRegs>
+    mov         [esp+X_stk+ 0],eax  ;save internal vars for debug dump
+    mov         [esp+X_stk+ 4],ebx
+    mov         [esp+X_stk+ 8],ecx
+    mov         [esp+X_stk+12],edx
+  endif
+    pushad                          ;save all regs
+  if R ne SKEIN_RND_FEED_FWD
+    lea     eax,[esp+32+X_stk]
+  else
+    mov     eax,[FP_+ctxPtr]
+    add     eax,X_VARS
+  endif
+    push    eax                     ;Xptr
+  if (SKEIN_ASM_UNROLL and BLK_SIZE) or (R ge SKEIN_RND_SPECIAL)
+    mov     eax,R
+  else
+    lea     eax,[4*edi+1+(((R)-1) and 3)] ;compute round number using edi
+  endif
+    push    eax                     ;round number
+    push    dword ptr [FP_+ctxPtr]  ;ctx_hdr_ptr
+    mov     eax,BLK_SIZE
+    push    eax                     ;bits
+  ifdef _MINGW_
+    call    _Skein_Show_Round-4     ;strange linkage??
+  else
+    call    _Skein_Show_Round
+  endif
+    add     esp,4*4                 ;discard parameter space on stack
+    popad                           ;restore regs
+endm  ;Skein_Debug_Round
+endif ;ifdef SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+;
+; MACRO: a mix step
+;
+MixStep     macro   BLK_SIZE,ld_A,ld_C,st_A,st_C,RotNum0,RotNum1,_debug_
+  ifnb <ld_A>
+    mov     eax,[esp+X_stk+8*(ld_A)+0]
+    mov     ebx,[esp+X_stk+8*(ld_A)+4]
+  endif
+  ifnb <ld_C>
+    mov     ecx,[esp+X_stk+8*(ld_C)+0]
+    mov     edx,[esp+X_stk+8*(ld_C)+4]
+  endif
+    add     eax, ecx                ;X[A] += X[C]
+    adc     ebx, edx
+  ifnb <st_A>
+    mov         [esp+X_stk+8*(st_A)+0],eax
+    mov         [esp+X_stk+8*(st_A)+4],ebx
+  endif
+__rNum0 = (RotNum0) AND 7
+    RotL64  ecx, edx, esi,%(BLK_SIZE),%(__rNum0),%(RotNum1) ;X[C] <<<= RC_<BLK_BITS,RotNum0,RotNum1>
+    xor     ecx, eax                ;X[C] ^= X[A]
+    xor     edx, ebx
+  if _SKEIN_DEBUG or  (0 eq (_debug_ + 0))
+   ifb <st_C>
+    mov         [esp+X_stk+8*(ld_C)+0],ecx
+    mov         [esp+X_stk+8*(ld_C)+4],edx
+   else
+    mov         [esp+X_stk+8*(st_C)+0],ecx
+    mov         [esp+X_stk+8*(st_C)+4],edx
+   endif
+  endif
+  if _SKEIN_DEBUG and (0 ne (_debug_ + 0))
+    Skein_Debug_Round BLK_SIZE,%(RotNum0+1)
+  endif
+endm ;MixStep
+;
+;;;;;;;;;;;;;;;;;
+;
+; MACRO: key schedule injection
+;
+ks_Inject macro BLK_SIZE,X_load,X_stor,rLo,rHi,rndBase,keyIdx,twkIdx,ROUND_ADD
+    ;are rLo,rHi values already loaded? if not, load them now
+  ifnb <X_load> 
+    mov     rLo,[esp+X_stk +8*(X_load)  ]
+    mov     rHi,[esp+X_stk +8*(X_load)+4]
+  endif
+
+  ;inject the 64-bit key schedule value (and maybe the tweak as well)
+if SKEIN_ASM_UNROLL and BLK_SIZE
+_kOffs_ = ((rndBase)+(keyIdx)) mod ((BLK_SIZE/64)+1)
+    add     rLo,[FP_+ksKey+8*_kOffs_+ 0]
+    adc     rHi,[FP_+ksKey+8*_kOffs_+ 4]
+  ifnb <twkIdx>
+_tOffs_ = ((rndBase)+(twkIdx)) mod 3
+    add     rLo,[FP_+ksTwk+8*_tOffs_+ 0]
+    adc     rHi,[FP_+ksTwk+8*_tOffs_+ 4]
+  endif
+  ifnb <ROUND_ADD>
+    add     rLo,(ROUND_ADD)
+    adc     rHi,0
+  endif
+else
+    add     rLo,[FP_+ksKey+8*(keyIdx)+8*edi  ]
+    adc     rHi,[FP_+ksKey+8*(keyIdx)+8*edi+4]
+  ifnb <twkIdx>
+    add     rLo,[FP_+ksTwk+8*(twkIdx)+8*edi  ]
+    adc     rHi,[FP_+ksTwk+8*(twkIdx)+8*edi+4]
+  endif
+  ifnb <ROUND_ADD>
+    add     rLo,edi                     ;edi is the round number 
+    adc     rHi,0
+  endif
+endif
+
+  ;do we need to store updated rLo,rHi values? if so, do it now
+  ifnb <X_stor>
+    mov         [esp+X_stk +8*(X_stor)  ],rLo
+    mov         [esp+X_stk +8*(X_stor)+4],rHi
+  endif
+endm ;ks_Inject
+;
+;----------------------------------------------------------------
+; MACRO: key schedule rotation
+;
+ks_Rotate macro rLo,rHi,WCNT
+    mov   rLo,[FP_+ksKey+8*edi+ 0]       ;"rotate" the key schedule in memory
+    mov   rHi,[FP_+ksKey+8*edi+ 4]
+    mov       [FP_+ksKey+8*edi+8*(WCNT+1)+ 0],rLo
+    mov       [FP_+ksKey+8*edi+8*(WCNT+1)+ 4],rHi
+    mov   rLo,[FP_+ksTwk+8*edi+ 0]
+    mov   rHi,[FP_+ksTwk+8*edi+ 4]
+    mov       [FP_+ksTwk+8*edi+8*3+ 0],rLo
+    mov       [FP_+ksTwk+8*edi+8*3+ 4],rHi
+endm
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 256
+    public      _Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; MACRO: two rounds
+;
+R_256_TwoRounds macro _RR_,ld_0
+    ; here with edx:ecx = X[1]
+    ;--------- round _RR_
+    MixStep 256,ld_0, ,0,1,((_RR_)+0),0
+    MixStep 256,   2,3,2,3,((_RR_)+0),1,1
+
+    ; here with edx:ecx = X[3]
+    ;--------- round _RR_ + 1
+    MixStep 256,   0, ,0,3,((_RR_)+1),0
+    MixStep 256,   2,1,2,1,((_RR_)+1),1,1
+
+    ; here with edx:ecx = X[1]
+endm ;R_256_TwoRounds
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+_Skein_256_Process_Block proc near
+    WCNT    =   4                   ;WCNT=4 for Skein-256
+    Setup_Stack WCNT,(ROUNDS_256/8)
+
+    ; main hash loop for Skein_256
+Skein_256_block_loop:
+    mov     eax,[edi+TWEAK+ 0]      ;ebx:eax = tweak word T0
+    mov     ebx,[edi+TWEAK+ 4]
+    mov     ecx,[edi+TWEAK+ 8]      ;edx:ecx = tweak word T1
+    mov     edx,[edi+TWEAK+12]
+
+    add     eax,[FP_+bitAdd  ]      ;bump T0 by the bitAdd parameter
+    adc     ebx, 0
+    mov         [edi+TWEAK   ],eax  ;save updated tweak value T0
+    mov         [edi+TWEAK+ 4],ebx
+
+    mov         [FP_+ksTwk   ],eax  ;build the tweak schedule on the stack
+    mov         [FP_+ksTwk+ 4],ebx
+    xor     eax,ecx                 ;ebx:eax = T0 ^ T1
+    xor     ebx,edx
+    mov         [FP_+ksTwk+ 8],ecx
+    mov         [FP_+ksTwk+12],edx
+    mov         [FP_+ksTwk+16],eax
+    mov         [FP_+ksTwk+20],ebx
+
+    mov     eax,KW_PARITY_LO        ;init parity accumulator
+    mov     ebx,KW_PARITY_HI
+;
+_NN_ = 0
+  rept WCNT                         ;copy in the chaining vars
+    mov     ecx,[edi+X_VARS+_NN_   ]
+    mov     edx,[edi+X_VARS+_NN_+ 4]
+    xor     eax,ecx                 ;compute overall parity along the way
+    xor     ebx,edx
+    mov         [FP_+ksKey +_NN_   ],ecx
+    mov         [FP_+ksKey +_NN_+ 4],edx
+_NN_ = _NN_+8
+  endm
+;
+    mov         [FP_+ksKey +_NN_   ],eax ;save overall parity at the end of the array
+    mov         [FP_+ksKey +_NN_+ 4],ebx
+
+    mov     esi,[FP_+blkPtr ]       ;esi --> input block
+;
+_NN_ = WCNT*8-16                    ;work down from the end
+  rept WCNT/2                       ;perform initial key injection
+    mov     eax,[esi+_NN_       + 0]
+    mov     ebx,[esi+_NN_       + 4]
+    mov     ecx,[esi+_NN_       + 8]
+    mov     edx,[esi+_NN_       +12]
+    mov         [esp+_NN_+Wcopy + 0],eax
+    mov         [esp+_NN_+Wcopy + 4],ebx
+    mov         [esp+_NN_+Wcopy + 8],ecx
+    mov         [esp+_NN_+Wcopy +12],edx
+    add     eax,[FP_+_NN_+ksKey + 0]
+    adc     ebx,[FP_+_NN_+ksKey + 4]
+    add     ecx,[FP_+_NN_+ksKey + 8]
+    adc     edx,[FP_+_NN_+ksKey +12]
+   if     _NN_ eq (WCNT*8-16)       ;inject the tweak words
+    add     eax,[FP_+     ksTwk + 8];   (at the appropriate points)
+    adc     ebx,[FP_+     ksTwk +12]
+   elseif _NN_ eq (WCNT*8-32)
+    add     ecx,[FP_+     ksTwk + 0]
+    adc     edx,[FP_+     ksTwk + 4]
+   endif
+   if _NN_ or _SKEIN_DEBUG
+    mov         [esp+_NN_+X_stk + 0],eax
+    mov         [esp+_NN_+X_stk + 4],ebx
+    mov         [esp+_NN_+X_stk + 8],ecx
+    mov         [esp+_NN_+X_stk +12],edx
+   endif
+_NN_ = _NN_ - 16                    ;end at X[0], so regs are already loaded for first MIX!
+  endm
+;
+if _SKEIN_DEBUG                     ;debug dump of state at this point
+    Skein_Debug_Block WCNT*64 
+    Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL
+endif
+    add     esi, WCNT*8             ;skip the block
+    mov         [FP_+blkPtr   ],esi ;update block pointer
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT =   ROUNDS_256/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_256    ;unroll count
+  if ((ROUNDS_256/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_256"
+  endif
+    xor     edi,edi                 ;edi = iteration count
+Skein_256_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+      ; here with X[0], X[1] already loaded into eax..edx
+      R_256_TwoRounds %(4*_Rbase_+00),
+      R_256_TwoRounds %(4*_Rbase_+02),0
+
+      ;inject key schedule
+  if _UNROLL_CNT ne (ROUNDS_256/8)
+      ks_Rotate eax,ebx,WCNT
+      inc   edi                     ;edi = round number
+  endif
+_Rbase_ = _Rbase_+1
+      ks_Inject 256,3,3,eax,ebx,_Rbase_,3, ,_Rbase_
+      ks_Inject 256,2,2,eax,ebx,_Rbase_,2,1
+      ks_Inject 256, , ,ecx,edx,_Rbase_,1,0
+      ks_Inject 256,0, ,eax,ebx,_Rbase_,0
+  if _SKEIN_DEBUG
+      Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,saveRegs
+  endif
+endm ;rept _UNROLL_CNT
+;
+  if _UNROLL_CNT ne (ROUNDS_256/8)
+    cmp     edi,2*(ROUNDS_256/8)
+    jb      Skein_256_round_loop
+    mov     edi,[FP_+ctxPtr ]           ;restore edi --> context
+  endif
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
+_NN_ = 0
+ rept WCNT/2
+   if _NN_  ;eax..edx already loaded the first time
+    mov     eax,[esp+X_stk + _NN_ + 0]
+    mov     ebx,[esp+X_stk + _NN_ + 4]
+    mov     ecx,[esp+X_stk + _NN_ + 8]
+    mov     edx,[esp+X_stk + _NN_ +12]
+   endif
+   if _NN_ eq 0
+    and     dword ptr [edi +TWEAK +12],FIRST_MASK
+   endif
+    xor     eax,[esp+Wcopy + _NN_ + 0]
+    xor     ebx,[esp+Wcopy + _NN_ + 4]
+    xor     ecx,[esp+Wcopy + _NN_ + 8]
+    xor     edx,[esp+Wcopy + _NN_ +12]
+    mov         [edi+X_VARS+ _NN_ + 0],eax
+    mov         [edi+X_VARS+ _NN_ + 4],ebx
+    mov         [edi+X_VARS+ _NN_ + 8],ecx
+    mov         [edi+X_VARS+ _NN_ +12],edx
+_NN_ = _NN_+16
+  endm
+if _SKEIN_DEBUG
+    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
+endif
+    ; go back for more blocks, if needed
+    dec     dword ptr [FP_+blkCnt]
+    jnz     Skein_256_block_loop
+    
+    Reset_Stack _Skein_256_Process_Block
+    ret
+_Skein_256_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+    public  _Skein_256_Process_Block_CodeSize
+_Skein_256_Process_Block_CodeSize proc
+    mov     eax,_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block
+    ret
+_Skein_256_Process_Block_CodeSize endp
+;
+    public  _Skein_256_Unroll_Cnt
+_Skein_256_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_256/8
+    mov     eax,_UNROLL_CNT
+  else
+    xor     eax,eax
+  endif
+    ret
+_Skein_256_Unroll_Cnt endp
+endif
+endif ;_USE_ASM_ and 256
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 512
+    public      _Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds
+;
+R_512_FourRounds macro _RR_,ld_0
+    ; here with edx:ecx = X[1]
+    ;--------- round _RR_
+    ; R512(0,1,2,3,4,5,6,7,R_0, 1);
+    MixStep 512, ld_0, ,0,1,((_RR_)+0),0
+    MixStep 512,    2,3,2,3,((_RR_)+0),1
+    MixStep 512,    4,5,4,5,((_RR_)+0),2
+    MixStep 512,    6,7,6, ,((_RR_)+0),3,1
+
+    ; here with edx:ecx = X[7]
+    ; R512(2,1,4,7,6,5,0,3,R_1, 2);
+    MixStep 512,    4, ,4,7,((_RR_)+1),1
+    MixStep 512,    6,5,6,5,((_RR_)+1),2
+    MixStep 512,    0,3,0,3,((_RR_)+1),3
+    MixStep 512,    2,1,2, ,((_RR_)+1),0,1
+
+    ; here with edx:ecx = X[1]
+    ; R512(4,1,6,3,0,5,2,7,R_2, 3);
+    MixStep 512,    4, ,4,1,((_RR_)+2),0
+    MixStep 512,    6,3,6,3,((_RR_)+2),1
+    MixStep 512,    0,5,0,5,((_RR_)+2),2
+    MixStep 512,    2,7,2, ,((_RR_)+2),3,1
+
+    ; here with edx:ecx = X[7]
+    ; R512(6,1,0,7,2,5,4,3,R_3, 4);
+    MixStep 512,    0, ,0,7,((_RR_)+3),1
+    MixStep 512,    2,5,2,5,((_RR_)+3),2
+    MixStep 512,    4,3,4,3,((_RR_)+3),3
+    MixStep 512,    6,1,6, ,((_RR_)+3),0,1
+
+endm ;R_512_FourRounds
+;
+;;;;;;;;;;;;;;;;;
+; code
+;
+_Skein_512_Process_Block proc near
+    WCNT    =   8                   ;WCNT=8 for Skein-512
+    Setup_Stack WCNT,(ROUNDS_512/8)
+
+    ; main hash loop for Skein_512
+Skein_512_block_loop:
+    mov     eax,[edi+TWEAK+ 0]      ;ebx:eax = tweak word T0
+    mov     ebx,[edi+TWEAK+ 4]
+    mov     ecx,[edi+TWEAK+ 8]      ;edx:ecx = tweak word T1
+    mov     edx,[edi+TWEAK+12]
+
+    add     eax,[FP_+bitAdd  ]      ;bump T0 by the bitAdd parameter
+    adc     ebx, 0
+    mov         [edi+TWEAK   ],eax  ;save updated tweak value T0
+    mov         [edi+TWEAK+ 4],ebx
+
+    mov         [FP_+ksTwk   ],eax  ;build the tweak schedule on the stack
+    mov         [FP_+ksTwk+ 4],ebx
+    xor     eax,ecx                 ;ebx:eax = T0 ^ T1
+    xor     ebx,edx
+    mov         [FP_+ksTwk+ 8],ecx
+    mov         [FP_+ksTwk+12],edx
+    mov         [FP_+ksTwk+16],eax
+    mov         [FP_+ksTwk+20],ebx
+
+    mov     eax,KW_PARITY_LO        ;init parity accumulator
+    mov     ebx,KW_PARITY_HI
+;
+_NN_ = 0
+  rept WCNT                         ;copy in the chaining vars
+    mov     ecx,[edi+X_VARS+_NN_   ]
+    mov     edx,[edi+X_VARS+_NN_+ 4]
+    xor     eax,ecx                 ;compute overall parity along the way
+    xor     ebx,edx
+    mov         [FP_+ksKey +_NN_   ],ecx
+    mov         [FP_+ksKey +_NN_+ 4],edx
+_NN_ = _NN_+8
+  endm
+;
+    mov         [FP_+ksKey +_NN_   ],eax ;save overall parity at the end of the array
+    mov         [FP_+ksKey +_NN_+ 4],ebx
+
+    mov     esi,[FP_+blkPtr ]       ;esi --> input block
+;
+_NN_ = WCNT*8-16                    ;work down from the end
+  rept WCNT/2                       ;perform initial key injection
+    mov     eax,[esi+_NN_       + 0]
+    mov     ebx,[esi+_NN_       + 4]
+    mov     ecx,[esi+_NN_       + 8]
+    mov     edx,[esi+_NN_       +12]
+    mov         [esp+_NN_+Wcopy + 0],eax
+    mov         [esp+_NN_+Wcopy + 4],ebx
+    mov         [esp+_NN_+Wcopy + 8],ecx
+    mov         [esp+_NN_+Wcopy +12],edx
+    add     eax,[FP_+_NN_+ksKey + 0]
+    adc     ebx,[FP_+_NN_+ksKey + 4]
+    add     ecx,[FP_+_NN_+ksKey + 8]
+    adc     edx,[FP_+_NN_+ksKey +12]
+   if     _NN_ eq (WCNT*8-16)       ;inject the tweak words
+    add     eax,[FP_+     ksTwk + 8];   (at the appropriate points)
+    adc     ebx,[FP_+     ksTwk +12]
+   elseif _NN_ eq (WCNT*8-32)
+    add     ecx,[FP_+     ksTwk + 0]
+    adc     edx,[FP_+     ksTwk + 4]
+   endif
+   if _NN_ or _SKEIN_DEBUG
+    mov         [esp+_NN_+X_stk + 0],eax
+    mov         [esp+_NN_+X_stk + 4],ebx
+    mov         [esp+_NN_+X_stk + 8],ecx
+    mov         [esp+_NN_+X_stk +12],edx
+   endif
+_NN_ = _NN_ - 16                    ;end at X[0], so regs are already loaded for first MIX!
+  endm
+;
+if _SKEIN_DEBUG                     ;debug dump of state at this point
+    Skein_Debug_Block WCNT*64 
+    Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL
+endif
+    add     esi, WCNT*8             ;skip the block
+    mov         [FP_+blkPtr   ],esi ;update block pointer
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+if SKEIN_ASM_UNROLL and 512
+_UNROLL_CNT =   ROUNDS_512/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_512
+  if ((ROUNDS_512/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_512"
+  endif
+    xor     edi,edi                 ;edi = round counter
+Skein_512_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+      ; here with X[0], X[1] already loaded into eax..edx
+      R_512_FourRounds %(4*_Rbase_+00),
+
+      ;inject odd  key schedule words
+  if _UNROLL_CNT ne (ROUNDS_512/8)
+      ks_Rotate eax,ebx,WCNT
+      inc   edi                     ;edi = round number
+  endif
+_Rbase_ = _Rbase_+1
+      ks_Inject 512,7,7,eax,ebx,_Rbase_,7, ,_Rbase_
+      ks_Inject 512,6,6,eax,ebx,_Rbase_,6,1
+      ks_Inject 512,5,5,eax,ebx,_Rbase_,5,0
+      ks_Inject 512,4,4,eax,ebx,_Rbase_,4
+      ks_Inject 512,3,3,eax,ebx,_Rbase_,3
+      ks_Inject 512,2,2,eax,ebx,_Rbase_,2
+      ks_Inject 512, , ,ecx,edx,_Rbase_,1
+      ks_Inject 512,0, ,eax,ebx,_Rbase_,0
+  if _SKEIN_DEBUG
+      Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT ,saveRegs
+  endif
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 512) eq 0
+    cmp     edi,2*(ROUNDS_512/8)
+    jb      Skein_512_round_loop
+    mov     edi,[FP_+ctxPtr ]           ;restore edi --> context
+endif
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
+_NN_ = 0
+ rept WCNT/2
+   if _NN_  ;eax..edx already loaded the first time
+    mov     eax,[esp+X_stk + _NN_ + 0]
+    mov     ebx,[esp+X_stk + _NN_ + 4]
+    mov     ecx,[esp+X_stk + _NN_ + 8]
+    mov     edx,[esp+X_stk + _NN_ +12]
+   endif
+   if _NN_ eq 0
+    and     dword ptr [edi + TWEAK+12],FIRST_MASK
+   endif
+    xor     eax,[esp+Wcopy + _NN_ + 0]
+    xor     ebx,[esp+Wcopy + _NN_ + 4]
+    xor     ecx,[esp+Wcopy + _NN_ + 8]
+    xor     edx,[esp+Wcopy + _NN_ +12]
+    mov         [edi+X_VARS+ _NN_ + 0],eax
+    mov         [edi+X_VARS+ _NN_ + 4],ebx
+    mov         [edi+X_VARS+ _NN_ + 8],ecx
+    mov         [edi+X_VARS+ _NN_ +12],edx
+_NN_ = _NN_+16
+  endm
+if _SKEIN_DEBUG
+    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+endif
+    ; go back for more blocks, if needed
+    dec     dword ptr [FP_+blkCnt]
+    jnz     Skein_512_block_loop
+
+    Reset_Stack _Skein_512_Process_Block
+    ret
+_Skein_512_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+    public  _Skein_512_Process_Block_CodeSize
+_Skein_512_Process_Block_CodeSize proc
+    mov     eax,_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block
+    ret
+_Skein_512_Process_Block_CodeSize endp
+;
+    public  _Skein_512_Unroll_Cnt
+_Skein_512_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_512/8
+    mov     eax,_UNROLL_CNT
+  else
+    xor     eax,eax
+  endif
+    ret
+_Skein_512_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 512
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 1024
+    public      _Skein1024_Process_Block
+;
+; void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds
+;
+R_1024_FourRounds macro _RR_,ld_0
+    ; here with edx:ecx = X[1]
+
+    ;--------- round _RR_
+    MixStep 1024, ld_0,  , 0, 1,((_RR_)+0),0
+    MixStep 1024,    2, 3, 2, 3,((_RR_)+0),1
+    MixStep 1024,    4, 5, 4, 5,((_RR_)+0),2
+    MixStep 1024,    6, 7, 6, 7,((_RR_)+0),3
+    MixStep 1024,    8, 9, 8, 9,((_RR_)+0),4
+    MixStep 1024,   10,11,10,11,((_RR_)+0),5
+    MixStep 1024,   12,13,12,13,((_RR_)+0),6
+    MixStep 1024,   14,15,14,  ,((_RR_)+0),7,1
+    ; here with edx:ecx = X[15]
+
+    ;--------- round _RR_+1
+    MixStep 1024,    4,  , 4,15,((_RR_)+1),3
+    MixStep 1024,    0, 9, 0, 9,((_RR_)+1),0
+    MixStep 1024,    2,13, 2,13,((_RR_)+1),1
+    MixStep 1024,    6,11, 6,11,((_RR_)+1),2
+    MixStep 1024,   10, 7,10, 7,((_RR_)+1),4
+    MixStep 1024,   12, 3,12, 3,((_RR_)+1),5
+    MixStep 1024,   14, 5,14, 5,((_RR_)+1),6
+    MixStep 1024,    8, 1, 8,  ,((_RR_)+1),7,1
+    ; here with edx:ecx = X[1]
+
+    ;--------- round _RR_+2
+    MixStep 1024,    6,  , 6, 1,((_RR_)+2),3    
+    MixStep 1024,    0, 7, 0, 7,((_RR_)+2),0    
+    MixStep 1024,    2, 5, 2, 5,((_RR_)+2),1
+    MixStep 1024,    4, 3, 4, 3,((_RR_)+2),2    
+    MixStep 1024,   12,15,12,15,((_RR_)+2),4
+    MixStep 1024,   14,13,14,13,((_RR_)+2),5    
+    MixStep 1024,    8,11, 8,11,((_RR_)+2),6    
+    MixStep 1024,   10, 9,10,  ,((_RR_)+2),7,1
+    ; here with edx:ecx = X[9]
+
+    ;--------- round _RR_+3
+    MixStep 1024,    4,  , 4, 9,((_RR_)+3),3
+    MixStep 1024,    0,15, 0,15,((_RR_)+3),0
+    MixStep 1024,    2,11, 2,11,((_RR_)+3),1
+    MixStep 1024,    6,13, 6,13,((_RR_)+3),2
+    MixStep 1024,    8, 5, 8, 5,((_RR_)+3),5
+    MixStep 1024,   10, 3,10, 3,((_RR_)+3),6
+    MixStep 1024,   12, 7,12, 7,((_RR_)+3),7
+    MixStep 1024,   14, 1,14,  ,((_RR_)+3),4,1
+
+    ; here with edx:ecx = X[1]
+endm ;R_1024_FourRounds
+;
+;;;;;;;;;;;;;;;;;
+; code
+;
+_Skein1024_Process_Block proc near
+;
+    WCNT    =   16                   ;WCNT=16 for Skein-1024
+    Setup_Stack WCNT,(ROUNDS_1024/8)
+
+    ; main hash loop for Skein1024
+Skein1024_block_loop:
+    mov     eax,[edi+TWEAK+ 0]      ;ebx:eax = tweak word T0
+    mov     ebx,[edi+TWEAK+ 4]
+    mov     ecx,[edi+TWEAK+ 8]      ;edx:ecx = tweak word T1
+    mov     edx,[edi+TWEAK+12]
+
+    add     eax,[FP_+bitAdd  ]      ;bump T0 by the bitAdd parameter
+    adc     ebx, 0
+    mov         [edi+TWEAK   ],eax  ;save updated tweak value T0
+    mov         [edi+TWEAK+ 4],ebx
+
+    mov         [FP_+ksTwk   ],eax  ;build the tweak schedule on the stack
+    mov         [FP_+ksTwk+ 4],ebx
+    xor     eax,ecx                 ;ebx:eax = T0 ^ T1
+    xor     ebx,edx
+    mov         [FP_+ksTwk+ 8],ecx
+    mov         [FP_+ksTwk+12],edx
+    mov         [FP_+ksTwk+16],eax
+    mov         [FP_+ksTwk+20],ebx
+
+    mov     eax,KW_PARITY_LO        ;init parity accumulator
+    mov     ebx,KW_PARITY_HI
+EDI_BIAS    equ 70h                 ;bias the edi offsets to make them short!
+    add     edi, EDI_BIAS
+CT_ equ     <edi-EDI_BIAS>
+;
+_NN_ = 0
+  rept WCNT                         ;copy in the chaining vars
+    mov     ecx,[CT_+X_VARS+_NN_   ]
+    mov     edx,[CT_+X_VARS+_NN_+ 4]
+    xor     eax,ecx                 ;compute overall parity along the way
+    xor     ebx,edx
+    mov         [FP_+ksKey +_NN_   ],ecx
+    mov         [FP_+ksKey +_NN_+ 4],edx
+_NN_ = _NN_+8
+  endm
+;
+    mov         [FP_+ksKey +_NN_   ],eax ;save overall parity at the end of the array
+    mov         [FP_+ksKey +_NN_+ 4],ebx
+
+    mov     esi,[FP_+blkPtr ]       ;esi --> input block
+    lea     edi,[esp+Wcopy]
+;
+_NN_ = WCNT*8-16                    ;work down from the end
+  rept WCNT/2                       ;perform initial key injection
+    mov     eax,[esi+_NN_       + 0]
+    mov     ebx,[esi+_NN_       + 4]
+    mov     ecx,[esi+_NN_       + 8]
+    mov     edx,[esi+_NN_       +12]
+    mov         [edi+_NN_+      + 0],eax
+    mov         [edi+_NN_+      + 4],ebx
+    mov         [edi+_NN_+      + 8],ecx
+    mov         [edi+_NN_+      +12],edx
+    add     eax,[FP_+_NN_+ksKey + 0]
+    adc     ebx,[FP_+_NN_+ksKey + 4]
+    add     ecx,[FP_+_NN_+ksKey + 8]
+    adc     edx,[FP_+_NN_+ksKey +12]
+   if     _NN_ eq (WCNT*8-16)       ;inject the tweak words
+    add     eax,[FP_+     ksTwk + 8];   (at the appropriate points)
+    adc     ebx,[FP_+     ksTwk +12]
+   elseif _NN_ eq (WCNT*8-32)
+    add     ecx,[FP_+     ksTwk + 0]
+    adc     edx,[FP_+     ksTwk + 4]
+   endif
+   if _NN_ or _SKEIN_DEBUG
+    mov         [esp+_NN_+X_stk + 0],eax
+    mov         [esp+_NN_+X_stk + 4],ebx
+    mov         [esp+_NN_+X_stk + 8],ecx
+    mov         [esp+_NN_+X_stk +12],edx
+   endif
+_NN_ = _NN_ - 16                    ;end at X[0], so regs are already loaded for first MIX!
+  endm
+;
+if _SKEIN_DEBUG                     ;debug dump of state at this point
+    Skein_Debug_Block WCNT*64 
+    Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL
+endif
+    sub     esi,-WCNT*8             ;skip the block (short immediate)
+    mov         [FP_+blkPtr   ],esi ;update block pointer
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+if SKEIN_ASM_UNROLL and 1024
+_UNROLL_CNT =   ROUNDS_1024/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_1024
+  if ((ROUNDS_1024/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_1024"
+  endif
+    xor     edi,edi                 ;edi = round counter
+Skein_1024_round_loop:
+endif
+
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+      ; here with X[0], X[1] already loaded into eax..edx
+      R_1024_FourRounds %(4*_Rbase_+00),
+
+      ;inject odd  key schedule words
+      ;inject odd  key schedule words
+  if _UNROLL_CNT ne (ROUNDS_1024/8)
+      ks_Rotate eax,ebx,WCNT
+      inc   edi                     ;edi = round number
+  endif
+_Rbase_ = _Rbase_+1
+      ks_Inject 1024,15,15,eax,ebx,_Rbase_,15, ,_Rbase_
+      ks_Inject 1024,14,14,eax,ebx,_Rbase_,14,1
+      ks_Inject 1024,13,13,eax,ebx,_Rbase_,13,0
+  irp _w,<12,11,10,9,8,7,6,5,4,3,2>
+      ks_Inject 1024,_w,_w,eax,ebx,_Rbase_,_w
+  endm
+      ks_Inject 1024,  ,  ,ecx,edx,_Rbase_,1
+      ks_Inject 1024, 0,  ,eax,ebx,_Rbase_,0
+
+  if _SKEIN_DEBUG
+      Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,saveRegs
+  endif
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 1024) eq 0
+    cmp     edi,2*(ROUNDS_1024/8)
+    jb      Skein_1024_round_loop
+endif
+    mov     edi,[FP_+ctxPtr ]           ;restore edi --> context
+    add     edi,EDI_BIAS                ;and bias it for short offsets below
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
+    lea     esi,[esp+Wcopy]             ;use short offsets below
+_NN_ = 0
+ rept WCNT/2
+   if _NN_  ;eax..edx already loaded the first time
+    mov     eax,[esp+X_stk + _NN_ + 0]
+    mov     ebx,[esp+X_stk + _NN_ + 4]
+    mov     ecx,[esp+X_stk + _NN_ + 8]
+    mov     edx,[esp+X_stk + _NN_ +12]
+   endif
+   if _NN_ eq 0
+    and     dword ptr [CT_ + TWEAK+12],FIRST_MASK
+   endif
+    xor     eax,[esi       + _NN_ + 0]
+    xor     ebx,[esi       + _NN_ + 4]
+    xor     ecx,[esi       + _NN_ + 8]
+    xor     edx,[esi       + _NN_ +12]
+    mov         [CT_+X_VARS+ _NN_ + 0],eax
+    mov         [CT_+X_VARS+ _NN_ + 4],ebx
+    mov         [CT_+X_VARS+ _NN_ + 8],ecx
+    mov         [CT_+X_VARS+ _NN_ +12],edx
+_NN_ = _NN_+16
+  endm
+    sub     edi,EDI_BIAS                ;undo the bias for return
+
+if _SKEIN_DEBUG
+    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD
+endif
+    ; go back for more blocks, if needed
+    dec     dword ptr [FP_+blkCnt]
+    jnz     Skein1024_block_loop
+
+    Reset_Stack _Skein1024_Process_Block
+    ret
+_Skein1024_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+    public  _Skein1024_Process_Block_CodeSize
+_Skein1024_Process_Block_CodeSize proc
+    mov     eax,_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block
+    ret
+_Skein1024_Process_Block_CodeSize endp
+;
+    public  _Skein1024_Unroll_Cnt
+_Skein1024_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_1024/8
+    mov     eax,_UNROLL_CNT
+  else
+    xor     eax,eax
+  endif
+    ret
+_Skein1024_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 1024
+;----------------------------------------------------------------
+    end
diff --git a/Additional_Implementations/skein_block_xmm32.asm b/Additional_Implementations/skein_block_xmm32.asm
new file mode 100644
index 0000000000000..96ef121cd49ab
--- /dev/null
+++ b/Additional_Implementations/skein_block_xmm32.asm
@@ -0,0 +1,1167 @@
+;
+;----------------------------------------------------------------
+; 32-bit x86 assembler code for Skein block functions using XMM registers
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+    .386p
+    .model flat
+    .code
+    .xmm                                    ;enable XMM instructions
+;
+_MASK_ALL_  equ (256+512+1024)              ;all three algorithm bits
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_        = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_        = SKEIN_USE_ASM
+else
+_USE_ASM_        = _MASK_ALL_
+endif
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP  
+_SKEIN_LOOP       = 0                       ;default is all fully unrolled
+else
+_SKEIN_LOOP       = SKEIN_LOOP
+endif
+;--------------
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) mod 10
+;
+SKEIN_ASM_UNROLL  = 0
+  irp _NN_,<256,512,1024>
+    if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + _NN_
+    endif
+  endm
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256  =   72
+ROUNDS_512  =   72
+ROUNDS_1024 =   80
+else
+ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) mod 10) + 5)
+endif
+irp _NN_,<256,512,1024>
+  if _USE_ASM_ and _NN_
+    irp _RR_,<%(ROUNDS_&_NN_)>
+      if _NN_ eq 1024
+%out  +++ SKEIN_ROUNDS_&_NN_ = _RR_
+      else
+%out  +++ SKEIN_ROUNDS_&_NN_  = _RR_
+      endif
+    endm
+  endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE equ (1)
+else
+ifdef  SKEIN_PERF                           ;use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG      = 0
+else
+_SKEIN_DEBUG      = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS   =   0                           ;# bits of hash output
+BCNT        =   4 + HASH_BITS               ;number of bytes in BUFFER[]
+TWEAK       =   4 + BCNT                    ;tweak values[0..1]
+X_VARS      =  16 + TWEAK                   ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+KW_PARITY_LO=   0A9FC1A22h                  ;overall parity of key schedule words (hi32/lo32)
+KW_PARITY_HI=   01BD11BDAh
+FIRST_MASK8 =   NOT (1 SHL 6)               ;FIRST block flag bit
+;
+; rotation constants for Skein
+;
+RC_256_0_0  = 14
+RC_256_0_1  = 16
+
+RC_256_1_0  = 52
+RC_256_1_1  = 57
+
+RC_256_2_0  = 23
+RC_256_2_1  = 40
+
+RC_256_3_0  =  5
+RC_256_3_1  = 37
+
+RC_256_4_0  = 25
+RC_256_4_1  = 33
+
+RC_256_5_0  = 46
+RC_256_5_1  = 12
+
+RC_256_6_0  = 58
+RC_256_6_1  = 22
+
+RC_256_7_0  = 32
+RC_256_7_1  = 32
+
+RC_512_0_0  = 46
+RC_512_0_1  = 36
+RC_512_0_2  = 19
+RC_512_0_3  = 37
+
+RC_512_1_0  = 33
+RC_512_1_1  = 27
+RC_512_1_2  = 14
+RC_512_1_3  = 42
+
+RC_512_2_0  = 17
+RC_512_2_1  = 49
+RC_512_2_2  = 36
+RC_512_2_3  = 39
+
+RC_512_3_0  = 44
+RC_512_3_1  =  9
+RC_512_3_2  = 54
+RC_512_3_3  = 56
+
+RC_512_4_0  = 39
+RC_512_4_1  = 30
+RC_512_4_2  = 34
+RC_512_4_3  = 24
+
+RC_512_5_0  = 13
+RC_512_5_1  = 50
+RC_512_5_2  = 10
+RC_512_5_3  = 17
+
+RC_512_6_0  = 25
+RC_512_6_1  = 29
+RC_512_6_2  = 39
+RC_512_6_3  = 43
+
+RC_512_7_0  =  8
+RC_512_7_1  = 35
+RC_512_7_2  = 56
+RC_512_7_3  = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 =  8
+RC_1024_0_3 = 47
+RC_1024_0_4 =  8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 =  4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 =  5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 =  9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 =  4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 =  9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+mov64 macro x0,x1
+    movq    x0,x1
+endm
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar    macro localName,localSize
+localName   =   _STK_OFFS_
+_STK_OFFS_  =   _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro WCNT,RND_CNT
+_STK_OFFS_  =   0                   ;starting offset from esp, forced on 16-byte alignment
+    ;----- local  variables         ;<-- esp
+    StackVar    X_stk  , 8*(WCNT)   ;local context vars
+    StackVar    Wcopy  , 8*(WCNT)   ;copy of input block    
+    StackVar    ksTwk  ,16*3        ;key schedule: tweak words
+    StackVar    ksKey  ,16*(WCNT)+16;key schedule: key   words
+FRAME_OFFS  =   ksTwk+128           ;<-- ebp
+  if (SKEIN_ASM_UNROLL and (WCNT*64)) eq 0
+    StackVar    ksRot,16*(RND_CNT/4);leave space for ks "rotation" to happen
+  endif
+LOCAL_SIZE  =   _STK_OFFS_          ;size of local vars
+    ;
+    ;"restart" the stack defns, because we relocate esp to guarantee alignment
+    ;    (i.e., these vars are NOT at fixed offsets from esp)
+_STK_OFFS_  =   0
+    ;----- 
+    StackVar    savRegs,8*4         ;pushad data
+    StackVar    retAddr,4           ;return address
+    ;----- caller parameters
+    StackVar    ctxPtr ,4           ;context ptr
+    StackVar    blkPtr ,4           ;pointer to block data
+    StackVar    blkCnt ,4           ;number of full blocks to process
+    StackVar    bitAdd ,4           ;bit count to add to tweak
+    ;----- caller's stack frame
+;
+; Notes on stack frame setup:
+;   * the most used variable (except for Skein-256) is X_stk[], based at [esp+0]
+;   * the next most used is the key schedule words
+;       so ebp is "centered" there, allowing short offsets to the key/tweak
+;       schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-(
+;   * the Wcopy variables are infrequently accessed, and they have long 
+;       offsets from both esp and ebp only in the 1024-bit case.
+;   * all other local vars and calling parameters can be accessed 
+;       with short offsets, except in the 1024-bit case
+;
+    pushad                          ;save all regs
+    mov     ebx,esp                 ;keep ebx as pointer to caller parms
+    sub     esp,LOCAL_SIZE          ;make room for the locals
+    and     esp,not 15              ;force alignment
+    mov     edi,[ebx+ctxPtr ]       ;edi --> Skein context
+    lea     ebp,[esp+FRAME_OFFS]    ;maximize use of short offsets from ebp
+    mov     ecx,ptr32 [ebx+blkCnt]  ;keep block cnt in ecx
+;
+endm ;Setup_Stack
+;
+FP_         equ <ebp-FRAME_OFFS>    ;keep as many short offsets as possible
+SI_         equ <esi-FRAME_OFFS>    ;keep as many short offsets as possible
+ptr64       equ <qword ptr>         ;useful abbreviations
+ptr32       equ <dword ptr>
+ptr08       equ <byte  ptr>
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro   procStart
+    mov     esp,ebx                 ;get rid of locals (wipe??)
+    popad                           ;restore all regs
+
+    ;display code size in bytes to stdout
+  irp  _BCNT_,<%($+1-procStart)>    ;account for return opcode
+if     _BCNT_ ge 10000              ;(align it all pretty)
+%out procStart code size = _BCNT_ bytes  
+elseif _BCNT_ ge  1000
+%out procStart code size =  _BCNT_ bytes  
+else
+%out procStart code size =   _BCNT_ bytes  
+endif
+  endm ;irp _BCNT_
+
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+    extrn   _Skein_Show_Block:near   ;calls to C routines
+    extrn   _Skein_Show_Round:near
+;
+SKEIN_RND_SPECIAL       =   1000
+SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+;                     const u08b_t *blkPtr, const u64b_t *wPtr, 
+;                     const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+    Put_XMM_&BLK_BITS
+    pushad                          ;save all regs
+    lea     eax,[FP_+ksTwk+1]       ;+1 = flag: "stride" size = 2 qwords
+    lea     esi,[FP_+ksKey+1]
+    lea     ecx,[esp+32+Wcopy]      ;adjust offset by 32 for pushad
+    mov     edx,[ebx+ctxPtr]        ;ctx_hdr_ptr
+    lea     edx,[edx+X_VARS]        ;edx ==> cxt->X[]
+    push    eax                     ;tsPtr
+    push    esi                     ;ksPtr
+    push    ecx                     ;wPtr
+    push    ptr32 [ebx+blkPtr]      ;blkPtr
+    push    edx                     ;ctx->Xptr
+    push    ptr32 [ebx+ctxPtr]      ;ctx_hdr_ptr
+    mov     eax,BLK_BITS
+    push    eax                     ;bits
+  ifdef _MINGW_
+    call    _Skein_Show_Block-4     ;strange linkage??
+  else
+    call    _Skein_Show_Block
+  endif
+    add     esp,7*4                 ;discard parameter space on stack
+    popad                           ;restore regs
+;
+    Get_XMM_&BLK_BITS
+endm ;Skein_Debug_Block
+
+;
+Skein_Debug_Round macro BLK_BITS,R,saveRegs
+;
+;void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X);
+;
+  ifnb <saveRegs>
+      Put_XMM_&BLK_BITS
+  endif
+    pushad                          ;save all regs
+  if R ne SKEIN_RND_FEED_FWD
+    lea     eax,[esp+32+X_stk]      ;adjust offset by 32 for pushad
+  else
+    mov     eax,[ebx+ctxPtr]
+    add     eax,X_VARS
+  endif
+    push    eax                     ;Xptr
+  if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL)
+    mov     eax,R
+  else
+    lea     eax,[4*edx+1+(((R)-1) and 3)] ;compute round number using edx
+  endif
+    push    eax                     ;round number
+    push    ptr32 [ebx+ctxPtr]      ;ctx_hdr_ptr
+    mov     eax,BLK_BITS
+    push    eax                     ;bits
+  ifdef _MINGW_
+    call    _Skein_Show_Round-4     ;strange linkage??
+  else
+    call    _Skein_Show_Round
+  endif
+    add     esp,4*4                 ;discard parameter space on stack
+    popad                           ;restore regs
+
+  ifnb <saveRegs>
+      Get_XMM_&BLK_BITS       ;save internal vars for debug dump
+  endif
+endm  ;Skein_Debug_Round
+endif ;ifdef SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+; useful macros
+_ldX macro xn
+  ifnb <xn>
+    mov64 xmm&xn,ptr64 [esp+X_stk+8*xn]
+  endif
+endm
+
+_stX macro xn
+  ifnb <xn>
+    mov64        ptr64 [esp+X_stk+8*xn],xmm&xn
+  endif
+endm
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 256
+    public      _Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; Skein-256 round macros
+;
+R_256_OneRound macro _RR_,x0,x1,x2,x3,t0,t1
+  irp _qq_,<%((_RR_) and 7)>        ;figure out which rotation constants to use
+    if x0 eq 0
+_RC0_ =   RC_256_&_qq_&_0
+_RC1_ =   RC_256_&_qq_&_1
+    else
+_RC0_ =   RC_256_&_qq_&_1
+_RC1_ =   RC_256_&_qq_&_0
+    endif
+  endm
+;
+    paddq   xmm&x0,xmm&x1
+    mov64   xmm&t0,xmm&x1
+    psllq   xmm&x1,   _RC0_
+    psrlq   xmm&t0,64-_RC0_
+    xorpd   xmm&x1,xmm&x0
+    xorpd   xmm&x1,xmm&t0
+;
+    paddq   xmm&x2,xmm&x3
+    mov64   xmm&t1,xmm&x3
+    psllq   xmm&x3,   _RC1_
+    psrlq   xmm&t1,64-_RC1_
+    xorpd   xmm&x3,xmm&x2
+    xorpd   xmm&x3,xmm&t1
+  if _SKEIN_DEBUG
+    Skein_Debug_Round 256,%(_RR_+1),saveRegs
+  endif
+endm ;R_256_OneRound
+;
+R_256_FourRounds macro _RN_
+    R_256_OneRound (_RN_+0),0,1,2,3,4,5
+    R_256_OneRound (_RN_+1),2,1,0,3,4,5
+
+    R_256_OneRound (_RN_+2),0,1,2,3,4,5
+    R_256_OneRound (_RN_+3),2,1,0,3,4,5
+
+    ;inject key schedule
+    inc   edx                     ;bump round number
+    movd  xmm4,edx
+  if _UNROLL_CNT eq (ROUNDS_256/8)
+    ;fully unrolled version
+_RK_ = ((_RN_)/4)                 ;key injection counter
+    paddq xmm0,[FP_+ksKey+16*((_RK_+1) mod 5)]
+    paddq xmm1,[FP_+ksKey+16*((_RK_+2) mod 5)]
+    paddq xmm2,[FP_+ksKey+16*((_RK_+3) mod 5)]
+    paddq xmm3,[FP_+ksKey+16*((_RK_+4) mod 5)]
+    paddq xmm1,[FP_+ksTwk+16*((_RK_+1) mod 3)]
+    paddq xmm2,[FP_+ksTwk+16*((_RK_+2) mod 3)]
+    paddq xmm3,xmm4
+  else ;looping version
+    paddq xmm0,[SI_+ksKey+16*1]
+    paddq xmm1,[SI_+ksKey+16*2]
+    paddq xmm2,[SI_+ksKey+16*3]
+    paddq xmm3,[SI_+ksKey+16*4]
+    paddq xmm1,[SI_+ksTwk+16*1]
+    paddq xmm2,[SI_+ksTwk+16*2]
+    paddq xmm3,xmm4
+;   
+    mov64 xmm4,<ptr64 [SI_+ksKey]>;first, "rotate" key schedule on the stack
+    mov64 xmm5,<ptr64 [SI_+ksTwk]>;    (for next time through)
+    mov64      <ptr64 [SI_+ksKey+16*(WCNT+1)]>,xmm4
+    mov64      <ptr64 [SI_+ksTwk+16*3]>,xmm5
+    add   esi,16                  ;bump rolling pointer
+  endif
+  if _SKEIN_DEBUG
+      Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,saveRegs
+  endif
+endm ;R256_FourRounds
+;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_256 equ <call _Put_XMM_256>
+Get_XMM_256 equ <call _Get_XMM_256>
+
+_Put_XMM_256:
+  irp _NN_,<0,1,2,3>
+    mov64           ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_
+  endm
+    ret
+;
+_Get_XMM_256:
+  irp _NN_,<0,1,2,3>
+    mov64  xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8]
+  endm
+    ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+_Skein_256_Process_Block proc near
+    WCNT    =   4                   ;WCNT=4 for Skein-256
+    Setup_Stack WCNT,ROUNDS_256
+    ; main hash loop for Skein_256
+Skein_256_block_loop:
+    movd    xmm4,ptr32 [ebx+bitAdd]
+    mov64   xmm5,ptr64 [edi+TWEAK+0]
+    mov64   xmm6,ptr64 [edi+TWEAK+8]
+    paddq   xmm5,xmm4               ;bump T0 by the bitAdd parameter
+    mov64   ptr64 [edi+TWEAK],xmm5  ;save updated tweak value T0 (for next time)
+    movapd  xmm7,xmm6
+    xorpd   xmm7,xmm5               ;compute overall tweak parity
+    movdqa  [FP_+ksTwk   ],xmm5     ;save the expanded tweak schedule on the stack
+    movdqa  [FP_+ksTwk+16],xmm6
+    movdqa  [FP_+ksTwk+32],xmm7
+
+    mov     esi,[ebx+blkPtr]        ;esi --> input block
+    mov     eax,KW_PARITY_LO        ;init key schedule parity accumulator
+    mov     edx,KW_PARITY_HI
+    movd    xmm4,eax
+    movd    xmm0,edx
+    unpcklps xmm4,xmm0              ;pack two 32-bit words into xmm4
+;
+  irp _NN_,<0,1,2,3>                ;copy in the chaining vars
+    mov64   xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_]
+    xorpd   xmm4,xmm&_NN_           ;update overall parity
+    movdqa  [FP_+ksKey+16*_NN_],xmm&_NN_
+  endm
+    movdqa  [FP_+ksKey+16*WCNT],xmm4;save overall parity at the end of the array
+;
+    paddq   xmm1,xmm5               ;inject the initial tweak words
+    paddq   xmm2,xmm6
+;
+  irp _NN_,<0,1,2,3>                ;perform the initial key injection
+    mov64   xmm4,ptr64 [esi+8*_NN_] ;and save a copy of the input block on stack
+    mov64        ptr64 [esp+8*_NN_+Wcopy],xmm4
+    paddq   xmm&_NN_,xmm4
+  endm
+;
+if _SKEIN_DEBUG                     ;debug dump of state at this point
+    Skein_Debug_Block 256
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+    add     esi, WCNT*8             ;skip to the next block
+    mov         [ebx+blkPtr   ],esi ;save the updated block pointer
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+    xor     edx,edx                 ;edx = iteration count
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT =   ROUNDS_256/8        ;fully unrolled
+else
+_UNROLL_CNT =   SKEIN_UNROLL_256    ;partial unroll count
+  if ((ROUNDS_256/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_256" ;sanity check
+  endif
+    mov     esi,ebp                 ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_256_round_loop:               ;   (since there's no 16* scaled address mode)
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2                  ; here with X[0..3] in XMM0..XMM3
+      R_256_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT*2
+;
+  if _UNROLL_CNT ne (ROUNDS_256/8)
+    cmp     edx,2*(ROUNDS_256/8)
+    jb      Skein_256_round_loop
+  endif
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
+    irp _NN_,<0,1,2,3>
+        mov64   xmm4,ptr64 [esp+Wcopy+8*_NN_]
+        xorpd   xmm&_NN_,xmm4
+        mov64        ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+    endm
+    and     ptr08 [edi +TWEAK +15],FIRST_MASK8
+if _SKEIN_DEBUG
+    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,saveRegs
+endif
+    ; go back for more blocks, if needed
+    dec     ecx
+    jnz     Skein_256_block_loop
+    
+    Reset_Stack _Skein_256_Process_Block
+    ret
+;
+_Skein_256_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+    public  _Skein_256_Process_Block_CodeSize
+_Skein_256_Process_Block_CodeSize proc
+    mov     eax,_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block
+    ret
+_Skein_256_Process_Block_CodeSize endp
+;
+    public  _Skein_256_Unroll_Cnt
+_Skein_256_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_256/8
+    mov     eax,_UNROLL_CNT
+  else
+    xor     eax,eax
+  endif
+    ret
+_Skein_256_Unroll_Cnt endp
+endif
+endif ;_USE_ASM_ and 256
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 512
+    public      _Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one round
+;
+R_512_Round macro _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd
+irp _nr_,<%((_RR_) and 7)>
+_Ra_ = RC_512_&_nr_&_&Ra
+_Rb_ = RC_512_&_nr_&_&Rb
+_Rc_ = RC_512_&_nr_&_&Rc
+_Rd_ = RC_512_&_nr_&_&Rd
+endm
+    paddq   xmm&a0,xmm&a1
+                            _stX c0
+    mov64   xmm&c0,xmm&a1
+    psllq   xmm&a1,   _Ra_
+    psrlq   xmm&c0,64-_Ra_
+    xorpd   xmm&a1,xmm&c0
+    xorpd   xmm&a1,xmm&a0
+
+    paddq   xmm&b0,xmm&b1
+                            _stX a0
+    mov64   xmm&a0,xmm&b1
+    psllq   xmm&b1,   _Rb_
+    psrlq   xmm&a0,64-_Rb_
+    xorpd   xmm&b1,xmm&b0
+                            _ldX c0
+    xorpd   xmm&b1,xmm&a0
+                             
+    paddq   xmm&c0,xmm&c1
+    mov64   xmm&a0,xmm&c1
+    psllq   xmm&c1,   _Rc_
+    psrlq   xmm&a0,64-_Rc_
+    xorpd   xmm&c1,xmm&c0
+    xorpd   xmm&c1,xmm&a0
+                             
+    paddq   xmm&d0,xmm&d1
+    mov64   xmm&a0,xmm&d1           
+    psllq   xmm&d1,   _Rd_
+    psrlq   xmm&a0,64-_Rd_
+    xorpd   xmm&d1,xmm&a0
+                            _ldX a0
+    xorpd   xmm&d1,xmm&d0
+  if _SKEIN_DEBUG
+    Skein_Debug_Round 512,%(_RR_+1),saveRegs
+  endif
+endm
+;
+; MACRO: four rounds
+R_512_FourRounds macro _RN_
+    R_512_Round (_RN_)  , 0,1,0, 2,3,1, 4,5,2, 6,7,3
+    R_512_Round (_RN_)+1, 2,1,0, 4,7,1, 6,5,2, 0,3,3
+    R_512_Round (_RN_)+2, 4,1,0, 6,3,1, 0,5,2, 2,7,3
+    R_512_Round (_RN_)+3, 6,1,0, 0,7,1, 2,5,2, 4,3,3
+
+    ;inject key schedule
+  irp _NN_,<0,1,2,3,4,5,6,7>
+   if _UNROLL_CNT eq (ROUNDS_512/8)
+    paddq xmm&_NN_,[FP_+ksKey+16*((((_RN_)/4)+(_NN_)+1) mod 9)]
+    else
+    paddq xmm&_NN_,[SI_+ksKey+16*((_NN_)+1)]
+    endif
+  endm
+    _stX  0                       ;free up a register
+    inc   edx                     ;bump round counter
+    movd  xmm0,edx                ;inject the tweak
+  if _UNROLL_CNT eq (ROUNDS_512/8)
+    paddq xmm5,[FP_+ksTwk+16*(((_RN_)+1) mod 3)]
+    paddq xmm6,[FP_+ksTwk+16*(((_RN_)+2) mod 3)]
+    paddq xmm7,xmm0
+  else ;looping version
+    paddq xmm5,[SI_+ksTwk+16*1]
+    paddq xmm6,[SI_+ksTwk+16*2]
+    paddq xmm7,xmm0
+;   
+    mov64 xmm0,<ptr64 [SI_+ksKey]>;first, "rotate" key schedule on the stack
+    mov64      <ptr64 [SI_+ksKey+16*(WCNT+1)]>,xmm0
+    mov64 xmm0,<ptr64 [SI_+ksTwk]>;    (for next time through)
+    mov64      <ptr64 [SI_+ksTwk+16*3]>,xmm0
+    add   esi,16                  ;bump rolling pointer
+  endif
+    _ldX  0                       ;restore X0
+  if _SKEIN_DEBUG
+      Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,saveRegs
+  endif
+endm ;R_512_FourRounds
+;;;;;;;;;;;;;;;;;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_512 equ <call _Put_XMM_512>
+Get_XMM_512 equ <call _Get_XMM_512>
+
+_Put_XMM_512:
+  irp _NN_,<0,1,2,3,4,5,6,7>
+    mov64          ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_
+  endm
+    ret
+;
+_Get_XMM_512:
+  irp _NN_,<0,1,2,3,4,5,6,7>
+    mov64  xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8]
+  endm
+    ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+; code
+;
+_Skein_512_Process_Block proc near
+    WCNT    =   8                   ;WCNT=8 for Skein-512
+    Setup_Stack WCNT,ROUNDS_512
+    ; main hash loop for Skein_512
+Skein_512_block_loop:
+    movd    xmm0,ptr32 [ebx+bitAdd]
+    mov64   xmm1,ptr64 [edi+TWEAK+0]
+    mov64   xmm2,ptr64 [edi+TWEAK+8]
+    paddq   xmm1,xmm0               ;bump T0 by the bitAdd parameter
+    mov64   ptr64 [edi+TWEAK],xmm1  ;save updated tweak value T0 (for next time)
+    mov64   xmm0,xmm2
+    xorpd   xmm0,xmm1               ;compute overall tweak parity
+    movdqa  [FP_+ksTwk     ],xmm1   ;save the expanded tweak schedule on the stack
+    movdqa  [FP_+ksTwk+16*1],xmm2
+    movdqa  [FP_+ksTwk+16*2],xmm0
+
+    mov     esi,[ebx+blkPtr]        ;esi --> input block
+    mov     eax,KW_PARITY_LO        ;init key schedule parity accumulator
+    mov     edx,KW_PARITY_HI
+    movd    xmm0,eax
+    movd    xmm7,edx
+    unpcklps xmm0,xmm7              ;pack two 32-bit words into xmm0
+;
+  irp _NN_,<7,6,5,4,3,2,1>          ;copy in the chaining vars (skip #0 for now)
+    mov64   xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_]
+    xorpd   xmm0,xmm&_NN_           ;update overall parity
+    movdqa  [FP_+ksKey+16*_NN_],xmm&_NN_
+   if _NN_ eq 5
+    paddq   xmm5,xmm1               ;inject the initial tweak words
+    paddq   xmm6,xmm2               ;  (before they get trashed in xmm1/2)
+   endif
+  endm
+    mov64   xmm4,ptr64 [edi+X_VARS] ;handle #0 now
+    xorpd   xmm0,xmm4               ;update overall parity
+    movdqa  [FP_+ksKey+16* 0  ],xmm4;save the key value in slot #0
+    movdqa  [FP_+ksKey+16*WCNT],xmm0;save overall parity at the end of the array
+;
+    mov64   xmm0,xmm4
+  irp _NN_,<7,6,5,  4,3,2,1,0>      ;perform the initial key injection (except #4)
+    mov64   xmm4,ptr64 [esi+ 8*_NN_];and save a copy of the input block on stack
+    mov64        ptr64 [esp+ 8*_NN_+Wcopy],xmm4
+    paddq   xmm&_NN_,xmm4
+  endm
+    mov64   xmm4,ptr64 [esi+ 8*4]   ;get input block word #4
+    mov64        ptr64 [esp+ 8*4+Wcopy],xmm4
+    paddq   xmm4,[FP_+ksKey+16*4]   ;inject the initial key
+;
+if _SKEIN_DEBUG                     ;debug dump of state at this point
+    Skein_Debug_Block 512
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+    add     esi, WCNT*8             ;skip to the next block
+    mov         [ebx+blkPtr],esi    ;save the updated block pointer
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+    xor     edx,edx                 ;edx = round counter
+if SKEIN_ASM_UNROLL and 512
+_UNROLL_CNT =   ROUNDS_512/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_512
+  if ((ROUNDS_512/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_512"
+  endif
+    mov     esi,ebp                 ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_512_round_loop:               ;   (since there's no 16* scaled address mode)
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+      R_512_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 512) eq 0
+    cmp     edx,2*(ROUNDS_512/8)
+    jb      Skein_512_round_loop
+endif
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
+    and     ptr08 [edi +TWEAK +15],FIRST_MASK8
+irp _NN_,<0,2,4,6>                  ;do the aligned ones first
+    xorpd   xmm&_NN_,[esp+Wcopy+8*_NN_]
+    mov64   ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+endm
+irp _NN_,<1,3,5,7>                  ;now we have some register space available
+    mov64   xmm0,ptr64 [esp+Wcopy+8*_NN_]
+    xorpd   xmm&_NN_,xmm0
+    mov64   ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+endm
+if _SKEIN_DEBUG
+    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+endif
+    ; go back for more blocks, if needed
+    dec     ecx
+    jnz     Skein_512_block_loop
+
+    Reset_Stack _Skein_512_Process_Block
+    ret
+_Skein_512_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+    public  _Skein_512_Process_Block_CodeSize
+_Skein_512_Process_Block_CodeSize proc
+    mov     eax,_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block
+    ret
+_Skein_512_Process_Block_CodeSize endp
+;
+    public  _Skein_512_Unroll_Cnt
+_Skein_512_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_512/8
+    mov     eax,_UNROLL_CNT
+  else
+    xor     eax,eax
+  endif
+    ret
+_Skein_512_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 512
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 1024
+    public      _Skein1024_Process_Block
+;
+; void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+R_1024_REGS equ     (5)     ;keep this many block variables in registers
+;
+;;;;;;;;;;;;;;;;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_1024 equ <call _Put_XMM_1024>
+Get_XMM_1024 equ <call _Get_XMM_1024>
+
+_Put_XMM_1024:
+_NN_ = 0
+  rept R_1024_REGS
+   irp _rr_,<%(_NN_)>
+    mov64           ptr64 [esp+X_stk+4+8*_NN_],xmm&_rr_
+   endm
+_NN_ = _NN_+1
+  endm
+    ret
+;
+_Get_XMM_1024:
+_NN_ = 0
+  rept R_1024_REGS
+   irp _rr_,<%(_NN_)>
+    mov64  xmm&_rr_,ptr64 [esp+X_stk+4+8*_NN_]
+   endm
+_NN_ = _NN_+1
+  endm
+    ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one mix step
+MixStep_1024 macro  x0,x1,rotIdx0,rotIdx1,_debug_
+_r0_ =  x0      ;default, if already loaded
+_r1_ =  x1
+  ; load the regs (if necessary)
+  if (x0 ge R_1024_REGS)
+_r0_ =       5
+  mov64   xmm5,ptr64 [esp+X_stk+8*(x0)]
+  endif
+  if (x1 ge R_1024_REGS)
+_r1_ =       6     
+    mov64 xmm6,ptr64 [esp+X_stk+8*(x1)]
+  endif
+  ; do the mix
+  irp _rx_,<%((rotIdx0) and 7)>
+_Rc_ = RC_1024_&_rx_&_&rotIdx1  ;rotation constant
+  endm
+  irp _x0_,<%_r0_>
+  irp _x1_,<%_r1_>
+    paddq   xmm&_x0_,xmm&_x1_
+    mov64   xmm7    ,xmm&_x1_
+    psllq   xmm&_x1_,   _Rc_
+    psrlq   xmm7    ,64-_Rc_
+    xorpd   xmm&_x1_,xmm&_x0_
+    xorpd   xmm&_x1_,xmm7
+  endm
+  endm
+  ; save the regs (if necessary)
+  if (x0 ge R_1024_REGS)
+    mov64   ptr64 [esp+X_stk+8*(x0)],xmm5
+  endif
+  if (x1 ge R_1024_REGS)
+    mov64   ptr64 [esp+X_stk+8*(x1)],xmm6
+  endif
+  ; debug output
+  if _SKEIN_DEBUG and (0 ne (_debug_ + 0))
+    Skein_Debug_Round 1024,%((RotIdx0)+1),saveRegs
+  endif
+endm
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds
+;
+R_1024_FourRounds macro _RR_
+    ;--------- round _RR_
+    MixStep_1024     0, 1,%((_RR_)+0),0
+    MixStep_1024     2, 3,%((_RR_)+0),1
+    MixStep_1024     4, 5,%((_RR_)+0),2
+    MixStep_1024     6, 7,%((_RR_)+0),3
+    MixStep_1024     8, 9,%((_RR_)+0),4
+    MixStep_1024    10,11,%((_RR_)+0),5
+    MixStep_1024    12,13,%((_RR_)+0),6
+    MixStep_1024    14,15,%((_RR_)+0),7,1
+    ;--------- round _RR_+1
+    MixStep_1024     0, 9,%((_RR_)+1),0
+    MixStep_1024     2,13,%((_RR_)+1),1
+    MixStep_1024     6,11,%((_RR_)+1),2
+    MixStep_1024     4,15,%((_RR_)+1),3
+    MixStep_1024    10, 7,%((_RR_)+1),4
+    MixStep_1024    12, 3,%((_RR_)+1),5
+    MixStep_1024    14, 5,%((_RR_)+1),6
+    MixStep_1024     8, 1,%((_RR_)+1),7,1
+    ;--------- round _RR_+2
+    MixStep_1024     0, 7,%((_RR_)+2),0    
+    MixStep_1024     2, 5,%((_RR_)+2),1
+    MixStep_1024     4, 3,%((_RR_)+2),2    
+    MixStep_1024     6, 1,%((_RR_)+2),3    
+    MixStep_1024    12,15,%((_RR_)+2),4
+    MixStep_1024    14,13,%((_RR_)+2),5    
+    MixStep_1024     8,11,%((_RR_)+2),6    
+    MixStep_1024    10, 9,%((_RR_)+2),7,1
+    ;--------- round _RR_+3
+    MixStep_1024     0,15,%((_RR_)+3),0
+    MixStep_1024     2,11,%((_RR_)+3),1
+    MixStep_1024     6,13,%((_RR_)+3),2
+    MixStep_1024     4, 9,%((_RR_)+3),3
+    MixStep_1024    14, 1,%((_RR_)+3),4
+    MixStep_1024     8, 5,%((_RR_)+3),5
+    MixStep_1024    10, 3,%((_RR_)+3),6
+    MixStep_1024    12, 7,%((_RR_)+3),7,1
+
+    inc   edx                     ;edx = round number
+    movd  xmm7,edx
+    ;inject the key
+irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0>
+  if _UNROLL_CNT ne (ROUNDS_1024/8)
+    if _NN_ lt R_1024_REGS
+      paddq xmm&_NN_,ptr64 [SI_+ksKey+16*_NN_+16]
+    else
+      mov64 xmm6    ,ptr64 [esp+X_stk+ 8*_NN_]
+     if     _NN_ eq 15
+      paddq xmm6,xmm7
+     elseif _NN_ eq 14
+      paddq xmm6,ptr64 [SI_+ksTwk+16*2]
+     elseif _NN_ eq 13
+      paddq xmm6,ptr64 [SI_+ksTwk+16*1]
+     endif
+      paddq xmm6    ,ptr64 [SI_+ksKey+16*_NN_+16]
+      mov64          ptr64 [esp+X_stk+ 8*_NN_],xmm6
+    endif
+  else
+    if _NN_ lt R_1024_REGS
+      paddq xmm&_NN_,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)]
+    else
+      mov64 xmm6,ptr64 [esp+X_stk+ 8*_NN_]
+      paddq xmm6,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)]
+     if     _NN_ eq 15
+      paddq xmm6,xmm7
+     elseif _NN_ eq 14
+      paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+2) mod  3)]
+     elseif _NN_ eq 13
+      paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+1) mod  3)]
+     endif
+      mov64      ptr64 [esp+X_stk+ 8*_NN_],xmm6
+    endif
+  endif
+endm
+if _UNROLL_CNT ne (ROUNDS_1024/8) ;rotate the key schedule on the stack
+    mov64 xmm6,ptr64 [SI_+ksKey]
+    mov64 xmm7,ptr64 [SI_+ksTwk]
+    mov64      ptr64 [SI_+ksKey+16*(WCNT+1)],xmm6
+    mov64      ptr64 [SI_+ksTwk+16* 3      ],xmm7
+    add   esi,16                  ;bump rolling pointer
+endif
+if _SKEIN_DEBUG
+      Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,saveRegs
+endif
+endm ;R_1024_FourRounds
+;;;;;;;;;;;;;;;;
+; code
+;
+_Skein1024_Process_Block proc near
+;
+    WCNT    =   16                  ;WCNT=16 for Skein-1024
+    Setup_Stack WCNT,ROUNDS_1024
+    add     edi,80h                 ;bias the edi ctxt offsets to keep them all short
+ctx equ    <edi-80h>                ;offset alias
+    ; main hash loop for Skein1024
+Skein1024_block_loop:
+    movd    xmm0,ptr32 [ebx+bitAdd]
+    mov64   xmm1,ptr64 [ctx+TWEAK+0]
+    mov64   xmm2,ptr64 [ctx+TWEAK+8]
+    paddq   xmm1,xmm0               ;bump T0 by the bitAdd parameter
+    mov64   ptr64 [ctx+TWEAK],xmm1  ;save updated tweak value T0 (for next time)
+    mov64   xmm0,xmm2
+    xorpd   xmm0,xmm1               ;compute overall tweak parity
+    movdqa  [FP_+ksTwk   ],xmm1     ;save the expanded tweak schedule on the stack
+    movdqa  [FP_+ksTwk+16],xmm2
+    movdqa  [FP_+ksTwk+32],xmm0
+
+    mov     esi,[ebx+blkPtr]        ;esi --> input block
+    mov     eax,KW_PARITY_LO        ;init key schedule parity accumulator
+    mov     edx,KW_PARITY_HI
+    movd    xmm7,eax
+    movd    xmm6,edx
+    unpcklps xmm7,xmm6              ;pack two 32-bit words into xmm7
+;
+    lea     eax,[esp+80h]           ;use short offsets for Wcopy, X_stk writes below
+SP_ equ    <eax-80h>                ;[eax+OFFS] mode is one byte shorter than [esp+OFFS]
+irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0>
+    mov64   xmm6,ptr64 [ctx+X_VARS+8*_NN_]
+    xorpd   xmm7,xmm6               ;update overall parity
+    movdqa  [FP_+ksKey+16*_NN_],xmm6;save the key schedule on the stack
+  if _NN_ lt R_1024_REGS
+     _rr_  =  _NN_
+  else
+     _rr_  =   R_1024_REGS
+  endif
+  irp _rn_,<%(_rr_)>
+    mov64   xmm&_rn_,ptr64 [esi+         8*_NN_];save copy of the input block on stack
+    mov64            ptr64 [SP_+ Wcopy + 8*_NN_],xmm&_rn_   ;(for feedforward later)
+    paddq   xmm&_rn_,xmm6               ;inject the key into the block
+   if _NN_ eq 13
+    paddq   xmm&_rn_,xmm1               ;inject the initial tweak words
+   elseif _NN_ eq 14
+    paddq   xmm&_rn_,xmm2
+   endif
+   if _NN_ ge R_1024_REGS               ;only save X[5..15] on stack, leave X[0..4] in regs
+    mov64   ptr64 [SP_+X_stk+8*_NN_],xmm&_rn_ 
+   endif
+  endm
+endm
+    movdqa  [FP_+ksKey+16*WCNT],xmm7;save overall key parity at the end of the array
+;
+if _SKEIN_DEBUG                     ;debug dump of state at this point
+    Skein_Debug_Block 1024
+    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+    add     esi, WCNT*8             ;skip to the next block
+    mov         [ebx+blkPtr],esi    ;save the updated block pointer
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+    xor     edx,edx                 ;edx = round counter
+if SKEIN_ASM_UNROLL and 1024
+_UNROLL_CNT =   ROUNDS_1024/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_1024
+  if ((ROUNDS_1024/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_1024"
+  endif
+    mov     esi,ebp                 ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_1024_round_loop:
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+    R_1024_FourRounds %_Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 1024) eq 0
+    cmp     edx,2*(ROUNDS_1024/8)
+    jb      Skein_1024_round_loop
+endif
+    and     ptr08 [ctx +TWEAK +15],FIRST_MASK8      ;clear tweak bit for next time thru
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
+    lea     eax,[esp+80h]                           ;allow short offsets to X_stk and Wcopy
+irp _NN_,<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+  if _NN_ lt R_1024_REGS
+    if _NN_ and 1                                   ;already in regs: no load needed
+      mov64 xmm7  ,ptr64 [SP_+ Wcopy + 8*_NN_]      ;unaligned
+      xorpd xmm&_NN_,xmm7
+    else
+      xorpd xmm&_NN_,    [SP_+ Wcopy + 8*_NN_]      ;aligned
+    endif
+      mov64        ptr64 [ctx+ X_vars+ 8*_NN_],xmm&_NN_
+  else
+      mov64   xmm7,ptr64 [SP_+ X_stk + 8*_NN_]      ;load X value from stack
+    if _NN_ and 1
+      mov64   xmm6,ptr64 [SP_+ Wcopy + 8*_NN_]      ;unaligned
+      xorpd   xmm7,xmm6
+    else
+      xorpd   xmm7,      [SP_+ Wcopy + 8*_NN_]      ;aligned
+    endif
+      mov64        ptr64 [ctx+ X_vars+ 8*_NN_],xmm7
+ endif
+endm
+if _SKEIN_DEBUG
+    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD   ;no need to save regs on stack here
+endif
+    ; go back for more blocks, if needed
+    dec     ecx
+    jnz     Skein1024_block_loop
+
+    Reset_Stack _Skein1024_Process_Block
+    ret
+_Skein1024_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+    public  _Skein1024_Process_Block_CodeSize
+_Skein1024_Process_Block_CodeSize proc
+    mov     eax,_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block
+    ret
+_Skein1024_Process_Block_CodeSize endp
+;
+    public  _Skein1024_Unroll_Cnt
+_Skein1024_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_1024/8
+    mov     eax,_UNROLL_CNT
+  else
+    xor     eax,eax
+  endif
+    ret
+_Skein1024_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 1024
+;----------------------------------------------------------------
+    end
diff --git a/Additional_Implementations/skein_block_xmm32.s b/Additional_Implementations/skein_block_xmm32.s
new file mode 100644
index 0000000000000..fa10bd2b98c1f
--- /dev/null
+++ b/Additional_Implementations/skein_block_xmm32.s
@@ -0,0 +1,1110 @@
+#
+#----------------------------------------------------------------
+# 32-bit x86 assembler code for Skein block functions using XMM registers
+#
+# Author: Doug Whiting, Hifn/Exar
+#
+# This code is released to the public domain.
+#----------------------------------------------------------------
+#
+    .text
+    .altmacro                               #use advanced macro features
+    .psize 0,128                            #list file has no page boundaries
+#
+_MASK_ALL_  =   (256+512+1024)              #all three algorithm bits
+SAVE_REGS   =   1
+#
+#################
+.ifndef SKEIN_USE_ASM
+_USE_ASM_        = _MASK_ALL_
+.elseif SKEIN_USE_ASM & _MASK_ALL_
+_USE_ASM_        = SKEIN_USE_ASM
+.else
+_USE_ASM_        = _MASK_ALL_
+.endif
+#
+#################
+.ifndef SKEIN_LOOP  
+_SKEIN_LOOP       = 002                     #default is all fully unrolled, except Skein1024
+.else
+_SKEIN_LOOP       = SKEIN_LOOP
+.endif
+#--------------
+# the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
+SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
+#
+SKEIN_ASM_UNROLL  = 0
+  .irp _NN_,256,512,1024
+    .if (SKEIN_UNROLL_\_NN_) == 0
+SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + \_NN_
+    .endif
+  .endr
+#
+#################
+#
+.ifndef SKEIN_ROUNDS
+ROUNDS_256  =   72
+ROUNDS_512  =   72
+ROUNDS_1024 =   80
+.else
+ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
+ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
+.irp _NN_,256,512,1024
+  .if _USE_ASM_ && \_NN_
+    .irp _RR_,%(ROUNDS_\_NN_)
+      .if \_NN_ < 1024
+.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
+      .else
+.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+      .endif
+    .endr
+  .endif
+.endr
+.endif
+#################
+#
+.ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE = (1)
+.else
+.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE = (1)
+.endif
+.endif
+#
+#################
+#
+.ifndef SKEIN_DEBUG
+_SKEIN_DEBUG      = 0
+.else
+_SKEIN_DEBUG      = 1
+.endif
+#################
+#
+# define offsets of fields in hash context structure
+#
+HASH_BITS   =   0                           ## bits of hash output
+BCNT        =   4 + HASH_BITS               #number of bytes in BUFFER[]
+TWEAK       =   4 + BCNT                    #tweak values[0..1]
+X_VARS      =  16 + TWEAK                   #chaining vars
+#
+#(Note: buffer[] in context structure is NOT needed here :-)
+#
+KW_PARITY_LO=   0xA9FC1A22                  #overall parity of key schedule words (hi32/lo32)
+KW_PARITY_HI=   0x1BD11BDA
+FIRST_MASK8 =   ~ (1 << 6)                  #FIRST block flag bit
+#
+# rotation constants for Skein
+#
+RC_256_0_0  = 14
+RC_256_0_1  = 16
+
+RC_256_1_0  = 52
+RC_256_1_1  = 57
+
+RC_256_2_0  = 23
+RC_256_2_1  = 40
+
+RC_256_3_0  =  5
+RC_256_3_1  = 37
+
+RC_256_4_0  = 25
+RC_256_4_1  = 33
+
+RC_256_5_0  = 46
+RC_256_5_1  = 12
+
+RC_256_6_0  = 58
+RC_256_6_1  = 22
+
+RC_256_7_0  = 32
+RC_256_7_1  = 32
+
+RC_512_0_0  = 46
+RC_512_0_1  = 36
+RC_512_0_2  = 19
+RC_512_0_3  = 37
+
+RC_512_1_0  = 33
+RC_512_1_1  = 27
+RC_512_1_2  = 14
+RC_512_1_3  = 42
+
+RC_512_2_0  = 17
+RC_512_2_1  = 49
+RC_512_2_2  = 36
+RC_512_2_3  = 39
+
+RC_512_3_0  = 44
+RC_512_3_1  =  9
+RC_512_3_2  = 54
+RC_512_3_3  = 56
+
+RC_512_4_0  = 39
+RC_512_4_1  = 30
+RC_512_4_2  = 34
+RC_512_4_3  = 24
+
+RC_512_5_0  = 13
+RC_512_5_1  = 50
+RC_512_5_2  = 10
+RC_512_5_3  = 17
+
+RC_512_6_0  = 25
+RC_512_6_1  = 29
+RC_512_6_2  = 39
+RC_512_6_3  = 43
+
+RC_512_7_0  =  8
+RC_512_7_1  = 35
+RC_512_7_2  = 56
+RC_512_7_3  = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 =  8
+RC_1024_0_3 = 47
+RC_1024_0_4 =  8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 =  4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 =  5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 =  9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 =  4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 =  9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+#
+#----------------------------------------------------------------
+# declare allocated space on the stack
+.macro StackVar  localName,localSize
+\localName  =   _STK_OFFS_
+_STK_OFFS_  =   _STK_OFFS_+(\localSize)
+.endm #StackVar
+#
+#----------------------------------------------------------------
+#
+# MACRO: Configure stack frame, allocate local vars
+#
+.macro Setup_Stack WCNT,RND_CNT
+_STK_OFFS_  =   0                   #starting offset from esp, forced on 16-byte alignment
+    #----- local  variables         #<-- esp
+    StackVar    X_stk  , 8*(WCNT)   #local context vars
+    StackVar    Wcopy  , 8*(WCNT)   #copy of input block    
+    StackVar    ksTwk  ,16*3        #key schedule: tweak words
+    StackVar    ksKey  ,16*(WCNT)+16#key schedule: key   words
+FRAME_OFFS  =   ksTwk+128           #<-- ebp
+F_O         =   FRAME_OFFS          #syntactic shorthand
+  .if (SKEIN_ASM_UNROLL && (WCNT*64)) == 0
+    StackVar    ksRot,16*(RND_CNT/4)#leave space for ks "rotation" to happen
+  .endif
+LOCAL_SIZE  =   _STK_OFFS_          #size of local vars
+    #
+    #"restart" the stack defns, because we relocate esp to guarantee alignment
+    #    (i.e., these vars are NOT at fixed offsets from esp)
+_STK_OFFS_  =   0
+    #----- 
+    StackVar    savRegs,8*4         #pushad data
+    StackVar    retAddr,4           #return address
+    #----- caller parameters
+    StackVar    ctxPtr ,4           #context ptr
+    StackVar    blkPtr ,4           #pointer to block data
+    StackVar    blkCnt ,4           #number of full blocks to process
+    StackVar    bitAdd ,4           #bit count to add to tweak
+    #----- caller's stack frame
+#
+# Notes on stack frame setup:
+#   * the most used variable (except for Skein-256) is X_stk[], based at [esp+0]
+#   * the next most used is the key schedule words
+#       so ebp is "centered" there, allowing short offsets to the key/tweak
+#       schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-(
+#   * the Wcopy variables are infrequently accessed, and they have long 
+#       offsets from both esp and ebp only in the 1024-bit case.
+#   * all other local vars and calling parameters can be accessed 
+#       with short offsets, except in the 1024-bit case
+#
+    pushal                          #save all regs
+    movl    %esp,%ebx               #keep ebx as pointer to caller parms
+    subl    $LOCAL_SIZE,%esp        #make room for the locals
+    andl    $~15,%esp               #force alignment
+    movl    ctxPtr(%ebx),%edi       #edi --> Skein context
+    leal    FRAME_OFFS(%esp),%ebp   #maximize use of short offsets from ebp
+    movl    blkCnt(%ebx),%ecx       #keep block cnt in ecx
+.endm #Setup_Stack
+#
+#----------------------------------------------------------------
+#
+.macro Reset_Stack,procStart
+    movl     %ebx,%esp              #get rid of locals (wipe??)
+    popal                           #restore all regs
+.endm # Reset_Stack
+#
+#----------------------------------------------------------------
+# macros to help debug internals
+#
+.if _SKEIN_DEBUG
+    .extern   _Skein_Show_Block   #calls to C routines
+    .extern   _Skein_Show_Round
+#
+SKEIN_RND_SPECIAL       =   1000
+SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
+#
+.macro Skein_Debug_Block BLK_BITS
+#
+#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+#                     const u08b_t *blkPtr, const u64b_t *wPtr, 
+#                     const u64b_t *ksPtr,const u64b_t *tsPtr)#
+#
+    call    _Put_XMM_\BLK_BITS
+    pushal                          #save all regs
+    leal    ksTwk+1-F_O(%ebp),%eax  #+1 = flag: "stride" size = 2 qwords
+    leal    ksKey+1-F_O(%ebp),%esi
+    leal    Wcopy+32(%esp),%ecx     #adjust offset by 32 for pushad
+    movl    ctxPtr(%ebx)  ,%edx     #ctx_hdr_ptr
+    leal    X_VARS(%edx)  ,%edx     #edx ==> cxt->X[]
+    pushl   %eax                    #tsPtr
+    pushl   %esi                    #ksPtr
+    pushl   %ecx                    #wPtr
+    pushl   blkPtr(%ebx)            #blkPtr
+    pushl   %edx                    #ctx->Xptr
+    pushl   ctxPtr(%ebx)            #ctx_hdr_ptr
+    movl    $\BLK_BITS,%eax
+    pushl   %eax                    #bits
+    call    _Skein_Show_Block
+    addl    $7*4,%esp               #discard parameter space on stack
+    popal                           #restore regs
+#
+    call    _Get_XMM_\BLK_BITS
+.endm #Skein_Debug_Block
+
+#
+.macro Skein_Debug_Round BLK_BITS,R,saveRegs=0
+#
+#void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)#
+#
+  .if \saveRegs
+    call    _Put_XMM_\BLK_BITS
+  .endif
+    pushal                          #save all regs
+  .if R <> SKEIN_RND_FEED_FWD
+    leal    32+X_stk(%esp),%eax     #adjust offset by 32 for pushal
+  .else
+    movl    ctxPtr(%ebx),%eax
+    addl    $X_VARS,%eax
+  .endif
+    pushl   %eax                    #Xptr
+  .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
+    movl    $\R,%eax
+  .else     #compute round number from edx, R
+    leal    1+(((\R)-1) && 3)(,%edx,4),%eax
+  .endif
+    pushl   %eax                    #round number
+    pushl   ctxPtr(%ebx)            #ctx_hdr_ptr
+    movl    $\BLK_BITS,%eax
+    pushl   %eax                    #bits
+    call    _Skein_Show_Round
+    addl    $4*4,%esp               #discard parameter space on stack
+    popal                           #restore regs
+  .if \saveRegs
+    call  _Get_XMM_\BLK_BITS        #save internal vars for debug dump
+  .endif
+.endm  #Skein_Debug_Round
+.endif #ifdef SKEIN_DEBUG
+#
+#----------------------------------------------------------------
+# useful macros
+.macro _ldX xn
+    movq          X_stk+8*(\xn)(%esp),%xmm\xn
+.endm
+
+.macro _stX xn
+    movq  %xmm\xn,X_stk+8*(\xn)(%esp)
+.endm
+#
+#----------------------------------------------------------------
+#
+.macro C_label lName
+ \lName:        #use both "genders" to work across linkage conventions
+_\lName:
+    .global  \lName
+    .global _\lName
+.endm
+#
+
+.if _USE_ASM_ & 256
+#
+# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+#
+# Skein-256 round macros
+#
+.macro R_256_OneRound _RR_,x0,x1,x2,x3,t0,t1
+  .irp _qq_,%((\_RR_) && 7)        #figure out which rotation constants to use
+    .if \x0 == 0
+_RC0_ =   RC_256_\_qq_&&_0
+_RC1_ =   RC_256_\_qq_&&_1
+    .else
+_RC0_ =   RC_256_\_qq_&&_1
+_RC1_ =   RC_256_\_qq_&&_0
+    .endif
+  .endr
+#
+    paddq    %xmm\x1,%xmm\x0
+    movq     %xmm\x1,%xmm\t0
+    psllq  $   _RC0_,%xmm\x1
+    psrlq  $64-_RC0_,%xmm\t0
+    xorpd    %xmm\x0,%xmm\x1
+    xorpd    %xmm\t0,%xmm\x1
+#                         
+    paddq    %xmm\x3,%xmm\x2
+    movq     %xmm\x3,%xmm\t1
+    psllq  $   _RC1_,%xmm\x3
+    psrlq  $64-_RC1_,%xmm\t1
+    xorpd    %xmm\x2,%xmm\x3
+    xorpd    %xmm\t1,%xmm\x3
+  .if _SKEIN_DEBUG
+    Skein_Debug_Round 256,%(\_RR_+1),SAVE_REGS
+  .endif
+.endm #R_256_OneRound
+#
+.macro R_256_FourRounds _RN_
+    R_256_OneRound %(_RN_+0),0,1,2,3,4,5
+    R_256_OneRound (_RN_+1),2,1,0,3,4,5
+
+    R_256_OneRound (_RN_+2),0,1,2,3,4,5
+    R_256_OneRound (_RN_+3),2,1,0,3,4,5
+
+    #inject key schedule
+    incl  %edx                     #bump round number
+    movd  %edx,%xmm4
+  .if _UNROLL_CNT == (ROUNDS_256/8)
+    #fully unrolled version
+_RK_ = ((_RN_)/4)                 #key injection counter
+    paddq ksKey+16*((_RK_+1) % 5)-F_O(%ebp),%xmm0
+    paddq ksKey+16*((_RK_+2) % 5)-F_O(%ebp),%xmm1
+    paddq ksKey+16*((_RK_+3) % 5)-F_O(%ebp),%xmm2
+    paddq ksKey+16*((_RK_+4) % 5)-F_O(%ebp),%xmm3
+    paddq ksTwk+16*((_RK_+1) % 3)-F_O(%ebp),%xmm1
+    paddq ksTwk+16*((_RK_+2) % 3)-F_O(%ebp),%xmm2
+    paddq %xmm4,%xmm3
+  .else #looping version
+    paddq ksKey+16*1-F_O(%esi),%xmm0
+    paddq ksKey+16*2-F_O(%esi),%xmm1
+    paddq ksKey+16*3-F_O(%esi),%xmm2
+    paddq ksKey+16*4-F_O(%esi),%xmm3
+    paddq ksTwk+16*1-F_O(%esi),%xmm1
+    paddq ksTwk+16*2-F_O(%esi),%xmm2
+    paddq %xmm4,%xmm3
+#   
+    movq        ksKey-F_O(%esi),%xmm4   #first, "rotate" key schedule on the stack
+    movq        ksTwk-F_O(%esi),%xmm5   #    (for next time through)
+    movq  %xmm4,ksKey+16*(WCNT+1)-F_O(%esi)
+    movq  %xmm5,ksTwk+16*3-F_O(%esi)
+    addl  $16,%esi                     #bump rolling pointer
+  .endif
+  .if _SKEIN_DEBUG
+      Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,SAVE_REGS
+  .endif
+.endm #R256_FourRounds
+#
+.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
+_Put_XMM_256:
+  .irp _NN_,0,1,2,3
+    movq  %xmm\_NN_,X_stk+4+\_NN_*8(%esp)
+  .endr
+    ret
+#
+_Get_XMM_256:
+  .irp _NN_,0,1,2,3
+    movq            X_stk+4+_NN_*8(%esp),%xmm\_NN_
+  .endr
+    ret
+.endif
+#
+#################
+#
+# code
+#
+C_label Skein_256_Process_Block
+    WCNT    =   4                   #WCNT=4 for Skein-256
+    Setup_Stack WCNT,ROUNDS_256
+    # main hash loop for Skein_256
+Skein_256_block_loop:
+    movd    bitAdd (%ebx),%xmm4
+    movq    TWEAK+0(%edi),%xmm5
+    movq    TWEAK+8(%edi),%xmm6
+    paddq   %xmm4        ,%xmm5     #bump T0 by the bitAdd parameter
+    movq    %xmm5,TWEAK(%edi)       #save updated tweak value T0 (for next time)
+    movapd  %xmm6,%xmm7
+    xorpd   %xmm5,%xmm7             #compute overall tweak parity
+    movdqa  %xmm5,ksTwk   -F_O(%ebp)#save the expanded tweak schedule on the stack
+    movdqa  %xmm6,ksTwk+16-F_O(%ebp)        
+    movdqa  %xmm7,ksTwk+32-F_O(%ebp)        
+
+    movl    blkPtr(%ebx),%esi       #esi --> input block
+    movl    $KW_PARITY_LO,%eax      #init key schedule parity accumulator
+    movl    $KW_PARITY_HI,%edx 
+    movd    %eax ,%xmm4
+    movd    %edx ,%xmm0
+    unpcklps %xmm0,%xmm4            #replicate parity dword to 64 bits
+#
+  .irp _NN_,0,1,2,3                 #copy in the chaining vars
+    movq    X_VARS+8*\_NN_(%edi),%xmm\_NN_
+    xorpd   %xmm\_NN_,%xmm4         #update overall parity
+    movdqa  %xmm\_NN_,ksKey+16*_NN_-F_O(%ebp)
+  .endr
+    movdqa  %xmm4,ksKey+16*WCNT-F_O(%ebp)#save overall parity at the end of the array
+#
+    paddq   %xmm5,%xmm1             #inject the initial tweak words
+    paddq   %xmm6,%xmm2
+#
+  .irp _NN_,0,1,2,3                 #perform the initial key injection
+    movq          8*\_NN_(%esi),%xmm4#and save a copy of the input block on stack
+    movq    %xmm4,8*\_NN_+Wcopy(%esp)
+    paddq   %xmm4,%xmm\_NN_         #inject the key word
+  .endr
+#
+.if _SKEIN_DEBUG                    #debug dump of state at this point
+    Skein_Debug_Block 256
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,SAVE_REGS
+.endif
+    addl    $WCNT*8,%esi            #skip to the next block
+    movl    %esi,blkPtr(%ebx)       #save the updated block pointer
+    #
+    # now the key schedule is computed. Start the rounds
+    #
+    xorl    %edx,%edx               #edx = iteration count
+.if SKEIN_ASM_UNROLL & 256
+_UNROLL_CNT =   ROUNDS_256/8        #fully unrolled
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_256    #partial unroll count
+  .if ((ROUNDS_256/8) % _UNROLL_CNT)
+    .error "Invalid SKEIN_UNROLL_256" #sanity check
+  .endif
+    movl    %ebp,%esi               #use this as "rolling" pointer into ksTwk/ksKey
+Skein_256_round_loop:               #   (since there's no 16* scaled address mode)
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2                  # here with X[0..3] in XMM0..XMM3
+      R_256_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+.endr #rept _UNROLL_CNT*2
+#
+  .if _UNROLL_CNT <> (ROUNDS_256/8)
+    cmpl    $2*(ROUNDS_256/8),%edx
+    jb      Skein_256_round_loop
+  .endif
+    #----------------------------
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
+  .irp _NN_,0,1,2,3
+    movq    Wcopy+8*\_NN_(%esp),%xmm4
+    xorpd   %xmm4,%xmm\_NN_
+    movq    %xmm\_NN_,X_VARS+8*\_NN_(%edi)
+  .endr
+    andb    $FIRST_MASK8,TWEAK +15(%edi)
+.if _SKEIN_DEBUG
+    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,SAVE_REGS
+.endif
+    # go back for more blocks, if needed
+    decl    %ecx
+    jnz     Skein_256_block_loop
+    Reset_Stack _Skein_256_Process_Block
+    ret
+#
+.ifdef _SKEIN_CODE_SIZE
+C_label  Skein_256_Process_Block_CodeSize
+    movl    $_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block,%eax
+    ret
+#
+C_label  Skein_256_Unroll_Cnt
+  .if _UNROLL_CNT <> ROUNDS_256/8
+    movl    $_UNROLL_CNT,%eax
+  .else
+    xorl    %eax,%eax
+  .endif
+    ret
+.endif
+.endif #_USE_ASM_ & 256
+#
+#----------------------------------------------------------------
+#
+.if _USE_ASM_ & 512
+#
+# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+# MACRO: one round
+#
+.macro R_512_Round _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd
+  .irp _qq_,%((\_RR_) && 7)
+_Ra_ = RC_512_\_qq_&&_\Ra
+_Rb_ = RC_512_\_qq_&&_\Rb
+_Rc_ = RC_512_\_qq_&&_\Rc
+_Rd_ = RC_512_\_qq_&&_\Rd
+  .endr
+    paddq   %xmm\a1 , %xmm\a0 
+                              _stX c0
+    movq    %xmm\a1 , %xmm\c0 
+    psllq  $   _Ra_ , %xmm\a1 
+    psrlq  $64-_Ra_ , %xmm\c0 
+    xorpd   %xmm\c0 , %xmm\a1 
+    xorpd   %xmm\a0 , %xmm\a1 
+                                    
+    paddq   %xmm\b1 , %xmm\b0 
+                              _stX a0
+    movq    %xmm\b1 , %xmm\a0 
+    psllq  $   _Rb_ , %xmm\b1 
+    psrlq  $64-_Rb_ , %xmm\a0 
+    xorpd   %xmm\b0 , %xmm\b1 
+                              _ldX c0
+    xorpd   %xmm\a0 , %xmm\b1 
+                               
+    paddq   %xmm\c1 , %xmm\c0 
+    movq    %xmm\c1 , %xmm\a0 
+    psllq  $   _Rc_ , %xmm\c1 
+    psrlq  $64-_Rc_ , %xmm\a0 
+    xorpd   %xmm\c0 , %xmm\c1 
+    xorpd   %xmm\a0 , %xmm\c1 
+                               
+    paddq   %xmm\d1 , %xmm\d0 
+    movq    %xmm\d1 , %xmm\a0           
+    psllq  $   _Rd_ , %xmm\d1 
+    psrlq  $64-_Rd_ , %xmm\a0 
+    xorpd   %xmm\a0 , %xmm\d1 
+                              _ldX a0
+    xorpd   %xmm\d0 , %xmm\d1 
+  .if _SKEIN_DEBUG
+    Skein_Debug_Round 512,%(_RR_+1),SAVE_REGS
+  .endif
+.endm
+#
+# MACRO: four rounds
+.macro R_512_FourRounds _RN_
+    R_512_Round %((_RN_)  ), 0,1,0, 2,3,1, 4,5,2, 6,7,3
+    R_512_Round %((_RN_)+1), 2,1,0, 4,7,1, 6,5,2, 0,3,3
+    R_512_Round %((_RN_)+2), 4,1,0, 6,3,1, 0,5,2, 2,7,3
+    R_512_Round %((_RN_)+3), 6,1,0, 0,7,1, 2,5,2, 4,3,3
+
+    #inject key schedule
+.irp _NN_,0,1,2,3,4,5,6,7
+  .if _UNROLL_CNT == (ROUNDS_512/8)
+    paddq ksKey+16*((((\_RN_)/4)+(\_NN_)+1)%9)-F_O(%ebp),%xmm\_NN_
+  .else
+    paddq ksKey+16*((\_NN_)+1)-F_O(%esi),%xmm\_NN_
+  .endif
+.endr
+    _stX  0                       #free up a register
+    incl  %edx                    #bump round counter
+    movd  %edx,%xmm0              #inject the tweak
+  .if _UNROLL_CNT == (ROUNDS_512/8)
+    paddq ksTwk+16*(((_RN_)+1) % 3)-F_O(%ebp),%xmm5
+    paddq ksTwk+16*(((_RN_)+2) % 3)-F_O(%ebp),%xmm6
+    paddq %xmm0                              ,%xmm7
+  .else #looping version
+    paddq ksTwk+16*1-F_O(%esi),%xmm5
+    paddq ksTwk+16*2-F_O(%esi),%xmm6
+    paddq %xmm0               ,%xmm7
+    # "rotate" key schedule on the stack (for next time through)
+    movq        ksKey            -F_O(%esi),%xmm0
+    movq  %xmm0,ksKey+16*(WCNT+1)-F_O(%esi)
+    movq        ksTwk            -F_O(%esi),%xmm0
+    movq  %xmm0,ksTwk+16*3       -F_O(%esi)
+    addl  $16,%esi                #bump rolling pointer
+  .endif
+    _ldX  0                       #restore X0
+  .if _SKEIN_DEBUG
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,SAVE_REGS
+  .endif
+.endm #R_512_FourRounds
+#################
+.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
+_Put_XMM_512:
+  .irp _NN_,0,1,2,3,4,5,6,7
+    movq  %xmm\_NN_,X_stk+4+\_NN_*8(%esp)
+  .endr
+    ret
+#
+_Get_XMM_512:
+  .irp _NN_,0,1,2,3,4,5,6,7
+    movq            X_stk+4+\_NN_*8(%esp),%xmm\_NN_
+  .endr
+    ret
+.endif
+#
+#################
+#
+C_label Skein_512_Process_Block
+    WCNT    =   8                   #WCNT=8 for Skein-512
+    Setup_Stack WCNT,ROUNDS_512
+    # main hash loop for Skein_512
+Skein_512_block_loop:
+    movd    bitAdd(%ebx) ,%xmm0
+    movq    TWEAK+0(%edi),%xmm1
+    movq    TWEAK+8(%edi),%xmm2
+    paddq   %xmm0,%xmm1               #bump T0 by the bitAdd parameter
+    movq    %xmm1,TWEAK(%edi)         #save updated tweak value T0 (for next time)
+    movq    %xmm2,%xmm0
+    xorpd   %xmm1,%xmm0               #compute overall tweak parity
+    movdqa  %xmm1,ksTwk     -F_O(%ebp)#save the expanded tweak schedule on the stack
+    movdqa  %xmm2,ksTwk+16*1-F_O(%ebp)    
+    movdqa  %xmm0,ksTwk+16*2-F_O(%ebp)    
+
+    movl    blkPtr(%ebx),%esi         #esi --> input block
+    movl    $KW_PARITY_LO,%eax        #init key schedule parity accumulator
+    movl    $KW_PARITY_HI,%edx 
+    movd    %eax ,%xmm0
+    movd    %edx ,%xmm7
+    unpcklps %xmm7,%xmm0              #replicate parity dword to 64 bits
+#
+  .irp _NN_,7,6,5,4,3,2,1             #copy in the chaining vars (skip #0 for now)
+    movq    X_VARS+8*\_NN_(%edi),%xmm\_NN_
+    xorpd   %xmm\_NN_,%xmm0           #update overall parity
+    movdqa  %xmm\_NN_,ksKey+16*\_NN_-F_O(%ebp)
+   .if \_NN_ == 5
+    paddq   %xmm1,%xmm5               #inject the initial tweak words
+    paddq   %xmm2,%xmm6               #  (before they get trashed in %xmm1/2)
+   .endif
+  .endr
+    movq    X_VARS(%edi),%xmm4        #handle #0 now
+    xorpd   %xmm4,%xmm0               #update overall parity
+    movdqa  %xmm4,ksKey+16* 0  -F_O(%ebp) #save the key value in slot #0
+    movdqa  %xmm0,ksKey+16*WCNT-F_O(%ebp) #save overall parity at the end of the array
+#
+    movq    %xmm4,%xmm0
+  .irp _NN_,7,6,5,  4,3,2,1,0         #perform the initial key injection (except #4)
+    movq    8*\_NN_(%esi),%xmm4       #and save a copy of the input block on stack
+    movq    %xmm4,8*\_NN_+Wcopy(%esp)
+    paddq   %xmm4,%xmm\_NN_
+  .endr
+    movq    8*4(%esi),%xmm4           #get input block word #4
+    movq    %xmm4,8*4+Wcopy(%esp)
+    paddq   ksKey+16*4-F_O(%ebp),%xmm4#inject the initial key
+#
+.if _SKEIN_DEBUG                      #debug dump of state at this point
+    Skein_Debug_Block 512
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,SAVE_REGS
+.endif
+    addl    $WCNT*8,%esi              #skip to the next block
+    movl    %esi,blkPtr(%ebx)         #save the updated block pointer
+    #
+    # now the key schedule is computed. Start the rounds
+    #
+    xorl    %edx,%edx                 #edx = round counter
+.if SKEIN_ASM_UNROLL & 512
+_UNROLL_CNT =   ROUNDS_512/8
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_512
+  .if ((ROUNDS_512/8) % _UNROLL_CNT)
+    .error "Invalid SKEIN_UNROLL_512"
+  .endif
+    movl    %ebp,%esi                 #use this as "rolling" pointer into ksTwk/ksKey
+Skein_512_round_loop:                 #   (since there's no 16* scaled address mode)
+.endif
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+      R_512_FourRounds %_Rbase_
+_Rbase_ = _Rbase_+4
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 512) == 0
+    cmpl    $2*(ROUNDS_512/8),%edx
+    jb      Skein_512_round_loop
+.endif
+    #----------------------------
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
+    andb    $FIRST_MASK8,TWEAK +15(%edi)
+.irp _NN_,0,2,4,6                   #do the aligned ones first
+    xorpd   Wcopy+8*\_NN_(%esp),%xmm\_NN_
+    movq    %xmm\_NN_,X_VARS+8*_NN_(%edi)
+.endr
+.irp _NN_,1,3,5,7                   #now we have some register space available
+    movq    Wcopy+8*\_NN_(%esp),%xmm0
+    xorpd   %xmm0,%xmm&\_NN_
+    movq    %xmm&\_NN_,X_VARS+8*\_NN_(%edi)
+.endr
+.if _SKEIN_DEBUG
+    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+.endif
+    # go back for more blocks, if needed
+    decl    %ecx
+    jnz     Skein_512_block_loop
+
+    Reset_Stack _Skein_512_Process_Block
+    ret
+#
+.ifdef _SKEIN_CODE_SIZE
+C_label Skein_512_Process_Block_CodeSize
+    movl    $(_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block),%eax
+    ret
+#
+C_label Skein_512_Unroll_Cnt
+  .if _UNROLL_CNT <> ROUNDS_512/8
+    movl    $_UNROLL_CNT,%eax
+  .else
+    xorl    %eax,%eax
+  .endif
+    ret
+.endif
+#
+.endif # _USE_ASM_ & 512
+#
+#----------------------------------------------------------------
+#
+.if _USE_ASM_ & 1024
+    .global      _Skein1024_Process_Block
+#
+# void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+R_1024_REGS =     (5)     #keep this many block variables in registers
+#
+################
+.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
+_Put_XMM_1024:
+_NN_ = 0
+  .rept R_1024_REGS
+   .irp _rr_,%(_NN_)
+    movq   %xmm\_rr_,X_stk+4+8*_NN_(%esp)
+   .endr
+_NN_ = _NN_+1
+  .endr
+    ret
+#
+_Get_XMM_1024:
+_NN_ = 0
+  .rept R_1024_REGS
+   .irp _rr_,%(_NN_)
+    movq             X_stk+4+8*_NN_(%esp),%xmm\_rr_
+   .endr
+_NN_ = _NN_+1
+  .endr
+    ret
+.endif
+#
+#################
+# MACRO: one mix step
+.macro MixStep_1024  x0,x1,rotIdx0,rotIdx1,_debug_=0
+_r0_ =  \x0      #default, if already loaded
+_r1_ =  \x1
+  # load the regs (if necessary)
+  .if (\x0 >= R_1024_REGS)
+_r0_ =       5
+    movq    X_stk+8*(\x0)(%esp),%xmm5
+  .endif
+  .if (\x1 >= R_1024_REGS)
+_r1_ =       6     
+    movq  X_stk+8*(\x1)(%esp),%xmm6
+  .endif
+  # do the mix
+  .irp _rx_,%((rotIdx0) && 7)
+_Rc_ = RC_1024_\_rx_&&_\rotIdx1  #rotation constant
+  .endr
+  .irp _x0_,%_r0_
+  .irp _x1_,%_r1_
+    paddq   %xmm\_x1_,%xmm\_x0_
+    movq    %xmm\_x1_,%xmm7    
+    psllq  $   _Rc_  ,%xmm\_x1_
+    psrlq  $64-_Rc_  ,%xmm7    
+    xorpd   %xmm\_x0_,%xmm\_x1_
+    xorpd   %xmm7    ,%xmm\_x1_
+  .endr
+  .endr
+  # save the regs (if necessary)
+  .if (\x0 >= R_1024_REGS)
+    movq    %xmm5,X_stk+8*(\x0)(%esp)
+  .endif
+  .if (\x1 >= R_1024_REGS)
+    movq    %xmm6,X_stk+8*(\x1)(%esp)
+  .endif
+  # debug output
+  .if _SKEIN_DEBUG && (\_debug_)
+    Skein_Debug_Round 1024,%((\RotIdx0)+1),SAVE_REGS
+  .endif
+.endm
+#################
+# MACRO: four rounds
+#
+.macro R_1024_FourRounds _RR_
+    #--------- round _RR_
+    MixStep_1024     0, 1,%((\_RR_)+0),0
+    MixStep_1024     2, 3,%((\_RR_)+0),1
+    MixStep_1024     4, 5,%((\_RR_)+0),2
+    MixStep_1024     6, 7,%((\_RR_)+0),3
+    MixStep_1024     8, 9,%((\_RR_)+0),4
+    MixStep_1024    10,11,%((\_RR_)+0),5
+    MixStep_1024    12,13,%((\_RR_)+0),6
+    MixStep_1024    14,15,%((\_RR_)+0),7,1
+    #--------- round _RR_+1
+    MixStep_1024     0, 9,%((\_RR_)+1),0
+    MixStep_1024     2,13,%((\_RR_)+1),1
+    MixStep_1024     6,11,%((\_RR_)+1),2
+    MixStep_1024     4,15,%((\_RR_)+1),3
+    MixStep_1024    10, 7,%((\_RR_)+1),4
+    MixStep_1024    12, 3,%((\_RR_)+1),5
+    MixStep_1024    14, 5,%((\_RR_)+1),6
+    MixStep_1024     8, 1,%((\_RR_)+1),7,1
+    #--------- round _RR_+2
+    MixStep_1024     0, 7,%((\_RR_)+2),0    
+    MixStep_1024     2, 5,%((\_RR_)+2),1
+    MixStep_1024     4, 3,%((\_RR_)+2),2    
+    MixStep_1024     6, 1,%((\_RR_)+2),3    
+    MixStep_1024    12,15,%((\_RR_)+2),4
+    MixStep_1024    14,13,%((\_RR_)+2),5    
+    MixStep_1024     8,11,%((\_RR_)+2),6    
+    MixStep_1024    10, 9,%((\_RR_)+2),7,1
+    #--------- round _RR_+3
+    MixStep_1024     0,15,%((\_RR_)+3),0
+    MixStep_1024     2,11,%((\_RR_)+3),1
+    MixStep_1024     6,13,%((\_RR_)+3),2
+    MixStep_1024     4, 9,%((\_RR_)+3),3
+    MixStep_1024    14, 1,%((\_RR_)+3),4
+    MixStep_1024     8, 5,%((\_RR_)+3),5
+    MixStep_1024    10, 3,%((\_RR_)+3),6
+    MixStep_1024    12, 7,%((\_RR_)+3),7,1
+
+    incl  %edx                     #edx = round number
+    movd  %edx,%xmm7
+
+    #inject the key
+.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+  .if _UNROLL_CNT <> (ROUNDS_1024/8)
+    .if \_NN_ < R_1024_REGS
+      paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm&\_NN_
+    .else
+      movq  X_stk+ 8*\_NN_(%esp),%xmm6
+     .if     \_NN_ == 15
+      paddq %xmm7,%xmm6
+     .elseif \_NN_ == 14
+      paddq ksTwk+16*2-F_O(%esi),%xmm6
+     .elseif \_NN_ == 13
+      paddq ksTwk+16*1-F_O(%esi),%xmm6
+     .endif
+      paddq       ksKey+16*\_NN_+16-F_O(%esi),%xmm6
+      movq  %xmm6,X_stk+ 8*\_NN_(%esp)
+    .endif
+  .else
+    .if \_NN_ < R_1024_REGS
+      paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm&\_NN_
+    .else
+      movq  X_stk+ 8*\_NN_(%esp), %xmm6
+      paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm6
+     .if     \_NN_ == 15
+      paddq %xmm7,%xmm6
+     .elseif \_NN_ == 14
+      paddq ksTwk+16*(((_Rbase_/4)+2) %  3)-F_O(%ebp),%xmm6
+     .elseif \_NN_ == 13
+      paddq ksTwk+16*(((_Rbase_/4)+1) %  3)-F_O(%ebp),%xmm6
+     .endif
+      movq %xmm6,X_stk+ 8*\_NN_(%esp)
+    .endif
+  .endif
+.endr
+  .if _UNROLL_CNT <> (ROUNDS_1024/8) #rotate the key schedule on the stack
+    movq ksKey-F_O(%esi), %xmm6
+    movq ksTwk-F_O(%esi), %xmm7
+    movq %xmm6,ksKey+16*(WCNT+1)-F_O(%esi)
+    movq %xmm7,ksTwk+16* 3      -F_O(%esi)
+    addl $16,%esi                   #bump rolling pointer
+  .endif
+  .if _SKEIN_DEBUG
+      Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,SAVE_REGS
+  .endif
+.endm #R_1024_FourRounds
+#
+################
+#
+C_label Skein1024_Process_Block
+#
+    WCNT    =   16                  #WCNT=16 for Skein-1024
+    Setup_Stack WCNT,ROUNDS_1024
+    addl    $0x80,%edi              #bias the edi ctxt offsets to keep them all short
+    # main hash loop for Skein1024
+Skein1024_block_loop:
+    movd    bitAdd(%ebx)      ,%xmm0
+    movq    TWEAK+0-0x80(%edi),%xmm1
+    movq    TWEAK+8-0x80(%edi),%xmm2
+    paddq   %xmm0,%xmm1             #bump T0 by the bitAdd parameter
+    movq    %xmm1,TWEAK-0x80(%edi)  #save updated tweak value T0 (for next time)
+    movq    %xmm2,%xmm0
+    xorpd   %xmm1,%xmm0             #compute overall tweak parity
+    movdqa  %xmm1,ksTwk   -F_O(%ebp)#save the expanded tweak schedule on the stack
+    movdqa  %xmm2,ksTwk+16-F_O(%ebp)
+    movdqa  %xmm0,ksTwk+32-F_O(%ebp)
+
+    movl    blkPtr(%ebx),%esi       #esi --> input block
+    movl    $KW_PARITY_LO,%eax      #init key schedule parity accumulator
+    movl    $KW_PARITY_HI,%edx 
+    movd    %eax ,%xmm7
+    movd    %edx ,%xmm6
+    unpcklps %xmm6,%xmm7            #replicate parity dword to 64 bits
+#
+    leal    0x80(%esp),%eax         #use short offsets for Wcopy, X_stk writes below
+.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+    movq    X_VARS+8*\_NN_-0x80(%edi),%xmm6
+    xorpd   %xmm6,%xmm7             #update overall parity
+    movdqa  %xmm6,ksKey+16*\_NN_-F_O(%ebp) #save the key schedule on the stack
+  .if \_NN_ < R_1024_REGS
+    _rr_  =  \_NN_
+  .else
+    _rr_  =   R_1024_REGS
+  .endif
+  .irp _rn_,%(_rr_)
+    movq    8*\_NN_(%esi),%xmm\_rn_ #save copy of the input block on stack
+    movq    %xmm\_rn_,Wcopy+8*\_NN_-0x80(%eax)  #(for feedforward later)
+    paddq   %xmm6,%xmm\_rn_         #inject the key into the block
+   .if \_NN_ == 13
+    paddq   %xmm1,%xmm\_rn_         #inject the initial tweak words
+   .elseif \_NN_ == 14
+    paddq   %xmm2,%xmm\_rn_
+   .endif
+   .if \_NN_ >= R_1024_REGS         #only save X[5..15] on stack, leave X[0..4] in regs
+    movq    %xmm\_rn_,X_stk+8*\_NN_-0x80(%eax)
+   .endif
+  .endr
+.endr
+    movdqa  %xmm7,ksKey+16*WCNT-F_O(%ebp) #save overall key parity at the end of the array
+#
+.if _SKEIN_DEBUG                    #debug dump of state at this point
+    Skein_Debug_Block 1024
+    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,SAVE_REGS
+.endif
+    addl    $WCNT*8,%esi            #skip to the next block
+    movl    %esi,blkPtr(%ebx)       #save the updated block pointer
+    #
+    # now the key schedule is computed. Start the rounds
+    #
+    xorl    %edx,%edx               #edx = round counter
+.if SKEIN_ASM_UNROLL & 1024
+_UNROLL_CNT =   ROUNDS_1024/8
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_1024
+  .if ((ROUNDS_1024/8) % _UNROLL_CNT)
+    .error "Invalid SKEIN_UNROLL_1024"
+  .endif
+    movl    %ebp,%esi               #use this as "rolling" pointer into ksTwk/ksKey
+Skein_1024_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+    R_1024_FourRounds %_Rbase_
+_Rbase_ = _Rbase_+4
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 1024) == 0
+    cmp     $2*(ROUNDS_1024/8),%edx
+    jb      Skein_1024_round_loop
+.endif
+    andb    $FIRST_MASK8,TWEAK +15-0x80(%edi)      #clear tweak bit for next time thru
+    #----------------------------
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
+    leal    0x80(%esp),%eax                        #allow short offsets to X_stk and Wcopy
+.irp _NN_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+  .if \_NN_ < R_1024_REGS
+    .if \_NN_ && 1                                 #already in regs: no load needed
+      movq  Wcopy+ 8*\_NN_-0x80(%eax),%xmm7        #unaligned
+      xorpd %xmm7,%xmm\_NN_
+    .else
+      xorpd Wcopy+ 8*\_NN_-0x80(%eax),%xmm\_NN_    #aligned
+    .endif
+      movq  %xmm\_NN_,X_VARS+8*\_NN_-0x80(%edi)
+  .else
+      movq    X_stk+8*\_NN_-0x80(%eax),%xmm7       #load X value from stack
+    .if \_NN_ && 1
+      movq    Wcopy+8*\_NN_-0x80(%eax),%xmm6       #unaligned
+      xorpd   %xmm6,%xmm7
+    .else
+      xorpd   Wcopy+8*\_NN_-0x80(%eax),%xmm7       #aligned
+    .endif
+      movq    %xmm7,X_VARS+8*\_NN_-0x80(%edi)
+ .endif
+.endr
+.if _SKEIN_DEBUG
+    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD   #no need to save regs on stack here
+.endif
+    # go back for more blocks, if needed
+    decl    %ecx
+    jnz     Skein1024_block_loop
+
+    Reset_Stack _Skein1024_Process_Block
+    ret
+#
+.ifdef _SKEIN_CODE_SIZE
+C_label Skein1024_Process_Block_CodeSize
+    movl    $(_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block),%eax
+    ret
+#
+C_label Skein1024_Unroll_Cnt
+  .if _UNROLL_CNT <> ROUNDS_1024/8
+    movl    $_UNROLL_CNT,%eax
+  .else
+    xorl    %eax,%eax
+  .endif
+    ret
+.endif
+#
+.endif # _USE_ASM_ & 1024
+#----------------------------------------------------------------
+    .end
diff --git a/Additional_Implementations/skein_perf_core2.txt b/Additional_Implementations/skein_perf_core2.txt
new file mode 100644
index 0000000000000..d8b795675c3d9
--- /dev/null
+++ b/Additional_Implementations/skein_perf_core2.txt
@@ -0,0 +1,1440 @@
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:27:59,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  3450.00  3450.00  |  8718.00  8718.00  | 41700.00 41730.00  | //: 32-bit, GCC_v3.42 [ C =...]
+      2_ ||  1719.00  1725.00  |  4323.00  4326.00  | 20835.00 20850.00  | //: 32-bit, GCC_v3.42 [ C =...]
+      4_ ||   861.00   861.00  |  2149.50  2151.00  | 10408.50 11277.00  | //: 32-bit, GCC_v3.42 [ C =...]
+      8_ ||   429.00   429.75  |  1074.75  1074.75  |  5204.25  5205.00  | //: 32-bit, GCC_v3.42 [ C =...]
+     10_ ||   343.80   344.40  |   865.80   866.40  |  4167.00  4167.60  | //: 32-bit, GCC_v3.42 [ C =...]
+     16_ ||   214.88   214.88  |   538.50   538.50  |  2603.25  2603.63  | //: 32-bit, GCC_v3.42 [ C =...]
+     32_ ||   107.06   115.88  |   269.25   269.25  |  1301.25  1301.25  | //: 32-bit, GCC_v3.42 [ C =...]
+     64_ ||    85.31    85.41  |   132.66   132.75  |   650.53   650.63  | //: 32-bit, GCC_v3.42 [ C =...]
+    100_ ||    82.20    88.86  |   126.78   126.78  |   416.46   416.46  | //: 32-bit, GCC_v3.42 [ C =...]
+    128_ ||    69.42    69.56  |    97.83    97.83  |   324.98   325.03  | //: 32-bit, GCC_v3.42 [ C =...]
+    256_ ||    56.70    56.74  |    76.34    76.34  |   242.95   242.98  | //: 32-bit, GCC_v3.42 [ C =...]
+    512_ ||    53.06    53.12  |    65.50    65.53  |   200.66   200.67  | //: 32-bit, GCC_v3.42 [ C =...]
+   1000_ ||    52.33    52.42  |    61.66    61.69  |   183.89   183.92  | //: 32-bit, GCC_v3.42 [ C =...]
+   1024_ ||    51.15    51.23  |    60.07    60.08  |   179.52   179.55  | //: 32-bit, GCC_v3.42 [ C =...]
+   2048_ ||    50.20    50.30  |    57.36    57.42  |   168.86   168.97  | //: 32-bit, GCC_v3.42 [ C =...]
+   4096_ ||    49.71    49.77  |    56.00    56.01  |   163.65   166.96  | //: 32-bit, GCC_v3.42 [ C =...]
+   8192_ ||    49.48    50.94  |    55.33    57.07  |   169.60   184.62  | //: 32-bit, GCC_v3.42 [ C =...]
+  10000_ ||    53.64    53.70  |    60.89    60.99  |   186.13   186.98  | //: 32-bit, GCC_v3.42 [ C =...]
+  16384_ ||    53.48    53.80  |    60.35    60.70  |   164.26   167.23  | //: 32-bit, GCC_v3.42 [ C =...]
+  32768_ ||    53.47    53.50  |    60.22    60.37  |   164.15   182.33  | //: 32-bit, GCC_v3.42 [ C =...]
+ 100000_ ||    53.85   100.93  |    60.43    61.36  |   164.25   169.78  | //: 32-bit, GCC_v3.42 [ C =...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [ C =...]
+  Block  ||        14464 bytes |        32544 bytes |        83024 bytes | //: 32-bit, GCC_v3.42 [ C =...]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:28:12,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2802.00  2814.00  |  5952.00  5952.00  | 30606.00 30606.00  | //: 32-bit, MSC_v9.00 [ C =...]
+      2_ ||  1392.00  1395.00  |  2976.00  2979.00  | 15309.00 15309.00  | //: 32-bit, MSC_v9.00 [ C =...]
+      4_ ||   696.00   697.50  |  1486.50  1486.50  |  7653.00  7654.50  | //: 32-bit, MSC_v9.00 [ C =...]
+      8_ ||   347.25   348.00  |   741.75   742.50  |  3825.75  3827.25  | //: 32-bit, MSC_v9.00 [ C =...]
+     10_ ||   278.40   278.40  |   593.40   593.40  |  3063.00  3063.00  | //: 32-bit, MSC_v9.00 [ C =...]
+     16_ ||   174.38   174.38  |   370.50   370.50  |  1913.25  1913.25  | //: 32-bit, MSC_v9.00 [ C =...]
+     32_ ||    86.25    86.25  |   186.00   186.75  |   957.00   957.19  | //: 32-bit, MSC_v9.00 [ C =...]
+     64_ ||    62.91    62.91  |    92.91    92.91  |   478.50   478.50  | //: 32-bit, MSC_v9.00 [ C =...]
+    100_ ||    65.52    65.58  |    88.02    88.08  |   306.30   306.30  | //: 32-bit, MSC_v9.00 [ C =...]
+    128_ ||    50.72    50.72  |    68.53    68.58  |   238.64   238.88  | //: 32-bit, MSC_v9.00 [ C =...]
+    256_ ||    44.88    45.05  |    56.11    56.13  |   178.17   178.24  | //: 32-bit, MSC_v9.00 [ C =...]
+    512_ ||    41.79    41.86  |    49.79    49.91  |   147.39   147.47  | //: 32-bit, MSC_v9.00 [ C =...]
+   1000_ ||    41.26    41.41  |    47.96    47.96  |   135.28   135.29  | //: 32-bit, MSC_v9.00 [ C =...]
+   1024_ ||    40.40    40.40  |    46.79    46.81  |   132.05   132.08  | //: 32-bit, MSC_v9.00 [ C =...]
+   2048_ ||    39.62    39.62  |    45.23    45.23  |   124.39   124.40  | //: 32-bit, MSC_v9.00 [ C =...]
+   4096_ ||    38.98    38.99  |    44.34    44.44  |   120.58   120.60  | //: 32-bit, MSC_v9.00 [ C =...]
+   8192_ ||    38.83    38.87  |    44.06    47.57  |   118.65   119.36  | //: 32-bit, MSC_v9.00 [ C =...]
+  10000_ ||    38.86    39.08  |    44.13    44.21  |   119.88   120.11  | //: 32-bit, MSC_v9.00 [ C =...]
+  16384_ ||    38.74    39.03  |    43.76    44.01  |   108.36   117.94  | //: 32-bit, MSC_v9.00 [ C =...]
+  32768_ ||    36.77    38.19  |    41.28    41.57  |   105.50   114.79  | //: 32-bit, MSC_v9.00 [ C =...]
+ 100000_ ||    38.85    39.09  |    43.56    43.77  |   105.79   114.18  | //: 32-bit, MSC_v9.00 [ C =...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+  Block  ||        10192 bytes |        22960 bytes |        53072 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:28:29,Oct  7 2008  by  'MSC_v6.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  8688.00  8712.00  | 26466.00 26472.00  | 61638.00 61680.00  | //: 32-bit, MSC_v6.00 [ C =...]
+      2_ ||  4347.00  4362.00  | 13293.00 13302.00  | 30036.00 30372.00  | //: 32-bit, MSC_v6.00 [ C =...]
+      4_ ||  2184.00  2199.00  |  6457.50  6508.50  | 15267.00 15285.00  | //: 32-bit, MSC_v6.00 [ C =...]
+      8_ ||  1093.50  1098.75  |  3227.25  3227.25  |  7398.75  7467.75  | //: 32-bit, MSC_v6.00 [ C =...]
+     10_ ||   873.60   878.40  |  2405.40  2574.00  |  5661.00  5668.20  | //: 32-bit, MSC_v6.00 [ C =...]
+     16_ ||   522.00   524.25  |  1455.00  1455.38  |  3459.38  3489.38  | //: 32-bit, MSC_v6.00 [ C =...]
+     32_ ||   260.06   261.00  |   727.69   732.56  |  1727.44  1728.00  | //: 32-bit, MSC_v6.00 [ C =...]
+     64_ ||   186.66   186.94  |   362.16   362.25  |   848.25   856.97  | //: 32-bit, MSC_v6.00 [ C =...]
+    100_ ||   194.10   194.10  |   344.52   344.58  |   542.22   545.28  | //: 32-bit, MSC_v6.00 [ C =...]
+    128_ ||   151.27   151.36  |   266.06   268.59  |   426.23   426.33  | //: 32-bit, MSC_v6.00 [ C =...]
+    256_ ||   137.67   137.91  |   219.66   219.68  |   314.74   317.74  | //: 32-bit, MSC_v6.00 [ C =...]
+    512_ ||   130.21   130.22  |   195.96   218.33  |   263.45   266.96  | //: 32-bit, MSC_v6.00 [ C =...]
+   1000_ ||   129.17   129.60  |   183.96   185.04  |   237.88   240.37  | //: 32-bit, MSC_v6.00 [ C =...]
+   1024_ ||   126.35   126.53  |   178.10   178.44  |   251.47   256.21  | //: 32-bit, MSC_v6.00 [ C =...]
+   2048_ ||   133.77   133.81  |   190.95   191.16  |   240.21   242.87  | //: 32-bit, MSC_v6.00 [ C =...]
+   4096_ ||   116.36   124.81  |   169.39   178.51  |   220.31   222.41  | //: 32-bit, MSC_v6.00 [ C =...]
+   8192_ ||   123.60   125.60  |   171.36   174.54  |   215.79   233.44  | //: 32-bit, MSC_v6.00 [ C =...]
+  10000_ ||   124.42   127.19  |   183.83   188.08  |   236.37   238.04  | //: 32-bit, MSC_v6.00 [ C =...]
+  16384_ ||   133.15   133.46  |   172.38   183.65  |   222.17   232.39  | //: 32-bit, MSC_v6.00 [ C =...]
+  32768_ ||   126.67   128.66  |   180.67   186.04  |   225.01   227.24  | //: 32-bit, MSC_v6.00 [ C =...]
+ 100000_ ||   123.83   125.06  |   172.26   186.33  |   205.87   224.72  | //: 32-bit, MSC_v6.00 [ C =...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1486 bytes |         1348 bytes |         1445 bytes | //: 32-bit, MSC_v6.00 [ C =...]
+  Block  ||        14094 bytes |        35580 bytes |        69258 bytes | //: 32-bit, MSC_v6.00 [ C =...]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:28:44,Oct  7 2008  by  'MSC_v4.20', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  5028.00  5058.00  |  9948.00 10044.00  | 35466.00 35520.00  | //: 32-bit, MSC_v4.20 [ C =...]
+      2_ ||  2508.00  2511.00  |  5070.00  5076.00  | 18090.00 18132.00  | //: 32-bit, MSC_v4.20 [ C =...]
+      4_ ||  1255.50  1255.50  |  2523.00  2523.00  |  9063.00  9063.00  | //: 32-bit, MSC_v4.20 [ C =...]
+      8_ ||   627.75   627.75  |  1261.50  1261.50  |  4536.00  4536.00  | //: 32-bit, MSC_v4.20 [ C =...]
+     10_ ||   502.20   502.20  |  1013.40  1014.60  |  3685.80  3685.80  | //: 32-bit, MSC_v4.20 [ C =...]
+     16_ ||   313.88   313.88  |   624.75   632.63  |  2284.88  2287.88  | //: 32-bit, MSC_v4.20 [ C =...]
+     32_ ||   155.25   155.25  |   312.38   312.38  |  1143.75  1143.75  | //: 32-bit, MSC_v4.20 [ C =...]
+     64_ ||   114.56   114.66  |   155.72   155.72  |   569.91   569.91  | //: 32-bit, MSC_v4.20 [ C =...]
+    100_ ||   120.66   120.78  |   148.92   148.98  |   363.60   363.66  | //: 32-bit, MSC_v4.20 [ C =...]
+    128_ ||    93.84    93.89  |   116.58   116.63  |   284.58   284.58  | //: 32-bit, MSC_v4.20 [ C =...]
+    256_ ||    83.46    83.48  |    95.20    95.20  |   213.77   213.77  | //: 32-bit, MSC_v4.20 [ C =...]
+    512_ ||    78.18    78.19  |    85.08    85.09  |   177.38   177.38  | //: 32-bit, MSC_v4.20 [ C =...]
+   1000_ ||    77.42    77.42  |    81.88    81.88  |   161.92   161.92  | //: 32-bit, MSC_v4.20 [ C =...]
+   1024_ ||    75.54    75.55  |    79.53    79.53  |   158.23   158.23  | //: 32-bit, MSC_v4.20 [ C =...]
+   2048_ ||    74.22    74.23  |    77.37    77.38  |   148.59   149.28  | //: 32-bit, MSC_v4.20 [ C =...]
+   4096_ ||    73.56    73.57  |    76.51    76.51  |   142.82   145.02  | //: 32-bit, MSC_v4.20 [ C =...]
+   8192_ ||    73.23    73.24  |    73.69    74.43  |   143.02   143.30  | //: 32-bit, MSC_v4.20 [ C =...]
+  10000_ ||    73.30    73.36  |    73.75    75.53  |   144.04   144.80  | //: 32-bit, MSC_v4.20 [ C =...]
+  16384_ ||    69.35    73.11  |    71.79    73.01  |   131.31   140.35  | //: 32-bit, MSC_v4.20 [ C =...]
+  32768_ ||    73.05    73.36  |    72.43    74.54  |   126.01   139.95  | //: 32-bit, MSC_v4.20 [ C =...]
+ 100000_ ||    69.28    70.12  |    66.33    67.35  |   129.68   136.97  | //: 32-bit, MSC_v4.20 [ C =...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1152 bytes |         1024 bytes |         1088 bytes | //: 32-bit, MSC_v4.20 [ C =...]
+  Block  ||        11968 bytes |        23776 bytes |        55360 bytes | //: 32-bit, MSC_v4.20 [ C =...]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:28:57,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   780.00   786.00  |  1110.00  1110.00  |  3288.00  3318.00  | //: 64-bit, MSC_v9.00 [ C =...]
+      2_ ||   402.00   402.00  |   549.00   552.00  |  1659.00  1659.00  | //: 64-bit, MSC_v9.00 [ C =...]
+      4_ ||   199.50   201.00  |   274.50   276.00  |   829.50   829.50  | //: 64-bit, MSC_v9.00 [ C =...]
+      8_ ||    96.75    97.50  |   134.25   135.00  |   414.75   414.75  | //: 64-bit, MSC_v9.00 [ C =...]
+     10_ ||    78.60    79.80  |   109.80   109.80  |   331.20   331.80  | //: 64-bit, MSC_v9.00 [ C =...]
+     16_ ||    48.38    48.38  |    67.13    67.13  |   224.25   224.63  | //: 64-bit, MSC_v9.00 [ C =...]
+     32_ ||    26.63    26.81  |    36.38    36.38  |   112.31   112.31  | //: 64-bit, MSC_v9.00 [ C =...]
+     64_ ||    17.06    17.06  |    16.78    16.78  |    51.66    51.75  | //: 64-bit, MSC_v9.00 [ C =...]
+    100_ ||    16.74    16.80  |    15.54    15.54  |    33.30    33.30  | //: 64-bit, MSC_v9.00 [ C =...]
+    128_ ||    12.98    13.08  |    11.95    12.00  |    25.78    25.83  | //: 64-bit, MSC_v9.00 [ C =...]
+    256_ ||    10.99    10.99  |     9.19     9.21  |    19.03    19.03  | //: 64-bit, MSC_v9.00 [ C =...]
+    512_ ||    10.14    10.18  |     7.84     7.85  |    15.60    15.60  | //: 64-bit, MSC_v9.00 [ C =...]
+   1000_ ||     9.88    10.67  |     7.38     7.38  |    14.16    14.17  | //: 64-bit, MSC_v9.00 [ C =...]
+   1024_ ||     9.60     9.64  |     7.18     7.18  |    13.74    13.74  | //: 64-bit, MSC_v9.00 [ C =...]
+   2048_ ||     9.35     9.38  |     6.83     6.83  |    12.84    12.84  | //: 64-bit, MSC_v9.00 [ C =...]
+   4096_ ||     9.28     9.28  |     6.69     6.70  |    12.40    12.40  | //: 64-bit, MSC_v9.00 [ C =...]
+   8192_ ||     9.18     9.21  |     6.58     6.59  |    12.28    12.28  | //: 64-bit, MSC_v9.00 [ C =...]
+  10000_ ||     9.21     9.22  |     6.60     6.60  |    12.27    12.39  | //: 64-bit, MSC_v9.00 [ C =...]
+  16384_ ||     9.19     9.20  |     6.53     6.55  |    12.12    12.12  | //: 64-bit, MSC_v9.00 [ C =...]
+  32768_ ||     9.16     9.17  |     6.51     6.55  |    12.08    12.53  | //: 64-bit, MSC_v9.00 [ C =...]
+ 100000_ ||     9.98    10.01  |     7.04     7.08  |    12.36    13.14  | //: 64-bit, MSC_v9.00 [ C =...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+  Block  ||         2272 bytes |         4944 bytes |        15264 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:28:59,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  6204.00  6252.00  | 11058.00 11124.00  | 25662.00 25788.00  | //: 32-bit, BCC_v5.51 [ C =...]
+      2_ ||  3048.00  3060.00  |  5469.00  5481.00  | 12576.00 12672.00  | //: 32-bit, BCC_v5.51 [ C =...]
+      4_ ||  1515.00  1521.00  |  2731.50  2733.00  |  6303.00  6348.00  | //: 32-bit, BCC_v5.51 [ C =...]
+      8_ ||   756.75   760.50  |  1364.25  1367.25  |  3147.75  3162.75  | //: 32-bit, BCC_v5.51 [ C =...]
+     10_ ||   605.40   607.80  |  1092.60  1095.00  |  2541.60  2545.80  | //: 32-bit, BCC_v5.51 [ C =...]
+     16_ ||   379.50   380.62  |   682.88   683.25  |  1584.38  1590.00  | //: 32-bit, BCC_v5.51 [ C =...]
+     32_ ||   187.88   188.62  |   340.69   341.06  |   794.81   797.62  | //: 32-bit, BCC_v5.51 [ C =...]
+     64_ ||   138.19   138.28  |   169.69   169.78  |   420.75   420.84  | //: 32-bit, BCC_v5.51 [ C =...]
+    100_ ||   145.02   145.08  |   160.80   160.86  |   269.16   269.52  | //: 32-bit, BCC_v5.51 [ C =...]
+    128_ ||   112.92   112.92  |   125.39   125.44  |   210.00   210.23  | //: 32-bit, BCC_v5.51 [ C =...]
+    256_ ||   100.27   100.29  |   103.08   103.08  |   156.33   156.42  | //: 32-bit, BCC_v5.51 [ C =...]
+    512_ ||    93.98    94.00  |    91.90    91.91  |   129.40   129.41  | //: 32-bit, BCC_v5.51 [ C =...]
+   1000_ ||    93.02    93.02  |    88.44    88.44  |   118.61   118.69  | //: 32-bit, BCC_v5.51 [ C =...]
+   1024_ ||    90.80    90.81  |    86.34    86.34  |   115.78   115.86  | //: 32-bit, BCC_v5.51 [ C =...]
+   2048_ ||    89.22    89.22  |    77.11    83.54  |    95.12    95.16  | //: 32-bit, BCC_v5.51 [ C =...]
+   4096_ ||    81.62    81.62  |    75.83    75.83  |    92.17   100.45  | //: 32-bit, BCC_v5.51 [ C =...]
+   8192_ ||    81.80    88.11  |    75.18    78.19  |    90.69    92.09  | //: 32-bit, BCC_v5.51 [ C =...]
+  10000_ ||    81.32    84.70  |    76.65    78.80  |    92.85    94.82  | //: 32-bit, BCC_v5.51 [ C =...]
+  16384_ ||    83.13    83.59  |    76.92    77.00  |    92.05    93.27  | //: 32-bit, BCC_v5.51 [ C =...]
+  32768_ ||    83.07    84.01  |    76.76    77.91  |    92.12    94.08  | //: 32-bit, BCC_v5.51 [ C =...]
+ 100000_ ||    83.48    84.08  |    77.08    78.59  |    93.38   102.21  | //: 32-bit, BCC_v5.51 [ C =...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [ C =...]
+  Block  ||        10732 bytes |        20964 bytes |        45988 bytes | //: 32-bit, BCC_v5.51 [ C =...]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:07,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2592.00  2604.00  |  4848.00  4854.00  | 22278.00 22284.00  | //: 32-bit, BCC_v5.51 [asm=...]
+      2_ ||  1287.00  1293.00  |  2430.00  2430.00  | 11139.00 11139.00  | //: 32-bit, BCC_v5.51 [asm=...]
+      4_ ||   637.50   639.00  |  1213.50  1213.50  |  5565.00  5566.50  | //: 32-bit, BCC_v5.51 [asm=...]
+      8_ ||   318.75   319.50  |   606.75   606.75  |  2782.50  2783.25  | //: 32-bit, BCC_v5.51 [asm=...]
+     10_ ||   255.60   255.60  |   486.00   486.60  |  2228.40  2228.40  | //: 32-bit, BCC_v5.51 [asm=...]
+     16_ ||   159.75   159.75  |   301.88   302.25  |  1391.25  1391.62  | //: 32-bit, BCC_v5.51 [asm=...]
+     32_ ||    78.75    78.75  |   151.31   151.31  |   695.44   695.62  | //: 32-bit, BCC_v5.51 [asm=...]
+     64_ ||    55.69    57.28  |    74.81    74.91  |   347.81   347.81  | //: 32-bit, BCC_v5.51 [asm=...]
+    100_ ||    57.42    57.48  |    69.84    69.90  |   222.60   222.60  | //: 32-bit, BCC_v5.51 [asm=...]
+    128_ ||    44.53    44.58  |    54.38    54.38  |   173.67   173.67  | //: 32-bit, BCC_v5.51 [asm=...]
+    256_ ||    38.55    38.55  |    43.99    44.02  |   129.05   129.05  | //: 32-bit, BCC_v5.51 [asm=...]
+    512_ ||    35.60    35.60  |    38.66    38.67  |   106.62   106.62  | //: 32-bit, BCC_v5.51 [asm=...]
+   1000_ ||    34.89    34.89  |    37.18    37.18  |    97.72    97.72  | //: 32-bit, BCC_v5.51 [asm=...]
+   1024_ ||    34.23    34.98  |    35.85    35.86  |    95.40    95.40  | //: 32-bit, BCC_v5.51 [asm=...]
+   2048_ ||    33.86    33.86  |    34.66    34.66  |    89.79    89.80  | //: 32-bit, BCC_v5.51 [asm=...]
+   4096_ ||    33.22    33.59  |    33.92    34.26  |    86.99    86.99  | //: 32-bit, BCC_v5.51 [asm=...]
+   8192_ ||    33.11    33.11  |    33.80    33.92  |    74.64    77.45  | //: 32-bit, BCC_v5.51 [asm=...]
+  10000_ ||    31.46    33.40  |    31.80    32.58  |    78.61    78.66  | //: 32-bit, BCC_v5.51 [asm=...]
+  16384_ ||    31.49    32.78  |    32.16    33.51  |    76.07    76.13  | //: 32-bit, BCC_v5.51 [asm=...]
+  32768_ ||    32.01    32.58  |    32.74    33.18  |    75.73    76.02  | //: 32-bit, BCC_v5.51 [asm=...]
+ 100000_ ||    32.23    32.45  |    33.33    61.75  |    84.30    85.34  | //: 32-bit, BCC_v5.51 [asm=...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [asm=...]
+  Block  ||         7588 bytes |        16636 bytes |        38262 bytes | //: 32-bit, BCC_v5.51 [asm=...]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:12,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2484.00  2490.00  |  4830.00  4836.00  | 22182.00 22188.00  | //: 32-bit, MSC_v9.00 [asm=...]
+      2_ ||  1254.00  1254.00  |  2415.00  2415.00  | 11091.00 11091.00  | //: 32-bit, MSC_v9.00 [asm=...]
+      4_ ||   627.00   627.00  |  1207.50  1207.50  |  5545.50  5545.50  | //: 32-bit, MSC_v9.00 [asm=...]
+      8_ ||   313.50   313.50  |   603.00   603.75  |  2390.25  2478.00  | //: 32-bit, MSC_v9.00 [asm=...]
+     10_ ||   250.20   252.00  |   485.40   488.40  |  1936.80  1959.00  | //: 32-bit, MSC_v9.00 [asm=...]
+     16_ ||   156.00   156.75  |   301.50   301.50  |  1386.00  1386.00  | //: 32-bit, MSC_v9.00 [asm=...]
+     32_ ||    77.81    77.81  |   150.94   151.31  |   692.81   692.81  | //: 32-bit, MSC_v9.00 [asm=...]
+     64_ ||    56.34    56.34  |    74.81    74.81  |   343.78   346.41  | //: 32-bit, MSC_v9.00 [asm=...]
+    100_ ||    58.62    58.68  |    70.74    70.80  |   221.76   221.76  | //: 32-bit, MSC_v9.00 [asm=...]
+    128_ ||    45.47    45.47  |    55.08    55.08  |   168.94   173.02  | //: 32-bit, MSC_v9.00 [asm=...]
+    256_ ||    40.10    40.10  |    44.95    44.95  |   128.88   128.88  | //: 32-bit, MSC_v9.00 [asm=...]
+    512_ ||    37.49    37.55  |    39.94    39.94  |    92.99    92.99  | //: 32-bit, MSC_v9.00 [asm=...]
+   1000_ ||    34.12    34.16  |    35.44    35.44  |    85.27    85.31  | //: 32-bit, MSC_v9.00 [asm=...]
+   1024_ ||    33.30    33.30  |    34.58    34.59  |    83.24    83.25  | //: 32-bit, MSC_v9.00 [asm=...]
+   2048_ ||    32.70    32.70  |    36.20    36.20  |    89.82    89.82  | //: 32-bit, MSC_v9.00 [asm=...]
+   4096_ ||    35.09    35.09  |    35.50    35.57  |    87.04    87.05  | //: 32-bit, MSC_v9.00 [asm=...]
+   8192_ ||    34.83    35.38  |    35.12    35.64  |    76.07    84.71  | //: 32-bit, MSC_v9.00 [asm=...]
+  10000_ ||    34.78    34.98  |    35.36    35.36  |    86.31    86.35  | //: 32-bit, MSC_v9.00 [asm=...]
+  16384_ ||    34.76    34.80  |    35.07    35.36  |    80.55    85.21  | //: 32-bit, MSC_v9.00 [asm=...]
+  32768_ ||    32.88    33.17  |    33.06    33.37  |    75.87    76.15  | //: 32-bit, MSC_v9.00 [asm=...]
+ 100000_ ||    32.96    33.40  |    33.29    33.60  |    75.79    76.81  | //: 32-bit, MSC_v9.00 [asm=...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+  Block  ||         7588 bytes |        16636 bytes |        38262 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:17,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2490.00  2496.00  |  4824.00  4836.00  | 22332.00 22356.00  | //: 32-bit, GCC_v3.42 [asm=...]
+      2_ ||  1251.00  1260.00  |  2412.00  2415.00  | 11157.00 11166.00  | //: 32-bit, GCC_v3.42 [asm=...]
+      4_ ||   621.00   622.50  |  1204.50  1204.50  |  5571.00  5572.50  | //: 32-bit, GCC_v3.42 [asm=...]
+      8_ ||   310.50   311.25  |   602.25   602.25  |  2785.50  2786.25  | //: 32-bit, GCC_v3.42 [asm=...]
+     10_ ||   249.00   249.60  |   482.40   482.40  |  2233.20  2233.80  | //: 32-bit, GCC_v3.42 [asm=...]
+     16_ ||   155.25   155.63  |   300.75   301.50  |  1393.88  1393.88  | //: 32-bit, GCC_v3.42 [asm=...]
+     32_ ||    76.50    77.06  |   151.31   151.31  |   696.38   696.56  | //: 32-bit, GCC_v3.42 [asm=...]
+     64_ ||    55.78    56.06  |    75.19    75.19  |   348.19   348.19  | //: 32-bit, GCC_v3.42 [asm=...]
+    100_ ||    58.32    58.44  |    70.80    70.80  |   222.96   222.96  | //: 32-bit, GCC_v3.42 [asm=...]
+    128_ ||    45.14    45.52  |    55.08    55.13  |   173.72   173.77  | //: 32-bit, GCC_v3.42 [asm=...]
+    256_ ||    40.03    40.13  |    44.91    44.93  |   129.33   129.33  | //: 32-bit, GCC_v3.42 [asm=...]
+    512_ ||    37.38    37.50  |    39.77    39.79  |   106.58   106.66  | //: 32-bit, GCC_v3.42 [asm=...]
+   1000_ ||    36.94    37.03  |    38.19    38.19  |    97.66    97.69  | //: 32-bit, GCC_v3.42 [asm=...]
+   1024_ ||    35.75    36.13  |    37.24    37.24  |    95.29    95.32  | //: 32-bit, GCC_v3.42 [asm=...]
+   2048_ ||    35.36    35.44  |    35.94    35.94  |    88.77    89.67  | //: 32-bit, GCC_v3.42 [asm=...]
+   4096_ ||    35.02    35.02  |    35.31    35.38  |    77.07    86.35  | //: 32-bit, GCC_v3.42 [asm=...]
+   8192_ ||    32.18    32.20  |    32.30    32.31  |    74.72    77.04  | //: 32-bit, GCC_v3.42 [asm=...]
+  10000_ ||    32.28    32.34  |    32.41    32.43  |    78.36    78.77  | //: 32-bit, GCC_v3.42 [asm=...]
+  16384_ ||    32.16    33.29  |    32.20    33.16  |    76.16    78.39  | //: 32-bit, GCC_v3.42 [asm=...]
+  32768_ ||    33.20    33.70  |    33.22    33.33  |    75.84    76.10  | //: 32-bit, GCC_v3.42 [asm=...]
+ 100000_ ||    33.26    33.96  |    33.11    33.41  |    75.75    76.29  | //: 32-bit, GCC_v3.42 [asm=...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [asm=...]
+  Block  ||         7588 bytes |        16636 bytes |        38262 bytes | //: 32-bit, GCC_v3.42 [asm=...]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:29:22,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   672.00   672.00  |  1068.00  1068.00  |  1920.00  1926.00  | //: 64-bit, MSC_v9.00 [asm=...]
+      2_ ||   336.00   336.00  |   534.00   534.00  |   963.00   963.00  | //: 64-bit, MSC_v9.00 [asm=...]
+      4_ ||   166.50   168.00  |   267.00   267.00  |   481.50   483.00  | //: 64-bit, MSC_v9.00 [asm=...]
+      8_ ||    81.00    81.00  |   130.50   131.25  |   240.00   240.75  | //: 64-bit, MSC_v9.00 [asm=...]
+     10_ ||    64.80    65.40  |   107.40   108.00  |   192.00   192.60  | //: 64-bit, MSC_v9.00 [asm=...]
+     16_ ||    40.13    40.13  |    65.63    65.63  |   120.00   120.00  | //: 64-bit, MSC_v9.00 [asm=...]
+     32_ ||    20.06    20.06  |    32.81    32.81  |    59.63    59.81  | //: 64-bit, MSC_v9.00 [asm=...]
+     64_ ||    14.25    14.34  |    16.31    16.31  |    32.44    32.44  | //: 64-bit, MSC_v9.00 [asm=...]
+    100_ ||    15.54    15.60  |    16.20    16.26  |    21.06    21.06  | //: 64-bit, MSC_v9.00 [asm=...]
+    128_ ||    11.81    11.86  |    11.44    11.48  |    14.86    14.86  | //: 64-bit, MSC_v9.00 [asm=...]
+    256_ ||     9.28     9.28  |     8.81     8.81  |    10.83    10.83  | //: 64-bit, MSC_v9.00 [asm=...]
+    512_ ||     8.43     8.43  |     7.46     7.46  |     8.66     8.66  | //: 64-bit, MSC_v9.00 [asm=...]
+   1000_ ||     8.18     8.18  |     6.97     6.97  |     7.77     7.78  | //: 64-bit, MSC_v9.00 [asm=...]
+   1024_ ||     7.98     8.50  |     6.81     7.38  |     7.58     7.58  | //: 64-bit, MSC_v9.00 [asm=...]
+   2048_ ||     7.75     7.75  |     6.47     6.47  |     7.05     7.05  | //: 64-bit, MSC_v9.00 [asm=...]
+   4096_ ||     7.65     7.65  |     6.30     6.30  |     6.78     6.78  | //: 64-bit, MSC_v9.00 [asm=...]
+   8192_ ||     7.59     7.59  |     6.21     6.21  |     6.64     6.64  | //: 64-bit, MSC_v9.00 [asm=...]
+  10000_ ||     7.59     7.59  |     6.23     6.23  |     6.69     6.69  | //: 64-bit, MSC_v9.00 [asm=...]
+  16384_ ||     7.57     7.57  |     6.17     6.17  |     6.57     6.57  | //: 64-bit, MSC_v9.00 [asm=...]
+  32768_ ||     7.55     7.56  |     6.15     6.15  |     6.53     6.55  | //: 64-bit, MSC_v9.00 [asm=...]
+ 100000_ ||     7.55     7.71  |     6.14     6.38  |     6.56     6.86  | //: 64-bit, MSC_v9.00 [asm=...]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+  Block  ||         2323 bytes |         4733 bytes |        11817 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:24,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  4728.00  4728.00  |  8352.00  8352.00  | 20034.00 20040.00  | //: 32-bit, GCC_v3.42 [ C =111]
+      2_ ||  2370.00  2370.00  |  4179.00  4179.00  |  9261.00  9264.00  | //: 32-bit, GCC_v3.42 [ C =111]
+      4_ ||  1092.00  1096.50  |  1924.50  1926.00  |  4624.50  4624.50  | //: 32-bit, GCC_v3.42 [ C =111]
+      8_ ||   544.50   545.25  |  1040.25  1047.75  |  2312.25  2313.00  | //: 32-bit, GCC_v3.42 [ C =111]
+     10_ ||   436.20   436.80  |   768.60   769.20  |  1852.20  1852.20  | //: 32-bit, GCC_v3.42 [ C =111]
+     16_ ||   272.63   273.00  |   480.38   519.38  |  1156.88  1157.25  | //: 32-bit, GCC_v3.42 [ C =111]
+     32_ ||   135.94   135.94  |   240.56   240.75  |   579.00   579.00  | //: 32-bit, GCC_v3.42 [ C =111]
+     64_ ||   100.88   101.53  |   129.75   129.84  |   289.59   289.69  | //: 32-bit, GCC_v3.42 [ C =111]
+    100_ ||   106.44   106.44  |   113.94   114.18  |   185.46   200.94  | //: 32-bit, GCC_v3.42 [ C =111]
+    128_ ||    83.06    83.06  |    89.11    89.11  |   144.61   144.61  | //: 32-bit, GCC_v3.42 [ C =111]
+    256_ ||    73.83    79.99  |    73.34    79.45  |   107.55   107.55  | //: 32-bit, GCC_v3.42 [ C =111]
+    512_ ||    69.16    69.18  |    65.32    65.39  |    88.89    88.92  | //: 32-bit, GCC_v3.42 [ C =111]
+   1000_ ||    68.45    68.45  |    62.84    62.92  |    81.36    81.38  | //: 32-bit, GCC_v3.42 [ C =111]
+   1024_ ||    66.83    66.86  |    61.34    61.34  |    79.42    79.43  | //: 32-bit, GCC_v3.42 [ C =111]
+   2048_ ||    65.67    65.73  |    59.33    59.33  |    74.70    74.71  | //: 32-bit, GCC_v3.42 [ C =111]
+   4096_ ||    65.08    65.15  |    58.33    58.33  |    72.33    72.34  | //: 32-bit, GCC_v3.42 [ C =111]
+   8192_ ||    65.76    70.08  |    62.66    62.66  |    77.08    77.15  | //: 32-bit, GCC_v3.42 [ C =111]
+  10000_ ||    70.01    70.33  |    62.84    62.92  |    77.70    77.70  | //: 32-bit, GCC_v3.42 [ C =111]
+  16384_ ||    69.93    70.32  |    62.63    62.71  |    72.64    72.73  | //: 32-bit, GCC_v3.42 [ C =111]
+  32768_ ||    69.31    69.90  |    58.90    59.54  |    73.37    76.24  | //: 32-bit, GCC_v3.42 [ C =111]
+ 100000_ ||    67.54    70.40  |    59.09    59.39  |    72.65    73.26  | //: 32-bit, GCC_v3.42 [ C =111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [ C =111]
+  Block  ||         2928 bytes |         5568 bytes |        11712 bytes | //: 32-bit, GCC_v3.42 [ C =111]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:31,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2952.00  2958.00  |  6030.00  6036.00  | 13668.00 13674.00  | //: 32-bit, MSC_v9.00 [ C =111]
+      2_ ||  1476.00  1476.00  |  3015.00  3015.00  |  6831.00  6834.00  | //: 32-bit, MSC_v9.00 [ C =111]
+      4_ ||   738.00   739.50  |  1507.50  1507.50  |  3415.50  3415.50  | //: 32-bit, MSC_v9.00 [ C =111]
+      8_ ||   369.00   369.75  |   751.50   751.50  |  1707.00  1707.00  | //: 32-bit, MSC_v9.00 [ C =111]
+     10_ ||   295.80   295.80  |   603.00   603.60  |  1366.80  1366.80  | //: 32-bit, MSC_v9.00 [ C =111]
+     16_ ||   184.88   185.25  |   376.50   376.50  |   855.38   855.38  | //: 32-bit, MSC_v9.00 [ C =111]
+     32_ ||    91.31    91.50  |   188.44   188.63  |   427.50   427.50  | //: 32-bit, MSC_v9.00 [ C =111]
+     64_ ||    66.56    66.66  |    93.84    93.84  |   213.56   213.66  | //: 32-bit, MSC_v9.00 [ C =111]
+    100_ ||    69.96    70.02  |    88.98    89.04  |   136.92   137.52  | //: 32-bit, MSC_v9.00 [ C =111]
+    128_ ||    54.14    54.23  |    69.52    69.75  |   106.69   106.88  | //: 32-bit, MSC_v9.00 [ C =111]
+    256_ ||    47.70    47.77  |    57.12    57.19  |    79.24    79.29  | //: 32-bit, MSC_v9.00 [ C =111]
+    512_ ||    44.46    44.54  |    50.75    50.81  |    65.52    65.55  | //: 32-bit, MSC_v9.00 [ C =111]
+   1000_ ||    43.90    43.96  |    48.78    48.85  |    60.08    60.11  | //: 32-bit, MSC_v9.00 [ C =111]
+   1024_ ||    42.83    42.87  |    47.44    47.65  |    58.49    58.51  | //: 32-bit, MSC_v9.00 [ C =111]
+   2048_ ||    42.17    42.17  |    45.83    45.83  |    55.01    55.16  | //: 32-bit, MSC_v9.00 [ C =111]
+   4096_ ||    41.76    41.76  |    45.02    45.03  |    53.27    53.44  | //: 32-bit, MSC_v9.00 [ C =111]
+   8192_ ||    38.35    41.55  |    41.20    41.26  |    48.54    51.37  | //: 32-bit, MSC_v9.00 [ C =111]
+  10000_ ||    41.53    41.59  |    44.76    44.80  |    53.01    53.01  | //: 32-bit, MSC_v9.00 [ C =111]
+  16384_ ||    41.38    41.69  |    44.43    44.47  |    52.00    52.07  | //: 32-bit, MSC_v9.00 [ C =111]
+  32768_ ||    41.36    41.38  |    44.43    44.44  |    51.86    52.07  | //: 32-bit, MSC_v9.00 [ C =111]
+ 100000_ ||    41.32    41.60  |    44.52    44.62  |    51.75    51.92  | //: 32-bit, MSC_v9.00 [ C =111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+  Block  ||         1712 bytes |         3664 bytes |         7200 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:36,Oct  7 2008  by  'MSC_v6.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  9150.00  9156.00  | 16794.00 16836.00  | 36456.00 37386.00  | //: 32-bit, MSC_v6.00 [ C =111]
+      2_ ||  4335.00  4335.00  |  8244.00  8352.00  | 18156.00 18246.00  | //: 32-bit, MSC_v6.00 [ C =111]
+      4_ ||  2167.50  2167.50  |  4117.50  4201.50  |  9031.50  9060.00  | //: 32-bit, MSC_v6.00 [ C =111]
+      8_ ||  1083.00  1083.75  |  2122.50  2125.50  |  4515.00  4611.00  | //: 32-bit, MSC_v6.00 [ C =111]
+     10_ ||   874.80   874.80  |  1683.60  1695.60  |  3621.00  3705.00  | //: 32-bit, MSC_v6.00 [ C =111]
+     16_ ||   541.50   541.88  |  1041.38  1042.50  |  2268.38  2274.00  | //: 32-bit, MSC_v6.00 [ C =111]
+     32_ ||   271.88   272.25  |   515.63   526.13  |  1133.81  1139.06  | //: 32-bit, MSC_v6.00 [ C =111]
+     64_ ||   201.00   201.09  |   259.59   263.72  |   567.47   569.34  | //: 32-bit, MSC_v6.00 [ C =111]
+    100_ ||   211.92   211.98  |   250.32   251.10  |   363.06   363.60  | //: 32-bit, MSC_v6.00 [ C =111]
+    128_ ||   166.78   167.11  |   196.73   198.28  |   283.45   284.20  | //: 32-bit, MSC_v6.00 [ C =111]
+    256_ ||   147.94   147.94  |   160.57   160.71  |   212.18   212.72  | //: 32-bit, MSC_v6.00 [ C =111]
+    512_ ||   139.32   139.37  |   143.68   143.70  |   175.95   176.36  | //: 32-bit, MSC_v6.00 [ C =111]
+   1000_ ||   138.17   138.18  |   140.23   140.80  |   168.46   168.46  | //: 32-bit, MSC_v6.00 [ C =111]
+   1024_ ||   134.92   134.92  |   135.90   136.72  |   164.48   164.48  | //: 32-bit, MSC_v6.00 [ C =111]
+   2048_ ||   132.76   132.76  |   132.19   132.25  |   154.34   155.67  | //: 32-bit, MSC_v6.00 [ C =111]
+   4096_ ||   131.66   131.74  |   132.76   133.34  |   149.64   150.49  | //: 32-bit, MSC_v6.00 [ C =111]
+   8192_ ||   131.21   135.88  |   120.29   124.46  |   142.16   147.73  | //: 32-bit, MSC_v6.00 [ C =111]
+  10000_ ||   124.18   125.11  |   123.38   125.46  |   139.12   140.88  | //: 32-bit, MSC_v6.00 [ C =111]
+  16384_ ||   124.27   130.94  |   122.04   127.55  |   137.91   146.32  | //: 32-bit, MSC_v6.00 [ C =111]
+  32768_ ||   123.57   128.76  |   120.72   121.97  |   138.10   140.89  | //: 32-bit, MSC_v6.00 [ C =111]
+ 100000_ ||   123.30   129.21  |   123.83   125.61  |   145.19   145.40  | //: 32-bit, MSC_v6.00 [ C =111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1486 bytes |         1348 bytes |         1445 bytes | //: 32-bit, MSC_v6.00 [ C =111]
+  Block  ||         2435 bytes |         5119 bytes |         8894 bytes | //: 32-bit, MSC_v6.00 [ C =111]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:48,Oct  7 2008  by  'MSC_v4.20', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  5418.00  5430.00  |  9450.00  9462.00  | 20436.00 20472.00  | //: 32-bit, MSC_v4.20 [ C =111]
+      2_ ||  2709.00  2721.00  |  4725.00  4731.00  | 10212.00 10245.00  | //: 32-bit, MSC_v4.20 [ C =111]
+      4_ ||  1351.50  1354.50  |  2359.50  2361.00  |  5097.00  5107.50  | //: 32-bit, MSC_v4.20 [ C =111]
+      8_ ||   675.00   678.75  |  1179.75  1179.75  |  2549.25  2552.25  | //: 32-bit, MSC_v4.20 [ C =111]
+     10_ ||   540.60   546.60  |   943.20   944.40  |  2041.20  2041.80  | //: 32-bit, MSC_v4.20 [ C =111]
+     16_ ||   337.88   338.25  |   589.50   589.50  |  1273.88  1275.38  | //: 32-bit, MSC_v4.20 [ C =111]
+     32_ ||   167.81   167.81  |   294.94   295.13  |   636.75   637.13  | //: 32-bit, MSC_v4.20 [ C =111]
+     64_ ||   124.41   124.41  |   147.19   147.84  |   318.28   318.47  | //: 32-bit, MSC_v4.20 [ C =111]
+    100_ ||   131.46   131.52  |   140.10   140.28  |   203.76   203.94  | //: 32-bit, MSC_v4.20 [ C =111]
+    128_ ||   102.42   102.47  |   109.22   109.41  |   159.05   159.38  | //: 32-bit, MSC_v4.20 [ C =111]
+    256_ ||    91.10    91.27  |    90.59    90.59  |   118.73   118.78  | //: 32-bit, MSC_v4.20 [ C =111]
+    512_ ||    85.43    85.43  |    80.78    80.79  |    98.43    98.48  | //: 32-bit, MSC_v4.20 [ C =111]
+   1000_ ||    84.56    84.56  |    77.74    77.75  |    90.24    90.28  | //: 32-bit, MSC_v4.20 [ C =111]
+   1024_ ||    82.55    82.55  |    75.83    75.83  |    88.15    88.19  | //: 32-bit, MSC_v4.20 [ C =111]
+   2048_ ||    81.07    81.07  |    73.35    73.36  |    83.00    83.02  | //: 32-bit, MSC_v4.20 [ C =111]
+   4096_ ||    80.34    80.36  |    72.12    72.13  |    80.42    80.44  | //: 32-bit, MSC_v4.20 [ C =111]
+   8192_ ||    79.97    80.54  |    71.56    71.64  |    79.11    79.62  | //: 32-bit, MSC_v4.20 [ C =111]
+  10000_ ||    75.11    80.03  |    66.25    69.37  |    73.59    74.99  | //: 32-bit, MSC_v4.20 [ C =111]
+  16384_ ||    75.57    80.04  |    67.66    71.51  |    74.32    74.42  | //: 32-bit, MSC_v4.20 [ C =111]
+  32768_ ||    75.61    80.15  |    67.03    67.84  |    74.04    78.41  | //: 32-bit, MSC_v4.20 [ C =111]
+ 100000_ ||    77.96    80.31  |    67.58    67.84  |    74.31    74.73  | //: 32-bit, MSC_v4.20 [ C =111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1152 bytes |         1024 bytes |         1088 bytes | //: 32-bit, MSC_v4.20 [ C =111]
+  Block  ||         2064 bytes |         3840 bytes |         7616 bytes | //: 32-bit, MSC_v4.20 [ C =111]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:29:54,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   780.00   786.00  |  1422.00  1434.00  |  3810.00  3816.00  | //: 64-bit, MSC_v9.00 [ C =111]
+      2_ ||   384.00   390.00  |   705.00   708.00  |  1902.00  1902.00  | //: 64-bit, MSC_v9.00 [ C =111]
+      4_ ||   193.50   193.50  |   355.50   355.50  |   951.00   952.50  | //: 64-bit, MSC_v9.00 [ C =111]
+      8_ ||    93.75    93.75  |   171.00   171.75  |   474.75   475.50  | //: 64-bit, MSC_v9.00 [ C =111]
+     10_ ||    75.60    76.20  |   140.40   140.40  |   380.40   381.00  | //: 64-bit, MSC_v9.00 [ C =111]
+     16_ ||    51.38    51.38  |    93.00    93.00  |   257.25   257.63  | //: 64-bit, MSC_v9.00 [ C =111]
+     32_ ||    25.31    25.31  |    46.50    46.50  |   118.69   118.69  | //: 64-bit, MSC_v9.00 [ C =111]
+     64_ ||    16.69    16.69  |    21.38    21.38  |    59.53    59.53  | //: 64-bit, MSC_v9.00 [ C =111]
+    100_ ||    17.16    17.22  |    20.52    21.00  |    38.22    38.28  | //: 64-bit, MSC_v9.00 [ C =111]
+    128_ ||    13.27    13.27  |    15.80    15.80  |    29.63    29.67  | //: 64-bit, MSC_v9.00 [ C =111]
+    256_ ||    11.16    11.18  |    12.61    12.73  |    22.10    22.10  | //: 64-bit, MSC_v9.00 [ C =111]
+    512_ ||    10.05    10.07  |    11.00    11.07  |    18.18    19.68  | //: 64-bit, MSC_v9.00 [ C =111]
+   1000_ ||     9.69     9.69  |    10.42    10.42  |    16.51    16.51  | //: 64-bit, MSC_v9.00 [ C =111]
+   1024_ ||     9.44     9.44  |    10.18    10.18  |    16.11    16.12  | //: 64-bit, MSC_v9.00 [ C =111]
+   2048_ ||     9.21     9.21  |     9.62     9.62  |    15.06    15.06  | //: 64-bit, MSC_v9.00 [ C =111]
+   4096_ ||     9.10     9.10  |     9.36     9.37  |    14.55    14.55  | //: 64-bit, MSC_v9.00 [ C =111]
+   8192_ ||     8.97     8.97  |     9.20     9.21  |    14.48    14.66  | //: 64-bit, MSC_v9.00 [ C =111]
+  10000_ ||     8.97     8.97  |     9.38     9.38  |    14.38    14.40  | //: 64-bit, MSC_v9.00 [ C =111]
+  16384_ ||     8.95     9.01  |     9.26     9.26  |    14.16    14.29  | //: 64-bit, MSC_v9.00 [ C =111]
+  32768_ ||     8.90     9.24  |     9.18     9.18  |    14.46    14.75  | //: 64-bit, MSC_v9.00 [ C =111]
+ 100000_ ||     9.18     9.71  |     9.35     9.49  |    14.79    14.99  | //: 64-bit, MSC_v9.00 [ C =111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+  Block  ||          704 bytes |         1456 bytes |         2976 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:29:57,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  6420.00  6420.00  | 11040.00 11040.00  | 23358.00 23364.00  | //: 32-bit, BCC_v5.51 [ C =111]
+      2_ ||  3210.00  3210.00  |  5517.00  5520.00  | 11679.00 11682.00  | //: 32-bit, BCC_v5.51 [ C =111]
+      4_ ||  1605.00  1605.00  |  2758.50  2758.50  |  5832.00  5833.50  | //: 32-bit, BCC_v5.51 [ C =111]
+      8_ ||   802.50   802.50  |  1379.25  1379.25  |  2916.00  2916.75  | //: 32-bit, BCC_v5.51 [ C =111]
+     10_ ||   642.00   642.00  |  1103.40  1103.40  |  2335.80  2335.80  | //: 32-bit, BCC_v5.51 [ C =111]
+     16_ ||   400.88   401.25  |   689.25   689.62  |  1458.00  1458.00  | //: 32-bit, BCC_v5.51 [ C =111]
+     32_ ||   199.50   199.50  |   344.44   344.44  |   729.00   729.00  | //: 32-bit, BCC_v5.51 [ C =111]
+     64_ ||   146.06   146.25  |   171.66   172.50  |   364.41   364.50  | //: 32-bit, BCC_v5.51 [ C =111]
+    100_ ||   152.28   152.28  |   162.78   162.78  |   233.16   233.16  | //: 32-bit, BCC_v5.51 [ C =111]
+    128_ ||   118.69   118.69  |   126.89   126.89  |   181.88   181.88  | //: 32-bit, BCC_v5.51 [ C =111]
+    256_ ||   104.62   104.62  |   104.48   104.48  |   135.30   135.33  | //: 32-bit, BCC_v5.51 [ C =111]
+    512_ ||    97.50    97.50  |    93.13    93.14  |   112.00   112.00  | //: 32-bit, BCC_v5.51 [ C =111]
+   1000_ ||    96.26    96.26  |    89.53    89.54  |   102.70   102.71  | //: 32-bit, BCC_v5.51 [ C =111]
+   1024_ ||    93.91    93.91  |    87.40    87.40  |   100.27   100.27  | //: 32-bit, BCC_v5.51 [ C =111]
+   2048_ ||    92.14    92.14  |    84.56    84.56  |    94.38    94.39  | //: 32-bit, BCC_v5.51 [ C =111]
+   4096_ ||    91.28    91.28  |    76.72    83.12  |    84.42    86.14  | //: 32-bit, BCC_v5.51 [ C =111]
+   8192_ ||    83.85    86.88  |    76.06    80.17  |    83.06    87.27  | //: 32-bit, BCC_v5.51 [ C =111]
+  10000_ ||    83.92    87.25  |    76.30    83.56  |    86.42    87.19  | //: 32-bit, BCC_v5.51 [ C =111]
+  16384_ ||    85.71    87.12  |    77.78    77.82  |    84.43    84.51  | //: 32-bit, BCC_v5.51 [ C =111]
+  32768_ ||    85.60    86.59  |    77.64    78.17  |    84.32    84.94  | //: 32-bit, BCC_v5.51 [ C =111]
+ 100000_ ||    86.18    87.75  |    78.03    79.63  |    84.77    88.79  | //: 32-bit, BCC_v5.51 [ C =111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [ C =111]
+  Block  ||         1888 bytes |         3028 bytes |         5864 bytes | //: 32-bit, BCC_v5.51 [ C =111]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:30:04,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2664.00  2664.00  |  4998.00  4998.00  | 10704.00 10704.00  | //: 32-bit, BCC_v5.51 [asm=111]
+      2_ ||  1338.00  1338.00  |  2505.00  2508.00  |  5352.00  5352.00  | //: 32-bit, BCC_v5.51 [asm=111]
+      4_ ||   669.00   669.00  |  1246.50  1246.50  |  2668.50  2670.00  | //: 32-bit, BCC_v5.51 [asm=111]
+      8_ ||   334.50   334.50  |   623.25   623.25  |  1334.25  1334.25  | //: 32-bit, BCC_v5.51 [asm=111]
+     10_ ||   266.40   266.40  |   501.00   501.00  |  1058.40  1058.40  | //: 32-bit, BCC_v5.51 [asm=111]
+     16_ ||   166.50   166.50  |   312.75   321.00  |   628.50   629.25  | //: 32-bit, BCC_v5.51 [asm=111]
+     32_ ||    79.88    79.88  |   147.75   147.75  |   312.19   312.38  | //: 32-bit, BCC_v5.51 [asm=111]
+     64_ ||    56.53    56.53  |    73.22    73.22  |   156.09   156.09  | //: 32-bit, BCC_v5.51 [asm=111]
+    100_ ||    58.08    58.08  |    68.52    74.10  |    99.36   107.52  | //: 32-bit, BCC_v5.51 [asm=111]
+    128_ ||    45.19    45.23  |    53.20    53.20  |    77.81    77.81  | //: 32-bit, BCC_v5.51 [asm=111]
+    256_ ||    39.26    39.28  |    43.24    43.24  |    57.52    62.32  | //: 32-bit, BCC_v5.51 [asm=111]
+    512_ ||    36.13    36.13  |    37.76    37.77  |    47.17    47.24  | //: 32-bit, BCC_v5.51 [asm=111]
+   1000_ ||    35.51    35.71  |    36.22    36.23  |    42.92    43.04  | //: 32-bit, BCC_v5.51 [asm=111]
+   1024_ ||    34.51    34.51  |    34.78    35.12  |    42.05    42.05  | //: 32-bit, BCC_v5.51 [asm=111]
+   2048_ ||    33.69    33.70  |    33.82    33.83  |    38.84    39.04  | //: 32-bit, BCC_v5.51 [asm=111]
+   4096_ ||    32.01    33.99  |    33.64    33.64  |    37.82    37.97  | //: 32-bit, BCC_v5.51 [asm=111]
+   8192_ ||    31.77    32.58  |    32.80    33.00  |    36.98    37.59  | //: 32-bit, BCC_v5.51 [asm=111]
+  10000_ ||    33.75    33.75  |    33.13    33.25  |    37.32    37.86  | //: 32-bit, BCC_v5.51 [asm=111]
+  16384_ ||    31.90    36.52  |    35.86    35.90  |    37.26    40.33  | //: 32-bit, BCC_v5.51 [asm=111]
+  32768_ ||    34.29    34.47  |    33.87    34.03  |    37.77    38.04  | //: 32-bit, BCC_v5.51 [asm=111]
+ 100000_ ||    33.20    34.48  |    33.75    33.91  |    37.98    38.23  | //: 32-bit, BCC_v5.51 [asm=111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [asm=111]
+  Block  ||         1276 bytes |         2532 bytes |         4983 bytes | //: 32-bit, BCC_v5.51 [asm=111]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:30:08,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2580.00  2598.00  |  4842.00  4848.00  | 10578.00 10602.00  | //: 32-bit, MSC_v9.00 [asm=111]
+      2_ ||  1299.00  1302.00  |  2445.00  2445.00  |  5277.00  5283.00  | //: 32-bit, MSC_v9.00 [asm=111]
+      4_ ||   648.00   648.00  |  1213.50  1215.00  |  2644.50  2649.00  | //: 32-bit, MSC_v9.00 [asm=111]
+      8_ ||   324.00   324.75  |   610.50   610.50  |  1322.25  1323.00  | //: 32-bit, MSC_v9.00 [asm=111]
+     10_ ||   259.80   259.80  |   484.20   484.20  |  1059.60  1060.20  | //: 32-bit, MSC_v9.00 [asm=111]
+     16_ ||   162.00   162.38  |   302.63   302.63  |   660.38   662.63  | //: 32-bit, MSC_v9.00 [asm=111]
+     32_ ||    80.81    81.00  |   141.56   141.56  |   308.63   308.63  | //: 32-bit, MSC_v9.00 [asm=111]
+     64_ ||    54.38    54.47  |    70.41    70.41  |   154.41   154.59  | //: 32-bit, MSC_v9.00 [asm=111]
+    100_ ||    57.18    57.24  |    66.42    66.48  |    98.40    98.46  | //: 32-bit, MSC_v9.00 [asm=111]
+    128_ ||    48.28    48.28  |    51.75    51.75  |    76.97    77.02  | //: 32-bit, MSC_v9.00 [asm=111]
+    256_ ||    39.05    39.05  |    42.45    42.47  |    56.95    56.95  | //: 32-bit, MSC_v9.00 [asm=111]
+    512_ ||    36.09    36.11  |    37.65    37.66  |    47.05    47.06  | //: 32-bit, MSC_v9.00 [asm=111]
+   1000_ ||    35.56    35.59  |    35.96    35.96  |    42.79    42.80  | //: 32-bit, MSC_v9.00 [asm=111]
+   1024_ ||    34.62    34.63  |    35.28    35.28  |    41.47    41.47  | //: 32-bit, MSC_v9.00 [asm=111]
+   2048_ ||    33.91    33.91  |    34.00    34.08  |    39.33    39.33  | //: 32-bit, MSC_v9.00 [asm=111]
+   4096_ ||    33.38    33.66  |    33.49    33.49  |    38.04    38.23  | //: 32-bit, MSC_v9.00 [asm=111]
+   8192_ ||    33.15    33.23  |    32.76    33.07  |    37.21    37.22  | //: 32-bit, MSC_v9.00 [asm=111]
+  10000_ ||    33.69    36.50  |    33.29    33.42  |    37.98    41.34  | //: 32-bit, MSC_v9.00 [asm=111]
+  16384_ ||    33.07    35.17  |    33.08    34.97  |    37.10    38.12  | //: 32-bit, MSC_v9.00 [asm=111]
+  32768_ ||    34.35    34.53  |    33.80    34.05  |    38.21    40.46  | //: 32-bit, MSC_v9.00 [asm=111]
+ 100000_ ||    33.96    34.57  |    33.93    35.69  |    38.04    38.20  | //: 32-bit, MSC_v9.00 [asm=111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+  Block  ||         1276 bytes |         2532 bytes |         4983 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:30:13,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2514.00  2514.00  |  4836.00  4836.00  | 10392.00 10398.00  | //: 32-bit, GCC_v3.42 [asm=111]
+      2_ ||  1254.00  1260.00  |  2409.00  2412.00  |  5181.00  5184.00  | //: 32-bit, GCC_v3.42 [asm=111]
+      4_ ||   628.50   628.50  |  1204.50  1204.50  |  2596.50  2598.00  | //: 32-bit, GCC_v3.42 [asm=111]
+      8_ ||   312.75   312.75  |   602.25   603.00  |  1298.25  1299.00  | //: 32-bit, GCC_v3.42 [asm=111]
+     10_ ||   250.80   251.40  |   482.40   483.00  |  1035.00  1035.60  | //: 32-bit, GCC_v3.42 [asm=111]
+     16_ ||   157.50   157.50  |   302.25   302.63  |   652.50   652.50  | //: 32-bit, GCC_v3.42 [asm=111]
+     32_ ||    78.19    78.38  |   151.88   152.06  |   326.81   326.81  | //: 32-bit, GCC_v3.42 [asm=111]
+     64_ ||    57.09    57.19  |    75.47    75.47  |   163.31   163.41  | //: 32-bit, GCC_v3.42 [asm=111]
+    100_ ||    60.06    60.06  |    71.22    71.28  |   104.58   104.58  | //: 32-bit, GCC_v3.42 [asm=111]
+    128_ ||    46.83    46.88  |    55.45    55.50  |    81.33    81.38  | //: 32-bit, GCC_v3.42 [asm=111]
+    256_ ||    41.32    41.34  |    45.47    45.49  |    59.91    59.91  | //: 32-bit, GCC_v3.42 [asm=111]
+    512_ ||    38.51    38.52  |    40.16    40.16  |    49.49    49.49  | //: 32-bit, GCC_v3.42 [asm=111]
+   1000_ ||    37.92    37.93  |    38.60    38.60  |    45.40    45.41  | //: 32-bit, GCC_v3.42 [asm=111]
+   1024_ ||    37.08    37.08  |    37.93    38.33  |    45.24    45.25  | //: 32-bit, GCC_v3.42 [asm=111]
+   2048_ ||    36.55    36.56  |    36.88    36.88  |    42.42    42.49  | //: 32-bit, GCC_v3.42 [asm=111]
+   4096_ ||    35.77    35.77  |    33.56    37.02  |    37.73    39.65  | //: 32-bit, GCC_v3.42 [asm=111]
+   8192_ ||    32.68    34.17  |    33.13    33.19  |    38.41    40.47  | //: 32-bit, GCC_v3.42 [asm=111]
+  10000_ ||    35.92    36.59  |    35.00    36.14  |    37.65    39.24  | //: 32-bit, GCC_v3.42 [asm=111]
+  16384_ ||    33.37    34.20  |    32.77    33.93  |    36.86    37.94  | //: 32-bit, GCC_v3.42 [asm=111]
+  32768_ ||    34.22    34.41  |    33.82    34.06  |    37.39    37.74  | //: 32-bit, GCC_v3.42 [asm=111]
+ 100000_ ||    34.23    34.34  |    33.81    34.20  |    37.34    37.86  | //: 32-bit, GCC_v3.42 [asm=111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [asm=111]
+  Block  ||         1276 bytes |         2532 bytes |         4983 bytes | //: 32-bit, GCC_v3.42 [asm=111]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:30:17,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   678.00   678.00  |  1098.00  1098.00  |  2034.00  2040.00  | //: 64-bit, MSC_v9.00 [asm=111]
+      2_ ||   339.00   339.00  |   546.00   546.00  |  1017.00  1020.00  | //: 64-bit, MSC_v9.00 [asm=111]
+      4_ ||   168.00   169.50  |   273.00   273.00  |   510.00   511.50  | //: 64-bit, MSC_v9.00 [asm=111]
+      8_ ||    81.75    82.50  |   134.25   134.25  |   254.25   255.00  | //: 64-bit, MSC_v9.00 [asm=111]
+     10_ ||    66.60    66.60  |   109.80   109.80  |   204.00   204.00  | //: 64-bit, MSC_v9.00 [asm=111]
+     16_ ||    40.88    40.88  |    66.75    67.13  |   127.50   127.50  | //: 64-bit, MSC_v9.00 [asm=111]
+     32_ ||    20.25    20.44  |    33.56    33.56  |    63.56    63.56  | //: 64-bit, MSC_v9.00 [asm=111]
+     64_ ||    14.91    15.00  |    16.50    16.50  |    31.69    31.69  | //: 64-bit, MSC_v9.00 [asm=111]
+    100_ ||    15.48    16.68  |    16.98    16.98  |    22.38    22.38  | //: 64-bit, MSC_v9.00 [asm=111]
+    128_ ||    12.80    12.80  |    12.94    12.94  |    15.84    15.89  | //: 64-bit, MSC_v9.00 [asm=111]
+    256_ ||     9.84     9.84  |     9.33     9.33  |    11.60    11.63  | //: 64-bit, MSC_v9.00 [asm=111]
+    512_ ||     8.75     8.79  |     8.53     8.57  |     9.36     9.38  | //: 64-bit, MSC_v9.00 [asm=111]
+   1000_ ||     8.45     8.45  |     7.93     7.93  |     8.39     8.39  | //: 64-bit, MSC_v9.00 [asm=111]
+   1024_ ||     8.25     8.25  |     7.14     7.14  |     8.19     8.19  | //: 64-bit, MSC_v9.00 [asm=111]
+   2048_ ||     8.00     8.00  |     6.77     7.33  |     7.58     7.58  | //: 64-bit, MSC_v9.00 [asm=111]
+   4096_ ||     7.88     7.88  |     6.58     6.58  |     7.29     7.29  | //: 64-bit, MSC_v9.00 [asm=111]
+   8192_ ||     7.81     7.81  |     6.49     6.49  |     7.13     7.15  | //: 64-bit, MSC_v9.00 [asm=111]
+  10000_ ||     7.81     7.81  |     6.50     6.50  |     7.18     7.18  | //: 64-bit, MSC_v9.00 [asm=111]
+  16384_ ||     7.79     7.79  |     6.42     6.42  |     7.04     7.04  | //: 64-bit, MSC_v9.00 [asm=111]
+  32768_ ||     7.77     7.77  |     6.40     6.40  |     7.03     7.03  | //: 64-bit, MSC_v9.00 [asm=111]
+ 100000_ ||     8.08     8.09  |     6.40     6.71  |     6.98     7.21  | //: 64-bit, MSC_v9.00 [asm=111]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+  Block  ||          664 bytes |         1074 bytes |         2221 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:30:19,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  4272.00  4296.00  |  7974.00  7980.00  | 17484.00 17496.00  | //: 32-bit, GCC_v3.42 [ C =332]
+      2_ ||  2139.00  2154.00  |  3981.00  3996.00  |  8736.00  8754.00  | //: 32-bit, GCC_v3.42 [ C =332]
+      4_ ||  1069.50  1071.00  |  1995.00  2002.50  |  4377.00  4378.50  | //: 32-bit, GCC_v3.42 [ C =332]
+      8_ ||   536.25   538.50  |   998.25  1000.50  |  2183.25  2186.25  | //: 32-bit, GCC_v3.42 [ C =332]
+     10_ ||   429.00   430.20  |   798.60   807.60  |  1749.60  1752.00  | //: 32-bit, GCC_v3.42 [ C =332]
+     16_ ||   267.75   270.00  |   498.00   499.88  |  1092.00  1093.13  | //: 32-bit, GCC_v3.42 [ C =332]
+     32_ ||   132.75   133.50  |   249.19   249.75  |   546.38   547.50  | //: 32-bit, GCC_v3.42 [ C =332]
+     64_ ||    98.44    99.00  |   123.94   124.03  |   272.25   272.34  | //: 32-bit, GCC_v3.42 [ C =332]
+    100_ ||   103.08   103.08  |   117.96   117.96  |   174.24   174.42  | //: 32-bit, GCC_v3.42 [ C =332]
+    128_ ||    80.72   121.13  |    92.34   133.22  |   137.06   137.39  | //: 32-bit, GCC_v3.42 [ C =332]
+    256_ ||    71.91    72.21  |    75.84    76.01  |   101.93   102.09  | //: 32-bit, GCC_v3.42 [ C =332]
+    512_ ||    67.50    67.59  |    67.62    67.75  |    83.95    84.47  | //: 32-bit, GCC_v3.42 [ C =332]
+   1000_ ||    66.71    67.00  |    64.95    65.28  |    77.12    77.20  | //: 32-bit, GCC_v3.42 [ C =332]
+   1024_ ||    64.89    64.96  |    63.19    63.23  |    74.67    74.67  | //: 32-bit, GCC_v3.42 [ C =332]
+   2048_ ||    63.35    63.36  |    61.13    61.14  |    70.19    70.19  | //: 32-bit, GCC_v3.42 [ C =332]
+   4096_ ||    62.80    62.80  |    60.11    60.12  |    62.58    62.65  | //: 32-bit, GCC_v3.42 [ C =332]
+   8192_ ||    57.83    59.01  |    55.02    60.12  |    66.75    66.86  | //: 32-bit, GCC_v3.42 [ C =332]
+  10000_ ||    62.69    62.87  |    59.76    59.87  |    67.20    67.63  | //: 32-bit, GCC_v3.42 [ C =332]
+  16384_ ||    62.50    62.75  |    55.96    59.35  |    62.39    63.28  | //: 32-bit, GCC_v3.42 [ C =332]
+  32768_ ||    58.66    59.69  |    56.17    56.62  |    61.97    63.07  | //: 32-bit, GCC_v3.42 [ C =332]
+ 100000_ ||    59.31    59.99  |    56.30    57.46  |    62.79    63.27  | //: 32-bit, GCC_v3.42 [ C =332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [ C =332]
+  Block  ||         6640 bytes |        13040 bytes |        18448 bytes | //: 32-bit, GCC_v3.42 [ C =332]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:30:25,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2988.00  2994.00  |  6240.00  6246.00  | 13794.00 13800.00  | //: 32-bit, MSC_v9.00 [ C =332]
+      2_ ||  1488.00  1503.00  |  3120.00  3126.00  |  6900.00  6903.00  | //: 32-bit, MSC_v9.00 [ C =332]
+      4_ ||   744.00   751.50  |  1560.00  1560.00  |  3445.50  3447.00  | //: 32-bit, MSC_v9.00 [ C =332]
+      8_ ||   372.00   372.75  |   777.75   779.25  |  1723.50  1723.50  | //: 32-bit, MSC_v9.00 [ C =332]
+     10_ ||   297.60   299.40  |   623.40   624.00  |  1379.40  1380.00  | //: 32-bit, MSC_v9.00 [ C =332]
+     16_ ||   186.38   186.38  |   389.25   389.63  |   861.75   861.75  | //: 32-bit, MSC_v9.00 [ C =332]
+     32_ ||    92.44    92.44  |   195.38   195.56  |   431.25   431.44  | //: 32-bit, MSC_v9.00 [ C =332]
+     64_ ||    67.59    67.78  |    97.03    97.13  |   215.53   215.63  | //: 32-bit, MSC_v9.00 [ C =332]
+    100_ ||    70.26    70.32  |    91.92    91.92  |   138.00   138.06  | //: 32-bit, MSC_v9.00 [ C =332]
+    128_ ||    54.98    55.08  |    71.44    71.48  |   107.58   107.58  | //: 32-bit, MSC_v9.00 [ C =332]
+    256_ ||    48.68    48.70  |    58.57    58.57  |    79.83    79.83  | //: 32-bit, MSC_v9.00 [ C =332]
+    512_ ||    45.43    45.46  |    52.22    52.23  |    65.98    66.01  | //: 32-bit, MSC_v9.00 [ C =332]
+   1000_ ||    44.88    44.89  |    50.20    50.20  |    60.44    60.45  | //: 32-bit, MSC_v9.00 [ C =332]
+   1024_ ||    43.81    43.81  |    48.98    48.99  |    59.00    59.00  | //: 32-bit, MSC_v9.00 [ C =332]
+   2048_ ||    43.00    43.00  |    47.36    47.37  |    55.50    55.50  | //: 32-bit, MSC_v9.00 [ C =332]
+   4096_ ||    42.59    42.59  |    46.56    46.57  |    53.75    53.75  | //: 32-bit, MSC_v9.00 [ C =332]
+   8192_ ||    42.38    42.39  |    46.16    46.16  |    52.87    52.87  | //: 32-bit, MSC_v9.00 [ C =332]
+  10000_ ||    42.42    42.42  |    46.30    46.31  |    53.29    53.31  | //: 32-bit, MSC_v9.00 [ C =332]
+  16384_ ||    42.28    42.60  |    45.96    46.75  |    52.45    52.52  | //: 32-bit, MSC_v9.00 [ C =332]
+  32768_ ||    42.25    42.36  |    45.84    45.85  |    52.30    52.32  | //: 32-bit, MSC_v9.00 [ C =332]
+ 100000_ ||    42.21    42.50  |    43.60    45.77  |    49.55    50.03  | //: 32-bit, MSC_v9.00 [ C =332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+  Block  ||         4560 bytes |         9232 bytes |        12560 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:30:31,Oct  7 2008  by  'MSC_v6.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  9054.00  9060.00  | 17406.00 17514.00  | 36888.00 37032.00  | //: 32-bit, MSC_v6.00 [ C =332]
+      2_ ||  4341.00  4341.00  |  9129.00  9168.00  | 18273.00 18423.00  | //: 32-bit, MSC_v6.00 [ C =332]
+      4_ ||  2169.00  2170.50  |  4590.00  4636.50  |  9240.00  9334.50  | //: 32-bit, MSC_v6.00 [ C =332]
+      8_ ||  1083.75  1084.50  |  2223.00  2243.25  |  4581.75  4663.50  | //: 32-bit, MSC_v6.00 [ C =332]
+     10_ ||   867.00   867.60  |  1776.60  1790.40  |  3648.00  3672.00  | //: 32-bit, MSC_v6.00 [ C =332]
+     16_ ||   541.13   541.50  |  1044.75  1051.50  |  2274.38  2278.50  | //: 32-bit, MSC_v6.00 [ C =332]
+     32_ ||   271.13   271.88  |   567.56   573.75  |  1139.25  1140.00  | //: 32-bit, MSC_v6.00 [ C =332]
+     64_ ||   201.09   201.09  |   270.84   272.81  |   569.72   571.59  | //: 32-bit, MSC_v6.00 [ C =332]
+    100_ ||   212.70   213.24  |   261.12   262.02  |   365.16   365.28  | //: 32-bit, MSC_v6.00 [ C =332]
+    128_ ||   166.08   166.45  |   204.84   205.41  |   284.48   288.80  | //: 32-bit, MSC_v6.00 [ C =332]
+    256_ ||   148.69   149.34  |   169.59   169.95  |   221.65   221.79  | //: 32-bit, MSC_v6.00 [ C =332]
+    512_ ||   140.47   140.53  |   148.24   148.48  |   179.11   179.11  | //: 32-bit, MSC_v6.00 [ C =332]
+   1000_ ||   139.66   139.66  |   139.37   139.82  |   163.58   165.01  | //: 32-bit, MSC_v6.00 [ C =332]
+   1024_ ||   136.26   136.27  |   141.22   141.49  |   157.43   158.56  | //: 32-bit, MSC_v6.00 [ C =332]
+   2048_ ||   134.25   134.25  |   135.90   137.12  |   151.73   152.42  | //: 32-bit, MSC_v6.00 [ C =332]
+   4096_ ||   133.89   134.06  |   131.19   134.61  |   147.72   150.40  | //: 32-bit, MSC_v6.00 [ C =332]
+   8192_ ||   132.87   134.43  |   134.17   135.04  |   143.82   148.00  | //: 32-bit, MSC_v6.00 [ C =332]
+  10000_ ||   133.42   134.41  |   124.60   130.22  |   137.58   138.21  | //: 32-bit, MSC_v6.00 [ C =332]
+  16384_ ||   131.73   132.54  |   121.35   122.08  |   135.51   139.44  | //: 32-bit, MSC_v6.00 [ C =332]
+  32768_ ||   124.97   134.41  |   128.74   129.78  |   142.57   143.53  | //: 32-bit, MSC_v6.00 [ C =332]
+ 100000_ ||   126.77   134.34  |   126.62   129.40  |   135.08   139.54  | //: 32-bit, MSC_v6.00 [ C =332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1486 bytes |         1348 bytes |         1445 bytes | //: 32-bit, MSC_v6.00 [ C =332]
+  Block  ||         6038 bytes |        13395 bytes |        15975 bytes | //: 32-bit, MSC_v6.00 [ C =332]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:30:42,Oct  7 2008  by  'MSC_v4.20', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  5436.00  5436.00  |  9474.00  9474.00  | 20430.00 20442.00  | //: 32-bit, MSC_v4.20 [ C =332]
+      2_ ||  2715.00  2718.00  |  4731.00  4734.00  | 10215.00 10218.00  | //: 32-bit, MSC_v4.20 [ C =332]
+      4_ ||  1359.00  1360.50  |  2364.00  2364.00  |  5098.50  5103.00  | //: 32-bit, MSC_v4.20 [ C =332]
+      8_ ||   680.25   680.25  |  1182.00  1182.75  |  2549.25  2550.00  | //: 32-bit, MSC_v4.20 [ C =332]
+     10_ ||   538.80   543.60  |   945.60   946.20  |  2042.40  2043.00  | //: 32-bit, MSC_v4.20 [ C =332]
+     16_ ||   339.38   339.75  |   590.63   591.00  |  1274.63  1275.00  | //: 32-bit, MSC_v4.20 [ C =332]
+     32_ ||   166.69   166.69  |   295.31   295.50  |   637.13   637.50  | //: 32-bit, MSC_v4.20 [ C =332]
+     64_ ||   123.47   123.66  |   147.28   147.38  |   318.56   318.75  | //: 32-bit, MSC_v4.20 [ C =332]
+    100_ ||   130.62   130.74  |   140.28   140.76  |   203.94   204.00  | //: 32-bit, MSC_v4.20 [ C =332]
+    128_ ||   101.44   101.67  |   109.31   109.45  |   159.14   159.33  | //: 32-bit, MSC_v4.20 [ C =332]
+    256_ ||    90.70    90.75  |    90.52    90.56  |   118.66   118.83  | //: 32-bit, MSC_v4.20 [ C =332]
+    512_ ||    85.00    85.03  |    80.81    80.87  |    98.46    98.46  | //: 32-bit, MSC_v4.20 [ C =332]
+   1000_ ||    84.00    84.01  |    77.78    77.78  |    90.31    90.32  | //: 32-bit, MSC_v4.20 [ C =332]
+   1024_ ||    81.99    82.13  |    75.93    75.93  |    88.18    88.18  | //: 32-bit, MSC_v4.20 [ C =332]
+   2048_ ||    80.66    80.68  |    73.43    73.43  |    83.02    83.02  | //: 32-bit, MSC_v4.20 [ C =332]
+   4096_ ||    79.98    80.38  |    72.23    72.27  |    80.45    80.45  | //: 32-bit, MSC_v4.20 [ C =332]
+   8192_ ||    79.63    80.15  |    71.66    71.73  |    79.15    79.22  | //: 32-bit, MSC_v4.20 [ C =332]
+  10000_ ||    79.65    80.07  |    71.85    72.33  |    79.79    79.82  | //: 32-bit, MSC_v4.20 [ C =332]
+  16384_ ||    79.66    79.71  |    71.40    71.41  |    78.77    78.85  | //: 32-bit, MSC_v4.20 [ C =332]
+  32768_ ||    75.39    79.68  |    67.83    71.25  |    78.23    78.50  | //: 32-bit, MSC_v4.20 [ C =332]
+ 100000_ ||    75.49    77.32  |    67.60    67.87  |    74.33    75.55  | //: 32-bit, MSC_v4.20 [ C =332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1152 bytes |         1024 bytes |         1088 bytes | //: 32-bit, MSC_v4.20 [ C =332]
+  Block  ||         4736 bytes |         8976 bytes |        12896 bytes | //: 32-bit, MSC_v4.20 [ C =332]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:30:49,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   780.00   798.00  |  1920.00  1920.00  |  3732.00  3732.00  | //: 64-bit, MSC_v9.00 [ C =332]
+      2_ ||   387.00   387.00  |   951.00   951.00  |  1866.00  1869.00  | //: 64-bit, MSC_v9.00 [ C =332]
+      4_ ||   199.50   199.50  |   477.00   480.00  |   936.00   936.00  | //: 64-bit, MSC_v9.00 [ C =332]
+      8_ ||    95.25    96.00  |   231.75   235.50  |   467.25   468.00  | //: 64-bit, MSC_v9.00 [ C =332]
+     10_ ||    76.80    76.80  |   189.00   191.40  |   402.60   402.60  | //: 64-bit, MSC_v9.00 [ C =332]
+     16_ ||    51.75    51.75  |   127.13   128.63  |   234.75   254.25  | //: 64-bit, MSC_v9.00 [ C =332]
+     32_ ||    23.63    23.63  |    58.13    58.31  |   115.69   115.69  | //: 64-bit, MSC_v9.00 [ C =332]
+     64_ ||    16.69    16.88  |    28.88    28.97  |    58.31    58.31  | //: 64-bit, MSC_v9.00 [ C =332]
+    100_ ||    17.10    17.16  |    27.66    27.90  |    37.62    37.62  | //: 64-bit, MSC_v9.00 [ C =332]
+    128_ ||    12.98    13.13  |    21.14    21.47  |    29.16    29.16  | //: 64-bit, MSC_v9.00 [ C =332]
+    256_ ||    11.27    11.30  |    17.04    17.18  |    21.66    21.73  | //: 64-bit, MSC_v9.00 [ C =332]
+    512_ ||    10.20    10.20  |    16.21    16.21  |    17.79    17.82  | //: 64-bit, MSC_v9.00 [ C =332]
+   1000_ ||     9.98    10.12  |    14.23    14.25  |    16.13    16.13  | //: 64-bit, MSC_v9.00 [ C =332]
+   1024_ ||     9.73    10.54  |    13.88    13.89  |    15.73    15.73  | //: 64-bit, MSC_v9.00 [ C =332]
+   2048_ ||     9.48     9.48  |    13.51    13.51  |    14.70    14.70  | //: 64-bit, MSC_v9.00 [ C =332]
+   4096_ ||     9.35     9.36  |    13.21    13.22  |    14.16    14.16  | //: 64-bit, MSC_v9.00 [ C =332]
+   8192_ ||     9.25     9.25  |    13.08    13.08  |    13.93    13.93  | //: 64-bit, MSC_v9.00 [ C =332]
+  10000_ ||     9.27     9.28  |    12.89    12.99  |    13.98    13.98  | //: 64-bit, MSC_v9.00 [ C =332]
+  16384_ ||     9.26     9.28  |    12.77    12.89  |    13.74    13.74  | //: 64-bit, MSC_v9.00 [ C =332]
+  32768_ ||     9.23     9.25  |    12.83    13.09  |    13.77    14.27  | //: 64-bit, MSC_v9.00 [ C =332]
+ 100000_ ||     9.32     9.56  |    13.12    13.19  |    14.15    14.23  | //: 64-bit, MSC_v9.00 [ C =332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+  Block  ||         1200 bytes |         2928 bytes |         5008 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:30:52,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  5958.00  5958.00  | 10182.00 10188.00  | 21522.00 21522.00  | //: 32-bit, BCC_v5.51 [ C =332]
+      2_ ||  3228.00  3228.00  |  5091.00  5091.00  | 10761.00 10761.00  | //: 32-bit, BCC_v5.51 [ C =332]
+      4_ ||  1491.00  1491.00  |  2544.00  2544.00  |  5374.50  5821.50  | //: 32-bit, BCC_v5.51 [ C =332]
+      8_ ||   744.75   745.50  |  1272.00  1272.00  |  2686.50  2686.50  | //: 32-bit, BCC_v5.51 [ C =332]
+     10_ ||   595.80   595.80  |  1017.60  1102.80  |  2151.60  2152.20  | //: 32-bit, BCC_v5.51 [ C =332]
+     16_ ||   372.00   372.38  |   636.00   636.00  |  1343.25  1343.62  | //: 32-bit, BCC_v5.51 [ C =332]
+     32_ ||   184.69   184.69  |   317.62   317.81  |   671.62   671.62  | //: 32-bit, BCC_v5.51 [ C =332]
+     64_ ||   135.56   135.56  |   171.56   171.56  |   335.72   335.81  | //: 32-bit, BCC_v5.51 [ C =332]
+    100_ ||   141.90   141.90  |   150.18   150.18  |   232.68   232.68  | //: 32-bit, BCC_v5.51 [ C =332]
+    128_ ||   119.67   119.67  |   126.84   126.84  |   181.50   181.55  | //: 32-bit, BCC_v5.51 [ C =332]
+    256_ ||   106.15   106.15  |   104.25   104.27  |   135.00   135.02  | //: 32-bit, BCC_v5.51 [ C =332]
+    512_ ||    99.42    99.46  |    93.00    93.01  |   111.75   111.76  | //: 32-bit, BCC_v5.51 [ C =332]
+   1000_ ||    98.08    98.08  |    89.47    89.47  |   102.47   102.49  | //: 32-bit, BCC_v5.51 [ C =332]
+   1024_ ||    95.70    95.75  |    87.33    87.33  |   100.03   100.04  | //: 32-bit, BCC_v5.51 [ C =332]
+   2048_ ||    86.60    86.77  |    78.01    78.01  |    86.94    86.94  | //: 32-bit, BCC_v5.51 [ C =332]
+   4096_ ||    85.95    85.96  |    76.71    76.71  |    84.25    84.25  | //: 32-bit, BCC_v5.51 [ C =332]
+   8192_ ||    85.56    92.63  |    76.07    82.40  |    89.80    90.33  | //: 32-bit, BCC_v5.51 [ C =332]
+  10000_ ||    86.68    90.16  |    79.43    81.88  |    89.21    90.98  | //: 32-bit, BCC_v5.51 [ C =332]
+  16384_ ||    87.25    92.78  |    82.30    82.31  |    89.34    89.41  | //: 32-bit, BCC_v5.51 [ C =332]
+  32768_ ||    92.38    92.56  |    81.90    82.46  |    89.01    89.03  | //: 32-bit, BCC_v5.51 [ C =332]
+ 100000_ ||    88.00    88.57  |    78.54    81.88  |    84.53    88.34  | //: 32-bit, BCC_v5.51 [ C =332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [ C =332]
+  Block  ||         4340 bytes |         7660 bytes |        10408 bytes | //: 32-bit, BCC_v5.51 [ C =332]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:31:00,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2784.00  2784.00  |  5094.00  5100.00  | 10800.00 10806.00  | //: 32-bit, BCC_v5.51 [asm=332]
+      2_ ||  1374.00  1377.00  |  2538.00  2538.00  |  5370.00  5373.00  | //: 32-bit, BCC_v5.51 [asm=332]
+      4_ ||   687.00   687.00  |  1267.50  1267.50  |  2695.50  2697.00  | //: 32-bit, BCC_v5.51 [asm=332]
+      8_ ||   341.25   341.25  |   633.00   633.75  |  1348.50  1348.50  | //: 32-bit, BCC_v5.51 [asm=332]
+     10_ ||   278.40   282.00  |   512.40   513.00  |  1089.00  1089.00  | //: 32-bit, BCC_v5.51 [asm=332]
+     16_ ||   172.50   172.50  |   320.25   320.25  |   678.00   679.12  | //: 32-bit, BCC_v5.51 [asm=332]
+     32_ ||    85.88    85.88  |   159.38   159.56  |   339.75   339.75  | //: 32-bit, BCC_v5.51 [asm=332]
+     64_ ||    61.03    61.12  |    79.31    79.41  |   169.78   169.78  | //: 32-bit, BCC_v5.51 [asm=332]
+    100_ ||    62.40    62.46  |    73.92    73.98  |   108.48   108.72  | //: 32-bit, BCC_v5.51 [asm=332]
+    128_ ||    48.28    48.28  |    57.47    57.47  |    84.14    84.14  | //: 32-bit, BCC_v5.51 [asm=332]
+    256_ ||    42.00    42.02  |    46.03    46.31  |    61.90    61.90  | //: 32-bit, BCC_v5.51 [asm=332]
+    512_ ||    37.96    39.39  |    41.17    41.18  |    51.16    51.33  | //: 32-bit, BCC_v5.51 [asm=332]
+   1000_ ||    37.45    37.74  |    38.66    62.12  |    45.85    69.53  | //: 32-bit, BCC_v5.51 [asm=332]
+   1024_ ||    36.34    58.89  |    37.42    60.09  |    44.31    44.62  | //: 32-bit, BCC_v5.51 [asm=332]
+   2048_ ||    35.77    36.18  |    36.06    36.34  |    42.06    42.07  | //: 32-bit, BCC_v5.51 [asm=332]
+   4096_ ||    35.04    35.44  |    35.33    35.33  |    40.48    48.07  | //: 32-bit, BCC_v5.51 [asm=332]
+   8192_ ||    34.80    43.64  |    35.91    35.92  |    40.46    40.66  | //: 32-bit, BCC_v5.51 [asm=332]
+  10000_ ||    35.05    36.40  |    35.54    37.36  |    41.01    54.16  | //: 32-bit, BCC_v5.51 [asm=332]
+  16384_ ||    34.92    36.14  |    35.74    40.79  |    40.28    43.83  | //: 32-bit, BCC_v5.51 [asm=332]
+  32768_ ||    35.39    38.29  |    35.19    37.31  |    39.88    40.94  | //: 32-bit, BCC_v5.51 [asm=332]
+ 100000_ ||    36.40    38.36  |    35.18    37.16  |    40.05    40.36  | //: 32-bit, BCC_v5.51 [asm=332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [asm=332]
+  Block  ||         3060 bytes |         6300 bytes |         8835 bytes | //: 32-bit, BCC_v5.51 [asm=332]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:31:04,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2598.00  2604.00  |  4866.00  4878.00  | 10614.00 10632.00  | //: 32-bit, MSC_v9.00 [asm=332]
+      2_ ||  1290.00  1299.00  |  2451.00  2460.00  |  5331.00  5334.00  | //: 32-bit, MSC_v9.00 [asm=332]
+      4_ ||   648.00   649.50  |  1222.50  1222.50  |  2647.50  2656.50  | //: 32-bit, MSC_v9.00 [asm=332]
+      8_ ||   326.25   327.75  |   612.00   614.25  |  1330.50  1332.75  | //: 32-bit, MSC_v9.00 [asm=332]
+     10_ ||   260.40   261.00  |   490.20   490.20  |  1067.40  1067.40  | //: 32-bit, MSC_v9.00 [asm=332]
+     16_ ||   162.38   162.38  |   306.00   306.00  |   661.50   661.88  | //: 32-bit, MSC_v9.00 [asm=332]
+     32_ ||    80.81    80.81  |   153.94   153.94  |   333.75   333.94  | //: 32-bit, MSC_v9.00 [asm=332]
+     64_ ||    58.78    58.78  |    76.13    76.41  |   166.88   166.88  | //: 32-bit, MSC_v9.00 [asm=332]
+    100_ ||    60.78    60.78  |    72.00    72.00  |   106.86   106.92  | //: 32-bit, MSC_v9.00 [asm=332]
+    128_ ||    47.58    47.63  |    55.92    55.92  |    83.16    83.20  | //: 32-bit, MSC_v9.00 [asm=332]
+    256_ ||    42.05    42.05  |    45.75    45.75  |    61.59    61.64  | //: 32-bit, MSC_v9.00 [asm=332]
+    512_ ||    39.18    39.19  |    40.82    41.02  |    50.93    51.02  | //: 32-bit, MSC_v9.00 [asm=332]
+   1000_ ||    38.38    38.42  |    39.17    39.19  |    46.49    46.61  | //: 32-bit, MSC_v9.00 [asm=332]
+   1024_ ||    37.38    37.78  |    38.02    60.78  |    67.69    68.44  | //: 32-bit, MSC_v9.00 [asm=332]
+   2048_ ||    36.80    48.25  |    36.66    48.20  |    42.67    42.81  | //: 32-bit, MSC_v9.00 [asm=332]
+   4096_ ||    36.57    36.59  |    36.25    36.26  |    41.31    41.40  | //: 32-bit, MSC_v9.00 [asm=332]
+   8192_ ||    36.21    36.30  |    35.84    38.76  |    40.68    40.71  | //: 32-bit, MSC_v9.00 [asm=332]
+  10000_ ||    40.98    47.69  |    35.81    35.86  |    40.96    43.93  | //: 32-bit, MSC_v9.00 [asm=332]
+  16384_ ||    36.27    38.04  |    35.77    43.58  |    40.33    43.27  | //: 32-bit, MSC_v9.00 [asm=332]
+  32768_ ||    36.04    41.09  |    35.57    35.89  |    40.17    40.36  | //: 32-bit, MSC_v9.00 [asm=332]
+ 100000_ ||    34.46    36.34  |    34.07    37.16  |    39.60    43.18  | //: 32-bit, MSC_v9.00 [asm=332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+  Block  ||         3060 bytes |         6300 bytes |         8835 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:31:10,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2550.00  2568.00  |  4896.00  4902.00  | 10662.00 10728.00  | //: 32-bit, GCC_v3.42 [asm=332]
+      2_ ||  1275.00  1290.00  |  2445.00  2457.00  |  5355.00  5379.00  | //: 32-bit, GCC_v3.42 [asm=332]
+      4_ ||   640.50   645.00  |  1224.00  1225.50  |  2655.00  2668.50  | //: 32-bit, GCC_v3.42 [asm=332]
+      8_ ||   318.75   320.25  |   609.75   610.50  |  1328.25  1332.75  | //: 32-bit, GCC_v3.42 [asm=332]
+     10_ ||   254.40   257.40  |   488.40   490.20  |  1070.40  1074.60  | //: 32-bit, GCC_v3.42 [asm=332]
+     16_ ||   161.25   162.00  |   306.38   307.13  |   669.00   671.25  | //: 32-bit, GCC_v3.42 [asm=332]
+     32_ ||    79.88    80.06  |   153.56   153.75  |   333.00   335.63  | //: 32-bit, GCC_v3.42 [asm=332]
+     64_ ||    58.50    58.69  |    76.50    76.59  |   166.69   167.34  | //: 32-bit, GCC_v3.42 [asm=332]
+    100_ ||    60.78    61.02  |    72.36    73.08  |   107.04   107.58  | //: 32-bit, GCC_v3.42 [asm=332]
+    128_ ||    47.39    47.44  |    56.06    56.30  |    83.44    83.63  | //: 32-bit, GCC_v3.42 [asm=332]
+    256_ ||    41.79    41.88  |    46.03    46.10  |    61.71    61.92  | //: 32-bit, GCC_v3.42 [asm=332]
+    512_ ||    39.20    39.33  |    40.96    41.10  |    51.23    51.30  | //: 32-bit, GCC_v3.42 [asm=332]
+   1000_ ||    38.40    38.57  |    39.23    39.26  |    46.83    47.06  | //: 32-bit, GCC_v3.42 [asm=332]
+   1024_ ||    37.53    37.72  |    38.27    38.33  |    45.78    46.00  | //: 32-bit, GCC_v3.42 [asm=332]
+   2048_ ||    36.94    37.00  |    37.03    37.15  |    43.10    56.43  | //: 32-bit, GCC_v3.42 [asm=332]
+   4096_ ||    41.78    53.31  |    36.01    40.53  |    40.97    41.13  | //: 32-bit, GCC_v3.42 [asm=332]
+   8192_ ||    35.90    36.04  |    35.84    48.31  |    40.53    40.55  | //: 32-bit, GCC_v3.42 [asm=332]
+  10000_ ||    36.42    36.48  |    35.85    46.01  |    40.60    40.74  | //: 32-bit, GCC_v3.42 [asm=332]
+  16384_ ||    36.20    39.37  |    35.61    38.72  |    40.15    41.33  | //: 32-bit, GCC_v3.42 [asm=332]
+  32768_ ||    36.47    40.53  |    35.81    39.15  |    40.13    41.96  | //: 32-bit, GCC_v3.42 [asm=332]
+ 100000_ ||    36.70    43.77  |    35.89    37.72  |    40.20    44.66  | //: 32-bit, GCC_v3.42 [asm=332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [asm=332]
+  Block  ||         3060 bytes |         6300 bytes |         8835 bytes | //: 32-bit, GCC_v3.42 [asm=332]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:31:14,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   684.00   690.00  |  1104.00  1104.00  |  2028.00  2034.00  | //: 64-bit, MSC_v9.00 [asm=332]
+      2_ ||   339.00   339.00  |   549.00   549.00  |  1014.00  1017.00  | //: 64-bit, MSC_v9.00 [asm=332]
+      4_ ||   168.00   169.50  |   276.00   276.00  |   507.00   508.50  | //: 64-bit, MSC_v9.00 [asm=332]
+      8_ ||    81.75    81.75  |   135.00   146.25  |   273.75   273.75  | //: 64-bit, MSC_v9.00 [asm=332]
+     10_ ||    70.80    70.80  |   120.00   120.00  |   219.00   219.00  | //: 64-bit, MSC_v9.00 [asm=332]
+     16_ ||    44.25    44.25  |    74.25    74.25  |   126.00   126.38  | //: 64-bit, MSC_v9.00 [asm=332]
+     32_ ||    20.06    20.25  |    33.75    33.75  |    63.00    63.00  | //: 64-bit, MSC_v9.00 [asm=332]
+     64_ ||    14.53    14.53  |    16.69    16.97  |    34.13    34.13  | //: 64-bit, MSC_v9.00 [asm=332]
+    100_ ||    15.72    15.72  |    16.74    16.74  |    22.20    22.20  | //: 64-bit, MSC_v9.00 [asm=332]
+    128_ ||    11.06    11.11  |    11.77    11.81  |    15.70    15.70  | //: 64-bit, MSC_v9.00 [asm=332]
+    256_ ||     9.52     9.52  |     9.05     9.07  |    12.38    12.40  | //: 64-bit, MSC_v9.00 [asm=332]
+    512_ ||     9.35     9.35  |     7.72     7.72  |     9.26     9.26  | //: 64-bit, MSC_v9.00 [asm=332]
+   1000_ ||     8.42     8.42  |     7.22     7.22  |     8.30     8.30  | //: 64-bit, MSC_v9.00 [asm=332]
+   1024_ ||     8.19     8.87  |     7.62     7.63  |     8.12     8.12  | //: 64-bit, MSC_v9.00 [asm=332]
+   2048_ ||     7.97     7.97  |     7.25     7.38  |     7.52     8.15  | //: 64-bit, MSC_v9.00 [asm=332]
+   4096_ ||     7.86     7.88  |     6.54     7.09  |     7.84    11.52  | //: 64-bit, MSC_v9.00 [asm=332]
+   8192_ ||     8.49    11.80  |     9.78    10.72  |     7.05    10.38  | //: 64-bit, MSC_v9.00 [asm=332]
+  10000_ ||     7.85     8.51  |     6.58     6.58  |     7.11     7.12  | //: 64-bit, MSC_v9.00 [asm=332]
+  16384_ ||     7.86     7.88  |     6.41     6.41  |     7.00     7.01  | //: 64-bit, MSC_v9.00 [asm=332]
+  32768_ ||     7.89     9.85  |     6.50     7.00  |     6.94     6.97  | //: 64-bit, MSC_v9.00 [asm=332]
+ 100000_ ||     7.80     9.43  |     6.90     7.71  |     7.18     8.48  | //: 64-bit, MSC_v9.00 [asm=332]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+  Block  ||         1288 bytes |         2182 bytes |         3449 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:31:16,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  3954.00  3966.00  |  7350.00  7350.00  | 40698.00 40704.00  | //: 32-bit, GCC_v3.42 [ C =335]
+      2_ ||  1977.00  1977.00  |  3678.00  3678.00  | 22035.00 24258.00  | //: 32-bit, GCC_v3.42 [ C =335]
+      4_ ||  1072.50  1072.50  |  1837.50  1839.00  | 10161.00 11007.00  | //: 32-bit, GCC_v3.42 [ C =335]
+      8_ ||   495.00   495.75  |   993.75   999.75  |  5456.25  8527.50  | //: 32-bit, GCC_v3.42 [ C =335]
+     10_ ||   429.60   430.80  |   730.80   801.60  |  4380.00  5951.40  | //: 32-bit, GCC_v3.42 [ C =335]
+     16_ ||   249.00   269.25  |   499.13   502.13  |  2741.63  4381.13  | //: 32-bit, GCC_v3.42 [ C =335]
+     32_ ||   133.31   135.19  |   249.19   251.25  |  1369.69  2140.69  | //: 32-bit, GCC_v3.42 [ C =335]
+     64_ ||    90.84    99.00  |   114.47   123.94  |   635.06   635.16  | //: 32-bit, GCC_v3.42 [ C =335]
+    100_ ||    95.28   103.56  |   108.90   109.44  |   406.50   580.20  | //: 32-bit, GCC_v3.42 [ C =335]
+    128_ ||    74.44    80.44  |    85.50    91.88  |   317.25   317.30  | //: 32-bit, GCC_v3.42 [ C =335]
+    256_ ||    66.00    71.48  |    69.70    69.80  |   237.12   237.12  | //: 32-bit, GCC_v3.42 [ C =335]
+    512_ ||    66.96    66.98  |    67.36    67.39  |   224.53   224.58  | //: 32-bit, GCC_v3.42 [ C =335]
+   1000_ ||    66.20    66.21  |    64.66    64.73  |   205.97   206.02  | //: 32-bit, GCC_v3.42 [ C =335]
+   1024_ ||    64.61    87.60  |    63.19    63.20  |   175.51   194.46  | //: 32-bit, GCC_v3.42 [ C =335]
+   2048_ ||    58.87    66.35  |    56.44    60.15  |   165.23   193.47  | //: 32-bit, GCC_v3.42 [ C =335]
+   4096_ ||    58.09    71.92  |    55.50    55.51  |   168.27   197.84  | //: 32-bit, GCC_v3.42 [ C =335]
+   8192_ ||    57.83    61.97  |    55.02    64.00  |   173.49   203.78  | //: 32-bit, GCC_v3.42 [ C =335]
+  10000_ ||    62.69    63.71  |    59.52    62.18  |   176.01   194.46  | //: 32-bit, GCC_v3.42 [ C =335]
+  16384_ ||    62.78    65.42  |    59.37    63.71  |   182.36   201.21  | //: 32-bit, GCC_v3.42 [ C =335]
+  32768_ ||    62.48    70.41  |    59.61    63.00  |   184.14   189.59  | //: 32-bit, GCC_v3.42 [ C =335]
+ 100000_ ||    61.14    68.82  |    59.72    62.87  |   190.22   202.45  | //: 32-bit, GCC_v3.42 [ C =335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [ C =335]
+  Block  ||         6640 bytes |        13040 bytes |        41968 bytes | //: 32-bit, GCC_v3.42 [ C =335]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:31:27,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2994.00  2994.00  |  6240.00  6240.00  | 14598.00 14604.00  | //: 32-bit, MSC_v9.00 [ C =335]
+      2_ ||  1488.00  1494.00  |  3123.00  3126.00  |  7308.00  7311.00  | //: 32-bit, MSC_v9.00 [ C =335]
+      4_ ||   744.00   745.50  |  1558.50  1558.50  |  3646.50  3648.00  | //: 32-bit, MSC_v9.00 [ C =335]
+      8_ ||   372.00   372.00  |   779.25   780.00  |  1827.75  1827.75  | //: 32-bit, MSC_v9.00 [ C =335]
+     10_ ||   300.60   301.20  |   624.00   624.60  |  1459.20  1461.00  | //: 32-bit, MSC_v9.00 [ C =335]
+     16_ ||   187.13   187.88  |   389.63   389.63  |   913.50   913.88  | //: 32-bit, MSC_v9.00 [ C =335]
+     32_ ||    92.63    93.19  |   195.38   195.56  |   456.56   456.56  | //: 32-bit, MSC_v9.00 [ C =335]
+     64_ ||    67.69    67.78  |    97.03    97.13  |   228.66   228.75  | //: 32-bit, MSC_v9.00 [ C =335]
+    100_ ||    70.62    70.68  |    91.86    91.92  |   146.10   146.16  | //: 32-bit, MSC_v9.00 [ C =335]
+    128_ ||    54.84    55.27  |    71.48    71.48  |   112.88   112.92  | //: 32-bit, MSC_v9.00 [ C =335]
+    256_ ||    48.49    48.54  |    58.48    58.50  |    83.37    83.48  | //: 32-bit, MSC_v9.00 [ C =335]
+    512_ ||    45.42    45.42  |    52.07    52.23  |    68.57    68.60  | //: 32-bit, MSC_v9.00 [ C =335]
+   1000_ ||    44.65    44.65  |    50.20    50.20  |    62.74    62.76  | //: 32-bit, MSC_v9.00 [ C =335]
+   1024_ ||    43.80    43.80  |    48.98    48.99  |    61.13    61.14  | //: 32-bit, MSC_v9.00 [ C =335]
+   2048_ ||    43.00    43.00  |    47.36    47.37  |    57.45    57.47  | //: 32-bit, MSC_v9.00 [ C =335]
+   4096_ ||    42.33    42.34  |    46.57    46.57  |    55.59    55.60  | //: 32-bit, MSC_v9.00 [ C =335]
+   8192_ ||    42.21    42.25  |    46.16    46.17  |    54.66    54.67  | //: 32-bit, MSC_v9.00 [ C =335]
+  10000_ ||    42.16    42.42  |    46.31    46.73  |    55.11    55.13  | //: 32-bit, MSC_v9.00 [ C =335]
+  16384_ ||    42.28    42.29  |    46.21    46.24  |    54.20    54.24  | //: 32-bit, MSC_v9.00 [ C =335]
+  32768_ ||    42.35    42.36  |    45.95    46.10  |    50.90    51.80  | //: 32-bit, MSC_v9.00 [ C =335]
+ 100000_ ||    40.09    40.55  |    45.76    45.97  |    51.00    53.08  | //: 32-bit, MSC_v9.00 [ C =335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+  Block  ||         4560 bytes |         9232 bytes |        29280 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:31:35,Oct  7 2008  by  'MSC_v6.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  8670.00  8682.00  | 17376.00 17436.00  | 55890.00 56922.00  | //: 32-bit, MSC_v6.00 [ C =335]
+      2_ ||  4344.00  4347.00  |  9315.00  9414.00  | 27747.00 27771.00  | //: 32-bit, MSC_v6.00 [ C =335]
+      4_ ||  2164.50  2164.50  |  4500.00  4522.50  | 13807.50 13896.00  | //: 32-bit, MSC_v6.00 [ C =335]
+      8_ ||  1082.25  1088.25  |  2235.00  2245.50  |  6796.50  6931.50  | //: 32-bit, MSC_v6.00 [ C =335]
+     10_ ||   866.40   871.80  |  1800.60  1811.40  |  5465.40  5560.80  | //: 32-bit, MSC_v6.00 [ C =335]
+     16_ ||   548.25   548.25  |  1124.25  1129.88  |  3447.00  3447.75  | //: 32-bit, MSC_v6.00 [ C =335]
+     32_ ||   272.25   272.63  |   560.44   563.81  |  1723.50  1723.88  | //: 32-bit, MSC_v6.00 [ C =335]
+     64_ ||   202.59   203.44  |   282.47   283.50  |   837.00   840.09  | //: 32-bit, MSC_v6.00 [ C =335]
+    100_ ||   214.38   215.88  |   269.28   269.82  |   532.74   532.86  | //: 32-bit, MSC_v6.00 [ C =335]
+    128_ ||   167.63   167.67  |   208.64   210.70  |   418.03   421.73  | //: 32-bit, MSC_v6.00 [ C =335]
+    256_ ||   149.41   150.16  |   173.23   173.79  |   317.27   320.23  | //: 32-bit, MSC_v6.00 [ C =335]
+    512_ ||   147.41   147.48  |   148.73   148.88  |   265.04   265.10  | //: 32-bit, MSC_v6.00 [ C =335]
+   1000_ ||   140.53   140.53  |   144.00   144.20  |   244.22   250.33  | //: 32-bit, MSC_v6.00 [ C =335]
+   1024_ ||   141.71   141.73  |   142.42   142.59  |   235.18   235.96  | //: 32-bit, MSC_v6.00 [ C =335]
+   2048_ ||   135.36   135.38  |   137.08   137.16  |   220.27   221.11  | //: 32-bit, MSC_v6.00 [ C =335]
+   4096_ ||   124.41   128.93  |   123.98   126.91  |   200.32   204.77  | //: 32-bit, MSC_v6.00 [ C =335]
+   8192_ ||   124.35   126.84  |   124.37   130.27  |   204.17   219.92  | //: 32-bit, MSC_v6.00 [ C =335]
+  10000_ ||   126.45   133.37  |   133.76   134.30  |   217.48   218.27  | //: 32-bit, MSC_v6.00 [ C =335]
+  16384_ ||   133.14   135.72  |   128.15   128.86  |   191.67   194.97  | //: 32-bit, MSC_v6.00 [ C =335]
+  32768_ ||   129.20   133.13  |   127.94   129.85  |   202.98   210.08  | //: 32-bit, MSC_v6.00 [ C =335]
+ 100000_ ||   130.83   133.01  |   121.08   129.21  |   192.14   200.80  | //: 32-bit, MSC_v6.00 [ C =335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1486 bytes |         1348 bytes |         1445 bytes | //: 32-bit, MSC_v6.00 [ C =335]
+  Block  ||         6038 bytes |        13395 bytes |        37221 bytes | //: 32-bit, MSC_v6.00 [ C =335]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:31:48,Oct  7 2008  by  'MSC_v4.20', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  5430.00  5436.00  |  9474.00  9474.00  | 21330.00 21366.00  | //: 32-bit, MSC_v4.20 [ C =335]
+      2_ ||  2706.00  2724.00  |  4731.00  4734.00  | 10662.00 10683.00  | //: 32-bit, MSC_v4.20 [ C =335]
+      4_ ||  1359.00  1359.00  |  2364.00  2365.50  |  5322.00  5332.50  | //: 32-bit, MSC_v4.20 [ C =335]
+      8_ ||   675.00   679.50  |  1182.00  1182.75  |  2661.75  2666.25  | //: 32-bit, MSC_v4.20 [ C =335]
+     10_ ||   540.00   543.60  |   945.60   945.60  |  2132.40  2136.00  | //: 32-bit, MSC_v4.20 [ C =335]
+     16_ ||   337.50   339.75  |   590.63   591.00  |  1330.50  1332.75  | //: 32-bit, MSC_v4.20 [ C =335]
+     32_ ||   166.69   167.25  |   295.31   295.50  |   665.25   666.38  | //: 32-bit, MSC_v4.20 [ C =335]
+     64_ ||   123.66   123.84  |   147.19   147.28  |   332.63   333.19  | //: 32-bit, MSC_v4.20 [ C =335]
+    100_ ||   130.62   130.98  |   140.28   140.76  |   212.88   213.24  | //: 32-bit, MSC_v4.20 [ C =335]
+    128_ ||   101.44   101.72  |   109.31   109.45  |   166.08   166.41  | //: 32-bit, MSC_v4.20 [ C =335]
+    256_ ||    90.54    90.73  |    90.54    90.56  |   124.99   125.11  | //: 32-bit, MSC_v4.20 [ C =335]
+    512_ ||    84.93    85.02  |    80.95    80.95  |   101.98   101.98  | //: 32-bit, MSC_v4.20 [ C =335]
+   1000_ ||    84.00    84.02  |    77.78    77.79  |    93.61    93.62  | //: 32-bit, MSC_v4.20 [ C =335]
+   1024_ ||    81.96    82.10  |    75.93    75.93  |    91.37    91.39  | //: 32-bit, MSC_v4.20 [ C =335]
+   2048_ ||    80.68    80.69  |    73.49    73.49  |    85.58    85.59  | //: 32-bit, MSC_v4.20 [ C =335]
+   4096_ ||    79.98    80.00  |    72.23    72.24  |    82.21    82.57  | //: 32-bit, MSC_v4.20 [ C =335]
+   8192_ ||    79.62    80.01  |    71.61    72.15  |    80.57    81.37  | //: 32-bit, MSC_v4.20 [ C =335]
+  10000_ ||    79.72    80.04  |    71.86    71.92  |    81.67    81.67  | //: 32-bit, MSC_v4.20 [ C =335]
+  16384_ ||    79.47    79.72  |    67.22    67.80  |    76.81    77.22  | //: 32-bit, MSC_v4.20 [ C =335]
+  32768_ ||    75.20    79.32  |    67.07    68.60  |    75.91    78.02  | //: 32-bit, MSC_v4.20 [ C =335]
+ 100000_ ||    75.38    75.82  |    67.48    69.43  |    74.87    77.52  | //: 32-bit, MSC_v4.20 [ C =335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1152 bytes |         1024 bytes |         1088 bytes | //: 32-bit, MSC_v4.20 [ C =335]
+  Block  ||         4736 bytes |         8976 bytes |        28880 bytes | //: 32-bit, MSC_v4.20 [ C =335]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:31:56,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   780.00   798.00  |  1890.00  1920.00  |  3498.00  3498.00  | //: 64-bit, MSC_v9.00 [ C =335]
+      2_ ||   387.00   387.00  |   951.00   969.00  |  1752.00  1752.00  | //: 64-bit, MSC_v9.00 [ C =335]
+      4_ ||   201.00   201.00  |   477.00   478.50  |   877.50   877.50  | //: 64-bit, MSC_v9.00 [ C =335]
+      8_ ||    95.25    96.00  |   234.75   237.00  |   440.25   441.00  | //: 64-bit, MSC_v9.00 [ C =335]
+     10_ ||    77.40    78.00  |   190.80   195.00  |   350.40   379.20  | //: 64-bit, MSC_v9.00 [ C =335]
+     16_ ||    52.13    52.13  |   126.75   127.13  |   237.00   237.00  | //: 64-bit, MSC_v9.00 [ C =335]
+     32_ ||    23.81    25.69  |    58.69    59.25  |   109.88   109.88  | //: 64-bit, MSC_v9.00 [ C =335]
+     64_ ||    16.97    16.97  |    29.34    29.72  |    54.75    54.75  | //: 64-bit, MSC_v9.00 [ C =335]
+    100_ ||    17.10    17.10  |    27.72    28.08  |    35.28    35.28  | //: 64-bit, MSC_v9.00 [ C =335]
+    128_ ||    13.03    13.03  |    21.19    21.52  |    27.33    27.33  | //: 64-bit, MSC_v9.00 [ C =335]
+    256_ ||    11.20    11.23  |    17.02    17.16  |    20.23    20.25  | //: 64-bit, MSC_v9.00 [ C =335]
+    512_ ||    10.21    10.22  |    14.95    16.15  |    16.56    17.94  | //: 64-bit, MSC_v9.00 [ C =335]
+   1000_ ||     9.95    10.00  |    14.23    14.24  |    15.09    15.10  | //: 64-bit, MSC_v9.00 [ C =335]
+   1024_ ||     9.71    10.50  |    13.91    13.91  |    14.68    14.68  | //: 64-bit, MSC_v9.00 [ C =335]
+   2048_ ||     9.48     9.73  |    13.43    13.51  |    13.73    13.75  | //: 64-bit, MSC_v9.00 [ C =335]
+   4096_ ||     9.36     9.36  |    13.21    13.22  |    13.28    13.28  | //: 64-bit, MSC_v9.00 [ C =335]
+   8192_ ||     9.28     9.31  |    12.83    12.94  |    12.97    14.48  | //: 64-bit, MSC_v9.00 [ C =335]
+  10000_ ||     9.30    10.06  |    12.94    14.10  |    13.07    14.36  | //: 64-bit, MSC_v9.00 [ C =335]
+  16384_ ||     9.25     9.27  |    12.98    13.01  |    12.83    12.83  | //: 64-bit, MSC_v9.00 [ C =335]
+  32768_ ||     9.22     9.24  |    12.81    12.91  |    12.90    12.90  | //: 64-bit, MSC_v9.00 [ C =335]
+ 100000_ ||     9.33     9.58  |    13.94    13.95  |    13.24    13.92  | //: 64-bit, MSC_v9.00 [ C =335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+  Block  ||         1200 bytes |         2928 bytes |        10880 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:32:00,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  5964.00  6462.00  | 10182.00 10188.00  | 21516.00 21516.00  | //: 32-bit, BCC_v5.51 [ C =335]
+      2_ ||  2979.00  2982.00  |  5091.00  5091.00  | 10758.00 10761.00  | //: 32-bit, BCC_v5.51 [ C =335]
+      4_ ||  1489.50  1489.50  |  2545.50  2757.00  |  5374.50  5374.50  | //: 32-bit, BCC_v5.51 [ C =335]
+      8_ ||   744.75   745.50  |  1272.00  1272.00  |  2687.25  2687.25  | //: 32-bit, BCC_v5.51 [ C =335]
+     10_ ||   595.20   595.20  |  1017.60  1017.60  |  2151.60  2152.20  | //: 32-bit, BCC_v5.51 [ C =335]
+     16_ ||   372.38   372.38  |   636.00   636.00  |  1343.25  1343.62  | //: 32-bit, BCC_v5.51 [ C =335]
+     32_ ||   200.25   200.25  |   317.62   317.81  |   671.44   672.00  | //: 32-bit, BCC_v5.51 [ C =335]
+     64_ ||   135.56   135.56  |   158.34   158.34  |   335.72   335.72  | //: 32-bit, BCC_v5.51 [ C =335]
+    100_ ||   141.78   141.84  |   150.18   150.18  |   214.74   214.74  | //: 32-bit, BCC_v5.51 [ C =335]
+    128_ ||   110.44   110.44  |   117.05   117.09  |   167.53   167.53  | //: 32-bit, BCC_v5.51 [ C =335]
+    256_ ||   106.15   106.15  |   104.25   104.27  |   135.00   135.02  | //: 32-bit, BCC_v5.51 [ C =335]
+    512_ ||    99.46    99.50  |    93.00    93.01  |   111.62   111.63  | //: 32-bit, BCC_v5.51 [ C =335]
+   1000_ ||    98.07    98.17  |    89.48    89.48  |   102.39   102.39  | //: 32-bit, BCC_v5.51 [ C =335]
+   1024_ ||    95.84    95.84  |    87.35    87.35  |    99.96    99.96  | //: 32-bit, BCC_v5.51 [ C =335]
+   2048_ ||    93.84    94.01  |    84.51    84.52  |    94.10    94.10  | //: 32-bit, BCC_v5.51 [ C =335]
+   4096_ ||    93.12    93.13  |    83.10    83.10  |    84.16    91.18  | //: 32-bit, BCC_v5.51 [ C =335]
+   8192_ ||    92.65    93.16  |    82.39    82.91  |    89.71    89.79  | //: 32-bit, BCC_v5.51 [ C =335]
+  10000_ ||    92.75    93.17  |    77.57    78.30  |    83.49    85.13  | //: 32-bit, BCC_v5.51 [ C =335]
+  16384_ ||    87.43    88.16  |    77.83    79.58  |    84.22    84.89  | //: 32-bit, BCC_v5.51 [ C =335]
+  32768_ ||    87.33    88.36  |    77.63    78.37  |    84.49    86.56  | //: 32-bit, BCC_v5.51 [ C =335]
+ 100000_ ||    87.96    89.42  |    77.90    78.17  |    84.30    85.04  | //: 32-bit, BCC_v5.51 [ C =335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [ C =335]
+  Block  ||         4340 bytes |         7660 bytes |        24192 bytes | //: 32-bit, BCC_v5.51 [ C =335]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:32:08,Oct  7 2008  by  'BCC_v5.51', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2718.00  2718.00  |  5076.00  5082.00  | 10746.00 10752.00  | //: 32-bit, BCC_v5.51 [asm=335]
+      2_ ||  1359.00  1362.00  |  2499.00  2499.00  |  5373.00  5376.00  | //: 32-bit, BCC_v5.51 [asm=335]
+      4_ ||   679.50   681.00  |  1245.00  1251.00  |  2673.00  2674.50  | //: 32-bit, BCC_v5.51 [asm=335]
+      8_ ||   340.50   340.50  |   622.50   625.50  |  1335.75  1336.50  | //: 32-bit, BCC_v5.51 [asm=335]
+     10_ ||   269.40   269.40  |   499.20   499.80  |  1075.80  1075.80  | //: 32-bit, BCC_v5.51 [asm=335]
+     16_ ||   170.62   170.62  |   310.88   311.25  |   667.50   667.88  | //: 32-bit, BCC_v5.51 [asm=335]
+     32_ ||    82.88    83.06  |   156.00   156.00  |   334.31   334.31  | //: 32-bit, BCC_v5.51 [asm=335]
+     64_ ||    59.25    59.34  |    77.25    77.34  |   166.50   167.53  | //: 32-bit, BCC_v5.51 [asm=335]
+    100_ ||    60.48    60.54  |    72.30    72.36  |   107.16   107.22  | //: 32-bit, BCC_v5.51 [asm=335]
+    128_ ||    47.02    47.02  |    56.30    56.30  |    83.53    84.66  | //: 32-bit, BCC_v5.51 [asm=335]
+    256_ ||    42.21    42.23  |    46.59    46.62  |    62.53    62.53  | //: 32-bit, BCC_v5.51 [asm=335]
+    512_ ||    39.22    39.23  |    41.00    41.00  |    51.18    51.18  | //: 32-bit, BCC_v5.51 [asm=335]
+   1000_ ||    38.02    38.03  |    39.37    39.37  |    46.66    46.67  | //: 32-bit, BCC_v5.51 [asm=335]
+   1024_ ||    34.83    34.83  |    35.55    35.56  |    42.06    42.12  | //: 32-bit, BCC_v5.51 [asm=335]
+   2048_ ||    33.65    33.65  |    34.07    34.07  |    39.49    39.49  | //: 32-bit, BCC_v5.51 [asm=335]
+   4096_ ||    33.34    33.40  |    33.47    33.68  |    38.18    38.18  | //: 32-bit, BCC_v5.51 [asm=335]
+   8192_ ||    32.90    33.36  |    32.87    33.00  |    37.36    38.25  | //: 32-bit, BCC_v5.51 [asm=335]
+  10000_ ||    32.68    33.54  |    33.04    33.37  |    37.54    39.47  | //: 32-bit, BCC_v5.51 [asm=335]
+  16384_ ||    31.89    36.04  |    35.74    35.85  |    40.30    40.32  | //: 32-bit, BCC_v5.51 [asm=335]
+  32768_ ||    35.05    36.16  |    35.79    35.94  |    40.28    40.31  | //: 32-bit, BCC_v5.51 [asm=335]
+ 100000_ ||    34.30    35.27  |    33.66    34.18  |    38.25    39.54  | //: 32-bit, BCC_v5.51 [asm=335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          996 bytes |         1000 bytes |         1068 bytes | //: 32-bit, BCC_v5.51 [asm=335]
+  Block  ||         3060 bytes |         6300 bytes |        20391 bytes | //: 32-bit, BCC_v5.51 [asm=335]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:32:11,Oct  7 2008  by  'MSC_v9.00', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2586.00  2592.00  |  4896.00  4902.00  | 10668.00 10668.00  | //: 32-bit, MSC_v9.00 [asm=335]
+      2_ ||  1311.00  1317.00  |  2448.00  2451.00  |  5340.00  5343.00  | //: 32-bit, MSC_v9.00 [asm=335]
+      4_ ||   652.50   654.00  |  1224.00  1227.00  |  2665.50  2665.50  | //: 32-bit, MSC_v9.00 [asm=335]
+      8_ ||   327.00   328.50  |   613.50   614.25  |  1332.75  1332.75  | //: 32-bit, MSC_v9.00 [asm=335]
+     10_ ||   263.40   263.40  |   489.60   489.60  |  1069.20  1069.80  | //: 32-bit, MSC_v9.00 [asm=335]
+     16_ ||   163.88   163.88  |   306.00   306.38  |   666.38   666.75  | //: 32-bit, MSC_v9.00 [asm=335]
+     32_ ||    81.00    81.00  |   154.13   154.31  |   334.31   334.50  | //: 32-bit, MSC_v9.00 [asm=335]
+     64_ ||    58.88    58.97  |    76.41    76.59  |   167.16   167.25  | //: 32-bit, MSC_v9.00 [asm=335]
+    100_ ||    61.08    61.14  |    72.30    72.36  |   107.04   107.10  | //: 32-bit, MSC_v9.00 [asm=335]
+    128_ ||    47.81    47.86  |    56.16    56.20  |    83.34    83.34  | //: 32-bit, MSC_v9.00 [asm=335]
+    256_ ||    42.14    42.16  |    45.89    45.89  |    61.64    61.76  | //: 32-bit, MSC_v9.00 [asm=335]
+    512_ ||    36.23    36.23  |    37.66    37.66  |    46.98    46.99  | //: 32-bit, MSC_v9.00 [asm=335]
+   1000_ ||    35.57    35.57  |    36.11    36.12  |    43.07    43.12  | //: 32-bit, MSC_v9.00 [asm=335]
+   1024_ ||    34.85    34.85  |    35.24    35.24  |    42.05    42.06  | //: 32-bit, MSC_v9.00 [asm=335]
+   2048_ ||    34.17    34.17  |    33.88    34.06  |    39.46    39.53  | //: 32-bit, MSC_v9.00 [asm=335]
+   4096_ ||    33.74    33.85  |    33.38    33.46  |    38.32    38.32  | //: 32-bit, MSC_v9.00 [asm=335]
+   8192_ ||    33.65    33.67  |    33.17    34.20  |    37.70    37.71  | //: 32-bit, MSC_v9.00 [asm=335]
+  10000_ ||    33.68    34.51  |    33.29    36.32  |    37.91    39.80  | //: 32-bit, MSC_v9.00 [asm=335]
+  16384_ ||    33.26    35.05  |    32.98    35.06  |    37.34    39.35  | //: 32-bit, MSC_v9.00 [asm=335]
+  32768_ ||    36.34    36.38  |    35.62    35.76  |    40.21    41.08  | //: 32-bit, MSC_v9.00 [asm=335]
+ 100000_ ||    36.32    36.43  |    35.91    35.98  |    38.02    38.19  | //: 32-bit, MSC_v9.00 [asm=335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+  Block  ||         3060 bytes |         6300 bytes |        20391 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+
+Skein performance, in clks per byte, dtMin =   36 clks.
+         [compiled 14:32:16,Oct  7 2008  by  'GCC_v3.42', 32-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||  2562.00  2562.00  |  4866.00  4866.00  | 10698.00 10722.00  | //: 32-bit, GCC_v3.42 [asm=335]
+      2_ ||  1269.00  1275.00  |  2436.00  2439.00  |  5343.00  5355.00  | //: 32-bit, GCC_v3.42 [asm=335]
+      4_ ||   645.00   645.00  |  1222.50  1224.00  |  2667.00  2676.00  | //: 32-bit, GCC_v3.42 [asm=335]
+      8_ ||   322.50   323.25  |   610.50   610.50  |  1332.75  1338.00  | //: 32-bit, GCC_v3.42 [asm=335]
+     10_ ||   254.40   255.60  |   486.60   487.20  |  1070.40  1072.20  | //: 32-bit, GCC_v3.42 [asm=335]
+     16_ ||   161.63   162.00  |   306.00   306.00  |   668.25   669.38  | //: 32-bit, GCC_v3.42 [asm=335]
+     32_ ||    73.69    73.69  |   141.56   141.94  |   307.31   307.50  | //: 32-bit, GCC_v3.42 [asm=335]
+     64_ ||    54.28    54.38  |    70.59    70.59  |   153.66   153.75  | //: 32-bit, GCC_v3.42 [asm=335]
+    100_ ||    56.16    56.40  |    66.66    66.66  |    98.40   107.22  | //: 32-bit, GCC_v3.42 [asm=335]
+    128_ ||    47.67    47.81  |    56.16    56.20  |    83.25    83.25  | //: 32-bit, GCC_v3.42 [asm=335]
+    256_ ||    41.72    41.86  |    45.84    45.87  |    61.48    61.52  | //: 32-bit, GCC_v3.42 [asm=335]
+    512_ ||    38.66    38.68  |    40.70    40.70  |    50.68    50.71  | //: 32-bit, GCC_v3.42 [asm=335]
+   1000_ ||    38.09    38.57  |    38.98    38.99  |    46.42    46.43  | //: 32-bit, GCC_v3.42 [asm=335]
+   1024_ ||    37.16    37.17  |    38.10    38.10  |    45.29    45.30  | //: 32-bit, GCC_v3.42 [asm=335]
+   2048_ ||    36.50    36.78  |    36.76    36.76  |    42.45    42.54  | //: 32-bit, GCC_v3.42 [asm=335]
+   4096_ ||    36.23    36.52  |    36.06    36.06  |    41.19    41.21  | //: 32-bit, GCC_v3.42 [asm=335]
+   8192_ ||    33.71    35.10  |    33.02    33.08  |    37.40    37.47  | //: 32-bit, GCC_v3.42 [asm=335]
+  10000_ ||    33.41    33.76  |    33.24    33.24  |    37.68    38.45  | //: 32-bit, GCC_v3.42 [asm=335]
+  16384_ ||    33.63    34.72  |    33.12    35.81  |    40.09    40.41  | //: 32-bit, GCC_v3.42 [asm=335]
+  32768_ ||    33.96    34.18  |    33.53    33.86  |    37.88    38.29  | //: 32-bit, GCC_v3.42 [asm=335]
+ 100000_ ||    35.61    36.38  |    34.24    35.18  |    37.98    38.64  | //: 32-bit, GCC_v3.42 [asm=335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||         1568 bytes |         1264 bytes |         1472 bytes | //: 32-bit, GCC_v3.42 [asm=335]
+  Block  ||         3060 bytes |         6300 bytes |        20391 bytes | //: 32-bit, GCC_v3.42 [asm=335]
+
+Skein performance, in clks per byte, dtMin =   24 clks.
+         [compiled 14:32:20,Oct  7 2008  by  'MSC_v9.00', 64-bit]
+         =================================================================
+         ||                       Skein block size                       |
+         ||--------------------------------------------------------------|
+ Message ||       256 bits     |       512 bits     |      1024 bits     |
+ Length  ||====================|====================|====================|
+ (bytes) ||     min    median  |     min    median  |     min    median  |
+=========||====================|====================|====================|
+      1_ ||   684.00   690.00  |  1104.00  1104.00  |  2022.00  2022.00  | //: 64-bit, MSC_v9.00 [asm=335]
+      2_ ||   339.00   342.00  |   549.00   549.00  |  1011.00  1014.00  | //: 64-bit, MSC_v9.00 [asm=335]
+      4_ ||   168.00   169.50  |   277.50   277.50  |   505.50   505.50  | //: 64-bit, MSC_v9.00 [asm=335]
+      8_ ||    81.00    81.75  |   135.00   135.00  |   252.00   252.00  | //: 64-bit, MSC_v9.00 [asm=335]
+     10_ ||    65.40    65.40  |   109.80   109.80  |   201.60   202.20  | //: 64-bit, MSC_v9.00 [asm=335]
+     16_ ||    40.88    40.88  |    67.13    67.50  |   126.00   126.00  | //: 64-bit, MSC_v9.00 [asm=335]
+     32_ ||    20.06    20.25  |    33.56    33.75  |    62.81    63.00  | //: 64-bit, MSC_v9.00 [asm=335]
+     64_ ||    14.53    14.63  |    18.19    18.28  |    33.84    33.94  | //: 64-bit, MSC_v9.00 [asm=335]
+    100_ ||    15.78    15.78  |    16.80    16.80  |    22.02    22.08  | //: 64-bit, MSC_v9.00 [asm=335]
+    128_ ||    11.11    11.11  |    11.77    11.77  |    15.61    15.66  | //: 64-bit, MSC_v9.00 [asm=335]
+    256_ ||     9.52     9.52  |     9.07     9.09  |    11.41    11.41  | //: 64-bit, MSC_v9.00 [asm=335]
+    512_ ||     8.63     8.64  |     7.72     7.72  |     9.15     9.16  | //: 64-bit, MSC_v9.00 [asm=335]
+   1000_ ||     8.41     8.42  |     7.21     7.22  |     8.24     8.26  | //: 64-bit, MSC_v9.00 [asm=335]
+   1024_ ||     8.89     8.90  |     7.62     7.63  |     8.08     8.08  | //: 64-bit, MSC_v9.00 [asm=335]
+   2048_ ||     8.00     8.00  |     6.69     7.25  |     7.50     7.50  | //: 64-bit, MSC_v9.00 [asm=335]
+   4096_ ||     7.89     7.89  |     6.52     6.52  |     7.22     7.22  | //: 64-bit, MSC_v9.00 [asm=335]
+   8192_ ||     7.84     7.84  |     6.44     6.44  |     7.07     7.07  | //: 64-bit, MSC_v9.00 [asm=335]
+  10000_ ||     7.84     7.84  |     6.45     6.50  |     7.12     7.12  | //: 64-bit, MSC_v9.00 [asm=335]
+  16384_ ||     7.82     7.82  |     6.40     6.40  |     6.99     7.01  | //: 64-bit, MSC_v9.00 [asm=335]
+  32768_ ||     7.79     7.80  |     6.37     6.37  |     6.96     6.96  | //: 64-bit, MSC_v9.00 [asm=335]
+ 100000_ ||     8.11     8.11  |     6.49     6.74  |     6.95     7.26  | //: 64-bit, MSC_v9.00 [asm=335]
+=========||====================|====================|====================|
+Code Size||                    |                    |                    |
+=========||====================|====================|====================|
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=335]
+  Block  ||         1288 bytes |         2182 bytes |         7133 bytes | //: 64-bit, MSC_v9.00 [asm=335]
diff --git a/Additional_Implementations/skein_rot_search2.c b/Additional_Implementations/skein_rot_search2.c
new file mode 100644
index 0000000000000..a47f5c81d3e39
--- /dev/null
+++ b/Additional_Implementations/skein_rot_search2.c
@@ -0,0 +1,2538 @@
+/***********************************************************************
+**
+** Generate Skein rotation constant candidate sets and test them.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include <ctype.h>
+#include <math.h>
+#include <assert.h>
+#include "brg_types.h"                  /* get Brian Gladman's platform-specific definitions */
+
+#define uint    unsigned int
+#define u08b    uint_8t
+#define u32b    uint_32t
+#define u64b    uint_64t
+
+/* Threefish algorithm parameters */
+#ifndef BITS_PER_WORD
+#define BITS_PER_WORD           (64)    /* number of bits in each word of a Threefish block */
+#endif
+
+#define ROUNDS_PER_CYCLE         (8)    /* when do we inject keys and start reusing rotation constants? */
+#define MAX_BITS_PER_BLK      (1024)
+
+#define MAX_WORDS_PER_BLK       (MAX_BITS_PER_BLK/BITS_PER_WORD) 
+#define MAX_ROTS_PER_CYCLE      (MAX_WORDS_PER_BLK*(ROUNDS_PER_CYCLE/2))  
+
+/* default search parameters for different block sizes */
+#define DEFAULT_GEN_CNT_4     (5500)
+#define DEFAULT_ROUND_CNT_4     ( 8)
+#define MIN_HW_OR_4             (50)
+#define MAX_SAT_ROUNDS_4        ( 9)
+
+#define DEFAULT_GEN_CNT_8     (1600)
+#define DEFAULT_ROUND_CNT_8     ( 8)
+#define MIN_HW_OR_8             (36)
+#define MAX_SAT_ROUNDS_8        (10)
+
+#define DEFAULT_GEN_CNT_16     (400)    /* the 1024-bit search is slower, so search for fewer iterations :-( */
+#define DEFAULT_ROUND_CNT_16    ( 9)
+#define MIN_HW_OR_16            (40)
+#define MAX_SAT_ROUNDS_16       (11)
+
+#define MAX_ROT_VER_CNT         ( 4)
+#define MAX_ROT_VER_MASK        ((1 << MAX_ROT_VER_CNT ) - 1)
+
+#define MAX_POP_CNT           (1024)    /* size of population */
+#define MIN_POP_CNT           (  32)
+#define DEFAULT_POP_CNT       (MAX_POP_CNT)
+
+#define ID_RECALC_BIT_NUM       (16)
+#define TWIDDLE_CNT_BIT0        (17)
+#define TWIDDLE_CNT_MASK    ((1 << TWIDDLE_CNT_BIT0  ) - 1)
+#define ID_RECALC_BIT       ( 1 << ID_RECALC_BIT_NUM )
+#define ID_NUM_MASK         ((1 << ID_RECALC_BIT_NUM ) - 1)
+
+#if     BITS_PER_WORD == 64
+typedef u64b    Word;
+#elif   BITS_PER_WORD == 32
+typedef u32b    Word;
+#else
+#error  "Invalid BITS_PER_WORD"
+#endif
+
+/* tstFlag bits */
+#define TST_FLG_SHOW        (1u << 0)
+#define TST_FLG_SHOW_HIST   (1u << 1)
+#define TST_FLG_VERBOSE     (1u << 2)
+#define TST_FLG_STDERR      (1u << 3)
+#define TST_FLG_QUICK_EXIT  (1u << 4)
+#define TST_FLG_USE_ABS     (1u << 5)
+#define TST_FLG_KEEP_MIN_HW (1u << 6)
+#define TST_FLG_WEIGHT_REP  (1u << 7)
+#define TST_FLG_CHECK_ONE   (1u << 8)
+#define TST_FLG_DO_RAND     (1u << 9)
+
+/* parameters for ShowSearchRec */
+#define SHOW_ROTS_FINAL     (4)          
+#define SHOW_ROTS_H         (3)
+#define SHOW_ROTS_PRELIM    (2)
+#define SHOW_ROTS           (1)
+#define SHOW_NONE           (0)
+
+typedef struct { Word x[MAX_WORDS_PER_BLK]; } Block;
+
+typedef void cycle_func(Word *b, const u08b *rotates, int rounds);
+
+typedef struct                          /* record for dealing with rotation searches */
+    {
+    u08b rotList[MAX_ROTS_PER_CYCLE];   /* rotation constants */
+    uint CRC;                           /* CRC of rotates[] -- use as a quick "ID" */
+    uint ID;                            /* (get_rotation index) + (TwiddleCnt << TWIDDLE_CNT_BIT0) */
+    uint parentCRC;                     /* CRC of the parent (allows us to track genealogy a bit) */
+    uint rWorst;                        /* "worst" min bit-to-bit differential */
+    u08b hw_OR[MAX_ROT_VER_CNT];        /* min hamming weights (over all words), using OR */
+    } rSearchRec;
+
+typedef struct                          /* pass a bunch of parameters to RunSearch */
+    {
+    uint    tstFlags;
+    uint    rounds;
+    uint    minHW_or;
+    uint    minOffs;
+    uint    diffBits;
+    uint    genCntMax;
+    uint    sampleCnt;
+    uint    maxSatRnds;
+    uint    seed0;
+    uint    rotVerMask;
+    uint    popCnt;
+    uint    runHours;                   /* 0 ==> never */
+    uint    dupRotMask;                 /* zero --> allow dup rots within the same round */
+    uint    regradeCnt;                 /* default = 3 */
+    u64b    goodRotCntMask;             /* which rotation values are ok? */
+    } testParms;
+
+/* globals */
+cycle_func *fwd_cycle       =   NULL;
+cycle_func *rev_cycle       =   NULL;
+cycle_func *fwd_cycle_or    =   NULL;   /* slow but steady */
+cycle_func *rev_cycle_or    =   NULL;
+cycle_func *fwd_cycle_or_rN =   NULL;   /* optimized for the current # rounds (for speed) */
+cycle_func *rev_cycle_or_rN =   NULL;
+const char *rotFileName     =   NULL;   /* read from file instead of generate random? */
+uint        bitsPerBlock    =      0;   /* default is to process all block sizes */
+uint        rotsPerCycle;
+uint        wordsPerBlock;
+
+/* macro "functions" */
+#define RotCnt_Bad(rotCnt) (((t.goodRotCntMask >> (rotCnt)) & 1) == 0)
+#define  left_rot(a,N)     (((a) << (N)) | ((a) >> (BITS_PER_WORD - (N))))
+#define right_rot(a,N)     (((a) >> (N)) | ((a) << (BITS_PER_WORD - (N))))
+#define DUP_64(w32)        ((w32) | (((u64b) (w32)) << 32))
+
+/********************** use RC4 to generate test data ******************/
+/* Note: this works identically on all platforms (big/little-endian)   */
+static struct
+    {
+    uint I,J;                           /* RC4 vars */
+    u08b state[256];
+    } prng;
+
+void RandBytes(void *dst,uint byteCnt)
+    {
+    u08b a,b;
+    u08b *d = (u08b *) dst;
+
+    for (;byteCnt;byteCnt--,d++)        /* run RC4  */
+        {
+        prng.I  = (prng.I+1) & 0xFF;
+        a       =  prng.state[prng.I];
+        prng.J  = (prng.J+a) & 0xFF;
+        b       =  prng.state[prng.J];
+        prng.state[prng.I] = b;
+        prng.state[prng.J] = a;
+        *d      =  prng.state[(a+b) & 0xFF];
+        }
+    }
+
+/* get a pseudo-random 8-bit integer */
+uint Rand08(void)
+    {
+    u08b b;
+    RandBytes(&b,1);
+    return (uint) b;
+    }
+
+/* get a pseudo-random 32-bit integer in a portable way */
+uint Rand32(void)
+    {
+    uint i,n;
+    u08b tmp[sizeof(uint)];
+
+    RandBytes(tmp,sizeof(tmp));
+
+    for (i=n=0;i<sizeof(tmp);i++)
+        n = n*256 + tmp[i];
+    
+    return n;
+    }
+
+/* get a pseudo-random 64-bit integer in a portable way */
+u64b Rand64(void)
+    {
+    uint i;
+    u64b n;
+    u08b tmp[sizeof(u64b)];
+
+    RandBytes(tmp,sizeof(tmp));
+
+    n=0;
+    for (i=0;i<sizeof(tmp);i++)
+        n = n*256 + tmp[i];
+    
+    return n;
+    }
+
+/* init the (RC4-based) prng */
+void Rand_Init(u64b seed)
+    {
+    uint i,j;
+    u08b tmp[4*256];
+
+    /* init the "key" in an endian-independent fashion */
+    for (i=0;i<8;i++)
+        tmp[i] = (u08b) (seed >> (8*i));
+
+    /* initialize the permutation */
+    for (i=0;i<256;i++)
+        prng.state[i]=(u08b) i;
+
+    /* now run the RC4 key schedule */
+    for (i=j=0;i<256;i++)
+        {                   
+        j = (j + prng.state[i] + tmp[i%8]) & 0xFF;
+        tmp[256]      = prng.state[i];
+        prng.state[i] = prng.state[j];
+        prng.state[j] = tmp[256];
+        }
+    prng.I = prng.J = 0;  /* init I,J variables for RC4 */
+    
+    /* discard some initial RC4 keystream before returning */
+    RandBytes(tmp,sizeof(tmp));
+    }
+
+/* implementations of Skein round functions for various block sizes */
+void fwd_cycle_16(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds -=8)
+        {
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 0]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 1]); b[ 3] ^= b[ 2];
+        b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[ 2]); b[ 5] ^= b[ 4];
+        b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[ 3]); b[ 7] ^= b[ 6];
+        b[ 8] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[ 4]); b[ 9] ^= b[ 8];
+        b[10] += b[11]; b[11] = left_rot(b[11], rotates[ 5]); b[11] ^= b[10];
+        b[12] += b[13]; b[13] = left_rot(b[13], rotates[ 6]); b[13] ^= b[12];
+        b[14] += b[15]; b[15] = left_rot(b[15], rotates[ 7]); b[15] ^= b[14];
+        if (rounds == 1) break;                           
+                                                          
+        b[ 0] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[ 8]); b[ 9] ^= b[ 0];
+        b[ 2] += b[13]; b[13] = left_rot(b[13], rotates[ 9]); b[13] ^= b[ 2];
+        b[ 6] += b[11]; b[11] = left_rot(b[11], rotates[10]); b[11] ^= b[ 6];
+        b[ 4] += b[15]; b[15] = left_rot(b[15], rotates[11]); b[15] ^= b[ 4];
+        b[10] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[12]); b[ 7] ^= b[10];
+        b[12] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[13]); b[ 3] ^= b[12];
+        b[14] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[14]); b[ 5] ^= b[14];
+        b[ 8] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[15]); b[ 1] ^= b[ 8];
+        if (rounds == 2) break;                           
+                                                          
+        b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[16]); b[ 7] ^= b[ 0];
+        b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[17]); b[ 5] ^= b[ 2];
+        b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[18]); b[ 3] ^= b[ 4];
+        b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[19]); b[ 1] ^= b[ 6];
+        b[12] += b[15]; b[15] = left_rot(b[15], rotates[20]); b[15] ^= b[12];
+        b[14] += b[13]; b[13] = left_rot(b[13], rotates[21]); b[13] ^= b[14];
+        b[ 8] += b[11]; b[11] = left_rot(b[11], rotates[22]); b[11] ^= b[ 8];
+        b[10] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[23]); b[ 9] ^= b[10];
+        if (rounds == 3) break;                           
+                                                          
+        b[ 0] += b[15]; b[15] = left_rot(b[15], rotates[24]); b[15] ^= b[ 0];
+        b[ 2] += b[11]; b[11] = left_rot(b[11], rotates[25]); b[11] ^= b[ 2];
+        b[ 6] += b[13]; b[13] = left_rot(b[13], rotates[26]); b[13] ^= b[ 6];
+        b[ 4] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[27]); b[ 9] ^= b[ 4];
+        b[14] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[28]); b[ 1] ^= b[14];
+        b[ 8] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[29]); b[ 5] ^= b[ 8];
+        b[10] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[30]); b[ 3] ^= b[10];
+        b[12] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[31]); b[ 7] ^= b[12];
+        if (rounds == 4) break;                           
+                                                          
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[32]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[33]); b[ 3] ^= b[ 2];
+        b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[34]); b[ 5] ^= b[ 4];
+        b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[35]); b[ 7] ^= b[ 6];
+        b[ 8] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[36]); b[ 9] ^= b[ 8];
+        b[10] += b[11]; b[11] = left_rot(b[11], rotates[37]); b[11] ^= b[10];
+        b[12] += b[13]; b[13] = left_rot(b[13], rotates[38]); b[13] ^= b[12];
+        b[14] += b[15]; b[15] = left_rot(b[15], rotates[39]); b[15] ^= b[14];
+        if (rounds == 5) break;                           
+                                                          
+        b[ 0] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[40]); b[ 9] ^= b[ 0];
+        b[ 2] += b[13]; b[13] = left_rot(b[13], rotates[41]); b[13] ^= b[ 2];
+        b[ 6] += b[11]; b[11] = left_rot(b[11], rotates[42]); b[11] ^= b[ 6];
+        b[ 4] += b[15]; b[15] = left_rot(b[15], rotates[43]); b[15] ^= b[ 4];
+        b[10] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[44]); b[ 7] ^= b[10];
+        b[12] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[45]); b[ 3] ^= b[12];
+        b[14] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[46]); b[ 5] ^= b[14];
+        b[ 8] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[47]); b[ 1] ^= b[ 8];
+        if (rounds == 6) break;                           
+                                                          
+        b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[48]); b[ 7] ^= b[ 0];
+        b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[49]); b[ 5] ^= b[ 2];
+        b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[50]); b[ 3] ^= b[ 4];
+        b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[51]); b[ 1] ^= b[ 6];
+        b[12] += b[15]; b[15] = left_rot(b[15], rotates[52]); b[15] ^= b[12];
+        b[14] += b[13]; b[13] = left_rot(b[13], rotates[53]); b[13] ^= b[14];
+        b[ 8] += b[11]; b[11] = left_rot(b[11], rotates[54]); b[11] ^= b[ 8];
+        b[10] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[55]); b[ 9] ^= b[10];
+        if (rounds == 7) break;                           
+                                                          
+        b[ 0] += b[15]; b[15] = left_rot(b[15], rotates[56]); b[15] ^= b[ 0];
+        b[ 2] += b[11]; b[11] = left_rot(b[11], rotates[57]); b[11] ^= b[ 2];
+        b[ 6] += b[13]; b[13] = left_rot(b[13], rotates[58]); b[13] ^= b[ 6];
+        b[ 4] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[59]); b[ 9] ^= b[ 4];
+        b[14] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[60]); b[ 1] ^= b[14];
+        b[ 8] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[61]); b[ 5] ^= b[ 8];
+        b[10] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[62]); b[ 3] ^= b[10];
+        b[12] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[63]); b[ 7] ^= b[12];
+        }
+    }
+
+void fwd_cycle_8(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds -=8)
+        {
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 0]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 1]); b[ 3] ^= b[ 2];
+        b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[ 2]); b[ 5] ^= b[ 4];
+        b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[ 3]); b[ 7] ^= b[ 6];
+        if (rounds == 1) break;
+
+        b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 4]); b[ 1] ^= b[ 2];
+        b[ 4] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[ 5]); b[ 7] ^= b[ 4];
+        b[ 6] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[ 6]); b[ 5] ^= b[ 6];
+        b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 7]); b[ 3] ^= b[ 0];
+        if (rounds == 2) break;
+
+        b[ 4] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 8]); b[ 1] ^= b[ 4];
+        b[ 6] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 9]); b[ 3] ^= b[ 6];
+        b[ 0] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[10]); b[ 5] ^= b[ 0];
+        b[ 2] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[11]); b[ 7] ^= b[ 2];
+        if (rounds == 3) break;
+
+        b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[12]); b[ 1] ^= b[ 6];
+        b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[13]); b[ 7] ^= b[ 0];
+        b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[14]); b[ 5] ^= b[ 2];
+        b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[15]); b[ 3] ^= b[ 4];
+        if (rounds == 4) break;
+
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[16]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[17]); b[ 3] ^= b[ 2];
+        b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[18]); b[ 5] ^= b[ 4];
+        b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[19]); b[ 7] ^= b[ 6];
+        if (rounds == 5) break;
+
+        b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[20]); b[ 1] ^= b[ 2];
+        b[ 4] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[21]); b[ 7] ^= b[ 4];
+        b[ 6] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[22]); b[ 5] ^= b[ 6];
+        b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[23]); b[ 3] ^= b[ 0];
+        if (rounds == 6) break;
+
+        b[ 4] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[24]); b[ 1] ^= b[ 4];
+        b[ 6] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[25]); b[ 3] ^= b[ 6];
+        b[ 0] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[26]); b[ 5] ^= b[ 0];
+        b[ 2] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[27]); b[ 7] ^= b[ 2];
+        if (rounds == 7) break;
+
+        b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[28]); b[ 1] ^= b[ 6];
+        b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[29]); b[ 7] ^= b[ 0];
+        b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[30]); b[ 5] ^= b[ 2];
+        b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[31]); b[ 3] ^= b[ 4];
+        }
+    }
+
+void fwd_cycle_4(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds -=8)
+        {
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 0]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 1]); b[ 3] ^= b[ 2];
+        if (rounds == 1) break;
+
+        b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 2]); b[ 3] ^= b[ 0];
+        b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 3]); b[ 1] ^= b[ 2];
+        if (rounds == 2) break;
+
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 4]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 5]); b[ 3] ^= b[ 2];
+        if (rounds == 3) break;
+
+        b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 6]); b[ 3] ^= b[ 0];
+        b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 7]); b[ 1] ^= b[ 2];
+        if (rounds == 4) break;
+
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 8]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 9]); b[ 3] ^= b[ 2];
+        if (rounds == 5) break;
+
+        b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[10]); b[ 3] ^= b[ 0];
+        b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[11]); b[ 1] ^= b[ 2];
+        if (rounds == 6) break;
+
+        b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[12]); b[ 1] ^= b[ 0];
+        b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[13]); b[ 3] ^= b[ 2];
+        if (rounds == 7) break;
+
+        b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[14]); b[ 3] ^= b[ 0];
+        b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[15]); b[ 1] ^= b[ 2];
+        }
+    }
+
+/* reverse versions of the cipher */
+void rev_cycle_16(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds = (rounds-1) & ~7)
+        {
+        switch (rounds & 7)
+            {
+            case 0:
+                    b[ 7] ^= b[12]; b[ 7] = right_rot(b[ 7], rotates[63]); b[12] -= b[ 7]; 
+                    b[ 3] ^= b[10]; b[ 3] = right_rot(b[ 3], rotates[62]); b[10] -= b[ 3]; 
+                    b[ 5] ^= b[ 8]; b[ 5] = right_rot(b[ 5], rotates[61]); b[ 8] -= b[ 5]; 
+                    b[ 1] ^= b[14]; b[ 1] = right_rot(b[ 1], rotates[60]); b[14] -= b[ 1]; 
+                    b[ 9] ^= b[ 4]; b[ 9] = right_rot(b[ 9], rotates[59]); b[ 4] -= b[ 9]; 
+                    b[13] ^= b[ 6]; b[13] = right_rot(b[13], rotates[58]); b[ 6] -= b[13]; 
+                    b[11] ^= b[ 2]; b[11] = right_rot(b[11], rotates[57]); b[ 2] -= b[11]; 
+                    b[15] ^= b[ 0]; b[15] = right_rot(b[15], rotates[56]); b[ 0] -= b[15];
+            case 7:                                                                       
+                    b[ 9] ^= b[10]; b[ 9] = right_rot(b[ 9], rotates[55]); b[10] -= b[ 9];
+                    b[11] ^= b[ 8]; b[11] = right_rot(b[11], rotates[54]); b[ 8] -= b[11];
+                    b[13] ^= b[14]; b[13] = right_rot(b[13], rotates[53]); b[14] -= b[13];
+                    b[15] ^= b[12]; b[15] = right_rot(b[15], rotates[52]); b[12] -= b[15];
+                    b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[51]); b[ 6] -= b[ 1];
+                    b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[50]); b[ 4] -= b[ 3];
+                    b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[49]); b[ 2] -= b[ 5];
+                    b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[48]); b[ 0] -= b[ 7];
+            case 6:                                                                       
+                    b[ 1] ^= b[ 8]; b[ 1] = right_rot(b[ 1], rotates[47]); b[ 8] -= b[ 1];
+                    b[ 5] ^= b[14]; b[ 5] = right_rot(b[ 5], rotates[46]); b[14] -= b[ 5];
+                    b[ 3] ^= b[12]; b[ 3] = right_rot(b[ 3], rotates[45]); b[12] -= b[ 3];
+                    b[ 7] ^= b[10]; b[ 7] = right_rot(b[ 7], rotates[44]); b[10] -= b[ 7];
+                    b[15] ^= b[ 4]; b[15] = right_rot(b[15], rotates[43]); b[ 4] -= b[15];
+                    b[11] ^= b[ 6]; b[11] = right_rot(b[11], rotates[42]); b[ 6] -= b[11];
+                    b[13] ^= b[ 2]; b[13] = right_rot(b[13], rotates[41]); b[ 2] -= b[13];
+                    b[ 9] ^= b[ 0]; b[ 9] = right_rot(b[ 9], rotates[40]); b[ 0] -= b[ 9];
+            case 5:                                                                       
+                    b[15] ^= b[14]; b[15] = right_rot(b[15], rotates[39]); b[14] -= b[15];
+                    b[13] ^= b[12]; b[13] = right_rot(b[13], rotates[38]); b[12] -= b[13];
+                    b[11] ^= b[10]; b[11] = right_rot(b[11], rotates[37]); b[10] -= b[11];
+                    b[ 9] ^= b[ 8]; b[ 9] = right_rot(b[ 9], rotates[36]); b[ 8] -= b[ 9];
+                    b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[35]); b[ 6] -= b[ 7];
+                    b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[34]); b[ 4] -= b[ 5];
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[33]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[32]); b[ 0] -= b[ 1];
+            case 4:                                                                       
+                    b[ 7] ^= b[12]; b[ 7] = right_rot(b[ 7], rotates[31]); b[12] -= b[ 7];
+                    b[ 3] ^= b[10]; b[ 3] = right_rot(b[ 3], rotates[30]); b[10] -= b[ 3];
+                    b[ 5] ^= b[ 8]; b[ 5] = right_rot(b[ 5], rotates[29]); b[ 8] -= b[ 5];
+                    b[ 1] ^= b[14]; b[ 1] = right_rot(b[ 1], rotates[28]); b[14] -= b[ 1];
+                    b[ 9] ^= b[ 4]; b[ 9] = right_rot(b[ 9], rotates[27]); b[ 4] -= b[ 9];
+                    b[13] ^= b[ 6]; b[13] = right_rot(b[13], rotates[26]); b[ 6] -= b[13];
+                    b[11] ^= b[ 2]; b[11] = right_rot(b[11], rotates[25]); b[ 2] -= b[11];
+                    b[15] ^= b[ 0]; b[15] = right_rot(b[15], rotates[24]); b[ 0] -= b[15];
+            case 3:                                                                       
+                    b[ 9] ^= b[10]; b[ 9] = right_rot(b[ 9], rotates[23]); b[10] -= b[ 9];
+                    b[11] ^= b[ 8]; b[11] = right_rot(b[11], rotates[22]); b[ 8] -= b[11];
+                    b[13] ^= b[14]; b[13] = right_rot(b[13], rotates[21]); b[14] -= b[13];
+                    b[15] ^= b[12]; b[15] = right_rot(b[15], rotates[20]); b[12] -= b[15];
+                    b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[19]); b[ 6] -= b[ 1];
+                    b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[18]); b[ 4] -= b[ 3];
+                    b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[17]); b[ 2] -= b[ 5];
+                    b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[16]); b[ 0] -= b[ 7];
+            case 2:                                                                       
+                    b[ 1] ^= b[ 8]; b[ 1] = right_rot(b[ 1], rotates[15]); b[ 8] -= b[ 1];
+                    b[ 5] ^= b[14]; b[ 5] = right_rot(b[ 5], rotates[14]); b[14] -= b[ 5];
+                    b[ 3] ^= b[12]; b[ 3] = right_rot(b[ 3], rotates[13]); b[12] -= b[ 3];
+                    b[ 7] ^= b[10]; b[ 7] = right_rot(b[ 7], rotates[12]); b[10] -= b[ 7];
+                    b[15] ^= b[ 4]; b[15] = right_rot(b[15], rotates[11]); b[ 4] -= b[15];
+                    b[11] ^= b[ 6]; b[11] = right_rot(b[11], rotates[10]); b[ 6] -= b[11];
+                    b[13] ^= b[ 2]; b[13] = right_rot(b[13], rotates[ 9]); b[ 2] -= b[13];
+                    b[ 9] ^= b[ 0]; b[ 9] = right_rot(b[ 9], rotates[ 8]); b[ 0] -= b[ 9];
+            case 1:                                                                       
+                    b[15] ^= b[14]; b[15] = right_rot(b[15], rotates[ 7]); b[14] -= b[15];
+                    b[13] ^= b[12]; b[13] = right_rot(b[13], rotates[ 6]); b[12] -= b[13];
+                    b[11] ^= b[10]; b[11] = right_rot(b[11], rotates[ 5]); b[10] -= b[11];
+                    b[ 9] ^= b[ 8]; b[ 9] = right_rot(b[ 9], rotates[ 4]); b[ 8] -= b[ 9];
+                    b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[ 3]); b[ 6] -= b[ 7];
+                    b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[ 2]); b[ 4] -= b[ 5];
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 1]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 0]); b[ 0] -= b[ 1];
+            }                                                                             
+                                                                                          
+        }                                                                                 
+    }                                                                                     
+                                                                                          
+void rev_cycle_8(Word *b, const u08b *rotates, int rounds)                                
+    {                                                                                     
+    for (;rounds > 0;rounds = (rounds-1) & ~7)                                            
+        {                                                                                 
+        switch (rounds & 7)                                                               
+            {                                                                             
+            case 0:                                                                       
+                    b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[31]); b[ 4] -= b[ 3];
+                    b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[30]); b[ 2] -= b[ 5];
+                    b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[29]); b[ 0] -= b[ 7];
+                    b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[28]); b[ 6] -= b[ 1];
+            case 7:                                                                       
+                    b[ 7] ^= b[ 2]; b[ 7] = right_rot(b[ 7], rotates[27]); b[ 2] -= b[ 7];
+                    b[ 5] ^= b[ 0]; b[ 5] = right_rot(b[ 5], rotates[26]); b[ 0] -= b[ 5];
+                    b[ 3] ^= b[ 6]; b[ 3] = right_rot(b[ 3], rotates[25]); b[ 6] -= b[ 3];
+                    b[ 1] ^= b[ 4]; b[ 1] = right_rot(b[ 1], rotates[24]); b[ 4] -= b[ 1];
+            case 6:                                                                       
+                    b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[23]); b[ 0] -= b[ 3];
+                    b[ 5] ^= b[ 6]; b[ 5] = right_rot(b[ 5], rotates[22]); b[ 6] -= b[ 5];
+                    b[ 7] ^= b[ 4]; b[ 7] = right_rot(b[ 7], rotates[21]); b[ 4] -= b[ 7];
+                    b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[20]); b[ 2] -= b[ 1];
+            case 5:                                                                       
+                    b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[19]); b[ 6] -= b[ 7];
+                    b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[18]); b[ 4] -= b[ 5];
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[17]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[16]); b[ 0] -= b[ 1];
+            case 4:                                                                       
+                    b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[15]); b[ 4] -= b[ 3];
+                    b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[14]); b[ 2] -= b[ 5];
+                    b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[13]); b[ 0] -= b[ 7];
+                    b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[12]); b[ 6] -= b[ 1];
+            case 3:                                                                       
+                    b[ 7] ^= b[ 2]; b[ 7] = right_rot(b[ 7], rotates[11]); b[ 2] -= b[ 7];
+                    b[ 5] ^= b[ 0]; b[ 5] = right_rot(b[ 5], rotates[10]); b[ 0] -= b[ 5];
+                    b[ 3] ^= b[ 6]; b[ 3] = right_rot(b[ 3], rotates[ 9]); b[ 6] -= b[ 3];
+                    b[ 1] ^= b[ 4]; b[ 1] = right_rot(b[ 1], rotates[ 8]); b[ 4] -= b[ 1];
+            case 2:                                                                       
+                    b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[ 7]); b[ 0] -= b[ 3];
+                    b[ 5] ^= b[ 6]; b[ 5] = right_rot(b[ 5], rotates[ 6]); b[ 6] -= b[ 5];
+                    b[ 7] ^= b[ 4]; b[ 7] = right_rot(b[ 7], rotates[ 5]); b[ 4] -= b[ 7];
+                    b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[ 4]); b[ 2] -= b[ 1];
+            case 1:                                                                       
+                    b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[ 3]); b[ 6] -= b[ 7];
+                    b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[ 2]); b[ 4] -= b[ 5];
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 1]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 0]); b[ 0] -= b[ 1];
+            }                                                                             
+        }                                                                                 
+    }                                                                                     
+                                                                                          
+void rev_cycle_4(Word *b, const u08b *rotates, int rounds)                                
+    {                                                                                     
+    for (;rounds > 0;rounds = (rounds-1) & ~7)                                            
+        {                                                                                 
+        switch (rounds & 7)                                                               
+            {                                                                             
+            case 0:                                                                       
+                    b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[15]); b[ 2] -= b[ 1];
+                    b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[14]); b[ 0] -= b[ 3];
+            case 7:                                                                       
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[13]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[12]); b[ 0] -= b[ 1];
+            case 6:                                                                       
+                    b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[11]); b[ 2] -= b[ 1];
+                    b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[10]); b[ 0] -= b[ 3];
+            case 5:                                                                       
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 9]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 8]); b[ 0] -= b[ 1];
+            case 4:                                                                       
+                    b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[ 7]); b[ 2] -= b[ 1];
+                    b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[ 6]); b[ 0] -= b[ 3];
+            case 3:                                                                       
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 5]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 4]); b[ 0] -= b[ 1];
+            case 2:                                                                       
+                    b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[ 3]); b[ 2] -= b[ 1];
+                    b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[ 2]); b[ 0] -= b[ 3];
+            case 1:                                                                       
+                    b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 1]); b[ 2] -= b[ 3];
+                    b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 0]); b[ 0] -= b[ 1];
+            }
+        }
+    }
+
+#ifdef TEST_OR  /* enable this to simplify testing, since OR is not invertible */
+#define AddOp(I,J) b[I] += b[J]
+#define SubOp(I,J) b[I] -= b[J]
+#define XorOp(I,J) b[I] ^= b[J]
+#else           /* this is the "real" OR version */
+#define AddOp(I,J) b[I] |= b[J]
+#define SubOp(I,J) b[I] |= b[J]
+#define XorOp(I,J) b[I] |= b[J]
+#endif
+
+/* "OR" versions of the cipher: replace ADD, XOR with OR */
+void fwd_cycle_16_or(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds -=8)
+        {
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+        AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[ 4]); XorOp( 9, 8);
+        AddOp(10,11); b[11] = left_rot(b[11], rotates[ 5]); XorOp(11,10);
+        AddOp(12,13); b[13] = left_rot(b[13], rotates[ 6]); XorOp(13,12);
+        AddOp(14,15); b[15] = left_rot(b[15], rotates[ 7]); XorOp(15,14);
+        if (rounds == 1) break;                         
+                                                        
+        AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[ 8]); XorOp( 9, 0);
+        AddOp( 2,13); b[13] = left_rot(b[13], rotates[ 9]); XorOp(13, 2);
+        AddOp( 6,11); b[11] = left_rot(b[11], rotates[10]); XorOp(11, 6);
+        AddOp( 4,15); b[15] = left_rot(b[15], rotates[11]); XorOp(15, 4);
+        AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[12]); XorOp( 7,10);
+        AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3,12);
+        AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5,14);
+        AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 8);
+        if (rounds == 2) break;                         
+                                                        
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[16]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[17]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[18]); XorOp( 3, 4);
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[19]); XorOp( 1, 6);
+        AddOp(12,15); b[15] = left_rot(b[15], rotates[20]); XorOp(15,12);
+        AddOp(14,13); b[13] = left_rot(b[13], rotates[21]); XorOp(13,14);
+        AddOp( 8,11); b[11] = left_rot(b[11], rotates[22]); XorOp(11, 8);
+        AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[23]); XorOp( 9,10);
+        if (rounds == 3) break;                         
+                                                        
+        AddOp( 0,15); b[15] = left_rot(b[15], rotates[24]); XorOp(15, 0);
+        AddOp( 2,11); b[11] = left_rot(b[11], rotates[25]); XorOp(11, 2);
+        AddOp( 6,13); b[13] = left_rot(b[13], rotates[26]); XorOp(13, 6);
+        AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[27]); XorOp( 9, 4);
+        AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1,14);
+        AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[29]); XorOp( 5, 8);
+        AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[30]); XorOp( 3,10);
+        AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[31]); XorOp( 7,12);
+        if (rounds == 4) break;                         
+                                                        
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[32]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[33]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[34]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[35]); XorOp( 7, 6);
+        AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[36]); XorOp( 9, 8);
+        AddOp(10,11); b[11] = left_rot(b[11], rotates[37]); XorOp(11,10);
+        AddOp(12,13); b[13] = left_rot(b[13], rotates[38]); XorOp(13,12);
+        AddOp(14,15); b[15] = left_rot(b[15], rotates[39]); XorOp(15,14);
+        if (rounds == 5) break;                         
+                                                        
+        AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[40]); XorOp( 9, 0);
+        AddOp( 2,13); b[13] = left_rot(b[13], rotates[41]); XorOp(13, 2);
+        AddOp( 6,11); b[11] = left_rot(b[11], rotates[42]); XorOp(11, 6);
+        AddOp( 4,15); b[15] = left_rot(b[15], rotates[43]); XorOp(15, 4);
+        AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[44]); XorOp( 7,10);
+        AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[45]); XorOp( 3,12);
+        AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[46]); XorOp( 5,14);
+        AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[47]); XorOp( 1, 8);
+        if (rounds == 6) break;                         
+                                                        
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[48]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[49]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[50]); XorOp( 3, 4);
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[51]); XorOp( 1, 6);
+        AddOp(12,15); b[15] = left_rot(b[15], rotates[52]); XorOp(15,12);
+        AddOp(14,13); b[13] = left_rot(b[13], rotates[53]); XorOp(13,14);
+        AddOp( 8,11); b[11] = left_rot(b[11], rotates[54]); XorOp(11, 8);
+        AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[55]); XorOp( 9,10);
+        if (rounds == 7) break;                         
+                                                        
+        AddOp( 0,15); b[15] = left_rot(b[15], rotates[56]); XorOp(15, 0);
+        AddOp( 2,11); b[11] = left_rot(b[11], rotates[57]); XorOp(11, 2);
+        AddOp( 6,13); b[13] = left_rot(b[13], rotates[58]); XorOp(13, 6);
+        AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[59]); XorOp( 9, 4);
+        AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[60]); XorOp( 1,14);
+        AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[61]); XorOp( 5, 8);
+        AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[62]); XorOp( 3,10);
+        AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[63]); XorOp( 7,12);
+        }
+    }
+
+void fwd_cycle_8_or(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds -=8)
+        {
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+        if (rounds == 1) break;
+
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 2);
+        AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[ 5]); XorOp( 7, 4);
+        AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[ 6]); XorOp( 5, 6);
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 7]); XorOp( 3, 0);
+        if (rounds == 2) break;
+
+        AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 4);
+        AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 6);
+        AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[10]); XorOp( 5, 0);
+        AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[11]); XorOp( 7, 2);
+        if (rounds == 3) break;
+
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 6);
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[13]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[15]); XorOp( 3, 4);
+        if (rounds == 4) break;
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[16]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[17]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[18]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[19]); XorOp( 7, 6);
+        if (rounds == 5) break;
+
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[20]); XorOp( 1, 2);
+        AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[21]); XorOp( 7, 4);
+        AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[22]); XorOp( 5, 6);
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[23]); XorOp( 3, 0);
+        if (rounds == 6) break;
+
+        AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[24]); XorOp( 1, 4);
+        AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[25]); XorOp( 3, 6);
+        AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[26]); XorOp( 5, 0);
+        AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[27]); XorOp( 7, 2);
+        if (rounds == 7) break;
+
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1, 6);
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[29]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[30]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[31]); XorOp( 3, 4);
+        }
+    }
+
+void fwd_cycle_4_or(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds -=8)
+        {
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+        if (rounds == 1) break;
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 2]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 3]); XorOp( 1, 2);
+        if (rounds == 2) break;
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 5]); XorOp( 3, 2);
+        if (rounds == 3) break;
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 6]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 7]); XorOp( 1, 2);
+        if (rounds == 4) break;
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 2);
+        if (rounds == 5) break;
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[10]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[11]); XorOp( 1, 2);
+        if (rounds == 6) break;
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3, 2);
+        if (rounds == 7) break;
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[14]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 2);
+        }
+    }
+
+/* reverse versions of the cipher, using OR */
+void rev_cycle_16_or(Word *b, const u08b *rotates, int rounds)
+    {
+    for (;rounds > 0;rounds = (rounds-1) & ~7)
+        {
+        switch (rounds & 7)
+            {
+            case 0:
+                    XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[63]); SubOp(12, 7); 
+                    XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[62]); SubOp(10, 3); 
+                    XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[61]); SubOp( 8, 5); 
+                    XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[60]); SubOp(14, 1); 
+                    XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[59]); SubOp( 4, 9); 
+                    XorOp(13, 6); b[13] = right_rot(b[13], rotates[58]); SubOp( 6,13); 
+                    XorOp(11, 2); b[11] = right_rot(b[11], rotates[57]); SubOp( 2,11); 
+                    XorOp(15, 0); b[15] = right_rot(b[15], rotates[56]); SubOp( 0,15);
+            case 7:
+                    XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[55]); SubOp(10, 9); 
+                    XorOp(11, 8); b[11] = right_rot(b[11], rotates[54]); SubOp( 8,11); 
+                    XorOp(13,14); b[13] = right_rot(b[13], rotates[53]); SubOp(14,13); 
+                    XorOp(15,12); b[15] = right_rot(b[15], rotates[52]); SubOp(12,15); 
+                    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[51]); SubOp( 6, 1); 
+                    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[50]); SubOp( 4, 3); 
+                    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[49]); SubOp( 2, 5); 
+                    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[48]); SubOp( 0, 7);
+            case 6:
+                    XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[47]); SubOp( 8, 1); 
+                    XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[46]); SubOp(14, 5); 
+                    XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[45]); SubOp(12, 3); 
+                    XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[44]); SubOp(10, 7); 
+                    XorOp(15, 4); b[15] = right_rot(b[15], rotates[43]); SubOp( 4,15); 
+                    XorOp(11, 6); b[11] = right_rot(b[11], rotates[42]); SubOp( 6,11); 
+                    XorOp(13, 2); b[13] = right_rot(b[13], rotates[41]); SubOp( 2,13); 
+                    XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[40]); SubOp( 0, 9);
+            case 5:
+                    XorOp(15,14); b[15] = right_rot(b[15], rotates[39]); SubOp(14,15); 
+                    XorOp(13,12); b[13] = right_rot(b[13], rotates[38]); SubOp(12,13); 
+                    XorOp(11,10); b[11] = right_rot(b[11], rotates[37]); SubOp(10,11); 
+                    XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[36]); SubOp( 8, 9); 
+                    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[35]); SubOp( 6, 7); 
+                    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[34]); SubOp( 4, 5); 
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[33]); SubOp( 2, 3); 
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[32]); SubOp( 0, 1);
+            case 4:
+                    XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[31]); SubOp(12, 7); 
+                    XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[30]); SubOp(10, 3); 
+                    XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[29]); SubOp( 8, 5); 
+                    XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp(14, 1); 
+                    XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[27]); SubOp( 4, 9); 
+                    XorOp(13, 6); b[13] = right_rot(b[13], rotates[26]); SubOp( 6,13); 
+                    XorOp(11, 2); b[11] = right_rot(b[11], rotates[25]); SubOp( 2,11); 
+                    XorOp(15, 0); b[15] = right_rot(b[15], rotates[24]); SubOp( 0,15);
+            case 3:                                                                       
+                    XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[23]); SubOp(10, 9);
+                    XorOp(11, 8); b[11] = right_rot(b[11], rotates[22]); SubOp( 8,11);
+                    XorOp(13,14); b[13] = right_rot(b[13], rotates[21]); SubOp(14,13);
+                    XorOp(15,12); b[15] = right_rot(b[15], rotates[20]); SubOp(12,15);
+                    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[19]); SubOp( 6, 1);
+                    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[18]); SubOp( 4, 3);
+                    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[17]); SubOp( 2, 5);
+                    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[16]); SubOp( 0, 7);
+            case 2:                                                                       
+                    XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 8, 1);
+                    XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp(14, 5);
+                    XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp(12, 3);
+                    XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[12]); SubOp(10, 7);
+                    XorOp(15, 4); b[15] = right_rot(b[15], rotates[11]); SubOp( 4,15);
+                    XorOp(11, 6); b[11] = right_rot(b[11], rotates[10]); SubOp( 6,11);
+                    XorOp(13, 2); b[13] = right_rot(b[13], rotates[ 9]); SubOp( 2,13);
+                    XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[ 8]); SubOp( 0, 9);
+            case 1:                                                                       
+                    XorOp(15,14); b[15] = right_rot(b[15], rotates[ 7]); SubOp(14,15);
+                    XorOp(13,12); b[13] = right_rot(b[13], rotates[ 6]); SubOp(12,13);
+                    XorOp(11,10); b[11] = right_rot(b[11], rotates[ 5]); SubOp(10,11);
+                    XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[ 4]); SubOp( 8, 9);
+                    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+                    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+            }                                                                             
+                                                                                          
+        }                                                                                 
+    }                                                                                     
+                                                                                          
+void rev_cycle_8_or(Word *b, const u08b *rotates, int rounds)                             
+    {                                                                                     
+    for (;rounds > 0;rounds = (rounds-1) & ~7)                                            
+        {                                                                                 
+        switch (rounds & 7)                                                               
+            {                                                                             
+            case 0:                                                                       
+                    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[31]); SubOp( 4, 3);
+                    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[30]); SubOp( 2, 5);
+                    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[29]); SubOp( 0, 7);
+                    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp( 6, 1);
+            case 7:                                                                       
+                    XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[27]); SubOp( 2, 7);
+                    XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[26]); SubOp( 0, 5);
+                    XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[25]); SubOp( 6, 3);
+                    XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[24]); SubOp( 4, 1);
+            case 6:                                                                       
+                    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[23]); SubOp( 0, 3);
+                    XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[22]); SubOp( 6, 5);
+                    XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[21]); SubOp( 4, 7);
+                    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[20]); SubOp( 2, 1);
+            case 5:                                                                       
+                    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[19]); SubOp( 6, 7);
+                    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[18]); SubOp( 4, 5);
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[17]); SubOp( 2, 3);
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[16]); SubOp( 0, 1);
+            case 4:                                                                       
+                    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[15]); SubOp( 4, 3);
+                    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp( 2, 5);
+                    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[13]); SubOp( 0, 7);
+                    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 6, 1);
+            case 3:                                                                       
+                    XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[11]); SubOp( 2, 7);
+                    XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[10]); SubOp( 0, 5);
+                    XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 6, 3);
+                    XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 4, 1);
+            case 2:                                                                       
+                    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 7]); SubOp( 0, 3);
+                    XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[ 6]); SubOp( 6, 5);
+                    XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[ 5]); SubOp( 4, 7);
+                    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 2, 1);
+            case 1:                                                                       
+                    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+                    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+            }                                                                             
+        }                                                                                 
+    }                                                                                     
+                                                                                          
+void rev_cycle_4_or(Word *b, const u08b *rotates, int rounds)                             
+    {                                                                                     
+    for (;rounds > 0;rounds = (rounds-1) & ~7)                                            
+        {                                                                                 
+        switch (rounds & 7)                                                               
+            {                                                                             
+            case 0:                                                                       
+                    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 2, 1);
+                    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[14]); SubOp( 0, 3);
+            case 7:                                                                       
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp( 2, 3);
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 0, 1);
+            case 6:                                                                       
+                    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[11]); SubOp( 2, 1);
+                    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[10]); SubOp( 0, 3);
+            case 5:                                                                       
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 2, 3);
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 0, 1);
+            case 4:                                                                       
+                    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 7]); SubOp( 2, 1);
+                    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 6]); SubOp( 0, 3);
+            case 3:                                                                       
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 5]); SubOp( 2, 3);
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 0, 1);
+            case 2:                                                                       
+                    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 3]); SubOp( 2, 1);
+                    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 2]); SubOp( 0, 3);
+            case 1:                                                                       
+                    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+                    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+            }
+        }
+    }
+
+/* optimized versions for default round counts */
+#if   defined(__BORLANDC__)
+#pragma argsused
+#elif defined(_MSC_VER)
+#pragma warning(disable:4100)
+#endif
+void fwd_cycle_16_or_r9(Word *b, const u08b *rotates, int rounds)
+    {
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+        AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[ 4]); XorOp( 9, 8);
+        AddOp(10,11); b[11] = left_rot(b[11], rotates[ 5]); XorOp(11,10);
+        AddOp(12,13); b[13] = left_rot(b[13], rotates[ 6]); XorOp(13,12);
+        AddOp(14,15); b[15] = left_rot(b[15], rotates[ 7]); XorOp(15,14);
+
+        AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[ 8]); XorOp( 9, 0);
+        AddOp( 2,13); b[13] = left_rot(b[13], rotates[ 9]); XorOp(13, 2);
+        AddOp( 6,11); b[11] = left_rot(b[11], rotates[10]); XorOp(11, 6);
+        AddOp( 4,15); b[15] = left_rot(b[15], rotates[11]); XorOp(15, 4);
+        AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[12]); XorOp( 7,10);
+        AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3,12);
+        AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5,14);
+        AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 8);
+
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[16]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[17]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[18]); XorOp( 3, 4);
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[19]); XorOp( 1, 6);
+        AddOp(12,15); b[15] = left_rot(b[15], rotates[20]); XorOp(15,12);
+        AddOp(14,13); b[13] = left_rot(b[13], rotates[21]); XorOp(13,14);
+        AddOp( 8,11); b[11] = left_rot(b[11], rotates[22]); XorOp(11, 8);
+        AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[23]); XorOp( 9,10);
+
+        AddOp( 0,15); b[15] = left_rot(b[15], rotates[24]); XorOp(15, 0);
+        AddOp( 2,11); b[11] = left_rot(b[11], rotates[25]); XorOp(11, 2);
+        AddOp( 6,13); b[13] = left_rot(b[13], rotates[26]); XorOp(13, 6);
+        AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[27]); XorOp( 9, 4);
+        AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1,14);
+        AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[29]); XorOp( 5, 8);
+        AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[30]); XorOp( 3,10);
+        AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[31]); XorOp( 7,12);
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[32]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[33]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[34]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[35]); XorOp( 7, 6);
+        AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[36]); XorOp( 9, 8);
+        AddOp(10,11); b[11] = left_rot(b[11], rotates[37]); XorOp(11,10);
+        AddOp(12,13); b[13] = left_rot(b[13], rotates[38]); XorOp(13,12);
+        AddOp(14,15); b[15] = left_rot(b[15], rotates[39]); XorOp(15,14);
+
+        AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[40]); XorOp( 9, 0);
+        AddOp( 2,13); b[13] = left_rot(b[13], rotates[41]); XorOp(13, 2);
+        AddOp( 6,11); b[11] = left_rot(b[11], rotates[42]); XorOp(11, 6);
+        AddOp( 4,15); b[15] = left_rot(b[15], rotates[43]); XorOp(15, 4);
+        AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[44]); XorOp( 7,10);
+        AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[45]); XorOp( 3,12);
+        AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[46]); XorOp( 5,14);
+        AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[47]); XorOp( 1, 8);
+
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[48]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[49]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[50]); XorOp( 3, 4);
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[51]); XorOp( 1, 6);
+        AddOp(12,15); b[15] = left_rot(b[15], rotates[52]); XorOp(15,12);
+        AddOp(14,13); b[13] = left_rot(b[13], rotates[53]); XorOp(13,14);
+        AddOp( 8,11); b[11] = left_rot(b[11], rotates[54]); XorOp(11, 8);
+        AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[55]); XorOp( 9,10);
+
+        AddOp( 0,15); b[15] = left_rot(b[15], rotates[56]); XorOp(15, 0);
+        AddOp( 2,11); b[11] = left_rot(b[11], rotates[57]); XorOp(11, 2);
+        AddOp( 6,13); b[13] = left_rot(b[13], rotates[58]); XorOp(13, 6);
+        AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[59]); XorOp( 9, 4);
+        AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[60]); XorOp( 1,14);
+        AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[61]); XorOp( 5, 8);
+        AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[62]); XorOp( 3,10);
+        AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[63]); XorOp( 7,12);
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+        AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[ 4]); XorOp( 9, 8);
+        AddOp(10,11); b[11] = left_rot(b[11], rotates[ 5]); XorOp(11,10);
+        AddOp(12,13); b[13] = left_rot(b[13], rotates[ 6]); XorOp(13,12);
+        AddOp(14,15); b[15] = left_rot(b[15], rotates[ 7]); XorOp(15,14);
+    }
+
+#if   defined(__BORLANDC__)
+#pragma argsused
+#endif
+void fwd_cycle_8_or_r8(Word *b, const u08b *rotates, int rounds)
+    {
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 2);
+        AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[ 5]); XorOp( 7, 4);
+        AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[ 6]); XorOp( 5, 6);
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 7]); XorOp( 3, 0);
+
+        AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 4);
+        AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 6);
+        AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[10]); XorOp( 5, 0);
+        AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[11]); XorOp( 7, 2);
+
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 6);
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[13]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[15]); XorOp( 3, 4);
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[16]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[17]); XorOp( 3, 2);
+        AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[18]); XorOp( 5, 4);
+        AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[19]); XorOp( 7, 6);
+
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[20]); XorOp( 1, 2);
+        AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[21]); XorOp( 7, 4);
+        AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[22]); XorOp( 5, 6);
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[23]); XorOp( 3, 0);
+
+        AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[24]); XorOp( 1, 4);
+        AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[25]); XorOp( 3, 6);
+        AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[26]); XorOp( 5, 0);
+        AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[27]); XorOp( 7, 2);
+
+        AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1, 6);
+        AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[29]); XorOp( 7, 0);
+        AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[30]); XorOp( 5, 2);
+        AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[31]); XorOp( 3, 4);
+    }
+
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void fwd_cycle_4_or_r8(Word *b, const u08b *rotates, int rounds)
+    {
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 2]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 3]); XorOp( 1, 2);
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 5]); XorOp( 3, 2);
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 6]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 7]); XorOp( 1, 2);
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 2);
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[10]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[11]); XorOp( 1, 2);
+
+        AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 0);
+        AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3, 2);
+
+        AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[14]); XorOp( 3, 0);
+        AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 2);
+    }
+
+/* reverse versions of the cipher, using OR, for fixed round numbers */
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void rev_cycle_16_or_r9(Word *b, const u08b *rotates, int rounds)
+    {
+    XorOp(15,14); b[15] = right_rot(b[15], rotates[ 7]); SubOp(14,15);
+    XorOp(13,12); b[13] = right_rot(b[13], rotates[ 6]); SubOp(12,13);
+    XorOp(11,10); b[11] = right_rot(b[11], rotates[ 5]); SubOp(10,11);
+    XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[ 4]); SubOp( 8, 9);
+    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+                                                     
+    XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[63]); SubOp(12, 7); 
+    XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[62]); SubOp(10, 3); 
+    XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[61]); SubOp( 8, 5); 
+    XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[60]); SubOp(14, 1); 
+    XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[59]); SubOp( 4, 9); 
+    XorOp(13, 6); b[13] = right_rot(b[13], rotates[58]); SubOp( 6,13); 
+    XorOp(11, 2); b[11] = right_rot(b[11], rotates[57]); SubOp( 2,11); 
+    XorOp(15, 0); b[15] = right_rot(b[15], rotates[56]); SubOp( 0,15);
+                                                     
+    XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[55]); SubOp(10, 9); 
+    XorOp(11, 8); b[11] = right_rot(b[11], rotates[54]); SubOp( 8,11); 
+    XorOp(13,14); b[13] = right_rot(b[13], rotates[53]); SubOp(14,13); 
+    XorOp(15,12); b[15] = right_rot(b[15], rotates[52]); SubOp(12,15); 
+    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[51]); SubOp( 6, 1); 
+    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[50]); SubOp( 4, 3); 
+    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[49]); SubOp( 2, 5); 
+    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[48]); SubOp( 0, 7);
+                                                     
+    XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[47]); SubOp( 8, 1); 
+    XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[46]); SubOp(14, 5); 
+    XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[45]); SubOp(12, 3); 
+    XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[44]); SubOp(10, 7); 
+    XorOp(15, 4); b[15] = right_rot(b[15], rotates[43]); SubOp( 4,15); 
+    XorOp(11, 6); b[11] = right_rot(b[11], rotates[42]); SubOp( 6,11); 
+    XorOp(13, 2); b[13] = right_rot(b[13], rotates[41]); SubOp( 2,13); 
+    XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[40]); SubOp( 0, 9);
+                                                     
+    XorOp(15,14); b[15] = right_rot(b[15], rotates[39]); SubOp(14,15); 
+    XorOp(13,12); b[13] = right_rot(b[13], rotates[38]); SubOp(12,13); 
+    XorOp(11,10); b[11] = right_rot(b[11], rotates[37]); SubOp(10,11); 
+    XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[36]); SubOp( 8, 9); 
+    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[35]); SubOp( 6, 7); 
+    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[34]); SubOp( 4, 5); 
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[33]); SubOp( 2, 3); 
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[32]); SubOp( 0, 1);
+                                                     
+    XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[31]); SubOp(12, 7); 
+    XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[30]); SubOp(10, 3); 
+    XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[29]); SubOp( 8, 5); 
+    XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp(14, 1); 
+    XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[27]); SubOp( 4, 9); 
+    XorOp(13, 6); b[13] = right_rot(b[13], rotates[26]); SubOp( 6,13); 
+    XorOp(11, 2); b[11] = right_rot(b[11], rotates[25]); SubOp( 2,11); 
+    XorOp(15, 0); b[15] = right_rot(b[15], rotates[24]); SubOp( 0,15);
+                                                     
+    XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[23]); SubOp(10, 9);
+    XorOp(11, 8); b[11] = right_rot(b[11], rotates[22]); SubOp( 8,11);
+    XorOp(13,14); b[13] = right_rot(b[13], rotates[21]); SubOp(14,13);
+    XorOp(15,12); b[15] = right_rot(b[15], rotates[20]); SubOp(12,15);
+    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[19]); SubOp( 6, 1);
+    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[18]); SubOp( 4, 3);
+    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[17]); SubOp( 2, 5);
+    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[16]); SubOp( 0, 7);
+                                                     
+    XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 8, 1);
+    XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp(14, 5);
+    XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp(12, 3);
+    XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[12]); SubOp(10, 7);
+    XorOp(15, 4); b[15] = right_rot(b[15], rotates[11]); SubOp( 4,15);
+    XorOp(11, 6); b[11] = right_rot(b[11], rotates[10]); SubOp( 6,11);
+    XorOp(13, 2); b[13] = right_rot(b[13], rotates[ 9]); SubOp( 2,13);
+    XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[ 8]); SubOp( 0, 9);
+                                                     
+    XorOp(15,14); b[15] = right_rot(b[15], rotates[ 7]); SubOp(14,15);
+    XorOp(13,12); b[13] = right_rot(b[13], rotates[ 6]); SubOp(12,13);
+    XorOp(11,10); b[11] = right_rot(b[11], rotates[ 5]); SubOp(10,11);
+    XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[ 4]); SubOp( 8, 9);
+    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+    }
+                                                                                          
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void rev_cycle_8_or_r8(Word *b, const u08b *rotates, int rounds)                             
+    {                                                                                     
+    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[31]); SubOp( 4, 3);
+    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[30]); SubOp( 2, 5);
+    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[29]); SubOp( 0, 7);
+    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp( 6, 1);
+
+    XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[27]); SubOp( 2, 7);
+    XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[26]); SubOp( 0, 5);
+    XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[25]); SubOp( 6, 3);
+    XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[24]); SubOp( 4, 1);
+
+    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[23]); SubOp( 0, 3);
+    XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[22]); SubOp( 6, 5);
+    XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[21]); SubOp( 4, 7);
+    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[20]); SubOp( 2, 1);
+
+    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[19]); SubOp( 6, 7);
+    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[18]); SubOp( 4, 5);
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[17]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[16]); SubOp( 0, 1);
+
+    XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[15]); SubOp( 4, 3);
+    XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp( 2, 5);
+    XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[13]); SubOp( 0, 7);
+    XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 6, 1);
+
+    XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[11]); SubOp( 2, 7);
+    XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[10]); SubOp( 0, 5);
+    XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 6, 3);
+    XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 4, 1);
+
+    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 7]); SubOp( 0, 3);
+    XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[ 6]); SubOp( 6, 5);
+    XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[ 5]); SubOp( 4, 7);
+    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 2, 1);
+
+    XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+    XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+    }                                                                                     
+                                                                                          
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void rev_cycle_4_or_r8(Word *b, const u08b *rotates, int rounds)                             
+    {                                                                                     
+    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 2, 1);
+    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[14]); SubOp( 0, 3);
+
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 0, 1);
+
+    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[11]); SubOp( 2, 1);
+    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[10]); SubOp( 0, 3);
+
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 0, 1);
+
+    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 7]); SubOp( 2, 1);
+    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 6]); SubOp( 0, 3);
+
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 5]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 0, 1);
+
+    XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 3]); SubOp( 2, 1);
+    XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 2]); SubOp( 0, 3);
+
+    XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+    XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+    }
+
+
+/* test that fwd and rev ciphers are truly inverses */
+void InverseChecks(void)
+    {
+    uint  i,j,k,wCnt,tstCnt;
+    int   r,rN;
+    Block pt,ct,xt;
+    u08b  rots[MAX_ROTS_PER_CYCLE];
+    uint  TEST_CNT = (sizeof(size_t) == 8) ? 64 : 8;
+
+    cycle_func *fwd;
+    cycle_func *rev;
+    cycle_func *fwd_or;
+    cycle_func *fwd_or_rN;
+#ifdef TEST_OR
+    cycle_func *rev_or;
+    cycle_func *rev_or_rN;
+#endif
+    
+    Rand_Init(0);
+    for (wCnt=4;wCnt<=MAX_WORDS_PER_BLK;wCnt *= 2)
+        {
+        switch (wCnt)
+            {
+            case  4: fwd       = fwd_cycle_4        ; rev       = rev_cycle_4        ;
+                     fwd_or    = fwd_cycle_4_or     ; fwd_or_rN = fwd_cycle_4_or_r8  ; break;
+            case  8: fwd       = fwd_cycle_8        ; rev       = rev_cycle_8        ;
+                     fwd_or    = fwd_cycle_8_or     ; fwd_or_rN = fwd_cycle_8_or_r8  ; break;
+            default: fwd       = fwd_cycle_16       ; rev       = rev_cycle_16       ; 
+                     fwd_or    = fwd_cycle_16_or    ; fwd_or_rN = fwd_cycle_16_or_r9 ; break;
+            }
+#ifdef TEST_OR
+        switch (wCnt)
+            {
+            case  4: rev_or_rN = rev_cycle_4_or_r8  ; rev_or    = rev_cycle_4_or     ; break;
+            case  8: rev_or_rN = rev_cycle_8_or_r8  ; rev_or    = rev_cycle_8_or     ; break;
+            default: rev_or_rN = rev_cycle_16_or_r9 ; rev_or    = rev_cycle_16_or    ; break;
+            }
+#endif
+        for (tstCnt=0;tstCnt<TEST_CNT;tstCnt++)
+            {
+            if (tstCnt == 0)
+                {
+                memset(pt.x,0,sizeof(pt));      /* make the first test simple, for debug */
+                pt.x[0]++;
+                }
+            else
+                RandBytes(pt.x,wCnt*sizeof(pt.x[0]));
+
+            RandBytes(rots,sizeof(rots));       /* use random rotation constants */
+            for (i=0;i<MAX_ROTS_PER_CYCLE;i++)
+                rots[i] &= (BITS_PER_WORD-1);
+            for (r=1;r<32;r++)
+                {
+                ct=pt;
+                rev(ct.x,rots,r);
+                fwd(ct.x,rots,r);
+                if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+                    {
+                    printf("Inverse failure: #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+                    exit(8);
+                    }
+                fwd(ct.x,rots,r);
+                rev(ct.x,rots,r);
+                if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+                    {
+                    printf("Inverse failure: #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+                    exit(8);
+                    }
+#ifdef TEST_OR
+                fwd_or(ct.x,rots,r);
+                rev   (ct.x,rots,r);
+                if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+                    {
+                    printf("Inverse failure (fwd_or): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+                    exit(8);
+                    }
+                fwd   (ct.x,rots,r);
+                rev_or(ct.x,rots,r);
+                if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+                    {
+                    printf("Inverse failure (rev_or): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+                    exit(8);
+                    }                
+                if (r != ((wCnt == 16) ? 9 : 8))
+                    continue;
+                fwd_or_rN(ct.x,rots,r);
+                rev      (ct.x,rots,r);
+                if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+                    {
+                    printf("Inverse failure (fwd_or_rN): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+                    exit(8);
+                    }                
+                fwd      (ct.x,rots,r);
+                rev_or_rN(ct.x,rots,r);
+                if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+                    {
+                    printf("Inverse failure (rev_or_rN): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+                    exit(8);
+                    }                
+#else
+                /* validate that "quick" Hamming weight checks are ok, using OR */
+                for (i=0;i<wCnt;i++)
+                    {
+                    memset(ct.x,0,sizeof(ct.x));
+                    ct.x[i]=1;
+                    fwd_or(ct.x,rots,r);
+                    for (j=1;j<64;j++)
+                        {
+                        memset(xt.x,0,sizeof(xt.x));
+                        xt.x[i]=((u64b) 1) << j;
+                        fwd_or(xt.x,rots,r);
+                        for (k=0;k<wCnt;k++)
+                            if (left_rot(ct.x[k],j) != xt.x[k])
+                                {
+                                printf("Quick HW check failure: blk=%4d bits. r=%d. j=%d",wCnt*64,r,j);
+                                exit(2);
+                                }
+                        }
+                    }
+#endif
+                }
+            }
+        /* test the "hard coded" versions against variable versions of OR routines */
+        for (tstCnt=0;tstCnt<TEST_CNT;tstCnt++)
+            {
+            RandBytes(rots,sizeof(rots));
+            for (i=0;i<MAX_ROTS_PER_CYCLE;i++)
+                rots[i] &= (BITS_PER_WORD-1);
+            rN = (wCnt == 16) ? 9 : 8;
+            for (i=0;i<wCnt*64;i++)
+                {
+                memset(pt.x,0,sizeof(pt));
+                pt.x[i / 64] = ((u64b) 1) << (i % 64);
+                ct=pt;
+                xt=pt;
+                fwd_or   (ct.x,rots,rN);
+                fwd_or_rN(xt.x,rots,rN);
+                if (memcmp(xt.x,ct.x,wCnt*sizeof(xt.x[0])))
+                    {
+                    printf("OR failure: #%03d: wCnt=%d. i=%2d",tstCnt,wCnt,i);
+                    exit(8);
+                    }
+                }
+            }
+        }
+    }
+
+/* count the bits set in the word */
+uint HammingWeight(Word x)
+    {
+#if BITS_PER_WORD == 64
+    x = (x & DUP_64(0x55555555)) + ((x >> 1) & DUP_64(0x55555555));
+    x = (x & DUP_64(0x33333333)) + ((x >> 2) & DUP_64(0x33333333));
+    x = (x & DUP_64(0x0F0F0F0F)) + ((x >> 4) & DUP_64(0x0F0F0F0F));
+    x = (x & DUP_64(0x00FF00FF)) + ((x >> 8) & DUP_64(0x00FF00FF));
+    x = (x & DUP_64(0x0000FFFF)) + ((x >>16) & DUP_64(0x0000FFFF));
+    x = (x & DUP_64(0x000000FF)) + ((x >>32) & DUP_64(0x000000FF));
+#else
+    x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    x = (x & 0x0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F);
+    x = (x & 0x00FF00FF) + ((x >> 8) & 0x00FF00FF);
+    x = (x & 0x0000FFFF) + ((x >>16) & 0x000000FF);
+#endif
+    return (uint) x;
+    }
+
+
+/* use the CRC value as quick ID to help identify/verify rotation sets */
+void Set_CRC(rSearchRec *r)
+    {
+#define CRC_FDBK ((0x04C11DB7u >> 1) ^ 0x80000000u) /* CRC-32-IEEE-802.3 (from Wikipedia) */
+    uint i,h=~0u;
+
+    for (i=0;i<rotsPerCycle;i++)
+        {
+        h ^= r->rotList[i];
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+        h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+        }
+    r->CRC = h;
+    }
+
+/* qsort routine for search records: keep in descending order */
+int Compare_SearchRec_Descending(const void *aPtr,const void *bPtr)
+    {
+    uint wA = ((const rSearchRec *) aPtr)->rWorst;
+    uint wB = ((const rSearchRec *) bPtr)->rWorst;
+
+    if (wA < wB)
+        return +1;
+    if (wA > wB)
+        return -1;
+    else
+        {   /* equal metric. Sort by ID number */
+        wA = ((const rSearchRec *) aPtr)->ID;
+        wB = ((const rSearchRec *) bPtr)->ID;
+        if (wA < wB)
+            return -1;
+        if (wA > wB)
+            return +1;
+        return  0;
+        }
+    }
+
+const char *ASCII_TimeDate(void)
+    {
+    time_t t;
+    time(&t);   
+    return ctime(&t);
+    }
+
+/* test the rotation set for minimum hamming weight >= minHW */
+/*   [try to do it fast: rely on rotational symmetry using OR, */
+/*    and do an early exit if hamming weight is too low] */
+int Cycle_Min_HW(uint rounds, const u08b *rotList,uint minHW,uint verMask)
+    {
+    uint    i,j,v,hw,hMin;
+    u08b    rots[MAX_ROTS_PER_CYCLE];
+    Block   b;
+
+    hMin = BITS_PER_WORD;
+    for (v=0;v<MAX_ROT_VER_CNT;v++)
+        {
+        if ((verMask & (1 << v)) == 0)
+            continue;
+        if (v & 1)
+            { /* do it on the "half-cycle" */
+            for (i=0;i<rotsPerCycle;i++)
+                {
+                rots[i] = rotList[(i >= rotsPerCycle/2) ? i - rotsPerCycle/2 : i + rotsPerCycle/2];
+                }
+            }
+        else
+            memcpy(rots,rotList,rotsPerCycle*sizeof(rots[0]));
+        for (i=0;i<wordsPerBlock;i++)
+            {
+            memset(b.x,0,wordsPerBlock*sizeof(b.x[0]));
+            b.x[i] = 1;                     /* test propagation into one word */
+            if (minHW)
+                {       /* use the "_rN" versions for speed */
+                if (v & 2)
+                    rev_cycle_or_rN(b.x,rots,(int)rounds);
+                else
+                    fwd_cycle_or_rN(b.x,rots,(int)rounds);
+                }
+            else
+                {       /* saturation check */
+                if (v & 2)
+                    rev_cycle_or   (b.x,rots,(int)rounds);
+                else
+                    fwd_cycle_or   (b.x,rots,(int)rounds);
+                }
+            for (j=0;j<wordsPerBlock;j++)
+                {
+                hw = HammingWeight(b.x[j]);
+                if (minHW > hw)
+                    return 0;               /* stop if this isn't good enough */
+                if (hMin  > hw)             /* else keep track of min */
+                    hMin  = hw;
+                }
+            }
+        }
+    return hMin;
+    }
+
+/* compute/set the minimum hamming weight of the rotation set */
+/*   [more thorough check than Cycle_Min_HW] */
+uint Set_Min_hw_OR(rSearchRec *r,uint verMask,uint rounds)
+    { 
+    uint  i,j,v,hw,hwMin;
+    u08b  rots[MAX_ROTS_PER_CYCLE];
+    Block b;
+
+    Set_CRC(r);
+    hwMin = BITS_PER_WORD;
+    for (v=0;v<MAX_ROT_VER_CNT;v++)
+        {
+        r->hw_OR[v] = BITS_PER_WORD;
+        if ((verMask & (1 << v)) == 0)
+            continue;
+        if (v & 1)
+            { /* do it on the "half-cycle" */
+            for (i=0;i<rotsPerCycle;i++)
+                {
+                rots[i] = r->rotList[(i >= rotsPerCycle/2) ? i - rotsPerCycle/2 : i + rotsPerCycle/2];
+                }
+            }
+        else
+            memcpy(rots,r->rotList,rotsPerCycle*sizeof(rots[0]));
+        for (i=0;i<bitsPerBlock;i+=BITS_PER_WORD)
+            {
+            memset(b.x,0,sizeof(b.x));
+            b.x[i/BITS_PER_WORD] |= (((u64b) 1) << (i%BITS_PER_WORD));
+            if (v & 2)
+                rev_cycle_or(b.x,rots,(int) rounds);
+            else
+                fwd_cycle_or(b.x,rots,(int) rounds);
+            for (j=0;j<wordsPerBlock;j++)
+                {
+                hw = HammingWeight(b.x[j]);
+                if (hwMin > hw)
+                    hwMin = hw;
+                if (r->hw_OR[v] > (u08b) hw)
+                    r->hw_OR[v] = (u08b) hw;
+                }
+            }
+        }
+    return hwMin;
+    }
+
+/* show how the Hamming weight varies as a function of # rounds */
+void Show_HW_rounds(const u08b *rotates)
+    {
+    uint i,r,minHW,hw[4];
+
+    for (r=4;r<12;r++)
+        {  
+        minHW = bitsPerBlock;
+        for (i=0;i<4;i++)
+            {
+            hw[i]=Cycle_Min_HW(r,rotates,0,1 << i);
+            if (minHW > hw[i])
+                minHW = hw[i];
+            }
+        printf("%2d rounds: minHW = %2d  [",r,minHW);
+        for (i=0;i<4;i++)   /* show the different "versions" */
+            printf(" %2d",hw[i]);
+        printf(" ]\n");
+        }
+    }
+
+/* read rotations value from file */
+const u08b *get_rotation_file(const char *rfName)
+    {
+    enum   { MAX_LINE = 512 };
+    char   line[MAX_LINE+4];
+    uint   i,rotVal;
+    uint   rotShow=0;
+    static FILE *rf=NULL;
+    static u08b rotates[MAX_ROTS_PER_CYCLE];
+    static uint rotCnt =0;
+/**** sample format: 
++++++++++++++ Preliminary results: sampleCnt =  1024, block =  256 bits
+rMin = 0.425. #079C[*21] [CRC=D89E7C72. hw_OR=62. cnt= 1024. blkSize= 256]          
+   46   52
+   21   38
+   13   13
+   20   27
+   14   40
+   43   26
+   35   29
+   19   63
+rMin = 0.425. #0646[*17] [CRC=527174F3. hw_OR=61. cnt= 1024. blkSize= 256]          
+   26   24
+   50   48
+   40   25
+   36   55
+   10   20
+   10   16
+   60   55
+   18    7
+...
+****/
+    if (rfName[0] == '+')
+        {
+        rfName++;
+        rotShow = 1;
+        }
+    if (rf == NULL)
+        {
+        rf = fopen(rfName,"rt");
+        if (rf == NULL)
+            {
+            printf("Unable to open rotation file '%s'",rfName);
+            exit(2);
+            }
+        rotCnt=0;
+        for (;;)        /* skip to "preliminary results" section */
+            {
+            line[0]=0;
+            if (fgets(line,sizeof(line)-4,rf) == NULL || line[0] == 0)
+                {
+                fclose(rf);                 /* eof --> stop */
+                rf = NULL;
+                return NULL;
+                }
+            /* check for the header */
+            if (line[0] != '+' || line[1] != '+' || line[2] != '+' ||
+                strstr(line,"reliminary results:") == NULL)
+                continue;
+            /* now check for the correct block size */
+            for (i=strlen(line);i;i--)      /* start at eol and look backwards */
+                if (line[i-1] == '=')       /* check for '=' sign for block size */
+                    break;
+            if (i > 0 && sscanf(line+i,"%u bits",&i) == 1 && i == bitsPerBlock)
+                break;
+            }
+        }
+    /* now at the rMin line */
+    line[0]=0;
+    if (fgets(line,sizeof(line)-4,rf) == NULL || line[0] == 0 || strncmp(line,"rMin =",6))  
+        {
+        fclose(rf);
+        rf = NULL;
+        return NULL;
+        }
+
+    /* now read in all the rotation values */
+    for (i=0;i<rotsPerCycle;i++)
+        {
+        if (fscanf(rf,"%u",&rotVal) != 1 || rotVal >= bitsPerBlock)
+            {   /* Invalid rotation value */
+            fclose(rf);
+            rf = NULL;
+            return NULL;
+            }
+        rotates[i] = (u08b) rotVal;
+        }
+    if (fgets(line,sizeof(line)-4,rf) == NULL)          /* skip eol */
+        {
+        fclose(rf);
+        rf = NULL;
+        }
+    if (rotShow)
+        {   /* show the hamming weight profile */
+        printf("\n:::::::::::\n");
+        printf("Rot #%02d [%4d-bit blocks] read from file '%s':\n",rotCnt,bitsPerBlock,rfName);
+        for (i=0;i<rotsPerCycle;i++)
+            printf("%4d%s",rotates[i],((i+1)%(wordsPerBlock/2))?"":"\n");
+        Show_HW_rounds(rotates);     /* show HW results for different numbers of rounds */
+        printf(":::::::::::\n");
+        }
+    rotCnt++;
+    return rotates;
+    }
+
+/* generate a randomly chosen set of rotation constants of given minimum hamming weight (using OR) */
+/* (this may take a while, depending on minHW,rounds) */
+uint get_rotation(rSearchRec *r,testParms t)
+    {
+    static  u64b rCnt    = 1;
+    static  u64b rCntOK  = 0;
+    static  uint rScale  = BITS_PER_WORD;
+    static  uint hwBase  = 0;
+    static  uint rID     = 1;
+    uint    i,j,k,m,n,b,hw,q,qMask;
+    static  u08b rotates[MAX_ROTS_PER_CYCLE];   /*  last generated rotation set */
+    u08b    goodRots[BITS_PER_WORD];
+    uint    goodRotCnt;
+    
+    r->rWorst       =  0;
+    r->parentCRC    = ~0u;
+
+    if (rotFileName)                            /* get from search results file? */
+        {
+        const u08b *rf = get_rotation_file(rotFileName);
+        if (rf)
+            {
+            for (i=0;i<rotsPerCycle;i++)
+                r->rotList[i] = rf[i];
+            Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+            r->ID = rID++;
+            return 1;
+            }
+        /* here with file exhausted. Keep going with randomized values */
+        rotFileName = NULL;                     /* don't use file any more */
+        return 0;
+        }
+    for (i=goodRotCnt=0;i<BITS_PER_WORD;i++)
+        if (!RotCnt_Bad(i))
+            {
+            goodRots[goodRotCnt++] = (u08b) i;
+            }
+    
+    qMask   = ((wordsPerBlock/2)-1) & t.dupRotMask;     /* filter for dup rotate counts in the same round? */
+    for (;;rCnt++)
+        {
+        if (hwBase == 0)
+            {   /* pick a rotation set at random */
+            for (i=0;i<rotsPerCycle;)
+                {
+                rotates[i] = goodRots[Rand32() % goodRotCnt];
+                /* filter out unapproved rotation sets here */
+                for (q=i & ~qMask;q < i;q++)    /* check for dups in the same round */
+                    if (rotates[i] == rotates[q])
+                        break;
+                if (q >= i)                     /* no dup, value ok, so this value is ok */
+                    i++;
+                }
+            hw = Cycle_Min_HW(t.rounds,rotates,t.minHW_or-t.minOffs,t.rotVerMask);
+            if (hw == 0)                /* did we get close? */
+                continue;
+            rCntOK++;
+
+            hwBase = hw;
+            if (hw >= t.minHW_or)
+                if (Cycle_Min_HW(t.maxSatRnds, rotates,0,t.rotVerMask) == BITS_PER_WORD)
+                    {
+                    for (i=0;i<rotsPerCycle;i++)
+                        r->rotList[i] = rotates[i];
+                    rScale = 1;         /* set up for scaling below */
+                    }
+            }
+        /* use odd scaling for randomly generated rotations */
+        for (;rScale < BITS_PER_WORD;)
+            {
+            for (i=0;i<rotsPerCycle;i++)
+                {
+                r->rotList[i] = (rotates[i] * rScale) % BITS_PER_WORD;
+                if (RotCnt_Bad(r->rotList[i]))
+                    break;
+                }
+            rScale+=2;                  /* bump scale factor for next time */
+            if (i >= rotsPerCycle)
+                {   /* all values ok: this one's a keeper */
+                Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+                r->ID = rID++;
+                return 1;
+                }
+            }
+        /* Try nearby values to see if hw gets better: monotonic hill climb. */
+        /*      -- exhaustively try all possible values of pairs of changes  */
+        for (m=0;m<rotsPerCycle;m++)
+        for (b=0;b<BITS_PER_WORD ;b++)
+            {
+            k = rotsPerCycle-1-m;           /* work backwards, since we're already close */
+            rotates[k]++;
+            rotates[k] &= (BITS_PER_WORD-1);
+            if (RotCnt_Bad(rotates[k]))
+                continue;
+            for (q=k | qMask;q > k;q--)    /* check for dups in the same round */
+                if (rotates[k] == rotates[q])
+                    break;
+            if (q > k)      
+                continue;
+            for (i=m+1;i<rotsPerCycle;i++)
+                {
+                n = rotsPerCycle-1-i;   /* work backwards */
+                for (j=0;j<BITS_PER_WORD;j++)
+                    {
+                    rotates[n]++;       /* try another rotation value */
+                    rotates[n] &= (BITS_PER_WORD-1);
+                    if (RotCnt_Bad(rotates[n]))
+                        continue;
+                    for (q=n | qMask;q > n;q--)    /* check for dups in the same round */
+                        if (rotates[n] == rotates[q])
+                            break;
+                    if (q > n)      
+                        continue;  
+                    k  = (t.minHW_or > hwBase) ? t.minHW_or : hwBase;
+                    hw = Cycle_Min_HW(t.rounds,rotates,k,t.rotVerMask);
+                    if (hw > hwBase)
+                        if (Cycle_Min_HW(t.maxSatRnds, rotates,0,t.rotVerMask) == BITS_PER_WORD)
+                            {   /* must improve hw to accept this new rotation set */
+                            assert(hw >= t.minHW_or);
+                            hwBase = hw;
+                            rScale = 3; /* set up for scaling next time */
+                            for (i=0;i<rotsPerCycle;i++)
+                                r->rotList[i] = rotates[i];
+                            Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+                            r->ID = rID++;
+                            return 1;
+                            }
+                    }
+                }
+            }
+        hwBase = 0;                     /* back to random  */
+        }
+    }
+
+/* display a search record result */
+void ShowSearchRec(FILE *f,const rSearchRec *r,testParms t,uint showMode,char markCh,uint showNum)
+    {
+    uint  i,j,n,hwMin;
+    const char *s;
+    char  fStr[200];
+
+    hwMin=BITS_PER_WORD;
+    for (i=0;i<MAX_ROT_VER_CNT;i++)
+        if (hwMin > (uint) r->hw_OR[i])
+            hwMin = (uint) r->hw_OR[i];
+
+    switch (showMode)
+        {
+        case SHOW_ROTS_FINAL:  sprintf(fStr,".final:%02d " ,showNum); s = fStr; break;
+        case SHOW_ROTS_H:      s = ".format";  break;
+        case SHOW_ROTS_PRELIM: s = ".prelim";  break;
+        default:               s = "";         break;
+        }
+
+    fprintf(f,"rMin = %5.3f.%c [CRC=%08X. parent=%08X. ID=%08X. hw_OR=%2d. cnt=%5d. bits=%4u]%-10s%s%s\n",
+            r->rWorst/(double)t.sampleCnt,markCh,r->CRC,r->parentCRC,r->ID,
+            hwMin,t.sampleCnt,bitsPerBlock,s,
+            (t.tstFlags & TST_FLG_USE_ABS)?" useAbs":"",(r->ID & ID_RECALC_BIT)?" recalc":""
+           );
+
+    switch (showMode)
+        {
+        case SHOW_NONE:
+            break;
+        case SHOW_ROTS_H: /* format for "skein.h" */
+            for (j=n=0;j<rotsPerCycle/(wordsPerBlock/2);j++)
+                {
+                fprintf(f,"   ");
+                for (i=0;i<wordsPerBlock/2;i++)
+                    {
+                    fprintf(f,(wordsPerBlock == 16)?" R%04d":" R_%03d",wordsPerBlock*64);
+                    fprintf(f,"_%d_%d=%2d,",j,i,r->rotList[n++]);
+                    }
+                fprintf(f,"\n");
+                }
+            break;
+        default:
+            for (i=0;i<rotsPerCycle;i++)
+                fprintf(f,"   %2d%s",r->rotList[i],((i+1)%(wordsPerBlock/2))?"":"\n");
+            break;
+        }
+    }
+
+/* compute Skein differentials for a given rotation set */
+uint CheckDifferentials(rSearchRec *r,testParms t)
+    {
+    enum  { HIST_BINS =  20 };
+
+    uint    i,j,k,v,n,d,dMax,minCnt,maxCnt,vCnt,q;
+    uint    rMin,rMax,hwMin,hwMax,hw,rMinCnt,rMaxCnt,iMin,jMin,iMax,jMax;
+    uint    hist[HIST_BINS+1];
+    u08b    rots[MAX_ROTS_PER_CYCLE];
+    u64b    totSum,w,y,z,oMask;
+    double  fSum,fSqr,x,var,denom;
+    static  u64b onesCnt[3][MAX_BITS_PER_BLK][MAX_BITS_PER_BLK/8]; /* pack eight 8-bit counts into each u64b (for speed) */
+    u64b   *oPtr;
+    struct
+        {
+        Block pt,ct;
+        } a,b;
+
+    r->rWorst = t.sampleCnt;
+    dMax = 1u << (t.diffBits & (BITS_PER_WORD-1));
+    iMin = jMin = iMax = jMax = bitsPerBlock + 1;
+
+    for (v=vCnt=0;v < MAX_ROT_VER_CNT; v++)  
+        { /* different versions of rotation schedule, including "inverse" cipher */
+        if ((t.rotVerMask & (1 << v)) == 0)
+            continue;
+        vCnt++;     /* number of versions processed */
+        if (v & 1)
+            { /* do it on the "half-cycle" */
+            for (i=0;i<rotsPerCycle;i++)
+                {
+                rots[i] = r->rotList[(i >= rotsPerCycle/2) ? i - rotsPerCycle/2 : i + rotsPerCycle/2];
+                }
+            }
+        else
+            memcpy(rots,r->rotList,rotsPerCycle*sizeof(rots[0]));
+        for (d=1; d < dMax; d+=2)    /* multi-bit difference patterns (must start with a '1' bit)  */
+            {
+            hwMax=0;
+            hwMin=bitsPerBlock+1;
+            memset(onesCnt,0,sizeof(onesCnt));      /* clear stats before starting */
+                
+            oMask = DUP_64(0x01010101);             /* mask for adding, 8 bins at a time */
+            for (n=1;n<=t.sampleCnt;n++)
+                {
+                for (i=0;i<wordsPerBlock;i++)       /* generate input blocks in a portable way */
+                    a.pt.x[i] = Rand64();
+                a.ct = a.pt;
+                if (v & 2)
+                    rev_cycle(a.ct.x,rots,t.rounds);
+                else
+                    fwd_cycle(a.ct.x,rots,t.rounds);
+                for (i=0;i<bitsPerBlock;i++)
+                    {
+                    b.pt = a.pt;
+                    b.pt.x[i/BITS_PER_WORD] ^= left_rot((u64b)d,(i%BITS_PER_WORD));  /* inject input difference  */
+                    b.ct = b.pt;
+                    if (t.tstFlags & TST_FLG_DO_RAND)
+                        RandBytes(b.ct.x,sizeof(b.ct.x));       /* random results as a comparison point */
+                    else if (v & 2)
+                        rev_cycle(b.ct.x,rots,t.rounds);        /* let Skein do the mixing */
+                    else
+                        fwd_cycle(b.ct.x,rots,t.rounds);        /* let Skein do the mixing */
+                    z  = 0;                                     /* accumulate total hamming weight in z */
+                    oPtr = onesCnt[0][i];
+                    for (j=0;j<wordsPerBlock;j++)
+                        {                                       /* inner-most loop: unroll it fully */
+                        w = b.ct.x[j] ^ a.ct.x[j];              /* xor difference in each ciphertext word */
+                        y = (w     ) & oMask; oPtr[0] += y; z += y;   /* sum 8 bins at a time (bits 0,8,16,24...,56) */
+                        y = (w >> 1) & oMask; oPtr[1] += y; z += y;
+                        y = (w >> 2) & oMask; oPtr[2] += y; z += y;   /* do it 8 times to cover all bits in w */
+                        y = (w >> 3) & oMask; oPtr[3] += y; z += y;
+                                                                                    
+                        y = (w >> 4) & oMask; oPtr[4] += y; z += y;
+                        y = (w >> 5) & oMask; oPtr[5] += y; z += y;
+                        y = (w >> 6) & oMask; oPtr[6] += y; z += y;
+                        y = (w >> 7) & oMask; oPtr[7] += y; z += y;
+                        oPtr += 8;
+                        }
+                    /* sum up the total hamming weight bins (very carefully) */
+                    z = (z & DUP_64(0x00FF00FF)) + ((z >> 8) & DUP_64(0x00FF00FF));
+                    hw  = (uint) (z + (z >> 16) + (z >> 32) + (z >> 48)) & 0xFFFF;
+                    if (hwMin > hw) hwMin = hw;                 /* update total hw min/max stats */
+                    if (hwMax < hw) hwMax = hw;
+                    }
+                if ((n & 0x7F) == 0)
+                    {   /* prevent onesCnt[0] overflow by "transferring" MSBs of 8-bit bytes into onesCnt[1] */
+                    for (i=0;i<bitsPerBlock  ;i++)
+                    for (j=0;j<bitsPerBlock/8;j++)
+                        {   /* add the MSB (bit 7) of each byte into onesCnt[1], then mask it off in onesCnt[0] */
+                        onesCnt[1][i][j] += (onesCnt[0][i][j] >> 7) & oMask;
+                        onesCnt[0][i][j] &= ~(oMask << 7);
+                        }
+                    if ((n & 0x3FFF) == 0)
+                        {   /* propagate overflow into onesCnt[2] (occasionally, as needed) */
+                        for (i=0;i<bitsPerBlock  ;i++)
+                        for (j=0;j<bitsPerBlock/8;j++)
+                            { 
+                            onesCnt[2][i][j] += (onesCnt[1][i][j] >> 7) & oMask;
+                            onesCnt[1][i][j] &= ~(oMask << 7);
+                            }
+                        }
+                    }
+                if (n == 32 && d == 1 && (t.tstFlags & TST_FLG_QUICK_EXIT))
+                    {   /* quick exit if not even close to random looking after a few samples */
+                    for (i=0;i<bitsPerBlock  ;i++)
+                    for (j=0;j<bitsPerBlock/8;j++)
+                        {
+                        if ((onesCnt[0][i][j] & ~oMask) == 0)  /* any count less than 2? */
+                            {
+                            /** Since an ideal random function has prob=0.5 each for input/output bit 
+                             ** pair, the expected distribution of onesCnt[i][j] is binomial. 
+                             ** Thus, at this point, the probability of onesCnt[i][j] < 2 is:
+                             **     ((1+32)/2)/(2**-32)
+                             ** This probability is roughly 2**(-27), so when we observe such an
+                             ** occurrence, we exit immediately to save taking a lot of stats just
+                             ** to fail later. This filter significantly speeds up the search, at a
+                             ** very low probability of improperly dismissing a "good" rotation set.
+                             **/
+                            if (t.tstFlags & TST_FLG_SHOW && vCnt > 1)
+                                {   /* show why we stopped, if we already showed something */
+                                printf("%23s/* quick exit: %d/%d */\n","",(uint)onesCnt[0][i][j],n);
+                                }
+                            return r->rWorst = 0;   /* not a good result */
+                            }
+                        }
+                    }
+                }
+            /* now process the stats from the samples we just generated */
+            assert(t.sampleCnt < (1 << 22));            /* 2**22 is big enough not to worry! */
+            memset(hist,0,sizeof(hist));
+            fSum  = fSqr = 0.0;
+            denom = 1.0 / (double) t.sampleCnt;
+            rMin  = minCnt = ~0u;
+            totSum= rMax = rMinCnt = rMaxCnt = maxCnt = 0;
+            for (i=0;i<bitsPerBlock;i++)
+                {
+                for (j=0;j<bitsPerBlock/8;j++)
+                    {
+                    w = onesCnt[0][i][j];               /* 7+ bits here */
+                    y = onesCnt[1][i][j];               /* 7+ bits here */
+                    z = onesCnt[2][i][j];               /* 8  bits here.  Total = 22 bits */
+                    for (k=0;k<8;k++,w >>= 8,y >>= 8,z >>= 8)
+                        {
+                        q = (uint) ((w & 0xFF) + ((y & 0xFF) << 7) + ((z & 0xFF) << 14));
+                        if (maxCnt < q) { maxCnt = q; iMax = i; jMax = j; if (rMax < q) { rMax = q; rMaxCnt = 0; } }
+                        if (minCnt > q) { minCnt = q; iMin = i; jMin = j; if (rMin > q) { rMin = q; rMinCnt = 0; } }
+                        if (rMin == minCnt) rMinCnt++;
+                        if (rMax == maxCnt) rMaxCnt++;
+                        if (t.tstFlags & TST_FLG_SHOW)
+                            {   /* compute more extensive stats only if showing results below */
+                            totSum  += q;
+                            x        = q*denom;                 /* update stats for stdDev  */
+                            fSum    += x;
+                            fSqr    += x*x;
+                            hist[(uint)floor(x*HIST_BINS)]++;   /* track histogram  */
+                            }
+                        }
+                    }
+                }
+            if (t.tstFlags & TST_FLG_USE_ABS && rMin > t.sampleCnt - rMax)
+                {
+                rMin = t.sampleCnt - rMax;                      /* use max variation from 1/2 */
+                iMin = iMax;
+                jMin = jMax;
+                }
+            if (r->rWorst > rMin)
+                {
+                r->rWorst = rMin;
+                if (rMin == 0)
+                    {  /* if far worse than current best, stop now (to speed up the search) */
+                    if (t.tstFlags & TST_FLG_SHOW && (d > 1 || vCnt > 1)) /* show why we stopped, if we already showed something */
+                        printf("%23s/* early exit */\n","");
+                    return r->rWorst = 0;
+                    }
+                }
+            if (t.tstFlags & TST_FLG_SHOW)
+                {         /* show some detailed results of the test */
+                if (d == 1)
+                    {     /* put out the rotation info the first time thru */
+                    if ((t.tstFlags & TST_FLG_DO_RAND) == 0)
+                        {
+                        printf("Rotation set [CRC=%08X. hw_OR=%2d. sampleCnt=%5d. block=%4d bits. v=%d]:\n",
+                               r->CRC,r->hw_OR[v],t.sampleCnt,bitsPerBlock,v);
+                        if (vCnt == 0)
+                            for (i=0;i<rotsPerCycle;i++)
+                                printf("   %2d%s",r->rotList[i],((i+1)%(wordsPerBlock/2))?"":"\n");
+                        }
+                    }
+                printf("rnds=%2d,cnt=%5d",t.rounds,t.sampleCnt);
+                x  =  fSum/(bitsPerBlock*bitsPerBlock);
+                var= (fSqr/(bitsPerBlock*bitsPerBlock)) - x*x;
+                printf(" min=%5.3f.[%c] max=%5.3f.[%c]  hw=%3d..%3d.  avg=%7.5f. std=%6.4f. d=%X. [%3d,%3d]",
+                       rMin*denom,(rMinCnt > 9) ? '+' : '0'+rMinCnt,
+                       rMax*denom,(rMaxCnt > 9) ? '+' : '0'+rMaxCnt,
+                       hwMin,hwMax,
+                       (totSum*denom)/(bitsPerBlock*bitsPerBlock),sqrt(var),(uint)d,iMin,jMin);
+                if (t.tstFlags & TST_FLG_SHOW_HIST)
+                    { /* very wide histogram display */
+                    for (i=0;i<=HIST_BINS;i++)
+                        if (hist[i])
+                            printf(" %7.5f",hist[i]/(double)(bitsPerBlock*bitsPerBlock));
+                        else
+                            printf("  _     ");
+                    }
+                if (t.tstFlags & TST_FLG_DO_RAND)
+                    printf(" [RANDOM] ");
+                printf("\n");
+                fflush(stdout);
+                }
+            if (t.tstFlags & TST_FLG_DO_RAND)
+                break;        /* no need to do more than one random setting per rotation set */
+            }   /* for (d=1;d<dMax;d+=2) */
+        if (t.tstFlags & TST_FLG_DO_RAND)
+            break;        /* no need to do more than one random setting per rotation set */
+        }
+    return r->rWorst;
+    }
+
+/* twiddle a bit with an entry, but keep maxSatRounds satisfied */
+void Twiddle(rSearchRec *r,testParms t)
+    {
+    enum { MAX_TWIDDLE_CNT = 100, MAX_ROT_CNT = 6 };
+    uint i,j,k,n,v[MAX_ROT_CNT];
+    u08b old[MAX_ROT_CNT];
+    u64b usedBitmap;
+    u08b goodRots[BITS_PER_WORD];
+    uint goodRotCnt;
+
+    assert(rotsPerCycle <= sizeof(usedBitmap)*8);
+    r->ID += (1 << TWIDDLE_CNT_BIT0);           /* bump count of number of times twiddled */
+    r->ID &= ~ID_RECALC_BIT;                    /* show this one hasn't been had recalc yet */
+    r->parentCRC = r->CRC;                      /* track genealogy */
+
+    for (i=goodRotCnt=0;i<BITS_PER_WORD;i++)
+        if (!RotCnt_Bad(i))
+            {
+            goodRots[goodRotCnt++] = (u08b) i;
+            }
+
+    n = 1 + (Rand08() % MAX_ROT_CNT);
+    for (i=0;i<4;i++)
+        {
+        usedBitmap = 0; 
+        for (j=0;j<n;j++)
+            {               /* pick which set of n rotation constants to change */
+            do  {
+                v[j] = Rand08() % rotsPerCycle; /* rotation index */
+                }
+            while ((usedBitmap >> v[j]) & 1);   /* make sure all v[j] values are unique */
+            usedBitmap |= (((u64b) 1) << v[j]);
+            old[j] = r->rotList[v[j]];          /* save current value */
+            }
+        for (k=0;k<MAX_TWIDDLE_CNT/4;k++)
+            {  /* here with n rotation indices (v[0..n-1]) to be changed */
+            for (j=0;j<n;j++)
+                {
+                do  {
+                    r->rotList[v[j]] = goodRots[Rand32() % goodRotCnt];
+                    }   /* make sure new rotation value changes */
+                while (r->rotList[v[j]] == old[j]);
+                }
+            if (Cycle_Min_HW(t.maxSatRnds,r->rotList,0,t.rotVerMask) == BITS_PER_WORD)
+                {
+                if (i >= 2 || !(t.tstFlags & TST_FLG_KEEP_MIN_HW) ||
+                    Cycle_Min_HW(t.rounds,r->rotList,t.minHW_or,t.rotVerMask) >= (int) t.minHW_or)
+                    {
+                    Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+                    return;
+                    }
+                }
+            for (j=0;j<n;j++)   /* didn't work: go back to the old values */
+                r->rotList[v[j]] = old[j];
+            }
+        }
+    /* twiddling failed to produce a valid set (very rare). Select a brand new one */
+    get_rotation(r,t);
+    }
+
+/* run a full search */
+void RunSearch(testParms t)
+    {
+    enum        { KEEP_DIV = 16, KEEP_REP = 10, SHOW_CNT = 8 };
+    rSearchRec  popList[MAX_POP_CNT+2];
+    uint        i,j,k,n,repCnt,genCnt,keepCnt,prevBest[SHOW_CNT],showMask;
+    const       char *timeStr;
+    time_t      t0,t1;
+
+    Rand_Init(t.seed0 + (((u64b) bitsPerBlock) << 32));
+    memset(prevBest,0,sizeof(prevBest));
+
+    /* now set up the globals according to selected Skein blocksize */
+    switch (bitsPerBlock)
+        {
+        case  256:
+            t.genCntMax      = (t.genCntMax) ? t.genCntMax    : DEFAULT_GEN_CNT_4  ;
+            t.rounds         = (t.rounds)    ? t.rounds       : DEFAULT_ROUND_CNT_4;
+            t.minHW_or       = (t.minHW_or)  ? t.minHW_or     :         MIN_HW_OR_4;
+            t.maxSatRnds     = (t.maxSatRnds)? t.maxSatRnds   :    MAX_SAT_ROUNDS_4;
+            fwd_cycle_or_rN  = (t.rounds!=8) ? fwd_cycle_4_or :  fwd_cycle_4_or_r8 ;
+            rev_cycle_or_rN  = (t.rounds!=8) ? rev_cycle_4_or :  rev_cycle_4_or_r8 ;
+            fwd_cycle_or     = fwd_cycle_4_or;
+            rev_cycle_or     = fwd_cycle_4_or;
+            fwd_cycle        = fwd_cycle_4;
+            rev_cycle        = rev_cycle_4;
+            showMask         = 7;
+            break;
+        case  512:
+            t.genCntMax      = (t.genCntMax) ? t.genCntMax    : DEFAULT_GEN_CNT_8  ;
+            t.rounds         = (t.rounds)    ? t.rounds       : DEFAULT_ROUND_CNT_8;
+            t.minHW_or       = (t.minHW_or)  ? t.minHW_or     :         MIN_HW_OR_8;
+            t.maxSatRnds     = (t.maxSatRnds)? t.maxSatRnds   :    MAX_SAT_ROUNDS_8;
+            fwd_cycle_or_rN  = (t.rounds!=8) ? fwd_cycle_8_or :  fwd_cycle_8_or_r8 ;
+            rev_cycle_or_rN  = (t.rounds!=8) ? rev_cycle_8_or :  rev_cycle_8_or_r8 ;
+            fwd_cycle_or     = fwd_cycle_8_or;
+            rev_cycle_or     = rev_cycle_8_or;
+            fwd_cycle        = fwd_cycle_8;
+            rev_cycle        = rev_cycle_8;
+            showMask         = 3;
+            break;
+        case 1024:
+            t.genCntMax      = (t.genCntMax) ? t.genCntMax    : DEFAULT_GEN_CNT_16  ;
+            t.rounds         = (t.rounds)    ? t.rounds       : DEFAULT_ROUND_CNT_16;
+            t.minHW_or       = (t.minHW_or)  ? t.minHW_or     :         MIN_HW_OR_16;
+            t.maxSatRnds     = (t.maxSatRnds)? t.maxSatRnds   :    MAX_SAT_ROUNDS_16;
+            fwd_cycle_or_rN  = (t.rounds!=9) ? fwd_cycle_16_or: fwd_cycle_16_or_r9  ;
+            rev_cycle_or_rN  = (t.rounds!=9) ? rev_cycle_16_or: rev_cycle_16_or_r9  ;
+            fwd_cycle_or     = fwd_cycle_16_or;
+            rev_cycle_or     = rev_cycle_16_or;
+            fwd_cycle        = fwd_cycle_16;
+            rev_cycle        = rev_cycle_16;
+            showMask         = 1;
+            break;
+        default:
+            printf("Invalid block size!");
+            exit(2);
+        }
+    if (t.popCnt > MAX_POP_CNT)
+        t.popCnt = MAX_POP_CNT;
+    if (t.popCnt < MIN_POP_CNT)
+        t.popCnt = MIN_POP_CNT;
+    wordsPerBlock =   bitsPerBlock /      BITS_PER_WORD;
+    rotsPerCycle  = (wordsPerBlock / 2) * ROUNDS_PER_CYCLE;
+
+    keepCnt = t.popCnt/KEEP_DIV;
+    assert(keepCnt*(1+KEEP_REP) <= t.popCnt);
+    
+    printf("******************************************************************\n");
+    printf("Random seed = %u. BlockSize =%4d bits. sampleCnt =%6d. rounds = %2d. minHW_or=%d. CPU = %d-bit\n",
+                       t.seed0,bitsPerBlock,t.sampleCnt,t.rounds,t.minHW_or,(uint)sizeof(size_t)*8);
+    printf("Population  = %d. keepCnt = %d. repCnt = %d. rest = %d. keepMinHW = %d\n",
+            t.popCnt,keepCnt,KEEP_REP,t.popCnt-keepCnt*(1+KEEP_REP),(t.tstFlags & TST_FLG_KEEP_MIN_HW)?1:0); 
+    timeStr = ASCII_TimeDate();
+    if (t.tstFlags & TST_FLG_STDERR)
+        {
+        fprintf(stderr,"Start: %sBlock size = %d bits. popCnt = %d. sampleCnt = %d. keepMinHW = %d",
+                        timeStr,bitsPerBlock,t.popCnt,t.sampleCnt,(t.tstFlags & TST_FLG_KEEP_MIN_HW)?1:0);
+        if (t.runHours)
+            fprintf(stderr,". run time = %d hours",t.runHours);
+        fprintf(stderr,"\n");
+        }
+    else
+        showMask = 0;
+    printf("Start: %s  \n",timeStr);
+    time(&t0);
+    fflush(stdout);
+
+    for (n=0;n<t.popCnt;n++)
+        {   /* initialize the population with rotations that have "reasonable" hw_OR */
+        if (t.tstFlags & TST_FLG_STDERR)
+            fprintf(stderr,"\rGetRot: %04X    \r",t.popCnt-n);
+        if (get_rotation(&popList[n],t) == 0)
+            t.popCnt = n;               /* stop after end of file read in */
+        }
+    if (t.tstFlags & TST_FLG_STDERR)
+        fprintf(stderr,"\r%25s\r","");
+
+    for (genCnt=0;genCnt < t.genCntMax;genCnt++)
+        {   /* advance to the next generation */
+        for (i=0;i<t.popCnt;i++)
+            {   /* generate stats for all entries (this loop is where all the time is spent!) */
+            if ((i & showMask) == 1)
+                fprintf(stderr,"#%04X \r",t.popCnt-i);
+            if (genCnt == 0 || i >= keepCnt)
+                {
+                CheckDifferentials(&popList[i],t);
+                }
+            else if (i <= keepCnt/2 && (popList[i].ID & ID_RECALC_BIT) == 0)
+                {   /* recalc with bigger sampleCnt for better accuracy */
+                t.sampleCnt <<= 2;
+                CheckDifferentials(&popList[i],t);
+                t.sampleCnt >>= 2;
+                popList[i].rWorst = (popList[i].rWorst + 2) / 4;
+                popList[i].ID |= ID_RECALC_BIT;
+                }
+            }
+        qsort(popList,t.popCnt,sizeof(popList[0]),Compare_SearchRec_Descending);
+        if (t.genCntMax == 1)
+            { keepCnt = t.popCnt; break; }  /* allow quick processing from file */
+        /* now update the population for the next generation */
+        n = t.popCnt-1;                 /* start discarding at the end of the list */
+        for (i=0;i<keepCnt;i++)
+            {
+            if (t.tstFlags & TST_FLG_WEIGHT_REP)
+                repCnt = (i < keepCnt/2) ? KEEP_REP+2 : KEEP_REP-2 ;
+            else
+                repCnt = KEEP_REP;
+            for (j=0;j<repCnt;j++,n--)
+                {                       /* replicate the best ones, replacing the worst ones */
+                popList[n] = popList[i];
+                if (j == 0)
+                    {   /* splice two together, but only if they are from the same initial rotation set */
+                    k = Rand32() %  keepCnt;    
+                    if (((popList[n].ID ^ popList[k].ID) & ID_NUM_MASK) == 0)
+                        memcpy(popList[n].rotList,
+                               popList[k].rotList,
+                               rotsPerCycle*sizeof(popList[n].rotList[0])/2);
+                    }
+                Twiddle(&popList[n],t); /* tweak the replicate entry a bit */
+                assert(n >= keepCnt);   /* sanity check  */
+                }
+            }
+        for (;n>=keepCnt;n--)           /* just tweak the rest */
+            {
+            Twiddle(&popList[n],t);
+            }
+        time(&t1);
+        /* show current best */
+        if (t.tstFlags & TST_FLG_STDERR)
+            {   /* first to stderr (assuming redirected stdout */
+            fprintf(stderr,"\r%4d: ",genCnt+1);
+            for (i=j=0;i<SHOW_CNT;i++)
+                {
+                fprintf(stderr," %5.3f%c",popList[i].rWorst/(double)t.sampleCnt,(popList[i].ID & ID_RECALC_BIT)?'r':' ');
+                j |= (popList[i].rWorst ^ prevBest[i]); /* track changes */
+                prevBest[i] = popList[i].rWorst;
+                }
+            fprintf(stderr,"  {%6d sec%c}\n",(uint)(t1-t0),(j) ? '*':' ');
+            }
+        if (t.tstFlags & TST_FLG_VERBOSE)
+            {   /* then more details to stdout */
+            printf("::::: Gen =%5d. Best =%6.3f. PopCnt =%5d. SampleCnt =%5d. time=%6d.\n",
+                   genCnt+1,popList[0].rWorst/(double)t.sampleCnt,t.popCnt,t.sampleCnt,(uint)(t1-t0));
+            for (i=0;i<keepCnt;i++)
+                ShowSearchRec(stdout,&popList[i],t,SHOW_ROTS_PRELIM,(i)?' ':'-',i+1);
+            fflush(stdout);
+            }
+        if (t.runHours && t.runHours*3600 < (uint) (t1 - t0))
+            break;      /* timeout? */
+        }
+
+    /* re-grade the top entries using larger sampleCnt values */
+    printf("\n+++++++++++++ Preliminary results: sampleCnt = %5d, block = %4d bits\n",t.sampleCnt,bitsPerBlock);
+    qsort(popList,keepCnt,sizeof(popList[0]),Compare_SearchRec_Descending);
+    for (i=0;i<keepCnt;i++)
+        ShowSearchRec(stdout,&popList[i],t,SHOW_ROTS_PRELIM,' ',i+1);
+
+    /* re-run several times, since there will be statistical variations */
+    t.rotVerMask = MAX_ROT_VER_MASK;
+    t.diffBits   = (t.diffBits & 0x100) ? t.diffBits : 3;
+    t.sampleCnt *= 2;
+    t.tstFlags  |= TST_FLG_SHOW;
+    t.tstFlags  &= (TST_FLG_STDERR | TST_FLG_SHOW | TST_FLG_USE_ABS | TST_FLG_CHECK_ONE | TST_FLG_SHOW_HIST);
+
+    for (j=0;j < ((t.tstFlags & TST_FLG_CHECK_ONE) ? 1u:2u) ;j++)
+        {   /* do it twice, once with and once without USE_ABS, unless TST_FLG_CHECK_ONE set */
+        if (!(t.tstFlags & TST_FLG_CHECK_ONE))
+            t.tstFlags  ^= TST_FLG_USE_ABS;
+        for (n=0;n<t.regradeCnt;n++)
+            {
+            t.sampleCnt *= 2;
+            printf("+++ Re-running differentials with sampleCnt = %d, blockSize = %4d bits.%s\n",
+                   t.sampleCnt,bitsPerBlock,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+            for (i=0;i<keepCnt;i++)
+                {
+                if (t.tstFlags & TST_FLG_STDERR)
+                    fprintf(stderr,"       Re-run: samples=%d, blk=%4d. #%02d.%s    \r",
+                            t.sampleCnt,bitsPerBlock,keepCnt-i,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+                CheckDifferentials(&popList[i],t);
+                fflush(stdout);
+                }
+            if (keepCnt == 1)
+                {   /* show random comparison for final values */
+                printf("        RANDOM OUTPUT: /* useful stats for comparison to 'ideal' */\n");
+                t.tstFlags |=  TST_FLG_DO_RAND;
+                for (i=0;i<2;i++)
+                    {
+                    popList[keepCnt] =  popList[keepCnt-1];
+                    CheckDifferentials(&popList[keepCnt],t);
+                    }
+                t.tstFlags &= ~TST_FLG_DO_RAND;
+                }
+            /* sort per new stats */
+            if (t.tstFlags & TST_FLG_STDERR)
+                fprintf(stderr,"\r%60s\r","");
+            printf("\n+++++++++++++ Final results: sampleCnt = %5d, blockSize = %4d bits.%s\n",
+                   t.sampleCnt,bitsPerBlock,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+            qsort(popList,keepCnt,sizeof(popList[0]),Compare_SearchRec_Descending);
+            for (i=keepCnt;i;i--)
+                ShowSearchRec(stdout,&popList[i-1],t,SHOW_ROTS_FINAL,(i==1)?'-':' ',i);
+            fflush(stdout);
+            }
+        printf("\n+++++++++++++ Formatted results: sampleCnt = %5d, blockSize = %4d bits. %s\n",
+               t.sampleCnt,bitsPerBlock,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+        for (i=keepCnt;i;i--)
+            {
+            ShowSearchRec(stdout,&popList[i-1],t,SHOW_ROTS_H,' ',i);
+            printf("\n");
+            Show_HW_rounds(popList[i-1].rotList);
+            printf("\n");
+            }
+        fflush(stdout);
+        t.sampleCnt >>= n;  /* revert to original sampleCnt */
+        }
+
+    time(&t1);
+    printf("End:   %s\n",ASCII_TimeDate());
+    printf("Elapsed time = %6.3f hours\n\n",(t1-t0)/(double)3600.0);
+    if (t.tstFlags & TST_FLG_STDERR)
+        fprintf(stderr,"\r%60s\n","");    /* clear the screen if needed */
+    fflush(stdout);
+    }
+
+void GiveHelp(void)
+    {
+    printf("Usage:   skein_rot_search [options/flags]\n"
+           "Options: -Bnn   = set Skein block size in bits (default=512)\n"
+           "         -Cnn   = set count of random differentials taken\n"
+           "         -Dnn   = set number bits of difference pattern tested (default=1)\n"
+           "         -Gnn   = set min invalid rotation value (default 0)\n"
+           "         -Inn   = set rotation version mask\n"
+           "         -Onn   = set Hamming weight offset\n"
+           "         -Pnn   = set population count\n"
+           "         -Rnn   = set round count\n"
+           "         -Snn   = set initial random seed (0 --> randomize)\n"
+           "         -Tnn   = set max time to run (in hours)\n"
+           "         -Wnn   = set minimum hamming weight\n"
+           "         -Xnn   = set max test rotation count\n"
+           "         -Znn   = set max rounds needed for saturation using OR\n"
+           "         @file  = read rotations from file\n"
+           "Flags:   -A     = use min, not absolute difference\n"
+           "         -E     = no stderr output\n"
+           "         -H     = show histogram (very wide)\n"
+           "         -K     = keep minHW_or during twiddling\n"
+           "         -Q     = disable quick exit in search\n"
+           "         -U     = weighted repeat count (repeat best more frequently)\n"
+           "         -V     = verbose mode\n"
+          );
+    exit(0);
+    }
+
+int main(int argc,char *argv[])
+    {
+    uint        i,bMin,bMax;
+    testParms   t;
+    uint chkInv =        1;   /* check inverse functions at startup (slow for debbuging) */
+    uint goodRot=        2;   /* first allowed rotation value (+/-) */
+    uint seed   =        1;   /* 0 = randomize based on time, else use specified seed */
+    uint do8    =        0;   /* optimize 8-bit CPU performance */
+
+    t.rounds    =        0;   /* number of Skein rounds to test */
+    t.minHW_or  =        0;   /* minHW (using OR) required */
+    t.minOffs   =        4;   /* heuristic used to speed up rotation search */
+    t.diffBits  =        1;   /* # consecutive bits of differential inputs tested */
+    t.sampleCnt =     1024;   /* number of differential pairs tested */
+    t.genCntMax =        0;   /* number of "generations" tested */
+    t.maxSatRnds=        0;   /* number of rounds to Hamming weight "saturation" */
+    t.rotVerMask=        3;   /* mask of which versions to run */
+    t.runHours  =        0;   /* stop searching after this many hours */
+    t.dupRotMask=        0;   /* default is to allow same rotation value in a round */
+    t.regradeCnt=        3;   /* how many scaled up counts to try */
+    t.popCnt    = DEFAULT_POP_CNT;                      /* size of population */
+    t.tstFlags  = TST_FLG_STDERR | TST_FLG_VERBOSE | TST_FLG_USE_ABS | TST_FLG_CHECK_ONE; /* default flags */
+
+    for (i=1;i<(uint)argc;i++)
+        {   /* parse command line args */
+        if (argv[i][0] == '?')
+            GiveHelp();
+        else if (argv[i][0] == '-' || argv[i][0] == '+')
+            {
+#define arg_toi(s) atoi(s + ((s[2] == '=') ? 3 : 2))
+            switch (toupper(argv[i][1]))
+                {
+                case '?': GiveHelp();                            break;
+                                                                 
+                case 'A': t.tstFlags   &= ~TST_FLG_USE_ABS;      break;
+                case 'E': t.tstFlags   &= ~TST_FLG_STDERR;       break;
+                case 'H': t.tstFlags   |=  TST_FLG_SHOW_HIST;    break;
+                case 'K': t.tstFlags   |=  TST_FLG_KEEP_MIN_HW;  break;
+                case 'Q': t.tstFlags   |=  TST_FLG_QUICK_EXIT;   break;
+                case 'U': t.tstFlags   |=  TST_FLG_WEIGHT_REP;   break;
+                case 'V': t.tstFlags   &= ~TST_FLG_VERBOSE;      break;
+                case '1': t.tstFlags   &= ~TST_FLG_CHECK_ONE;    break;
+
+                case 'B': bitsPerBlock  =  arg_toi(argv[i]);     break;
+                case 'C': t.sampleCnt   =  arg_toi(argv[i]);     break;
+                case 'D': t.diffBits    =  arg_toi(argv[i]);     break;
+                case 'G': goodRot       =  arg_toi(argv[i]);     break;
+                case 'I': t.rotVerMask  =  arg_toi(argv[i]);     break;
+                case 'J': t.regradeCnt  =  arg_toi(argv[i]);     break;
+                case 'O': t.minOffs     =  arg_toi(argv[i]);     break;
+                case 'P': t.popCnt      =  arg_toi(argv[i]);     break;
+                case 'R': t.rounds      =  arg_toi(argv[i]);     break;
+                case 'S': seed          =  arg_toi(argv[i]);     break;
+                case 'T': t.runHours    =  arg_toi(argv[i]);     break;
+                case 'W': t.minHW_or    =  arg_toi(argv[i]);     break;
+                case 'X': t.genCntMax   =  arg_toi(argv[i]);     break;
+                case 'Z': t.maxSatRnds  =  arg_toi(argv[i]);     break;
+                case '2': t.dupRotMask  = ~0u;                   break;
+                case '0': chkInv        =  0;                    break;
+                case '8': do8           =  1;                    break;
+
+                default : printf("Unknown option: %s\n",argv[i]); GiveHelp();     break;
+                }
+            }
+        else if (argv[i][0] == '@')
+            {
+            rotFileName = argv[i]+1;
+            t.genCntMax = 1;            /* stop after one generation */
+            }
+        }
+
+    if (chkInv)
+        InverseChecks();    /* check fwd vs. rev transforms (slow in debugger) */
+
+    t.goodRotCntMask = 0;
+    for (i=goodRot; i <= BITS_PER_WORD - goodRot ;i++)
+        t.goodRotCntMask |= (((u64b) 1) << i);
+    if (do8) 
+        t.goodRotCntMask = (((u64b) 0x03838383) << 32) | 0x83838380;
+
+    if (bitsPerBlock == 0)
+        {
+        printf("Running search for all Skein block sizes (256, 512, and 1024)\n");
+        t.rounds   = 0;   /* use defaults, since otherwise it makes little sense */
+        t.minHW_or = 0;
+        }
+
+    bMin = (bitsPerBlock) ? bitsPerBlock :  256;
+    bMax = (bitsPerBlock) ? bitsPerBlock : 1024;
+
+    for (bitsPerBlock=bMin;bitsPerBlock<=bMax;bitsPerBlock*=2)
+        {
+        t.seed0 = (seed) ? seed : (uint) time(NULL);   /* randomize based on time if -s0 is given */
+        RunSearch(t);
+        }
+    
+    return 0;
+    }
diff --git a/Additional_Implementations/skein_test.c b/Additional_Implementations/skein_test.c
new file mode 100644
index 0000000000000..9d999e0d49c0c
--- /dev/null
+++ b/Additional_Implementations/skein_test.c
@@ -0,0 +1,1380 @@
+/***********************************************************************
+**
+** Test/verification code for the Skein block functions.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Testing:
+**   - buffering of incremental calls (random cnt steps)
+**   - partial input byte handling
+**   - output sample hash results (for comparison of ref vs. optimized)
+**   - performance
+**
+***********************************************************************/
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <time.h>
+#include <assert.h>
+
+#include "skein.h"
+#include "SHA3api_ref.h"
+
+static const uint_t HASH_BITS[] =    /* list of hash hash lengths to test */
+        { 160,224,256,384,512,1024, 256+8,512+8,1024+8,2048+8 };
+
+#define HASH_BITS_CNT   (sizeof(HASH_BITS)/sizeof(HASH_BITS[0]))
+
+/* bits of the verbose flag word */
+#define V_KAT_LONG      (1u << 0)
+#define V_KAT_SHORT     (1u << 1)
+#define V_KAT_NO_TREE   (1u << 2)
+#define V_KAT_NO_SEQ    (1u << 3)
+#define V_KAT_NO_3FISH  (1u << 4)
+#define V_KAT_DO_3FISH  (1u << 5)
+
+/* automatic compiler version number detection */
+#if !defined(CompilerVersion)
+
+#if   defined(_MSC_VER) && (_MSC_VER >= 1400)
+#define CompilerVersion (900)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+#define CompilerVersion (600)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1000)
+#define CompilerVersion (420)
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define CompilerVersion (100*__GNUC__ + 10*__GNUC_MINOR__ + __GNUC_PATCHLEVEL__)
+#elif defined(__BORLANDC__) /* this is in hex */
+#define CompilerVersion (100*(__BORLANDC__ >> 8) + 10*((__BORLANDC__ >> 4) & 0xF) + (__BORLANDC__ & 0xF))
+#endif
+
+#endif
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 
+/* external functions to determine code size (in bytes) */
+size_t  Skein_256_Process_Block_CodeSize(void);
+size_t  Skein_512_Process_Block_CodeSize(void);
+size_t  Skein1024_Process_Block_CodeSize(void);
+size_t  Skein_256_API_CodeSize(void);
+size_t  Skein_512_API_CodeSize(void);
+size_t  Skein1024_API_CodeSize(void);
+uint_t  Skein_256_Unroll_Cnt(void);
+uint_t  Skein_512_Unroll_Cnt(void);
+uint_t  Skein1024_Unroll_Cnt(void);
+#elif defined(SKEIN_LOOP)
+uint_t  Skein_256_Unroll_Cnt(void) { return (SKEIN_LOOP / 100) % 10; }
+uint_t  Skein_512_Unroll_Cnt(void) { return (SKEIN_LOOP /  10) % 10; }
+uint_t  Skein1024_Unroll_Cnt(void) { return (SKEIN_LOOP      ) % 10; }
+#else
+uint_t  Skein_256_Unroll_Cnt(void) { return 0; }
+uint_t  Skein_512_Unroll_Cnt(void) { return 0; }
+uint_t  Skein1024_Unroll_Cnt(void) { return 0; }
+#endif
+
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void    Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+
+/********************** debug i/o helper routines **********************/
+void FatalError(const char *s,...)
+    { /* print out a msg and exit with an error code */
+    va_list ap;
+    va_start(ap,s);
+    vprintf(s,ap);
+    va_end(ap);
+    printf("\n");
+    exit(2);
+    }
+
+static uint_t _quiet_   =   0;  /* quiet processing? */
+static uint_t verbose   =   0;  /* verbose flag bits */
+static uint_t katHash   = ~0u;  /* use as a quick check on KAT results */
+
+void ShowBytes(uint_t cnt,const u08b_t *b)
+    { /* formatted output of byte array */
+    uint_t i;
+
+    for (i=0;i < cnt;i++)
+        {
+        if (i %16 ==  0) printf("    ");
+        else if (i % 4 == 0) printf(" ");
+        printf(" %02X",b[i]);
+        katHash = (katHash ^ b[i]) * 0xDEADBEEF;
+        katHash = (katHash ^ (katHash >> 23) ^ (katHash >> 17) ^ (katHash >> 9)) * 0xCAFEF00D;
+        if (i %16 == 15 || i==cnt-1) printf("\n");
+        }
+    }
+
+#ifndef SKEIN_DEBUG
+uint_t skein_DebugFlag     =   0;     /* dummy flags (if not defined elsewhere) */
+#endif
+
+#define SKEIN_DEBUG_SHORT   (SKEIN_DEBUG_HDR | SKEIN_DEBUG_STATE | SKEIN_DEBUG_TWEAK | SKEIN_DEBUG_KEY | SKEIN_DEBUG_INPUT_08 | SKEIN_DEBUG_FINAL)
+#define SKEIN_DEBUG_DEFAULT (SKEIN_DEBUG_SHORT)
+
+void Show_Debug(const char *s,...)
+    {
+    if (skein_DebugFlag)              /* are we showing debug info? */
+        {
+        va_list ap;
+        va_start(ap,s);
+        vprintf(s,ap);
+        va_end(ap);
+        }
+    }
+
+/************** Timing routine (for performance measurements) ***********/
+/* unfortunately, this is generally assembly code and not very portable */
+
+#if defined(_M_IX86) || defined(__i386) || defined(_i386) || defined(__i386__) || defined(i386) || \
+    defined(_X86_)   || defined(__x86_64__) || defined(_M_X64) || defined(__x86_64)
+#define _Is_X86_    1
+#endif
+
+#if  defined(_Is_X86_) && (!defined(__STRICT_ANSI__)) && (defined(__GNUC__) || !defined(__STDC__)) && \
+    (defined(__BORLANDC__) || defined(_MSC_VER) || defined(__MINGW_H) || defined(__GNUC__))
+#define HI_RES_CLK_OK         1          /* it's ok to use RDTSC opcode */
+
+#if defined(_MSC_VER) && defined(_M_X64)
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+#endif
+
+#endif
+
+uint_32t HiResTime(void)
+    {
+#if defined(HI_RES_CLK_OK)
+    uint_32t x[2];
+#if   defined(__BORLANDC__)
+#define COMPILER_ID "BCC"
+    _asm { push edx };
+    __emit__(0x0F,0x31);            /* RDTSC instruction */
+    _asm { pop  edx };
+    _asm { mov x[0],eax };
+#elif defined(_MSC_VER)
+#define COMPILER_ID "MSC"
+#if defined(_MSC_VER) && defined(_M_X64)
+    x[0] = (uint_32t) __rdtsc();
+#else
+    _asm { push  edx };
+    _asm { _emit 0fh }; _asm { _emit 031h };
+    _asm { pop   edx };
+    _asm { mov x[0],eax };
+#endif
+#elif defined(__MINGW_H) || defined(__GNUC__)
+#define COMPILER_ID "GCC"
+    asm volatile("rdtsc" : "=a"(x[0]), "=d"(x[1]));
+#else
+#error  "HI_RES_CLK_OK -- but no assembler code for this platform (?)"
+#endif
+    return x[0];
+#else
+    /* avoid annoying MSVC 9.0 compiler warning #4720 in ANSI mode! */
+#if (!defined(_MSC_VER)) || (!defined(__STDC__)) || (_MSC_VER < 1300)
+    FatalError("No support for RDTSC on this CPU platform\n");
+#endif
+    return 0;
+#endif /* defined(HI_RES_CLK_OK) */
+    }
+
+/******** OS-specific calls for setting priorities and sleeping ******/
+#if (defined(_MSC_VER) && (_MSC_VER >= 1300) && !defined(__STRICT_ANSI__) && !defined(__STDC__)) \
+    && defined(_M_X64)
+#include <Windows.h>
+#include <WinBase.h>
+
+#ifdef  SKEIN_FORCE_LOCK_CPU            /* NielsF says this is not a good way to do things */
+#define SKEIN_LOCK_CPU_OK (1)
+int Lock_CPU(void)
+    {   /* lock this process to this CPU for perf timing */
+        /*   -- thanks to Brian Gladman for this code    */
+    HANDLE ph;
+    DWORD_PTR afp;
+    DWORD_PTR afs;
+    ph = GetCurrentProcess();
+    if(GetProcessAffinityMask(ph, &afp, &afs))
+        {
+        afp &= (((size_t)1u) << GetCurrentProcessorNumber());
+        if(!SetProcessAffinityMask(ph, afp))
+            return 1;
+        }
+    else
+        {
+        return 2;
+        }
+    return 0;   /* success */
+    }
+#endif
+
+#define _GOT_OS_SLEEP        (1)
+void OS_Sleep(uint_t msec)
+    {
+    Sleep(msec);
+    }
+
+#define _GOT_OS_SET_PRIORITY (1)
+int OS_Set_High_Priority(void)
+    {
+    if(!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST))
+        return 1;
+#ifdef SKEIN_LOCK_CPU_OK    
+    if (Lock_CPU())
+        return 2;
+#endif
+    return 0;
+    }
+
+int OS_Set_Normal_Priority(void)
+    {
+    if(!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_NORMAL))
+        return 1;
+    return 0;
+    }
+#endif
+
+#if defined(__linux) || defined(__linux__) || defined(linux) || defined(__gnu_linux__)
+#include <unistd.h>
+#define _GOT_OS_SLEEP        (1)
+void OS_Sleep(uint_t mSec)
+    {
+    usleep(mSec*1000);
+    }
+#endif
+
+#ifndef _GOT_OS_SET_PRIORITY
+/* dummy routines if nothing is available */
+int OS_Set_High_Priority(void)
+    {
+    return 0;
+    }
+int OS_Set_Normal_Priority(void)
+    {
+    return 0;
+    }
+#endif
+
+#ifndef _GOT_OS_SLEEP
+uint_32t OS_Sleep(uint_32t mSec)
+    {
+    return mSec;    /* avoid compiler warnings */
+    }
+#endif
+   
+#ifndef COMPILER_ID
+#define COMPILER_ID "(unknown)"
+#endif
+/********************** use RC4 to generate test data ******************/
+/* Note: this works identically on all platforms (big/little-endian)   */
+static struct
+    {
+    uint_t I,J;                         /* RC4 vars */
+    u08b_t state[256];
+    } prng;
+
+void RandBytes(void *dst,uint_t byteCnt)
+    {
+    u08b_t a,b;
+    u08b_t *d = (u08b_t *) dst;
+
+    for (;byteCnt;byteCnt--,d++)        /* run RC4  */
+        {
+        prng.I  = (prng.I+1) & 0xFF;
+        a       =  prng.state[prng.I];
+        prng.J  = (prng.J+a) & 0xFF;
+        b       =  prng.state[prng.J];
+        prng.state[prng.I] = b;
+        prng.state[prng.J] = a;
+        *d      =  prng.state[(a+b) & 0xFF];
+        }
+    }
+
+/* get a pseudo-random 32-bit integer in a portable way */
+uint_t Rand32(void)
+    {
+    uint_t i,n;
+    u08b_t tmp[4];
+
+    RandBytes(tmp,sizeof(tmp));
+
+    for (i=n=0;i<sizeof(tmp);i++)
+        n = n*256 + tmp[i];
+    
+    return n;
+    }
+
+/* init the (RC4-based) prng */
+void Rand_Init(u64b_t seed)
+    {
+    uint_t i,j;
+    u08b_t tmp[512];
+
+    /* init the "key" in an endian-independent fashion */
+    for (i=0;i<8;i++)
+        tmp[i] = (u08b_t) (seed >> (8*i));
+
+    /* initialize the permutation */
+    for (i=0;i<256;i++)
+        prng.state[i]=(u08b_t) i;
+
+    /* now run the RC4 key schedule */
+    for (i=j=0;i<256;i++)
+        {                   
+        j = (j + prng.state[i] + tmp[i%8]) & 0xFF;
+        tmp[256]      = prng.state[i];
+        prng.state[i] = prng.state[j];
+        prng.state[j] = tmp[256];
+        }
+    prng.I = prng.J = 0;  /* init I,J variables for RC4 */
+    
+    /* discard initial keystream before returning */
+    RandBytes(tmp,sizeof(tmp));
+    }
+    
+/***********************************************************************/
+/* An AHS-like API that allows explicit setting of block size          */
+/*    [i.e., the AHS API selects a block size based solely on the ]    */
+/*    [hash result length, while Skein allows independent hash    ]    */
+/*    [result size and block size                                 ]    */
+/***********************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+int Skein_Init(int blkSize,hashState *state, int hashbitlen)
+    {
+    switch (blkSize)
+        {
+        case  256:
+            state->statebits = 64*SKEIN_256_STATE_WORDS;
+            return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
+        case  512:
+            state->statebits = 64*SKEIN_512_STATE_WORDS;
+            return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+        case 1024:
+            state->statebits = 64*SKEIN1024_STATE_WORDS;
+            return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
+        default:
+            return SKEIN_FAIL;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init (extended) the context */
+int Skein_InitExt(int blkSize,hashState *state, int hashbitlen,u64b_t treeInfo,const u08b_t *key,size_t keyBytes)
+    {
+    switch (blkSize)
+        {
+        case  256:
+            state->statebits = 64*SKEIN_256_STATE_WORDS;
+            return Skein_256_InitExt(&state->u.ctx_256,(size_t) hashbitlen,treeInfo,key,keyBytes);
+        case  512:
+            state->statebits = 64*SKEIN_512_STATE_WORDS;
+            return Skein_512_InitExt(&state->u.ctx_512,(size_t) hashbitlen,treeInfo,key,keyBytes);
+        case 1024:
+            state->statebits = 64*SKEIN1024_STATE_WORDS;
+            return Skein1024_InitExt(&state->u.ctx1024,(size_t) hashbitlen,treeInfo,key,keyBytes);
+        default:
+            return SKEIN_FAIL;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+int Skein_Update(hashState *state, const BitSequence *data, DataLength databitlen)
+    {
+    /* only the final Update() call is allowed do partial bytes, else assert an error */
+    Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, FAIL);
+
+    if ((databitlen & 7) == 0)
+        {
+        switch (state->statebits)
+            {
+            case  512:  return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+            case  256:  return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
+            case 1024:  return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
+            default: return SKEIN_FAIL;
+            }
+        }
+    else
+        {
+        size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle */
+        u08b_t mask,*p;
+
+#if (!defined(_MSC_VER)) || (MSC_VER >= 1200)                 /* MSC v4.2 gives (invalid) warning here!!  */
+        Skein_assert(&state->u.h == &state->u.ctx_256.h);     /* sanity checks: allow u.h --> all contexts */
+        Skein_assert(&state->u.h == &state->u.ctx_512.h);
+        Skein_assert(&state->u.h == &state->u.ctx1024.h);
+#endif
+        switch (state->statebits)
+            {
+            case  512: Skein_512_Update(&state->u.ctx_512,data,bCnt);
+                       p    = state->u.ctx_512.b;
+                       break;
+            case  256: Skein_256_Update(&state->u.ctx_256,data,bCnt);
+                       p    = state->u.ctx_256.b;
+                       break;
+            case 1024: Skein1024_Update(&state->u.ctx1024,data,bCnt);
+                       p    = state->u.ctx1024.b;
+                       break;
+            default:   return FAIL;
+            }
+        Skein_Set_Bit_Pad_Flag(state->u.h);                     /* set tweak flag for the final call */
+        /* now "pad" the final partial byte the way NIST likes */
+        bCnt = state->u.h.bCnt;         /* get the bCnt value (same location for all block sizes) */
+        Skein_assert(bCnt != 0);        /* internal sanity check: there IS a partial byte in the buffer! */
+        mask = (u08b_t) (1u << (7 - (databitlen & 7)));         /* partial byte bit mask */
+        p[bCnt-1]  = (u08b_t)((p[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte (in the buffer) */
+        
+        return SUCCESS;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+int Skein_Final(hashState *state, BitSequence *hashval)
+    {
+    switch (state->statebits)
+        {
+        case  512:  return Skein_512_Final(&state->u.ctx_512,hashval);
+        case  256:  return Skein_256_Final(&state->u.ctx_256,hashval);
+        case 1024:  return Skein1024_Final(&state->u.ctx1024,hashval);
+        default:    return SKEIN_FAIL;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+int Skein_Hash(int blkSize,int hashbitlen, const BitSequence *data, /* all-in-one call */
+                DataLength databitlen,BitSequence *hashval)
+    {
+    hashState  state;
+    int r = Skein_Init(blkSize,&state,hashbitlen);
+    if (r == SKEIN_SUCCESS)
+        { /* these calls do not fail when called properly */
+        r = Skein_Update(&state,data,databitlen);
+        Skein_Final(&state,hashval);
+        }
+    return r;
+    }
+
+/***********************************************************************/
+/* various self-consistency checks */
+uint_t Skein_Test(uint_t blkSize,uint_t maxLen,uint_t hashLen,uint_t nStep,uint_t oneBlk)
+    {
+    enum        { MAX_BUF=1024 };
+    u08b_t      b[MAX_BUF+4],hashVal[2][MAX_BUF+4];
+    uint_t      i,j,k,n,bCnt,useAHS,step,bitLen,testCnt=0;
+    hashState   s[2];
+                
+    assert(blkSize > 0 && blkSize <= 1024 && (blkSize % 256) == 0);
+    assert((hashLen % 8) == 0);
+
+    if (maxLen  > MAX_BUF*8)     /* keep things reasonably small */
+        maxLen  = MAX_BUF*8;
+    if (hashLen > MAX_BUF*8)
+        hashLen = MAX_BUF*8;
+    if (maxLen  == 0)            /* default sizes */
+        maxLen  = blkSize*2;
+    if (hashLen == 0)
+        hashLen = blkSize;
+
+    if (oneBlk)
+        {
+        if (oneBlk > MAX_BUF*8)
+            oneBlk = MAX_BUF*8;
+        for (i=0;i<oneBlk/8;i++)
+            b[i] = (u08b_t) i;
+        if (Skein_Hash(blkSize,hashLen,b,oneBlk,hashVal[0]) != SKEIN_SUCCESS)
+            FatalError("Skein_Hash != SUCCESS");
+        return 1;
+        }
+
+    if (nStep == 0)
+        {
+        printf("Testing Skein: blkSize = %4d bits. hashLen=%4d bits. maxMsgLen = %4d bits.\n",
+               blkSize,hashLen,maxLen);
+        nStep = 1;
+        }
+
+    n = skein_DebugFlag;
+    skein_DebugFlag = 0;        /* turn of debug display for this "fake" AHS call */
+    if (Init(&s[0],hashLen) != SUCCESS) /* just see if AHS API supports this <blkSize,hashLen> pair */
+        FatalError("AHS_API Init() error!");
+    skein_DebugFlag = n;        /* restore debug display status */
+
+    useAHS = (s[0].statebits == blkSize);  /* does this <blkSize,hashLen> pair work via AHS_API? */
+    
+    bCnt = (maxLen + 7) / 8;    /* convert maxLen to bytes */
+    for (n=0;n < bCnt;n+=nStep) /* process all the data lengths (# bytes = n+1)*/
+        {
+        RandBytes(b,maxLen);    /* get something to hash */
+        for (j=8;j>0;j--)       /* j = # bits in final byte */
+            {
+            testCnt++;
+            memset(hashVal,0,sizeof(hashVal));
+            Show_Debug("\n*** Single Hash() call (%d bits)\n",8*n+j);
+            if (Skein_Hash(blkSize,hashLen,b,8*n+j,hashVal[0]) != SKEIN_SUCCESS)
+                FatalError("Skein_Hash != SUCCESS");
+            for (k=hashLen/8;k<=MAX_BUF;k++)
+                if (hashVal[0][k] != 0)
+                    FatalError("Skein hash output overrun!: hashLen = %d bits",hashLen);
+            if (useAHS)         /* compare using AHS API, if supported */
+                {      
+                Show_Debug("\n*** Single AHS API Hash() call\n");
+                if (Hash(hashLen,b,8*n+j,hashVal[1]) != SUCCESS)
+                    FatalError("Skein_Hash != SUCCESS");
+                for (k=hashLen/8;k<=MAX_BUF;k++)
+                    if (hashVal[1][k] != 0)
+                        FatalError("Skein AHS_API hash output overrun!: hashLen = %d bits",hashLen);
+                if (memcmp(hashVal[1],hashVal[0],hashLen/8))
+                    FatalError("Skein vs. AHS API miscompare");
+                }
+            /* now try (randomized) steps thru entire input block */
+            for (i=0;i<4;i++) 
+                {  
+                Show_Debug("\n*** Multiple Update() calls [%s]",(i)?"random steps":"step==1");
+                if (i >= 2)
+                    {
+                    Show_Debug("  [re-use precomputed state]");
+                    s[0] = s[1]; 
+                    }
+                else
+                    {
+                    k = (i) ? Skein_Init   (blkSize,&s[0],hashLen) :
+                              Skein_InitExt(blkSize,&s[0],hashLen,SKEIN_CFG_TREE_INFO_SEQUENTIAL,NULL,0);
+                    if (k != SKEIN_SUCCESS)
+                        FatalError("Skein_Init != SUCCESS");
+                    s[1] = s[0];            /* make a copy for next time */
+                    }
+                Show_Debug("\n");
+                for (k=0;k<n+1;k+=step)     /* step thru with variable sized steps */
+                    {/* for i == 0, step one byte at a time. for i>0, randomly */
+                    step = (i == 0) ? 1 : 1 + (Rand32() % (n+1-k));     /* # bytes to process */
+                    bitLen = (k+step >= n+1) ? 8*(step-1) + j: 8*step;  /* partial final byte handling */
+                    if (Skein_Update(&s[0],&b[k],bitLen) != SKEIN_SUCCESS)
+                        FatalError("Skein_Update != SUCCESS");
+                    }
+                if (Skein_Final(&s[0],hashVal[1]) != SKEIN_SUCCESS)
+                    FatalError("Skein_Final != SUCCESS");
+                for (k=hashLen/8;k<=MAX_BUF;k++)
+                    if (hashVal[0][k] != 0)
+                        FatalError("Skein hash output overrun!: hashLen = %d bits",hashLen);
+                if (memcmp(hashVal[1],hashVal[0],hashLen/8))
+                    FatalError("Skein Hash() vs. Update() miscompare!");
+                }
+            }
+        }
+    return testCnt;
+    }
+
+/* filter out <blkSize,hashBits> pairs in short KAT mode */
+uint_t Short_KAT_OK(uint_t blkSize,uint_t hashBits)
+    {
+    switch (blkSize)
+        {
+        case  256:
+            if (hashBits != 256 && hashBits != 224)
+                return 0;
+            break;
+        case  512:
+            if (hashBits != 256 && hashBits != 384 && hashBits != 512)
+                return 0;
+            break;
+        case 1024:
+            if (hashBits != 384 && hashBits != 512 && hashBits != 1024)
+                return 0;
+            break;
+        default:
+            return 0;
+        }
+    return 1;
+    }
+
+#if SKEIN_TREE_HASH
+#define MAX_TREE_MSG_LEN  (1 << 12)
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* pad final block, no OUTPUT stage */
+int Skein_Final_Pad(hashState *state, BitSequence *hashval)
+    {
+    switch (state->statebits)
+        {
+        case  512:  return Skein_512_Final_Pad(&state->u.ctx_512,hashval);
+        case  256:  return Skein_256_Final_Pad(&state->u.ctx_256,hashval);
+        case 1024:  return Skein1024_Final_Pad(&state->u.ctx1024,hashval);
+        default:    return SKEIN_FAIL;
+        }
+    }
+/* just the OUTPUT stage */
+int Skein_Output(hashState *state, BitSequence *hashval)
+    {
+    switch (state->statebits)
+        {
+        case  512:  return Skein_512_Output(&state->u.ctx_512,hashval);
+        case  256:  return Skein_256_Output(&state->u.ctx_256,hashval);
+        case 1024:  return Skein1024_Output(&state->u.ctx1024,hashval);
+        default:    return SKEIN_FAIL;
+        }
+    }
+
+/* generate a KAT test for the given data and tree parameters. */
+/* This is an "all-in-one" call. It is not intended to represent */
+/* how a real multi-processor version would be implemented, but  */
+/* the results will be the same */
+void Skein_TreeHash
+    (uint_t blkSize,uint_t hashBits,const u08b_t *msg,size_t msgBytes,
+     uint_t leaf   ,uint_t node    ,uint_t maxLevel  ,u08b_t *hashRes)
+    {
+    enum      { MAX_HEIGHT = 32 };          /* how deep we can go here */
+    uint_t    height;
+    uint_t    blkBytes  = blkSize/8;
+    uint_t    saveDebug = skein_DebugFlag;
+    size_t    n,nodeLen,srcOffs,dstOffs,bCnt;
+    u64b_t    treeInfo;
+    u08b_t    M[MAX_TREE_MSG_LEN+4];
+    hashState G,s;
+
+    assert(node < 256 && leaf < 256 && maxLevel < 256);
+    assert(node >  0  && leaf >  0  && maxLevel >  1 );
+    assert(blkSize == 256 || blkSize == 512 || blkSize == 1024);
+    assert(blkBytes <= sizeof(M));
+    assert(msgBytes <= sizeof(M));
+
+    /* precompute the config block result G for multiple uses below */
+#ifdef SKEIN_DEBUG
+    if (skein_DebugFlag)
+        skein_DebugFlag |= SKEIN_DEBUG_CONFIG;
+#endif
+    treeInfo = SKEIN_CFG_TREE_INFO(leaf,node,maxLevel);
+    if (Skein_InitExt(blkSize,&G,hashBits,treeInfo,NULL,0) != SKEIN_SUCCESS)
+        FatalError("Skein_InitExt() fails in tree");
+    skein_DebugFlag = saveDebug;
+
+    bCnt = msgBytes;
+    memcpy(M,msg,bCnt);
+    for (height=0;;height++)            /* walk up the tree */
+        {
+        if (height && (bCnt==blkBytes)) /* are we done (with only one block left)? */
+            break;
+        if (height+1 == maxLevel)       /* is this the final allowed level? */
+            {                           /* if so, do it as one big hash */
+            s = G;
+            Skein_Set_Tree_Level(s.u.h,height+1);
+            Skein_Update   (&s,M,bCnt*8);
+            Skein_Final_Pad(&s,M);
+            break;
+            }
+        nodeLen = blkBytes << ((height) ? node : leaf);
+        for (srcOffs=dstOffs=0;srcOffs <= bCnt;)
+            {
+            n = bCnt - srcOffs;         /* number of bytes left at this level */
+            if (n > nodeLen)            /* limit to node size */
+                n = nodeLen;
+            s = G;
+            s.u.h.T[0] = srcOffs;       /* nonzero initial offset in tweak! */
+            Skein_Set_Tree_Level(s.u.h,height+1);
+            Skein_Update   (&s,M+srcOffs,n*8);
+            Skein_Final_Pad(&s,M+dstOffs);  /* finish up this node, output intermediate result to M[]*/
+            dstOffs+=blkBytes;
+            srcOffs+=n;
+            if (srcOffs >= bCnt)        /* special logic to handle (msgBytes == 0) case */
+                break;
+            }
+        bCnt = dstOffs;
+        }
+
+    /* output the result */
+    Skein_Output(&s,hashRes);
+    }
+
+/*
+** Generate tree-mode hash KAT vectors.
+** Note:
+**    Tree vectors are different enough from non-tree vectors that it 
+**    makes sense to separate this out into a different function, rather 
+**    than shoehorn it into the same KAT logic as the other modes.
+**/
+void Skein_GenKAT_Tree(uint_t blkSize)
+    {
+    static const struct
+        {
+        uint_t leaf,node,maxLevel,levels;
+        }
+        TREE_PARMS[] = { {2,2,2,2}, {1,2,3,2}, {2,1,0xFF,3} };
+#define TREE_PARM_CNT (sizeof(TREE_PARMS)/sizeof(TREE_PARMS[0]))
+
+    u08b_t  msg[MAX_TREE_MSG_LEN+4],hashVal[MAX_TREE_MSG_LEN+4];
+    uint_t  i,j,k,n,p,q,hashBits,node,leaf,leafBytes,msgBytes,byteCnt,levels,maxLevel;
+
+    assert(blkSize == 256 || blkSize == 512 || blkSize == 1024);
+    for (i=0;i<MAX_TREE_MSG_LEN;i+=2)
+        {   /* generate "incrementing" tree hash input msg data */
+        msg[i  ] = (u08b_t) ((i ^ blkSize) ^ (i >> 16));
+        msg[i+1] = (u08b_t) ((i ^ blkSize) >> 8);
+        }
+    for (k=q=n=0;k < HASH_BITS_CNT;k++)
+        {
+        hashBits = HASH_BITS[k];
+        if (!Short_KAT_OK(blkSize,hashBits))
+            continue;
+        if ((verbose & V_KAT_SHORT) && (hashBits != blkSize))
+            continue;
+        for (p=0;p <TREE_PARM_CNT;p++)
+            {
+            if (p && (verbose & V_KAT_SHORT))
+                continue;           /* keep short KATs short */
+            if (p && hashBits != blkSize)
+                continue;           /* we only need one "non-full" size */
+
+            leaf      = TREE_PARMS[p].leaf;
+            node      = TREE_PARMS[p].node;
+            maxLevel  = TREE_PARMS[p].maxLevel;
+            levels    = TREE_PARMS[p].levels;
+            leafBytes = (blkSize/8) << leaf;    /* number of bytes in a "full" leaf */
+
+            for (j=0;j<4;j++)       /* different numbers of leaf results */
+                {
+                if ((verbose & V_KAT_SHORT) && (j != 3) && (j != 0))
+                    continue;
+                if (j && (hashBits != blkSize))
+                    break;
+                switch (j)
+                    {
+                    case 0: n = 1;                                break;
+                    case 1: n = 2;                                break;         
+                    case 2: n = (1 << (node * (levels-2)))*3/2;
+                            if (n <= 2) continue;                 break;
+                    case 3: n = (1 << (node * (levels-1)));       break;
+                    }
+                byteCnt = n*leafBytes;
+                assert(byteCnt > 0);
+                if (byteCnt > MAX_TREE_MSG_LEN)
+                    continue;
+                q = (q+1) % leafBytes;
+                msgBytes = byteCnt - q;
+                switch (blkSize)
+                    {
+                    case  256: printf("\n:Skein-256: "); break;
+                    case  512: printf("\n:Skein-512: "); break;
+                    case 1024: printf("\n:Skein-1024:"); break;
+                    }
+                printf(" %4d-bit hash, msgLen =%6d bits",hashBits,msgBytes*8);
+                printf(". Tree: leaf=%02X, node=%02X, maxLevels=%02X\n",leaf,node,maxLevel);
+                printf("\nMessage data:\n");
+                if (msgBytes == 0)
+                    printf("    (none)\n");
+                else
+                    ShowBytes(msgBytes,msg);
+                
+                Skein_TreeHash(blkSize,hashBits,msg,msgBytes,leaf,node,maxLevel,hashVal);
+                
+                printf("Result:\n");
+                ShowBytes((hashBits+7)/8,hashVal);
+                printf("--------------------------------\n");
+                }
+            }
+        }
+    }
+#endif
+
+/*
+** Output some KAT values. This output is generally re-directed to a file and
+** can be compared across platforms to help validate an implementation on a
+** new platform (or compare reference vs. optimized code, for example). The
+** file will be provided as part of the Skein submission package to NIST.
+**
+** When used in conjunction with the debug flag, this will output a VERY long
+** result. The verbose flag is used to output even more combinations of
+**      <blkSize,hashSize,msgLen>
+**
+** Note: this function does NOT output the NIST AHS KAT format.
+*/
+void Skein_ShowKAT(uint_t blkSizeMask)
+    {
+    enum
+        {
+        DATA_TYPE_ZERO  = 0,
+        DATA_TYPE_INC,
+        DATA_TYPE_RAND,
+        DATA_TYPE_MAC,
+        DATA_TYPE_TREE,
+        DATA_TYPE_CNT,
+
+        MAX_BYTES = 3*1024/8
+        };
+    static const char *TYPE_NAMES[] = { "zero","incrementing","random","random+MAC","tree",NULL };
+    static const uint_t  MSG_BITS[] =
+                { 0,1,2,3,4,5,6,7,8,9,10,32,64,128,192,
+                   256-1, 256, 256+1,  384,
+                   512-1, 512, 512+1,  768,
+                  1024-1,1024,1024+1,
+                  2048-1,2048,2048+1
+                };
+#define MSG_BITS_CNT (sizeof(MSG_BITS)/sizeof(MSG_BITS[0]))
+
+    uint_t      i,j,k,blkSize,dataType,hashBits,msgBits,keyBytes,blkBytes,keyType;
+    u08b_t      data[MAX_BYTES+4],key[MAX_BYTES+4],hashVal[MAX_BYTES+4];
+    const char *msgType;
+    hashState   s;
+
+    Rand_Init(SKEIN_MK_64(0xDEADBEEF,0)); /* init PRNG with repeatable value */
+    katHash = ~0u;
+    keyType =  0;
+
+#ifdef SKEIN_DEBUG
+    /* first, show some "raw" Threefish + feedforward block calls, with round-by-round debug info if enabled */
+    if (skein_DebugFlag && !(verbose & V_KAT_NO_3FISH))
+        {
+        k = skein_DebugFlag;                                        /* save debug flag value */
+        skein_DebugFlag  = THREEFISH_DEBUG_ALL & ~ SKEIN_DEBUG_HDR; /* turn on full debug detail, use Threefish name */
+        skein_DebugFlag |= (k & SKEIN_DEBUG_PERMUTE);
+#else
+    if (verbose & V_KAT_DO_3FISH)                                   /* allow non-SKEIN_DEBUG testing */
+        {
+#endif
+        for (blkSize = 256;blkSize <= 1024; blkSize*=2)
+            {
+            if (blkSizeMask && (blkSize & blkSizeMask) == 0)
+                continue;
+            for (dataType=DATA_TYPE_ZERO; dataType <= DATA_TYPE_INC; dataType++)
+                {
+                switch (dataType)
+                    {
+                    case DATA_TYPE_ZERO:
+                            memset(data,0,sizeof(data));
+                            memset(key ,0,sizeof(key));
+                            break;
+                    case DATA_TYPE_INC:
+                            for (i=0;i<MAX_BYTES;i++)
+                                {
+                                key [i] = (u08b_t)      i ;
+                                data[i] = (u08b_t) ~key[i];
+                                }
+                            break;
+                    default:
+                        continue;
+                    }
+#ifdef SKEIN_DEBUG
+                switch (blkSize)
+                    {
+                    case  256: printf("\n:Threefish-256: "); break;
+                    case  512: printf("\n:Threefish-512: "); break;
+                    case 1024: printf("\n:Threefish-1024:"); break;
+                    }
+                printf(" encryption + plaintext feedforward (round-by-round):\n");
+#endif
+                memset(&s,0,sizeof(s));
+                s.u.h.hashBitLen = blkSize;
+                Skein_Get64_LSB_First(s.u.h.T      ,key,2);               /* init T[] */
+                Skein_Get64_LSB_First(s.u.ctx1024.X,key+2*8,blkSize/64);  /* init X[] */
+                switch (blkSize)
+                    {
+                    case  256: Skein_256_Process_Block(&s.u.ctx_256,data,1,0); break;
+                    case  512: Skein_512_Process_Block(&s.u.ctx_512,data,1,0); break;
+                    case 1024: Skein1024_Process_Block(&s.u.ctx1024,data,1,0); break;
+                    }
+#ifdef SKEIN_DEBUG
+                printf("++++++++++++++++++++++++++++++++++++++\n");
+#endif
+                }
+            }
+#ifdef SKEIN_DEBUG
+        skein_DebugFlag = k;
+#endif
+        }
+
+    for (dataType=DATA_TYPE_ZERO; dataType < DATA_TYPE_CNT; dataType++)
+        {
+        msgType = TYPE_NAMES[dataType];
+        switch (dataType)
+            {
+            case DATA_TYPE_ZERO:
+                    memset(data,0,sizeof(data));
+                    memset(key ,0,sizeof(key));
+                    break;
+            case DATA_TYPE_INC:
+                    for (i=0;i<MAX_BYTES;i++)
+                        {
+                        key [i] = (u08b_t)      i ;
+                        data[i] = (u08b_t) ~key[i];
+                        }
+                    break;
+            case DATA_TYPE_MAC:
+                    RandBytes(key ,sizeof(key ));
+            case DATA_TYPE_RAND:
+                    RandBytes(data,sizeof(data));
+                    break;
+            case DATA_TYPE_TREE:
+                    if (verbose & V_KAT_NO_TREE)
+                        continue;
+                    break;
+            default:    /* should never get here */
+                    FatalError("Invalid data type: %d --> '%s'",dataType,msgType);
+                    break;
+            }
+        
+        for (blkSize = 256;blkSize <= 1024; blkSize*=2)
+            {
+            if (blkSizeMask && (blkSize & blkSizeMask) == 0)
+                continue;
+            if (dataType == DATA_TYPE_TREE)
+                {
+#if SKEIN_TREE_HASH
+                Skein_GenKAT_Tree(blkSize);
+#endif
+                continue;
+                }
+            if (verbose & V_KAT_NO_SEQ)
+                continue;
+            blkBytes = blkSize/8;
+            for (j=0;j <  MSG_BITS_CNT;j++)
+            for (k=0;k < HASH_BITS_CNT;k++)
+                {
+                msgBits  =  MSG_BITS[j];  /* message length   */
+                hashBits = HASH_BITS[k];  /* hash result size */
+                assert(MAX_BYTES*8 >= hashBits && MAX_BYTES*8 >= msgBits);
+                if (msgBits != 1024 && hashBits != blkSize && !(verbose & V_KAT_LONG))
+                    continue;   /* keep the output size reasonable, unless verbose */
+                if (verbose & V_KAT_SHORT)
+                    {           /* -v2 ==> generate "short" KAT set by filtering out most vectors */
+                    if (dataType != DATA_TYPE_INC)
+                        continue;
+                    if (msgBits != 8 && msgBits != blkSize && msgBits != 2*blkSize)
+                        continue;
+                    if (!Short_KAT_OK(blkSize,hashBits))
+                        continue;
+                    }
+                switch (blkSize)
+                    {
+                    case  256: printf("\n:Skein-256: "); break;
+                    case  512: printf("\n:Skein-512: "); break;
+                    case 1024: printf("\n:Skein-1024:"); break;
+                    }
+                printf(" %4d-bit hash, msgLen =%6d bits",hashBits,msgBits);
+                if (!(verbose & V_KAT_SHORT))
+                    printf(", data = '%s'",msgType);
+                printf("\n\nMessage data:\n");
+                if (msgBits == 0)
+                    printf("    (none)\n");
+                else
+                    ShowBytes((msgBits+7)/8,data);
+                switch (dataType)
+                    {
+                    default:                            /* straight hash value */
+                        if (Skein_Hash(blkSize,hashBits,data,msgBits,hashVal) != SKEIN_SUCCESS)
+                            FatalError("Skein_Hash() error!");
+                        break;
+                    case DATA_TYPE_MAC:                 /* include some MAC computations in KAT file */
+                        switch (keyType++)              /* sequence thru different MAC key lengths */
+                            {           
+                            case 0: keyBytes = blkBytes/2;   break;
+                            case 1: keyBytes = blkBytes;     break;
+                            case 2: keyBytes = blkBytes  +1; break;
+                            case 3: keyBytes = blkBytes*2+1; break;
+                            default:keyBytes = 0;       /* not actually a MAC this time, but use InitExt() */
+                                    keyType  = 0;       /* start the cycle again next time */
+                            }
+                        printf("MAC key = %4d bytes:\n",keyBytes);
+                        if (keyBytes)                   /* show MAC key, if any */
+                            ShowBytes(keyBytes,key);
+                        else
+                            printf("    (none)          /* use InitExt() call */\n");
+
+                        if (Skein_InitExt(blkSize,&s,hashBits,SKEIN_CFG_TREE_INFO_SEQUENTIAL,key,keyBytes) != SKEIN_SUCCESS)
+                            FatalError("Skein_InitExt() error!");
+                        if (Skein_Update(&s,data,msgBits) != SKEIN_SUCCESS)
+                            FatalError("Skein_Update() error!");
+                        if (Skein_Final(&s,hashVal) != SKEIN_SUCCESS)
+                            FatalError("Skein_Final() error!");
+                        break;
+                    case DATA_TYPE_TREE:
+                        assert(0);
+                        break;
+                    }
+                printf("Result:\n");
+                ShowBytes((hashBits+7)/8,hashVal);
+                printf("--------------------------------\n");
+                }
+            }
+        }
+    if (!_quiet_)
+        fprintf(stderr,"katHash = %08X\n",katHash ^ 0x150183D2);
+    }
+
+/* generate pre-computed IVs for inclusion in Skein C code */
+void Skein_GenerateIV(void)
+    {
+    static const struct
+        { uint_t blkSize,hashBits; }
+            IV_TAB[] = /* which pairs to precompute */
+                { { 256, 128 }, { 256, 160 }, { 256, 224 }, { 256, 256 },
+                  { 512, 128 }, { 512, 160 }, { 512, 224 }, { 512, 256 },
+                  { 512, 384 }, { 512, 512 },
+                  {1024, 384 }, {1024, 512 }, {1024,1024 }
+                };
+    uint_t       i,j,blkSize,hashBits;
+    hashState    state;
+    const u64b_t *w;
+    const char   *s;
+
+    printf("#ifndef _SKEIN_IV_H_\n"
+           "#define _SKEIN_IV_H_\n\n"
+           "#include \"skein.h\"    /* get Skein macros and types */\n\n"
+           "/*\n"
+           "***************** Pre-computed Skein IVs *******************\n"
+           "**\n"
+           "** NOTE: these values are not \"magic\" constants, but\n"
+           "** are generated using the Threefish block function.\n"
+           "** They are pre-computed here only for speed; i.e., to\n"
+           "** avoid the need for a Threefish call during Init().\n"
+           "**\n"
+           "** The IV for any fixed hash length may be pre-computed.\n"
+           "** Only the most common values are included here.\n"
+           "**\n"
+           "************************************************************\n"
+           "**/\n\n"
+           "#define MK_64 SKEIN_MK_64\n\n"
+          );
+    for (i=0;i < sizeof(IV_TAB)/sizeof(IV_TAB[0]); i++)
+        {
+        blkSize  = IV_TAB[i].blkSize;
+        hashBits = IV_TAB[i].hashBits;
+        switch (blkSize)
+            {
+            case  256:  w = state.u.ctx_256.X;  s = "_256"; break;
+            case  512:  w = state.u.ctx_512.X;  s = "_512"; break;
+            case 1024:  w = state.u.ctx1024.X;  s = "1024"; break;
+            default:    FatalError("Invalid blkSize");
+                        continue; /* should never happen, but avoids gcc warning */
+            }
+        if (Skein_Init(blkSize,&state,hashBits) != SKEIN_SUCCESS)
+            FatalError("Error generating IV: blkSize=%d, hashBits=%d",blkSize,hashBits);
+        printf("/* blkSize = %4d bits. hashSize = %4d bits */\n",blkSize,hashBits);
+        printf("const u64b_t SKEIN%s_IV_%d[] =\n    {\n",s,hashBits);
+        for (j=0;j<blkSize/64;j++)
+            printf("    MK_64(0x%08X,0x%08X)%s\n",
+                   (uint_32t)(w[j] >> 32),(uint_32t)w[j],(j+1 == blkSize/64)?"":",");
+        printf("    };\n\n");
+        }
+    printf("#endif /* _SKEIN_IV_H_ */\n");
+    }
+
+/* qsort routine */
+int compare_uint_32t(const void *aPtr,const void *bPtr)
+    {
+    uint_32t a = * ((uint_32t *) aPtr);
+    uint_32t b = * ((uint_32t *) bPtr);
+    
+    if (a > b) return  1;
+    if (a < b) return -1;
+    return 0;
+    }
+
+void ShowCompiler(const char *CVER)
+    {
+    printf(" //:");
+#if defined(SKEIN_XMM)
+    printf(" 32-XMM, ");
+#else
+    printf(" %2u-bit, ",(uint_t)(8*sizeof(size_t)));
+#endif
+    printf("%s%s",COMPILER_ID,CVER);
+
+    /* do we need to show unroll amount? */
+#if defined(SKEIN_USE_ASM) && SKEIN_USE_ASM
+    printf(" [asm=");
+#define _SC_DO_LOOP_ (1)
+#elif defined(SKEIN_LOOP)
+    printf(" [ C =");
+#define _SC_DO_LOOP_ (1)
+#endif
+
+#ifdef  _SC_DO_LOOP_
+    printf("%c",(Skein_256_Unroll_Cnt())?'0'+Skein_256_Unroll_Cnt():'.');
+    printf("%c",(Skein_512_Unroll_Cnt())?'0'+Skein_512_Unroll_Cnt():'.');
+    printf("%c",(Skein1024_Unroll_Cnt())?'0'+Skein1024_Unroll_Cnt():'.');
+    printf("]");
+#endif
+    }
+
+/* measure the speed (in CPU clks/byte) for a Skein implementation */
+void Skein_MeasurePerformance(const char *target)
+    {
+    const uint_t MSG_BYTES[] = {1,2,4,8,10,16,32,64,100,128,256,512,1000,1024,2048,4096,8192,10000,16384,32768,100000,0};
+    enum     {  TIMER_SAMPLE_CNT = 13, MAX_BUFFER=1024*100, PERF_TIMEOUT_CLKS = 500000 };
+    enum     {  _256 = 256, _512 = 512 };
+    uint_32t dt[24][3][TIMER_SAMPLE_CNT],t0,t1;
+    uint_32t dtMin = ~0u;
+    uint_t   targetSize = 0;
+    uint_t   repCnt     = 1;
+    uint_t   i,k,n,r,blkSize,msgBytes;
+    u08b_t   b[MAX_BUFFER],hashVal[SKEIN1024_BLOCK_BYTES*4];
+    hashState s;
+#ifdef CompilerVersion
+    char     CVER[20];                      /* avoid ANSI compiler warnings for sprintf()! :-(( */
+    n          = CompilerVersion;
+    CVER[0]    = '_';
+    CVER[1]    = 'v';
+    CVER[2]    = (char)('0'+((n /100)%10));
+    CVER[3]    = '.';
+    CVER[4]    = (char)('0'+((n / 10)%10));
+    CVER[5]    = (char)('0'+((n /  1)%10));
+    CVER[6]    = 0;
+#else
+#define CVER ""
+#endif      
+    if (target && target[0])
+        {
+        targetSize = atoi(target);
+        for (i=0;target[i];i++)
+            if (target[i] == '.')
+                {
+                repCnt = atoi(target+i+1);
+                break;
+                }
+        if (repCnt == 0)
+            repCnt = 1;
+        }
+
+    assert(sizeof(dt)/(3*TIMER_SAMPLE_CNT*sizeof(dt[0][0][0])) >=
+           sizeof(MSG_BYTES)/sizeof(MSG_BYTES[0]));
+    if (OS_Set_High_Priority())
+        printf("Unable to set thread to high priority\n");
+    fflush(stdout);                     /* let things calm down */
+    OS_Sleep(200);                      /* let things settle down for a bit */
+    memset(dt,0,sizeof(dt));
+    RandBytes(b,sizeof(b));             /* use random data for testing */
+    for (i=0;i<4*TIMER_SAMPLE_CNT;i++)  /* calibrate the overhead for measuring time */
+        {
+        t0 = HiResTime();
+        t1 = HiResTime();
+        if (dtMin > t1-t0)              /* keep only the minimum time */
+            dtMin = t1-t0;
+        }
+    for (r=0;r<repCnt;r++)
+        {
+        /* first take all the data and store it in dt, with no printf() activity */
+        for (n=0;n < sizeof(MSG_BYTES)/sizeof(MSG_BYTES[0]);n++)
+            {
+            msgBytes = MSG_BYTES[n];        /* pick the message size (in bits) */
+            if (msgBytes > MAX_BUFFER || msgBytes == 0)
+                break;
+            if (targetSize && targetSize != msgBytes)
+                continue;
+            for (k=0;k<3;k++)
+                {                           /* cycle thru the different block sizes */
+                blkSize=256 << k;
+                t0=HiResTime();
+                t1=HiResTime();
+#define OneTest(BITS)                                           \
+                Skein##BITS##_Init  (&s.u.ctx##BITS,BITS);      \
+                Skein##BITS##_Update(&s.u.ctx##BITS,b,msgBytes);\
+                Skein##BITS##_Final (&s.u.ctx##BITS,hashVal);
+
+                OS_Sleep(0);                        /* yield the time slice to OS */
+                for (i=0;i<TIMER_SAMPLE_CNT;i++)
+                    {
+                    HiResTime();                    /* prime the pump */
+                    switch (blkSize)
+                        {
+                        case  256:
+                            OneTest(_256);          /* prime the pump */
+                            t0 = HiResTime();
+                            OneTest(_256);          /* do it twice for some averaging */
+                            OneTest(_256);
+                            t1 = HiResTime();
+                            break;
+                        case  512:
+                            OneTest(_512);
+                            t0 = HiResTime();
+                            OneTest(_512);
+                            OneTest(_512);
+                            t1 = HiResTime();
+                            break;
+                        case 1024:
+                            OneTest(1024);
+                            t0 = HiResTime();
+                            OneTest(1024);
+                            OneTest(1024);
+                            t1 = HiResTime();
+                            break;
+                        }
+                    dt[n][k][i] = ((t1 - t0) - dtMin)/2; /* adjust for HiResTime() overhead */
+                    }
+                }
+            }
+        OS_Set_Normal_Priority();
+
+        if (targetSize == 0)
+            {
+            printf("\nSkein performance, in clks per byte, dtMin = %4d clks.\n",dtMin);
+            printf("         [compiled %s,%s  by  '%s%s', %u-bit]\n",__TIME__,__DATE__,COMPILER_ID,CVER,(uint_t)(8*sizeof(size_t)));
+            printf("         =================================================================\n");
+            printf("         ||                       Skein block size                       |\n");
+            printf("         ||--------------------------------------------------------------|\n");
+            printf(" Message ||       256 bits     |       512 bits     |      1024 bits     |\n");
+            printf(" Length  ||====================|====================|====================|\n");
+            printf(" (bytes) ||     min    median  |     min    median  |     min    median  |\n"); 
+            printf("=========||====================|====================|====================|\n");
+            }
+
+        /* now display the results */
+        for (n=0;n < sizeof(MSG_BYTES)/sizeof(MSG_BYTES[0]);n++)
+            {
+            msgBytes = MSG_BYTES[n];       /* pick the message size (in bits) */
+            if (msgBytes > MAX_BUFFER || msgBytes == 0)
+                break;
+            if (targetSize && targetSize != msgBytes)
+                continue;
+            printf("%7d_ ||",msgBytes);
+            for (k=0;k<3;k++)              /* cycle thru the different Skein block sizes */
+                {   /* here with dt[n][k][] full of time differences */
+                    /* discard high/low, then show min/median of the rest, in clks/byte */
+                qsort(dt[n][k],TIMER_SAMPLE_CNT,sizeof(dt[0][0][0]),compare_uint_32t);
+                printf(" %8.2f %8.2f  |",dt[n][k][1]/(double)msgBytes,dt[n][k][TIMER_SAMPLE_CNT/2]/(double)msgBytes);
+                }
+            ShowCompiler(CVER);
+            printf("\n");
+            if (targetSize == 0 && target && target[0] && repCnt == 1)
+                {   /* show the details */
+                for (k=0;k<3;k++)
+                    {
+                    printf("%4d: ",256 << k);
+                    for (i=0;i<TIMER_SAMPLE_CNT;i++)
+                        printf("%8d",dt[n][k][i]);
+                    printf("\n");
+                    }
+                }
+            }
+        }
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+    if (targetSize == 0)
+        {
+        printf("=========||====================|====================|====================|\n");
+        printf("Code Size||                    |                    |                    |\n");
+        printf("=========||====================|====================|====================|\n");
+        printf("    API  || %12d bytes | %12d bytes | %12d bytes |",
+               (int) Skein_256_API_CodeSize(),
+               (int) Skein_512_API_CodeSize(),
+               (int) Skein1024_API_CodeSize());
+        ShowCompiler(CVER);
+        printf("\n");
+        printf("  Block  || %12d bytes | %12d bytes | %12d bytes |",
+               (int) Skein_256_Process_Block_CodeSize(),
+               (int) Skein_512_Process_Block_CodeSize(),
+               (int) Skein1024_Process_Block_CodeSize());
+        ShowCompiler(CVER);
+        printf("\n");
+        }
+#endif
+    }
+
+void GiveHelp(void)
+    {
+    printf("Syntax:  skein_test [options]\n"
+           "Options: -bNN  = set Skein block size to NN bits\n"
+           "         -lNN  = set max test length  to NN bits\n"
+           "         -tNN  = set Skein hash length to NN bits\n"
+           "         -sNN  = set initial random seed\n"
+           "         -g    = generate precomputed IV values to stdout\n"
+           "         -k    = output KAT results to stdout\n"
+           "         -p    = output performance (clks/byte)\n"
+          );
+    exit(2);
+    }
+                   
+int main(int argc,char *argv[])
+    {
+    int    i,n;
+    uint_t testCnt;
+    uint_t doKAT   =    0;   /* generate KAT vectors?    */
+    uint_t blkSize =    0;   /* Skein state size in bits */
+    uint_t maxLen  = 1024;   /* max block size   in bits */
+    uint_t hashLen =    0;   /* hash length      in bits (0 --> all) */
+    uint_t seed0   = (uint_t) time(NULL); /* randomize based on time */
+    uint_t oneBlk  =    0;   /* test block size */
+
+    for (i=1;i<argc;i++)
+        {   /* process command-line switches */
+        if (argv[i][0] == '-')
+            {
+            switch(toupper(argv[i][1]))
+                {
+                case '?': GiveHelp();                         break;
+                case 'B': blkSize       |= atoi(argv[i]+2);   break;
+                case 'L': maxLen         = atoi(argv[i]+2);   break;
+                case 'S': seed0          = atoi(argv[i]+2);   break;
+                case 'T': hashLen        = atoi(argv[i]+2);   break;
+                case 'K': doKAT          = 1;                 break;
+                case 'V': verbose       |= (argv[i][2]) ? atoi(argv[i]+2) : V_KAT_LONG; break;
+                case 'G': Skein_GenerateIV();                 return 0;
+                case 'P': Skein_MeasurePerformance(argv[i]+2);return 0;
+                case 'Q': _quiet_        = 1;                 break;
+                case 'D': switch (toupper(argv[i][2]))
+                              {
+#ifdef SKEIN_DEBUG
+                              case  0 : skein_DebugFlag |= SKEIN_DEBUG_DEFAULT; break;
+                              case '-': skein_DebugFlag |= SKEIN_DEBUG_SHORT;   break;
+                              case '+': skein_DebugFlag |= SKEIN_DEBUG_ALL;     break;
+                              case 'P': skein_DebugFlag |= SKEIN_DEBUG_PERMUTE; break;
+                              case 'I': skein_DebugFlag |= SKEIN_DEBUG_SHORT |  SKEIN_DEBUG_INJECT; break;
+                              case 'C': skein_DebugFlag |= SKEIN_DEBUG_SHORT & ~SKEIN_DEBUG_CONFIG; break;
+#endif
+                              default : skein_DebugFlag |= atoi(argv[i]+2);     break;
+                              }
+                          break;
+                default:  FatalError("Unsupported command-line option: %s",argv[i]);
+                          break;
+                }
+            }
+        else if (argv[i][0] == '?')
+            GiveHelp();
+        else if (isdigit(argv[i][0]))
+            oneBlk = atoi(argv[i]);
+        }
+
+    if (blkSize == 0)                     /* default is all block sizes */
+        blkSize = 256 | 512 | 1024;
+    if (doKAT)
+        {
+        Skein_ShowKAT(blkSize);
+        }
+    else
+        {
+        if (oneBlk == 0)
+            printf("Seed0 = %d. Compiler = %s\n",seed0,COMPILER_ID);
+        Rand_Init(SKEIN_MK_64(0xDEADBEEF,seed0)); /* init PRNG for test data */
+
+        testCnt=0;
+        for (i=256;i<=1024;i*=2)
+            {
+            if (blkSize & i)
+                {
+                if (hashLen == 0)              /* use all hash sizes? */
+                    {
+                    for (n=0;n < HASH_BITS_CNT;n++)
+                        testCnt += Skein_Test(i,maxLen,HASH_BITS[n],0,oneBlk);
+                    }
+                else
+                    testCnt += Skein_Test(i,maxLen,hashLen,0,oneBlk);
+                }
+            }
+        if (oneBlk)
+            return 0;
+        if (testCnt)
+            printf("Success: %4d tests\n",testCnt);
+        }
+    /* do a quick final self-consistentcy check test to make sure nothing is broken */
+    skein_DebugFlag = 0;        /* no debug output here */
+    for (blkSize = 256;blkSize <= 1024; blkSize*=2)
+        {
+        Skein_Test(blkSize,16,0,1,0);
+        }
+
+    return 0;
+    }
diff --git a/Optimized_32bit/SHA3api_ref.c b/Optimized_32bit/SHA3api_ref.c
new file mode 100644
index 0000000000000..6861a3e4bffb2
--- /dev/null
+++ b/Optimized_32bit/SHA3api_ref.c
@@ -0,0 +1,115 @@
+/***********************************************************************
+**
+** Implementation of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#include <string.h>     /* get the memcpy/memset functions */
+#include "skein.h"      /* get the Skein API definitions   */
+#include "SHA3api_ref.h"/* get the  AHS  API definitions   */
+
+/******************************************************************/
+/*     AHS API code                                               */
+/******************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+HashReturn Init(hashState *state, int hashbitlen)
+    {
+#if SKEIN_256_NIST_MAX_HASH_BITS
+    if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
+        {
+        Skein_Assert(hashbitlen > 0,BAD_HASHLEN);
+        state->statebits = 64*SKEIN_256_STATE_WORDS;
+        return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
+        }
+#endif
+    if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
+        {
+        state->statebits = 64*SKEIN_512_STATE_WORDS;
+        return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+        }
+    else
+        {
+        state->statebits = 64*SKEIN1024_STATE_WORDS;
+        return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+    {
+    /* only the final Update() call is allowed do partial bytes, else assert an error */
+    Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, FAIL);
+
+    Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+    if ((databitlen & 7) == 0)  /* partial bytes? */
+        {
+        switch ((state->statebits >> 8) & 3)
+            {
+            case 2:  return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+            case 1:  return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
+            case 0:  return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
+            default: return FAIL;
+            }
+        }
+    else
+        {   /* handle partial final byte */
+        size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
+        u08b_t b,mask;
+
+        mask = (u08b_t) (1u << (7 - (databitlen & 7)));       /* partial byte bit mask */
+        b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+
+        switch ((state->statebits >> 8) & 3)
+            {
+            case 2:  Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte    */
+                     Skein_512_Update(&state->u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            case 1:  Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte    */
+                     Skein_256_Update(&state->u.ctx_256,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            case 0:  Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte    */
+                     Skein1024_Update(&state->u.ctx1024,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            default: return FAIL;
+            }
+        Skein_Set_Bit_Pad_Flag(state->u.h);                    /* set tweak flag for the final call */
+        
+        return SUCCESS;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+HashReturn Final(hashState *state, BitSequence *hashval)
+    {
+    Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+    switch ((state->statebits >> 8) & 3)
+        {
+        case 2:  return Skein_512_Final(&state->u.ctx_512,hashval);
+        case 1:  return Skein_256_Final(&state->u.ctx_256,hashval);
+        case 0:  return Skein1024_Final(&state->u.ctx1024,hashval);
+        default: return FAIL;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+HashReturn Hash(int hashbitlen, const BitSequence *data, /* all-in-one call */
+                DataLength databitlen,BitSequence *hashval)
+    {
+    hashState  state;
+    HashReturn r = Init(&state,hashbitlen);
+    if (r == SUCCESS)
+        { /* these calls do not fail when called properly */
+        r = Update(&state,data,databitlen);
+        Final(&state,hashval);
+        }
+    return r;
+    }
diff --git a/Optimized_32bit/SHA3api_ref.h b/Optimized_32bit/SHA3api_ref.h
new file mode 100644
index 0000000000000..6d62304e59b7e
--- /dev/null
+++ b/Optimized_32bit/SHA3api_ref.h
@@ -0,0 +1,66 @@
+#ifndef _AHS_API_H_
+#define _AHS_API_H_
+
+/***********************************************************************
+**
+** Interface declarations of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#include "skein.h"
+
+typedef enum
+    {
+    SUCCESS     = SKEIN_SUCCESS,
+    FAIL        = SKEIN_FAIL,
+    BAD_HASHLEN = SKEIN_BAD_HASHLEN
+    }
+    HashReturn;
+
+typedef size_t   DataLength;                /* bit count  type */
+typedef u08b_t   BitSequence;               /* bit stream type */
+
+typedef struct
+    {
+    uint_t  statebits;                      /* 256, 512, or 1024 */
+    union
+        {
+        Skein_Ctxt_Hdr_t h;                 /* common header "overlay" */
+        Skein_256_Ctxt_t ctx_256;
+        Skein_512_Ctxt_t ctx_512;
+        Skein1024_Ctxt_t ctx1024;
+        } u;
+    }
+    hashState;
+
+/* "incremental" hashing API */
+HashReturn Init  (hashState *state, int hashbitlen);
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+HashReturn Final (hashState *state,       BitSequence *hashval);
+
+/* "all-in-one" call */
+HashReturn Hash  (int hashbitlen,   const BitSequence *data, 
+                  DataLength databitlen,  BitSequence *hashval);
+
+
+/*
+** Re-define the compile-time constants below to change the selection
+** of the Skein state size in the Init() function in SHA3api_ref.c.
+**
+** That is, the NIST API does not allow for explicit selection of the
+** Skein block size, so it must be done implicitly in the Init() function.
+** The selection is controlled by these constants.
+*/
+#ifndef SKEIN_256_NIST_MAX_HASHBITS
+#define SKEIN_256_NIST_MAX_HASHBITS (0)
+#endif
+
+#ifndef SKEIN_512_NIST_MAX_HASHBITS
+#define SKEIN_512_NIST_MAX_HASHBITS (512)
+#endif
+
+#endif  /* ifdef _AHS_API_H_ */
diff --git a/Optimized_32bit/brg_endian.h b/Optimized_32bit/brg_endian.h
new file mode 100644
index 0000000000000..978eb33f08cf1
--- /dev/null
+++ b/Optimized_32bit/brg_endian.h
@@ -0,0 +1,148 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 20/10/2006
+*/
+
+#ifndef BRG_ENDIAN_H
+#define BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined(AVR)
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )  || defined( AVR )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+#endif
+
+/* special handler for IA64, which may be either endianness (?)  */
+/* here we assume little-endian, but this may need to be changed */
+#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+#  define PLATFORM_MUST_ALIGN (1)
+#ifndef PLATFORM_BYTE_ORDER
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+#endif
+
+#ifndef   PLATFORM_MUST_ALIGN
+#  define PLATFORM_MUST_ALIGN (0)
+#endif
+
+#endif  /* ifndef BRG_ENDIAN_H */
diff --git a/Optimized_32bit/brg_types.h b/Optimized_32bit/brg_types.h
new file mode 100644
index 0000000000000..d6d6cdab9fbfd
--- /dev/null
+++ b/Optimized_32bit/brg_types.h
@@ -0,0 +1,188 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef BRG_TYPES_H
+#define BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined(__GNUC__)  /* DLW: avoid mingw problem with -ansi */
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#  endif
+#endif
+
+#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 )
+#  error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8
+
+    dec_unit_type(size,x)       declares a variable 'x' of length 
+                                'size' bits
+
+    dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    ptr_cast(x,size)            casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define ui_type(size)               uint_##size##t
+#define dec_unit_type(size,x)       typedef ui_type(size) x
+#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/Optimized_32bit/skein.c b/Optimized_32bit/skein.c
new file mode 100644
index 0000000000000..c9289cd49e8ef
--- /dev/null
+++ b/Optimized_32bit/skein.c
@@ -0,0 +1,753 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
+
+#include <string.h>      /* get the memcpy/memset functions */
+#include "skein.h"       /* get the Skein API definitions   */
+#include "skein_iv.h"    /* get precomputed IVs */
+
+/*****************************************************************/
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void    Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+
+/*****************************************************************/
+/*     256-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_256_STATE_BYTES];
+        u64b_t  w[SKEIN_256_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+    switch (hashBitLen)
+        {             /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+        case  256: memcpy(ctx->X,SKEIN_256_IV_256,sizeof(ctx->X));  break;
+        case  224: memcpy(ctx->X,SKEIN_256_IV_224,sizeof(ctx->X));  break;
+        case  160: memcpy(ctx->X,SKEIN_256_IV_160,sizeof(ctx->X));  break;
+        case  128: memcpy(ctx->X,SKEIN_256_IV_128,sizeof(ctx->X));  break;
+#endif
+        default:
+            /* here if there is no precomputed IV value available */
+            /* build/process the config block, type == CONFIG (could be precomputed) */
+            Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+            cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+            cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+            cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+            memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+            /* compute the initial chaining values from config block */
+            memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+            Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+            break;
+        }
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_256_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_256_STATE_BYTES];
+        u64b_t  w[SKEIN_256_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein_256_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein_256_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+        {
+        uint_t i;
+        for (i=0;i<SKEIN_256_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+#endif
+        }
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(256,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash (default) */
+    ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+    Skein_Start_New_Type(ctx,MSG);
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+            Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN_256_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+            msg        += n * SKEIN_256_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_256_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)            /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_256_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_256_BLOCK_BYTES)
+            n  = SKEIN_256_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_256_API_CodeSize) -
+           ((u08b_t *) Skein_256_Init);
+    }
+#endif
+
+/*****************************************************************/
+/*     512-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_512_STATE_BYTES];
+        u64b_t  w[SKEIN_512_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+    switch (hashBitLen)
+        {             /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+        case  512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X));  break;
+        case  384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X));  break;
+        case  256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X));  break;
+        case  224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X));  break;
+#endif
+        default:
+            /* here if there is no precomputed IV value available */
+            /* build/process the config block, type == CONFIG (could be precomputed) */
+            Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+            cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+            cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+            cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+            memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+            /* compute the initial chaining values from config block */
+            memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+            Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+            break;
+        }
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_512_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_512_STATE_BYTES];
+        u64b_t  w[SKEIN_512_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein_512_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein_512_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+        {
+        uint_t i;
+        for (i=0;i<SKEIN_512_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+#endif
+        }
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(512,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash (default) */
+    ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+    Skein_Start_New_Type(ctx,MSG);
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+            Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+            msg        += n * SKEIN_512_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_512_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_512_BLOCK_BYTES)
+            n  = SKEIN_512_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_512_API_CodeSize) -
+           ((u08b_t *) Skein_512_Init);
+    }
+#endif
+
+/*****************************************************************/
+/*    1024-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN1024_STATE_BYTES];
+        u64b_t  w[SKEIN1024_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+    switch (hashBitLen)
+        {              /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+        case  512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break;
+        case  384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break;
+        case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break;
+#endif
+        default:
+            /* here if there is no precomputed IV value available */
+            /* build/process the config block, type == CONFIG (could be precomputed) */
+            Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+            cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+            cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+            cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+            memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+            /* compute the initial chaining values from config block */
+            memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+            Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+            break;
+        }
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein1024_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    union
+        {
+        u08b_t  b[SKEIN1024_STATE_BYTES];
+        u64b_t  w[SKEIN1024_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein1024_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein1024_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+        {
+        uint_t i;
+        for (i=0;i<SKEIN1024_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+#endif
+        }
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(1024,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash (default) */
+    ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+    Skein_Start_New_Type(ctx,MSG);
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+            Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN1024_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+            msg        += n * SKEIN1024_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN1024_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)            /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN1024_BLOCK_BYTES)
+            n  = SKEIN1024_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein1024_API_CodeSize) -
+           ((u08b_t *) Skein1024_Init);
+    }
+#endif
+
+/**************** Functions to support MAC/tree hashing ***************/
+/*   (this code is identical for Optimized and Reference versions)    */
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+    Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+    Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+#if SKEIN_TREE_HASH
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein_256_Output(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_256_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_256_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_256_BLOCK_BYTES)
+            n  = SKEIN_256_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein_512_Output(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_512_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_512_BLOCK_BYTES)
+            n  = SKEIN_512_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN1024_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN1024_BLOCK_BYTES)
+            n  = SKEIN1024_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+#endif
diff --git a/Optimized_32bit/skein.h b/Optimized_32bit/skein.h
new file mode 100644
index 0000000000000..721c9bc9ce0db
--- /dev/null
+++ b/Optimized_32bit/skein.h
@@ -0,0 +1,327 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_     1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+** 
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+**  SKEIN_DEBUG            -- make callouts from inside Skein code
+**                            to examine/display intermediate values.
+**                            [default: no callouts (no overhead)]
+**
+**  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+**                            code. If not defined, most error checking 
+**                            is disabled (for performance). Otherwise, 
+**                            the switch value is interpreted as:
+**                                0: use assert()      to flag errors
+**                                1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stddef.h>                          /* get size_t definition */
+#include "skein_port.h"                      /* get platform-specific definitions */
+
+enum
+    {
+    SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
+    SKEIN_FAIL            =      1,
+    SKEIN_BAD_HASHLEN     =      2
+    };
+
+#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define  SKEIN_256_STATE_WORDS ( 4)
+#define  SKEIN_512_STATE_WORDS ( 8)
+#define  SKEIN1024_STATE_WORDS (16)
+#define  SKEIN_MAX_STATE_WORDS (16)
+
+#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+typedef struct
+    {
+    size_t  hashBitLen;                      /* size of hash result, in bits */
+    size_t  bCnt;                            /* current byte count in buffer b[] */
+    u64b_t  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
+    } Skein_Ctxt_Hdr_t;
+
+typedef struct                               /*  256-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein_256_Ctxt_t;
+
+typedef struct                               /*  512-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN_512_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN_512_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein_512_Ctxt_t;
+
+typedef struct                               /* 1024-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN1024_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN1024_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein1024_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+int  Skein_256_Init  (Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+int  Skein_512_Init  (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
+int  Skein1024_Init  (Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+
+int  Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+
+int  Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+/*
+**   Skein APIs for "extended" initialization: MAC keys, tree hashing.
+**   After an InitExt() call, just use Update/Final calls as with Init().
+**
+**   Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes.
+**          When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, 
+**              the results of InitExt() are identical to calling Init().
+**          The function Init() may be called once to "precompute" the IV for
+**              a given hashBitLen value, then by saving a copy of the context
+**              the IV computation may be avoided in later calls.
+**          Similarly, the function InitExt() may be called once per MAC key 
+**              to precompute the MAC IV, then a copy of the context saved and
+**              reused for each new MAC computation.
+**/
+int  Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int  Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int  Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+
+/*
+**   Skein APIs for MAC and tree hash:
+**      Final_Pad:  pad, do final block, but no OUTPUT type
+**      Output:     do just the output stage
+*/
+int  Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+#ifndef SKEIN_TREE_HASH
+#define SKEIN_TREE_HASH (1)
+#endif
+#if  SKEIN_TREE_HASH
+int  Skein_256_Output   (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Output   (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+#endif
+
+/*****************************************************************
+** "Internal" Skein definitions
+**    -- not needed for sequential hashing API, but will be 
+**           helpful for other uses of Skein (e.g., tree hash mode).
+**    -- included here so that they can be shared between
+**           reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+                                
+#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+                                
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD   (((u64b_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+                                
+/* tweak word T[1]: tree level bit field mask */
+#define SKEIN_T1_TREE_LVL_MASK  (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n)  (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY      ( 0)                    /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG      ( 4)                    /* configuration block */
+#define SKEIN_BLK_TYPE_PERS     ( 8)                    /* personalization string */
+#define SKEIN_BLK_TYPE_PK       (12)                    /* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF      (16)                    /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE    (20)                    /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+#define SKEIN_BLK_TYPE_MASK     (63)                    /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK    SKEIN_T1_BLK_TYPE(PK)   /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL       (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION           (1)
+
+#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN       (4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS  ( 0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS  ( 8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS  (16)
+
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl)                   \
+    ( (((u64b_t)(leaf  )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
+      (((u64b_t)(node  )) << SKEIN_CFG_TREE_NODE_SIZE_POS) |    \
+      (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+
+/*
+**   Skein macros for getting/setting tweak words, etc.
+**   These are useful for partial input bytes, hash tree init/update, etc.
+**/
+#define Skein_Get_Tweak(ctxPtr,TWK_NUM)         ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Get_T0(ctxPtr)    Skein_Get_Tweak(ctxPtr,0)
+#define Skein_Get_T1(ctxPtr)    Skein_Get_Tweak(ctxPtr,1)
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1)           \
+    {                                           \
+    Skein_Set_T0(ctxPtr,(T0));                  \
+    Skein_Set_T1(ctxPtr,(T1));                  \
+    }
+
+#define Skein_Set_Type(ctxPtr,BLK_TYPE)         \
+    Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+    { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Clear_First_Flag(hdr)      { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;       }
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+
+/*****************************************************************
+** "Internal" Skein definitions for debugging and error checking
+******************************************************************/
+#ifdef  SKEIN_DEBUG             /* examine/display intermediate values? */
+#include "skein_debug.h"
+#else                           /* default is no callouts */
+#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
+#define Skein_Show_Round(bits,ctx,r,X)
+#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
+#define Skein_Show_Final(bits,ctx,cnt,outPtr)
+#define Skein_Show_Key(bits,ctx,key,keyBytes)
+#endif
+
+#ifndef SKEIN_ERR_CHECK        /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_assert(x)
+#elif   defined(SKEIN_ASSERT)
+#include <assert.h>     
+#define Skein_Assert(x,retCode) assert(x) 
+#define Skein_assert(x)         assert(x) 
+#else
+#include <assert.h>     
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
+#define Skein_assert(x)         assert(x)                     /* internal error */
+#endif
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum    
+    {   
+        /* Skein_256 round rotation constants */
+    R_256_0_0=14, R_256_0_1=16,
+    R_256_1_0=52, R_256_1_1=57,
+    R_256_2_0=23, R_256_2_1=40,
+    R_256_3_0= 5, R_256_3_1=37,
+    R_256_4_0=25, R_256_4_1=33,
+    R_256_5_0=46, R_256_5_1=12,
+    R_256_6_0=58, R_256_6_1=22,
+    R_256_7_0=32, R_256_7_1=32,
+
+        /* Skein_512 round rotation constants */
+    R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+    R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+    R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+    R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+    R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+    R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+    R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+    R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+
+        /* Skein1024 round rotation constants */
+    R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
+    R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
+    R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
+    R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
+    R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
+    R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
+    R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
+    R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
+    };
+
+#ifndef SKEIN_ROUNDS
+#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
+#define SKEIN_512_ROUNDS_TOTAL (72)
+#define SKEIN1024_ROUNDS_TOTAL (80)
+#else                                        /* allow command-line define in range 8*(5..14)   */
+#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5))
+#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
+#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS    ) + 5) % 10) + 5))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* ifndef _SKEIN_H_ */
diff --git a/Optimized_32bit/skein_block.c b/Optimized_32bit/skein_block.c
new file mode 100644
index 0000000000000..bfd29d1eee2d8
--- /dev/null
+++ b/Optimized_32bit/skein_block.c
@@ -0,0 +1,689 @@
+/***********************************************************************
+**
+** Implementation of the Skein block functions.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Compile-time switches:
+**
+**  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
+**                    versions use ASM code for block processing
+**                    [default: use C for all block sizes]
+**
+************************************************************************/
+
+#include <string.h>
+#include "skein.h"
+
+#ifndef SKEIN_USE_ASM
+#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
+#endif
+
+#ifndef SKEIN_LOOP
+#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
+#endif
+
+#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)                
+#define ts              (kw + KW_TWK_BASE)
+
+#ifdef SKEIN_DEBUG
+#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
+#else
+#define DebugSaveTweak(ctx)
+#endif
+
+/*****************************  Skein_256 ******************************/
+#if !(SKEIN_USE_ASM & 256)
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT = SKEIN_256_STATE_WORDS
+        };
+#undef  RCNT
+#define RCNT  (SKEIN_256_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
+#else
+#define SKEIN_UNROLL_256 (0)
+#endif
+
+#if SKEIN_UNROLL_256
+#if (RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
+#endif
+    size_t  r;
+    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+    u64b_t  X0,X1,X2,X3;                        /* local copy of context vars, for speed */
+    u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+    const u64b_t *Xptr[4];                      /* use for debugging (help compiler put Xn in registers) */
+    Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+#endif
+    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+    ts[0] = ctx->h.T[0];
+    ts[1] = ctx->h.T[1];
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ts[0] += byteCntAdd;                    /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[0] = ctx->X[0];     
+        ks[1] = ctx->X[1];
+        ks[2] = ctx->X[2];
+        ks[3] = ctx->X[3];
+        ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT);   /* get input block in little-endian format */
+        DebugSaveTweak(ctx);
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+        X0 = w[0] + ks[0];                      /* do the first full key injection */
+        X1 = w[1] + ks[1] + ts[0];
+        X2 = w[2] + ks[2] + ts[1];
+        X3 = w[3] + ks[3];
+
+        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);    /* show starting state values */
+
+        blkPtr += SKEIN_256_BLOCK_BYTES;
+
+        /* run the rounds */
+
+#define Round256(p0,p1,p2,p3,ROT,rNum)                              \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+
+#if SKEIN_UNROLL_256 == 0                       
+#define R256(p0,p1,p2,p3,ROT,rNum)           /* fully unrolled */   \
+    Round256(p0,p1,p2,p3,ROT,rNum)                                  \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I256(R)                                                     \
+    X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
+    X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
+    X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
+    X3   += ks[((R)+4) % 5] +     (R)+1;                            \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else                                       /* looping version */
+#define R256(p0,p1,p2,p3,ROT,rNum)                                  \
+    Round256(p0,p1,p2,p3,ROT,rNum)                                  \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I256(R)                                                     \
+    X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
+    X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
+    X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
+    X3   += ks[r+(R)+3] +    r+(R)   ;                              \
+    ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
+    ts[r + (R)+2    ]   = ts[r+(R)-1];                              \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+    for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256)  /* loop thru it */
+#endif  
+        {    
+#define R256_8_rounds(R)                  \
+        R256(0,1,2,3,R_256_0,8*(R) + 1);  \
+        R256(0,3,2,1,R_256_1,8*(R) + 2);  \
+        R256(0,1,2,3,R_256_2,8*(R) + 3);  \
+        R256(0,3,2,1,R_256_3,8*(R) + 4);  \
+        I256(2*(R));                      \
+        R256(0,1,2,3,R_256_4,8*(R) + 5);  \
+        R256(0,3,2,1,R_256_5,8*(R) + 6);  \
+        R256(0,1,2,3,R_256_6,8*(R) + 7);  \
+        R256(0,3,2,1,R_256_7,8*(R) + 8);  \
+        I256(2*(R)+1);
+
+        R256_8_rounds( 0);
+
+#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
+
+  #if   R256_Unroll_R( 1)
+        R256_8_rounds( 1);
+  #endif
+  #if   R256_Unroll_R( 2)
+        R256_8_rounds( 2);
+  #endif
+  #if   R256_Unroll_R( 3)
+        R256_8_rounds( 3);
+  #endif
+  #if   R256_Unroll_R( 4)
+        R256_8_rounds( 4);
+  #endif
+  #if   R256_Unroll_R( 5)
+        R256_8_rounds( 5);
+  #endif
+  #if   R256_Unroll_R( 6)
+        R256_8_rounds( 6);
+  #endif
+  #if   R256_Unroll_R( 7)
+        R256_8_rounds( 7);
+  #endif
+  #if   R256_Unroll_R( 8)
+        R256_8_rounds( 8);
+  #endif
+  #if   R256_Unroll_R( 9)
+        R256_8_rounds( 9);
+  #endif
+  #if   R256_Unroll_R(10)
+        R256_8_rounds(10);
+  #endif
+  #if   R256_Unroll_R(11)
+        R256_8_rounds(11);
+  #endif
+  #if   R256_Unroll_R(12)
+        R256_8_rounds(12);
+  #endif
+  #if   R256_Unroll_R(13)
+        R256_8_rounds(13);
+  #endif
+  #if   R256_Unroll_R(14)
+        R256_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_256 > 14)
+#error  "need more unrolling in Skein_256_Process_Block"
+  #endif
+        }
+        /* do the final "feedforward" xor, update context chaining vars */
+        ctx->X[0] = X0 ^ w[0];
+        ctx->X[1] = X1 ^ w[1];
+        ctx->X[2] = X2 ^ w[2];
+        ctx->X[3] = X3 ^ w[3];
+
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+        }
+    while (--blkCnt);
+    ctx->h.T[0] = ts[0];
+    ctx->h.T[1] = ts[1];
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_256_Process_Block_CodeSize) -
+           ((u08b_t *) Skein_256_Process_Block);
+    }
+uint_t Skein_256_Unroll_Cnt(void)
+    {
+    return SKEIN_UNROLL_256;
+    }
+#endif
+#endif
+
+/*****************************  Skein_512 ******************************/
+#if !(SKEIN_USE_ASM & 512)
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT = SKEIN_512_STATE_WORDS
+        };
+#undef  RCNT
+#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
+#else
+#define SKEIN_UNROLL_512 (0)
+#endif
+
+#if SKEIN_UNROLL_512
+#if (RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
+#endif
+    size_t  r;
+    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+    u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
+    u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+    const u64b_t *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
+    Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+    Xptr[4] = &X4;  Xptr[5] = &X5;  Xptr[6] = &X6;  Xptr[7] = &X7;
+#endif
+
+    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+    ts[0] = ctx->h.T[0];
+    ts[1] = ctx->h.T[1];
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ts[0] += byteCntAdd;                    /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[0] = ctx->X[0];
+        ks[1] = ctx->X[1];
+        ks[2] = ctx->X[2];
+        ks[3] = ctx->X[3];
+        ks[4] = ctx->X[4];
+        ks[5] = ctx->X[5];
+        ks[6] = ctx->X[6];
+        ks[7] = ctx->X[7];
+        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
+                ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        DebugSaveTweak(ctx);
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+        X0   = w[0] + ks[0];                    /* do the first full key injection */
+        X1   = w[1] + ks[1];
+        X2   = w[2] + ks[2];
+        X3   = w[3] + ks[3];
+        X4   = w[4] + ks[4];
+        X5   = w[5] + ks[5] + ts[0];
+        X6   = w[6] + ks[6] + ts[1];
+        X7   = w[7] + ks[7];
+
+        blkPtr += SKEIN_512_BLOCK_BYTES;
+
+        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+        /* run the rounds */
+#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+
+#if SKEIN_UNROLL_512 == 0                       
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I512(R)                                                     \
+    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
+    X1   += ks[((R)+2) % 9];                                        \
+    X2   += ks[((R)+3) % 9];                                        \
+    X3   += ks[((R)+4) % 9];                                        \
+    X4   += ks[((R)+5) % 9];                                        \
+    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
+    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
+    X7   += ks[((R)+8) % 9] +     (R)+1;                            \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else                                       /* looping version */
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I512(R)                                                     \
+    X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
+    X1   += ks[r+(R)+1];                                            \
+    X2   += ks[r+(R)+2];                                            \
+    X3   += ks[r+(R)+3];                                            \
+    X4   += ks[r+(R)+4];                                            \
+    X5   += ks[r+(R)+5] + ts[r+(R)+0];                              \
+    X6   += ks[r+(R)+6] + ts[r+(R)+1];                              \
+    X7   += ks[r+(R)+7] +    r+(R)   ;                              \
+    ks[r +       (R)+8] = ks[r+(R)-1];  /* rotate key schedule */   \
+    ts[r +       (R)+2] = ts[r+(R)-1];                              \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+    for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512)   /* loop thru it */
+#endif                         /* end of looped code definitions */
+        {
+#define R512_8_rounds(R)  /* do 8 full rounds */  \
+        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
+        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
+        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
+        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
+        I512(2*(R));                              \
+        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
+        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
+        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
+        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
+        I512(2*(R)+1);        /* and key injection */
+
+        R512_8_rounds( 0);
+
+#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
+
+  #if   R512_Unroll_R( 1)
+        R512_8_rounds( 1);
+  #endif
+  #if   R512_Unroll_R( 2)
+        R512_8_rounds( 2);
+  #endif
+  #if   R512_Unroll_R( 3)
+        R512_8_rounds( 3);
+  #endif
+  #if   R512_Unroll_R( 4)
+        R512_8_rounds( 4);
+  #endif
+  #if   R512_Unroll_R( 5)
+        R512_8_rounds( 5);
+  #endif
+  #if   R512_Unroll_R( 6)
+        R512_8_rounds( 6);
+  #endif
+  #if   R512_Unroll_R( 7)
+        R512_8_rounds( 7);
+  #endif
+  #if   R512_Unroll_R( 8)
+        R512_8_rounds( 8);
+  #endif
+  #if   R512_Unroll_R( 9)
+        R512_8_rounds( 9);
+  #endif
+  #if   R512_Unroll_R(10)
+        R512_8_rounds(10);
+  #endif
+  #if   R512_Unroll_R(11)
+        R512_8_rounds(11);
+  #endif
+  #if   R512_Unroll_R(12)
+        R512_8_rounds(12);
+  #endif
+  #if   R512_Unroll_R(13)
+        R512_8_rounds(13);
+  #endif
+  #if   R512_Unroll_R(14)
+        R512_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_512 > 14)
+#error  "need more unrolling in Skein_512_Process_Block"
+  #endif
+        }
+
+        /* do the final "feedforward" xor, update context chaining vars */
+        ctx->X[0] = X0 ^ w[0];
+        ctx->X[1] = X1 ^ w[1];
+        ctx->X[2] = X2 ^ w[2];
+        ctx->X[3] = X3 ^ w[3];
+        ctx->X[4] = X4 ^ w[4];
+        ctx->X[5] = X5 ^ w[5];
+        ctx->X[6] = X6 ^ w[6];
+        ctx->X[7] = X7 ^ w[7];
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+        }
+    while (--blkCnt);
+    ctx->h.T[0] = ts[0];
+    ctx->h.T[1] = ts[1];
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
+           ((u08b_t *) Skein_512_Process_Block);
+    }
+uint_t Skein_512_Unroll_Cnt(void)
+    {
+    return SKEIN_UNROLL_512;
+    }
+#endif
+#endif
+
+/*****************************  Skein1024 ******************************/
+#if !(SKEIN_USE_ASM & 1024)
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C, always looping (unrolled is bigger AND slower!) */
+    enum
+        {
+        WCNT = SKEIN1024_STATE_WORDS
+        };
+#undef  RCNT
+#define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define SKEIN_UNROLL_1024 (0)
+#endif
+
+#if (SKEIN_UNROLL_1024 != 0)
+#if (RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024"              /* sanity check on unroll count */
+#endif
+    size_t  r;
+    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+
+    u64b_t  X00,X01,X02,X03,X04,X05,X06,X07,    /* local copy of vars, for speed */
+            X08,X09,X10,X11,X12,X13,X14,X15;
+    u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+    const u64b_t *Xptr[16];                     /* use for debugging (help compiler put Xn in registers) */
+    Xptr[ 0] = &X00;  Xptr[ 1] = &X01;  Xptr[ 2] = &X02;  Xptr[ 3] = &X03;
+    Xptr[ 4] = &X04;  Xptr[ 5] = &X05;  Xptr[ 6] = &X06;  Xptr[ 7] = &X07;
+    Xptr[ 8] = &X08;  Xptr[ 9] = &X09;  Xptr[10] = &X10;  Xptr[11] = &X11;
+    Xptr[12] = &X12;  Xptr[13] = &X13;  Xptr[14] = &X14;  Xptr[15] = &X15;
+#endif
+
+    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+    ts[0] = ctx->h.T[0];
+    ts[1] = ctx->h.T[1];
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ts[0] += byteCntAdd;                    /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[ 0] = ctx->X[ 0];
+        ks[ 1] = ctx->X[ 1];
+        ks[ 2] = ctx->X[ 2];
+        ks[ 3] = ctx->X[ 3];
+        ks[ 4] = ctx->X[ 4];
+        ks[ 5] = ctx->X[ 5];
+        ks[ 6] = ctx->X[ 6];
+        ks[ 7] = ctx->X[ 7];
+        ks[ 8] = ctx->X[ 8];
+        ks[ 9] = ctx->X[ 9];
+        ks[10] = ctx->X[10];
+        ks[11] = ctx->X[11];
+        ks[12] = ctx->X[12];
+        ks[13] = ctx->X[13];
+        ks[14] = ctx->X[14];
+        ks[15] = ctx->X[15];
+        ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^
+                 ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^
+                 ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^
+                 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+        ts[2]  = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        DebugSaveTweak(ctx);
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+        X00    = w[ 0] + ks[ 0];                 /* do the first full key injection */
+        X01    = w[ 1] + ks[ 1];
+        X02    = w[ 2] + ks[ 2];
+        X03    = w[ 3] + ks[ 3];
+        X04    = w[ 4] + ks[ 4];
+        X05    = w[ 5] + ks[ 5];
+        X06    = w[ 6] + ks[ 6];
+        X07    = w[ 7] + ks[ 7];
+        X08    = w[ 8] + ks[ 8];
+        X09    = w[ 9] + ks[ 9];
+        X10    = w[10] + ks[10];
+        X11    = w[11] + ks[11];
+        X12    = w[12] + ks[12];
+        X13    = w[13] + ks[13] + ts[0];
+        X14    = w[14] + ks[14] + ts[1];
+        X15    = w[15] + ks[15];
+
+        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+
+#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0;   \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2;   \
+    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4;   \
+    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6;   \
+    X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8;   \
+    X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA;   \
+    X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC;   \
+    X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE;   \
+
+#if SKEIN_UNROLL_1024 == 0                      
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr);
+
+#define I1024(R)                                                      \
+    X00   += ks[((R)+ 1) % 17]; /* inject the key schedule value */   \
+    X01   += ks[((R)+ 2) % 17];                                       \
+    X02   += ks[((R)+ 3) % 17];                                       \
+    X03   += ks[((R)+ 4) % 17];                                       \
+    X04   += ks[((R)+ 5) % 17];                                       \
+    X05   += ks[((R)+ 6) % 17];                                       \
+    X06   += ks[((R)+ 7) % 17];                                       \
+    X07   += ks[((R)+ 8) % 17];                                       \
+    X08   += ks[((R)+ 9) % 17];                                       \
+    X09   += ks[((R)+10) % 17];                                       \
+    X10   += ks[((R)+11) % 17];                                       \
+    X11   += ks[((R)+12) % 17];                                       \
+    X12   += ks[((R)+13) % 17];                                       \
+    X13   += ks[((R)+14) % 17] + ts[((R)+1) % 3];                     \
+    X14   += ks[((R)+15) % 17] + ts[((R)+2) % 3];                     \
+    X15   += ks[((R)+16) % 17] +     (R)+1;                           \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); 
+#else                                       /* looping version */
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr);
+
+#define I1024(R)                                                      \
+    X00   += ks[r+(R)+ 0];    /* inject the key schedule value */     \
+    X01   += ks[r+(R)+ 1];                                            \
+    X02   += ks[r+(R)+ 2];                                            \
+    X03   += ks[r+(R)+ 3];                                            \
+    X04   += ks[r+(R)+ 4];                                            \
+    X05   += ks[r+(R)+ 5];                                            \
+    X06   += ks[r+(R)+ 6];                                            \
+    X07   += ks[r+(R)+ 7];                                            \
+    X08   += ks[r+(R)+ 8];                                            \
+    X09   += ks[r+(R)+ 9];                                            \
+    X10   += ks[r+(R)+10];                                            \
+    X11   += ks[r+(R)+11];                                            \
+    X12   += ks[r+(R)+12];                                            \
+    X13   += ks[r+(R)+13] + ts[r+(R)+0];                              \
+    X14   += ks[r+(R)+14] + ts[r+(R)+1];                              \
+    X15   += ks[r+(R)+15] +    r+(R)   ;                              \
+    ks[r  +       (R)+16] = ks[r+(R)-1];  /* rotate key schedule */   \
+    ts[r  +       (R)+ 2] = ts[r+(R)-1];                              \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+    for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024)    /* loop thru it */
+#endif  
+        {
+#define R1024_8_rounds(R)    /* do 8 full rounds */                               \
+        R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \
+        R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \
+        R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \
+        R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \
+        I1024(2*(R));                                                             \
+        R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \
+        R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \
+        R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \
+        R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \
+        I1024(2*(R)+1);
+
+        R1024_8_rounds( 0);
+
+#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
+
+  #if   R1024_Unroll_R( 1)
+        R1024_8_rounds( 1);
+  #endif
+  #if   R1024_Unroll_R( 2)
+        R1024_8_rounds( 2);
+  #endif
+  #if   R1024_Unroll_R( 3)
+        R1024_8_rounds( 3);
+  #endif
+  #if   R1024_Unroll_R( 4)
+        R1024_8_rounds( 4);
+  #endif
+  #if   R1024_Unroll_R( 5)
+        R1024_8_rounds( 5);
+  #endif
+  #if   R1024_Unroll_R( 6)
+        R1024_8_rounds( 6);
+  #endif
+  #if   R1024_Unroll_R( 7)
+        R1024_8_rounds( 7);
+  #endif
+  #if   R1024_Unroll_R( 8)
+        R1024_8_rounds( 8);
+  #endif
+  #if   R1024_Unroll_R( 9)
+        R1024_8_rounds( 9);
+  #endif
+  #if   R1024_Unroll_R(10)
+        R1024_8_rounds(10);
+  #endif
+  #if   R1024_Unroll_R(11)
+        R1024_8_rounds(11);
+  #endif
+  #if   R1024_Unroll_R(12)
+        R1024_8_rounds(12);
+  #endif
+  #if   R1024_Unroll_R(13)
+        R1024_8_rounds(13);
+  #endif
+  #if   R1024_Unroll_R(14)
+        R1024_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_1024 > 14)
+#error  "need more unrolling in Skein_1024_Process_Block"
+  #endif
+        }
+        /* do the final "feedforward" xor, update context chaining vars */
+
+        ctx->X[ 0] = X00 ^ w[ 0];
+        ctx->X[ 1] = X01 ^ w[ 1];
+        ctx->X[ 2] = X02 ^ w[ 2];
+        ctx->X[ 3] = X03 ^ w[ 3];
+        ctx->X[ 4] = X04 ^ w[ 4];
+        ctx->X[ 5] = X05 ^ w[ 5];
+        ctx->X[ 6] = X06 ^ w[ 6];
+        ctx->X[ 7] = X07 ^ w[ 7];
+        ctx->X[ 8] = X08 ^ w[ 8];
+        ctx->X[ 9] = X09 ^ w[ 9];
+        ctx->X[10] = X10 ^ w[10];
+        ctx->X[11] = X11 ^ w[11];
+        ctx->X[12] = X12 ^ w[12];
+        ctx->X[13] = X13 ^ w[13];
+        ctx->X[14] = X14 ^ w[14];
+        ctx->X[15] = X15 ^ w[15];
+
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+        
+        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+        blkPtr += SKEIN1024_BLOCK_BYTES;
+        }
+    while (--blkCnt);
+    ctx->h.T[0] = ts[0];
+    ctx->h.T[1] = ts[1];
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein1024_Process_Block_CodeSize) -
+           ((u08b_t *) Skein1024_Process_Block);
+    }
+uint_t Skein1024_Unroll_Cnt(void)
+    {
+    return SKEIN_UNROLL_1024;
+    }
+#endif
+#endif
diff --git a/Optimized_32bit/skein_debug.c b/Optimized_32bit/skein_debug.c
new file mode 100644
index 0000000000000..fac5038598ea5
--- /dev/null
+++ b/Optimized_32bit/skein_debug.c
@@ -0,0 +1,247 @@
+/***********************************************************************
+**
+** Debug output functions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+#include <stdio.h>
+
+#ifdef SKEIN_DEBUG  /* only instantiate this code if SKEIN_DEBUG is on */
+#include "skein.h"
+
+static const char INDENT[] =  "    ";  /* how much to indent on new line */
+
+uint_t skein_DebugFlag = 0;  /* off by default. Must be set externally */
+
+static void Show64_step(size_t cnt,const u64b_t *X,size_t step)
+    {
+    size_t i,j;
+    for (i=j=0;i < cnt;i++,j+=step)
+        {
+        if (i % 4 ==  0) printf(INDENT);
+        printf(" %08X.%08X ",(uint_32t)(X[j] >> 32),(uint_32t)X[j]);
+        if (i % 4 ==  3 || i==cnt-1) printf("\n");
+        fflush(stdout);
+        }
+    }
+
+#define Show64(cnt,X) Show64_step(cnt,X,1)
+
+static void Show64_flag(size_t cnt,const u64b_t *X)
+    {
+    size_t xptr = (size_t) X;
+    size_t step = (xptr & 1) ? 2 : 1;
+    if (step != 1)
+        {
+        X = (const u64b_t *) (xptr & ~1);
+        }
+    Show64_step(cnt,X,step);
+    }
+
+static void Show08(size_t cnt,const u08b_t *b)
+    {
+    size_t i;
+    for (i=0;i < cnt;i++)
+        {
+        if (i %16 ==  0) printf(INDENT);
+        else if (i % 4 == 0) printf(" ");
+        printf(" %02X",b[i]);
+        if (i %16 == 15 || i==cnt-1) printf("\n");
+        fflush(stdout);
+        }
+    }
+
+static const char *AlgoHeader(uint_t bits)
+    {
+    if (skein_DebugFlag & SKEIN_DEBUG_THREEFISH)
+        switch (bits)
+            {
+            case  256:  return ":Threefish-256: ";
+            case  512:  return ":Threefish-512: ";
+            case 1024:  return ":Threefish-1024:";
+            }
+    else
+        switch (bits)
+            {
+            case  256:  return ":Skein-256: ";
+            case  512:  return ":Skein-512: ";
+            case 1024:  return ":Skein-1024:";
+            }
+    return NULL;
+    }
+
+void Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr)
+    {
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag & SKEIN_DEBUG_FINAL)
+        {
+        printf("\n%s Final output=\n",AlgoHeader(bits));
+        Show08(cnt,outPtr);
+        printf("    ++++++++++\n");
+        fflush(stdout);
+        }
+    }
+
+/* show state after a round (or "pseudo-round") */
+void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X)
+    {
+    static uint_t injectNum=0;  /* not multi-thread safe! */
+
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag)
+        {
+        if (r >= SKEIN_RND_SPECIAL) 
+            {       /* a key injection (or feedforward) point */
+            injectNum = (r == SKEIN_RND_KEY_INITIAL) ? 0 : injectNum+1;
+            if (  skein_DebugFlag & SKEIN_DEBUG_INJECT ||
+                ((skein_DebugFlag & SKEIN_DEBUG_FINAL) && r == SKEIN_RND_FEED_FWD))
+                {
+                printf("\n%s",AlgoHeader(bits));
+                switch (r)
+                    {
+                    case SKEIN_RND_KEY_INITIAL:
+                        printf(" [state after initial key injection]");
+                        break;
+                    case SKEIN_RND_KEY_INJECT:
+                        printf(" [state after key injection #%02d]",injectNum);
+                        break;
+                    case SKEIN_RND_FEED_FWD:
+                        printf(" [state after plaintext feedforward]");
+                        injectNum = 0;
+                        break;
+                    }
+                printf("=\n");
+                Show64(bits/64,X);
+                if (r== SKEIN_RND_FEED_FWD)
+                    printf("    ----------\n");
+                }
+            }
+        else if (skein_DebugFlag & SKEIN_DEBUG_ROUNDS)
+            {
+            uint_t j;
+            u64b_t p[SKEIN_MAX_STATE_WORDS];
+            const u08b_t *perm;
+            const static u08b_t PERM_256 [4][ 4] = { { 0,1,2,3 }, { 0,3,2,1 }, { 0,1,2,3 }, { 0,3,2,1 } };
+            const static u08b_t PERM_512 [4][ 8] = { { 0,1,2,3,4,5,6,7 },
+                                                     { 2,1,4,7,6,5,0,3 },
+                                                     { 4,1,6,3,0,5,2,7 },
+                                                     { 6,1,0,7,2,5,4,3 }
+                                                   };
+            const static u08b_t PERM_1024[4][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+                                                     { 0, 9, 2,13, 6,11, 4,15,10, 7,12, 3,14, 5, 8, 1 },
+                                                     { 0, 7, 2, 5, 4, 3, 6, 1,12,15,14,13, 8,11,10, 9 },
+                                                     { 0,15, 2,11, 6,13, 4, 9,14, 1, 8, 5,10, 3,12, 7 }
+                                                   };
+                    
+            if ((skein_DebugFlag & SKEIN_DEBUG_PERMUTE) && (r & 3))
+                {
+                printf("\n%s [state after round %2d (permuted)]=\n",AlgoHeader(bits),(int)r);
+                switch (bits)
+                    {
+                    case  256: perm = PERM_256 [r&3];   break;
+                    case  512: perm = PERM_512 [r&3];   break;
+                    default:   perm = PERM_1024[r&3];   break;
+                    }
+                for (j=0;j<bits/64;j++)
+                    p[j] = X[perm[j]];
+                Show64(bits/64,p);
+                }
+            else
+                {
+                printf("\n%s [state after round %2d]=\n",AlgoHeader(bits),(int)r);
+                Show64(bits/64,X);
+                }
+            }
+        }
+    }
+
+/* show state after a round (or "pseudo-round"), given a list of pointers */
+void Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[])
+    {
+    uint_t i;
+    u64b_t X[SKEIN_MAX_STATE_WORDS];
+
+    for (i=0;i<bits/64;i++)     /* copy over the words */ 
+        X[i] = X_ptr[i][0];
+    Skein_Show_Round(bits,h,r,X);
+    }
+
+
+/* show the state at the start of a block */
+void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+                      const u64b_t *wPtr, const u64b_t *ksPtr, const u64b_t *tsPtr)
+    {
+    uint_t n;
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag)
+        {
+        if (skein_DebugFlag & SKEIN_DEBUG_HDR)
+            {
+            printf("\n%s Block: outBits=%4d. T0=%06X.",AlgoHeader(bits),(uint_t) h->hashBitLen,(uint_t)h->T[0]);
+            printf(" Type=");
+            n = (uint_t) ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) >> SKEIN_T1_POS_BLK_TYPE);
+            switch (n)
+                {
+                case SKEIN_BLK_TYPE_KEY:  printf("KEY. ");  break;
+                case SKEIN_BLK_TYPE_CFG:  printf("CFG. ");  break;
+                case SKEIN_BLK_TYPE_PERS: printf("PERS.");  break;
+                case SKEIN_BLK_TYPE_PK :  printf("PK.  ");  break;
+                case SKEIN_BLK_TYPE_KDF:  printf("KDF. ");  break;
+                case SKEIN_BLK_TYPE_MSG:  printf("MSG. ");  break;
+                case SKEIN_BLK_TYPE_OUT:  printf("OUT. ");  break;
+                default:    printf("0x%02X.",n); break;
+                }
+            printf(" Flags=");
+            printf((h->T[1] & SKEIN_T1_FLAG_FIRST)   ? " First":"      ");
+            printf((h->T[1] & SKEIN_T1_FLAG_FINAL)   ? " Final":"      ");
+            printf((h->T[1] & SKEIN_T1_FLAG_BIT_PAD) ? " Pad"  :"    ");
+            n = (uint_t) ((h->T[1] & SKEIN_T1_TREE_LVL_MASK) >> SKEIN_T1_POS_TREE_LVL);
+            if (n)
+                printf("  TreeLevel = %02X",n);
+            printf("\n");
+            fflush(stdout);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_TWEAK)
+            {
+            printf("  Tweak:\n");
+            Show64(2,h->T);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_STATE)
+            {
+            printf("  %s words:\n",(skein_DebugFlag & SKEIN_DEBUG_THREEFISH)?"Key":"State");
+            Show64(bits/64,X);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_KEYSCHED)
+            {
+            printf("  Tweak schedule:\n");
+            Show64_flag(3,tsPtr);
+            printf("  Key   schedule:\n");
+            Show64_flag((bits/64)+1,ksPtr);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_INPUT_64)
+            {
+            printf("  Input block (words):\n");
+            Show64(bits/64,wPtr);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_INPUT_08)
+            {
+            printf("  Input block (bytes):\n");
+            Show08(bits/8,blkPtr);
+            }
+        }
+    }
+
+void Skein_Show_Key(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes)
+    {
+    if (keyBytes)
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag & SKEIN_DEBUG_KEY)
+        {
+        printf("\n%s MAC key = %4u bytes\n",AlgoHeader(bits),(unsigned) keyBytes);
+        Show08(keyBytes,key);
+        }
+    }
+#endif
diff --git a/Optimized_32bit/skein_debug.h b/Optimized_32bit/skein_debug.h
new file mode 100644
index 0000000000000..7775c0165c0ac
--- /dev/null
+++ b/Optimized_32bit/skein_debug.h
@@ -0,0 +1,48 @@
+#ifndef _SKEIN_DEBUG_H_
+#define _SKEIN_DEBUG_H_
+/***********************************************************************
+**
+** Interface definitions for Skein hashing debug output.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#ifdef  SKEIN_DEBUG
+/* callout functions used inside Skein code */
+void    Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+                         const u64b_t *wPtr,const u64b_t *ksPtr,const u64b_t *tsPtr);
+void    Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X);
+void    Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[]);
+void    Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr);
+void    Skein_Show_Key  (uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes);
+
+extern  uint_t skein_DebugFlag;            /* flags to control debug output (0 --> none) */
+
+#define SKEIN_RND_SPECIAL       (1000u)
+#define SKEIN_RND_KEY_INITIAL   (SKEIN_RND_SPECIAL+0u)
+#define SKEIN_RND_KEY_INJECT    (SKEIN_RND_SPECIAL+1u)
+#define SKEIN_RND_FEED_FWD      (SKEIN_RND_SPECIAL+2u)
+
+/* flag bits:  skein_DebugFlag */
+#define SKEIN_DEBUG_KEY         (1u << 1)  /* show MAC key */
+#define SKEIN_DEBUG_CONFIG      (1u << 2)  /* show config block processing */
+#define SKEIN_DEBUG_STATE       (1u << 3)  /* show input state during Show_Block() */
+#define SKEIN_DEBUG_TWEAK       (1u << 4)  /* show input state during Show_Block() */
+#define SKEIN_DEBUG_KEYSCHED    (1u << 5)  /* show expanded key schedule */
+#define SKEIN_DEBUG_INPUT_64    (1u << 6)  /* show input block as 64-bit words */
+#define SKEIN_DEBUG_INPUT_08    (1u << 7)  /* show input block as  8-bit bytes */
+#define SKEIN_DEBUG_INJECT      (1u << 8)  /* show state after key injection & feedforward points */
+#define SKEIN_DEBUG_ROUNDS      (1u << 9)  /* show state after all rounds */
+#define SKEIN_DEBUG_FINAL       (1u <<10)  /* show final output of Skein */
+#define SKEIN_DEBUG_HDR         (1u <<11)  /* show block header */
+#define SKEIN_DEBUG_THREEFISH   (1u <<12)  /* use Threefish name instead of Skein */
+#define SKEIN_DEBUG_PERMUTE     (1u <<13)  /* use word permutations */
+#define SKEIN_DEBUG_ALL         ((~0u) & ~(SKEIN_DEBUG_THREEFISH | SKEIN_DEBUG_PERMUTE))
+#define THREEFISH_DEBUG_ALL     (SKEIN_DEBUG_ALL | SKEIN_DEBUG_THREEFISH)
+
+#endif /*  SKEIN_DEBUG    */
+
+#endif /* _SKEIN_DEBUG_H_ */
diff --git a/Optimized_32bit/skein_iv.h b/Optimized_32bit/skein_iv.h
new file mode 100644
index 0000000000000..a8f54a41d3450
--- /dev/null
+++ b/Optimized_32bit/skein_iv.h
@@ -0,0 +1,199 @@
+#ifndef _SKEIN_IV_H_
+#define _SKEIN_IV_H_
+
+#include "skein.h"    /* get Skein macros and types */
+
+/*
+***************** Pre-computed Skein IVs *******************
+**
+** NOTE: these values are not "magic" constants, but
+** are generated using the Threefish block function.
+** They are pre-computed here only for speed; i.e., to
+** avoid the need for a Threefish call during Init().
+**
+** The IV for any fixed hash length may be pre-computed.
+** Only the most common values are included here.
+**
+************************************************************
+**/
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize =  256 bits. hashSize =  128 bits */
+const u64b_t SKEIN_256_IV_128[] =
+    {
+    MK_64(0xE1111906,0x964D7260),
+    MK_64(0x883DAAA7,0x7C8D811C),
+    MK_64(0x10080DF4,0x91960F7A),
+    MK_64(0xCCF7DDE5,0xB45BC1C2)
+    };
+
+/* blkSize =  256 bits. hashSize =  160 bits */
+const u64b_t SKEIN_256_IV_160[] =
+    {
+    MK_64(0x14202314,0x72825E98),
+    MK_64(0x2AC4E9A2,0x5A77E590),
+    MK_64(0xD47A5856,0x8838D63E),
+    MK_64(0x2DD2E496,0x8586AB7D)
+    };
+
+/* blkSize =  256 bits. hashSize =  224 bits */
+const u64b_t SKEIN_256_IV_224[] =
+    {
+    MK_64(0xC6098A8C,0x9AE5EA0B),
+    MK_64(0x876D5686,0x08C5191C),
+    MK_64(0x99CB88D7,0xD7F53884),
+    MK_64(0x384BDDB1,0xAEDDB5DE)
+    };
+
+/* blkSize =  256 bits. hashSize =  256 bits */
+const u64b_t SKEIN_256_IV_256[] =
+    {
+    MK_64(0xFC9DA860,0xD048B449),
+    MK_64(0x2FCA6647,0x9FA7D833),
+    MK_64(0xB33BC389,0x6656840F),
+    MK_64(0x6A54E920,0xFDE8DA69)
+    };
+
+/* blkSize =  512 bits. hashSize =  128 bits */
+const u64b_t SKEIN_512_IV_128[] =
+    {
+    MK_64(0xA8BC7BF3,0x6FBF9F52),
+    MK_64(0x1E9872CE,0xBD1AF0AA),
+    MK_64(0x309B1790,0xB32190D3),
+    MK_64(0xBCFBB854,0x3F94805C),
+    MK_64(0x0DA61BCD,0x6E31B11B),
+    MK_64(0x1A18EBEA,0xD46A32E3),
+    MK_64(0xA2CC5B18,0xCE84AA82),
+    MK_64(0x6982AB28,0x9D46982D)
+    };
+
+/* blkSize =  512 bits. hashSize =  160 bits */
+const u64b_t SKEIN_512_IV_160[] =
+    {
+    MK_64(0x28B81A2A,0xE013BD91),
+    MK_64(0xC2F11668,0xB5BDF78F),
+    MK_64(0x1760D8F3,0xF6A56F12),
+    MK_64(0x4FB74758,0x8239904F),
+    MK_64(0x21EDE07F,0x7EAF5056),
+    MK_64(0xD908922E,0x63ED70B8),
+    MK_64(0xB8EC76FF,0xECCB52FA),
+    MK_64(0x01A47BB8,0xA3F27A6E)
+    };
+
+/* blkSize =  512 bits. hashSize =  224 bits */
+const u64b_t SKEIN_512_IV_224[] =
+    {
+    MK_64(0xCCD06162,0x48677224),
+    MK_64(0xCBA65CF3,0xA92339EF),
+    MK_64(0x8CCD69D6,0x52FF4B64),
+    MK_64(0x398AED7B,0x3AB890B4),
+    MK_64(0x0F59D1B1,0x457D2BD0),
+    MK_64(0x6776FE65,0x75D4EB3D),
+    MK_64(0x99FBC70E,0x997413E9),
+    MK_64(0x9E2CFCCF,0xE1C41EF7)
+    };
+
+/* blkSize =  512 bits. hashSize =  256 bits */
+const u64b_t SKEIN_512_IV_256[] =
+    {
+    MK_64(0xCCD044A1,0x2FDB3E13),
+    MK_64(0xE8359030,0x1A79A9EB),
+    MK_64(0x55AEA061,0x4F816E6F),
+    MK_64(0x2A2767A4,0xAE9B94DB),
+    MK_64(0xEC06025E,0x74DD7683),
+    MK_64(0xE7A436CD,0xC4746251),
+    MK_64(0xC36FBAF9,0x393AD185),
+    MK_64(0x3EEDBA18,0x33EDFC13)
+    };
+
+/* blkSize =  512 bits. hashSize =  384 bits */
+const u64b_t SKEIN_512_IV_384[] =
+    {
+    MK_64(0xA3F6C6BF,0x3A75EF5F),
+    MK_64(0xB0FEF9CC,0xFD84FAA4),
+    MK_64(0x9D77DD66,0x3D770CFE),
+    MK_64(0xD798CBF3,0xB468FDDA),
+    MK_64(0x1BC4A666,0x8A0E4465),
+    MK_64(0x7ED7D434,0xE5807407),
+    MK_64(0x548FC1AC,0xD4EC44D6),
+    MK_64(0x266E1754,0x6AA18FF8)
+    };
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+const u64b_t SKEIN_512_IV_512[] =
+    {
+    MK_64(0x4903ADFF,0x749C51CE),
+    MK_64(0x0D95DE39,0x9746DF03),
+    MK_64(0x8FD19341,0x27C79BCE),
+    MK_64(0x9A255629,0xFF352CB1),
+    MK_64(0x5DB62599,0xDF6CA7B0),
+    MK_64(0xEABE394C,0xA9D5C3F4),
+    MK_64(0x991112C7,0x1A75B523),
+    MK_64(0xAE18A40B,0x660FCC33)
+    };
+
+/* blkSize = 1024 bits. hashSize =  384 bits */
+const u64b_t SKEIN1024_IV_384[] =
+    {
+    MK_64(0x5102B6B8,0xC1894A35),
+    MK_64(0xFEEBC9E3,0xFE8AF11A),
+    MK_64(0x0C807F06,0xE32BED71),
+    MK_64(0x60C13A52,0xB41A91F6),
+    MK_64(0x9716D35D,0xD4917C38),
+    MK_64(0xE780DF12,0x6FD31D3A),
+    MK_64(0x797846B6,0xC898303A),
+    MK_64(0xB172C2A8,0xB3572A3B),
+    MK_64(0xC9BC8203,0xA6104A6C),
+    MK_64(0x65909338,0xD75624F4),
+    MK_64(0x94BCC568,0x4B3F81A0),
+    MK_64(0x3EBBF51E,0x10ECFD46),
+    MK_64(0x2DF50F0B,0xEEB08542),
+    MK_64(0x3B5A6530,0x0DBC6516),
+    MK_64(0x484B9CD2,0x167BBCE1),
+    MK_64(0x2D136947,0xD4CBAFEA)
+    };
+
+/* blkSize = 1024 bits. hashSize =  512 bits */
+const u64b_t SKEIN1024_IV_512[] =
+    {
+    MK_64(0xCAEC0E5D,0x7C1B1B18),
+    MK_64(0xA01B0E04,0x5F03E802),
+    MK_64(0x33840451,0xED912885),
+    MK_64(0x374AFB04,0xEAEC2E1C),
+    MK_64(0xDF25A0E2,0x813581F7),
+    MK_64(0xE4004093,0x8B12F9D2),
+    MK_64(0xA662D539,0xC2ED39B6),
+    MK_64(0xFA8B85CF,0x45D8C75A),
+    MK_64(0x8316ED8E,0x29EDE796),
+    MK_64(0x053289C0,0x2E9F91B8),
+    MK_64(0xC3F8EF1D,0x6D518B73),
+    MK_64(0xBDCEC3C4,0xD5EF332E),
+    MK_64(0x549A7E52,0x22974487),
+    MK_64(0x67070872,0x5B749816),
+    MK_64(0xB9CD28FB,0xF0581BD1),
+    MK_64(0x0E2940B8,0x15804974)
+    };
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const u64b_t SKEIN1024_IV_1024[] =
+    {
+    MK_64(0xD593DA07,0x41E72355),
+    MK_64(0x15B5E511,0xAC73E00C),
+    MK_64(0x5180E5AE,0xBAF2C4F0),
+    MK_64(0x03BD41D3,0xFCBCAFAF),
+    MK_64(0x1CAEC6FD,0x1983A898),
+    MK_64(0x6E510B8B,0xCDD0589F),
+    MK_64(0x77E2BDFD,0xC6394ADA),
+    MK_64(0xC11E1DB5,0x24DCB0A3),
+    MK_64(0xD6D14AF9,0xC6329AB5),
+    MK_64(0x6A9B0BFC,0x6EB67E0D),
+    MK_64(0x9243C60D,0xCCFF1332),
+    MK_64(0x1A1F1DDE,0x743F02D4),
+    MK_64(0x0996753C,0x10ED0BB8),
+    MK_64(0x6572DD22,0xF2B4969A),
+    MK_64(0x61FD3062,0xD00A579A),
+    MK_64(0x1DE0536E,0x8682E539)
+    };
+
+#endif /* _SKEIN_IV_H_ */
diff --git a/Optimized_32bit/skein_port.h b/Optimized_32bit/skein_port.h
new file mode 100644
index 0000000000000..653302de7467b
--- /dev/null
+++ b/Optimized_32bit/skein_port.h
@@ -0,0 +1,124 @@
+#ifndef _SKEIN_PORT_H_
+#define _SKEIN_PORT_H_
+/*******************************************************************
+**
+** Platform-specific definitions for Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Many thanks to Brian Gladman for his portable header files.
+**
+** To port Skein to an "unsupported" platform, change the definitions
+** in this file appropriately.
+** 
+********************************************************************/
+
+#include "brg_types.h"                      /* get integer type definitions */
+
+typedef unsigned int    uint_t;             /* native unsigned integer */
+typedef uint_8t         u08b_t;             /*  8-bit unsigned integer */
+typedef uint_64t        u64b_t;             /* 64-bit unsigned integer */
+
+#ifndef RotL_64
+#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs.  The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    SKEIN_NEED_SWAP:  0 for little-endian, 1 for big-endian
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which 
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */
+
+#include "brg_endian.h"                     /* get endianness selection */
+#if   PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+    /* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP   (1)
+#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP   (0)
+#if   PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
+#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
+#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
+#endif
+#else
+#error "Skein needs endianness setting!"
+#endif
+
+#endif /* ifndef SKEIN_NEED_SWAP */
+
+/*
+ ******************************************************************
+ *      Provide any definitions still needed.
+ ******************************************************************
+ */
+#ifndef Skein_Swap64  /* swap for big-endian, nop for little-endian */
+#if     SKEIN_NEED_SWAP
+#define Skein_Swap64(w64)                       \
+  ( (( ((u64b_t)(w64))       & 0xFF) << 56) |   \
+    (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |   \
+    (((((u64b_t)(w64)) >>16) & 0xFF) << 40) |   \
+    (((((u64b_t)(w64)) >>24) & 0xFF) << 32) |   \
+    (((((u64b_t)(w64)) >>32) & 0xFF) << 24) |   \
+    (((((u64b_t)(w64)) >>40) & 0xFF) << 16) |   \
+    (((((u64b_t)(w64)) >>48) & 0xFF) <<  8) |   \
+    (((((u64b_t)(w64)) >>56) & 0xFF)      ) )
+#else
+#define Skein_Swap64(w64)  (w64)
+#endif
+#endif  /* ifndef Skein_Swap64 */
+
+
+#ifndef Skein_Put64_LSB_First
+void    Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<bCnt;n++)
+        dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
+    }
+#else
+    ;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Put64_LSB_First */
+
+
+#ifndef Skein_Get64_LSB_First
+void    Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<8*wCnt;n+=8)
+        dst[n/8] = (((u64b_t) src[n  ])      ) +
+                   (((u64b_t) src[n+1]) <<  8) +
+                   (((u64b_t) src[n+2]) << 16) +
+                   (((u64b_t) src[n+3]) << 24) +
+                   (((u64b_t) src[n+4]) << 32) +
+                   (((u64b_t) src[n+5]) << 40) +
+                   (((u64b_t) src[n+6]) << 48) +
+                   (((u64b_t) src[n+7]) << 56) ;
+    }
+#else
+    ;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Get64_LSB_First */
+
+#endif   /* ifndef _SKEIN_PORT_H_ */
diff --git a/Optimized_64bit/SHA3api_ref.c b/Optimized_64bit/SHA3api_ref.c
new file mode 100644
index 0000000000000..6861a3e4bffb2
--- /dev/null
+++ b/Optimized_64bit/SHA3api_ref.c
@@ -0,0 +1,115 @@
+/***********************************************************************
+**
+** Implementation of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#include <string.h>     /* get the memcpy/memset functions */
+#include "skein.h"      /* get the Skein API definitions   */
+#include "SHA3api_ref.h"/* get the  AHS  API definitions   */
+
+/******************************************************************/
+/*     AHS API code                                               */
+/******************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+HashReturn Init(hashState *state, int hashbitlen)
+    {
+#if SKEIN_256_NIST_MAX_HASH_BITS
+    if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
+        {
+        Skein_Assert(hashbitlen > 0,BAD_HASHLEN);
+        state->statebits = 64*SKEIN_256_STATE_WORDS;
+        return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
+        }
+#endif
+    if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
+        {
+        state->statebits = 64*SKEIN_512_STATE_WORDS;
+        return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+        }
+    else
+        {
+        state->statebits = 64*SKEIN1024_STATE_WORDS;
+        return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+    {
+    /* only the final Update() call is allowed do partial bytes, else assert an error */
+    Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, FAIL);
+
+    Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+    if ((databitlen & 7) == 0)  /* partial bytes? */
+        {
+        switch ((state->statebits >> 8) & 3)
+            {
+            case 2:  return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+            case 1:  return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
+            case 0:  return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
+            default: return FAIL;
+            }
+        }
+    else
+        {   /* handle partial final byte */
+        size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
+        u08b_t b,mask;
+
+        mask = (u08b_t) (1u << (7 - (databitlen & 7)));       /* partial byte bit mask */
+        b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+
+        switch ((state->statebits >> 8) & 3)
+            {
+            case 2:  Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte    */
+                     Skein_512_Update(&state->u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            case 1:  Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte    */
+                     Skein_256_Update(&state->u.ctx_256,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            case 0:  Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte    */
+                     Skein1024_Update(&state->u.ctx1024,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            default: return FAIL;
+            }
+        Skein_Set_Bit_Pad_Flag(state->u.h);                    /* set tweak flag for the final call */
+        
+        return SUCCESS;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+HashReturn Final(hashState *state, BitSequence *hashval)
+    {
+    Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+    switch ((state->statebits >> 8) & 3)
+        {
+        case 2:  return Skein_512_Final(&state->u.ctx_512,hashval);
+        case 1:  return Skein_256_Final(&state->u.ctx_256,hashval);
+        case 0:  return Skein1024_Final(&state->u.ctx1024,hashval);
+        default: return FAIL;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+HashReturn Hash(int hashbitlen, const BitSequence *data, /* all-in-one call */
+                DataLength databitlen,BitSequence *hashval)
+    {
+    hashState  state;
+    HashReturn r = Init(&state,hashbitlen);
+    if (r == SUCCESS)
+        { /* these calls do not fail when called properly */
+        r = Update(&state,data,databitlen);
+        Final(&state,hashval);
+        }
+    return r;
+    }
diff --git a/Optimized_64bit/SHA3api_ref.h b/Optimized_64bit/SHA3api_ref.h
new file mode 100644
index 0000000000000..6d62304e59b7e
--- /dev/null
+++ b/Optimized_64bit/SHA3api_ref.h
@@ -0,0 +1,66 @@
+#ifndef _AHS_API_H_
+#define _AHS_API_H_
+
+/***********************************************************************
+**
+** Interface declarations of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#include "skein.h"
+
+typedef enum
+    {
+    SUCCESS     = SKEIN_SUCCESS,
+    FAIL        = SKEIN_FAIL,
+    BAD_HASHLEN = SKEIN_BAD_HASHLEN
+    }
+    HashReturn;
+
+typedef size_t   DataLength;                /* bit count  type */
+typedef u08b_t   BitSequence;               /* bit stream type */
+
+typedef struct
+    {
+    uint_t  statebits;                      /* 256, 512, or 1024 */
+    union
+        {
+        Skein_Ctxt_Hdr_t h;                 /* common header "overlay" */
+        Skein_256_Ctxt_t ctx_256;
+        Skein_512_Ctxt_t ctx_512;
+        Skein1024_Ctxt_t ctx1024;
+        } u;
+    }
+    hashState;
+
+/* "incremental" hashing API */
+HashReturn Init  (hashState *state, int hashbitlen);
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+HashReturn Final (hashState *state,       BitSequence *hashval);
+
+/* "all-in-one" call */
+HashReturn Hash  (int hashbitlen,   const BitSequence *data, 
+                  DataLength databitlen,  BitSequence *hashval);
+
+
+/*
+** Re-define the compile-time constants below to change the selection
+** of the Skein state size in the Init() function in SHA3api_ref.c.
+**
+** That is, the NIST API does not allow for explicit selection of the
+** Skein block size, so it must be done implicitly in the Init() function.
+** The selection is controlled by these constants.
+*/
+#ifndef SKEIN_256_NIST_MAX_HASHBITS
+#define SKEIN_256_NIST_MAX_HASHBITS (0)
+#endif
+
+#ifndef SKEIN_512_NIST_MAX_HASHBITS
+#define SKEIN_512_NIST_MAX_HASHBITS (512)
+#endif
+
+#endif  /* ifdef _AHS_API_H_ */
diff --git a/Optimized_64bit/brg_endian.h b/Optimized_64bit/brg_endian.h
new file mode 100644
index 0000000000000..978eb33f08cf1
--- /dev/null
+++ b/Optimized_64bit/brg_endian.h
@@ -0,0 +1,148 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 20/10/2006
+*/
+
+#ifndef BRG_ENDIAN_H
+#define BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined(AVR)
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )  || defined( AVR )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+#endif
+
+/* special handler for IA64, which may be either endianness (?)  */
+/* here we assume little-endian, but this may need to be changed */
+#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+#  define PLATFORM_MUST_ALIGN (1)
+#ifndef PLATFORM_BYTE_ORDER
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+#endif
+
+#ifndef   PLATFORM_MUST_ALIGN
+#  define PLATFORM_MUST_ALIGN (0)
+#endif
+
+#endif  /* ifndef BRG_ENDIAN_H */
diff --git a/Optimized_64bit/brg_types.h b/Optimized_64bit/brg_types.h
new file mode 100644
index 0000000000000..d6d6cdab9fbfd
--- /dev/null
+++ b/Optimized_64bit/brg_types.h
@@ -0,0 +1,188 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef BRG_TYPES_H
+#define BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined(__GNUC__)  /* DLW: avoid mingw problem with -ansi */
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#  endif
+#endif
+
+#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 )
+#  error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8
+
+    dec_unit_type(size,x)       declares a variable 'x' of length 
+                                'size' bits
+
+    dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    ptr_cast(x,size)            casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define ui_type(size)               uint_##size##t
+#define dec_unit_type(size,x)       typedef ui_type(size) x
+#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/Optimized_64bit/skein.c b/Optimized_64bit/skein.c
new file mode 100644
index 0000000000000..c9289cd49e8ef
--- /dev/null
+++ b/Optimized_64bit/skein.c
@@ -0,0 +1,753 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
+
+#include <string.h>      /* get the memcpy/memset functions */
+#include "skein.h"       /* get the Skein API definitions   */
+#include "skein_iv.h"    /* get precomputed IVs */
+
+/*****************************************************************/
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void    Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+
+/*****************************************************************/
+/*     256-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_256_STATE_BYTES];
+        u64b_t  w[SKEIN_256_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+    switch (hashBitLen)
+        {             /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+        case  256: memcpy(ctx->X,SKEIN_256_IV_256,sizeof(ctx->X));  break;
+        case  224: memcpy(ctx->X,SKEIN_256_IV_224,sizeof(ctx->X));  break;
+        case  160: memcpy(ctx->X,SKEIN_256_IV_160,sizeof(ctx->X));  break;
+        case  128: memcpy(ctx->X,SKEIN_256_IV_128,sizeof(ctx->X));  break;
+#endif
+        default:
+            /* here if there is no precomputed IV value available */
+            /* build/process the config block, type == CONFIG (could be precomputed) */
+            Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+            cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+            cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+            cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+            memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+            /* compute the initial chaining values from config block */
+            memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+            Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+            break;
+        }
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_256_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_256_STATE_BYTES];
+        u64b_t  w[SKEIN_256_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein_256_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein_256_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+        {
+        uint_t i;
+        for (i=0;i<SKEIN_256_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+#endif
+        }
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(256,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash (default) */
+    ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+    Skein_Start_New_Type(ctx,MSG);
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+            Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN_256_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+            msg        += n * SKEIN_256_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_256_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)            /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_256_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_256_BLOCK_BYTES)
+            n  = SKEIN_256_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_256_API_CodeSize) -
+           ((u08b_t *) Skein_256_Init);
+    }
+#endif
+
+/*****************************************************************/
+/*     512-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_512_STATE_BYTES];
+        u64b_t  w[SKEIN_512_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+    switch (hashBitLen)
+        {             /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+        case  512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X));  break;
+        case  384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X));  break;
+        case  256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X));  break;
+        case  224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X));  break;
+#endif
+        default:
+            /* here if there is no precomputed IV value available */
+            /* build/process the config block, type == CONFIG (could be precomputed) */
+            Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+            cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+            cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+            cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+            memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+            /* compute the initial chaining values from config block */
+            memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+            Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+            break;
+        }
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_512_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_512_STATE_BYTES];
+        u64b_t  w[SKEIN_512_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein_512_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein_512_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+        {
+        uint_t i;
+        for (i=0;i<SKEIN_512_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+#endif
+        }
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(512,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash (default) */
+    ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+    Skein_Start_New_Type(ctx,MSG);
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+            Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+            msg        += n * SKEIN_512_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_512_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_512_BLOCK_BYTES)
+            n  = SKEIN_512_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_512_API_CodeSize) -
+           ((u08b_t *) Skein_512_Init);
+    }
+#endif
+
+/*****************************************************************/
+/*    1024-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN1024_STATE_BYTES];
+        u64b_t  w[SKEIN1024_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+    switch (hashBitLen)
+        {              /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+        case  512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break;
+        case  384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break;
+        case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break;
+#endif
+        default:
+            /* here if there is no precomputed IV value available */
+            /* build/process the config block, type == CONFIG (could be precomputed) */
+            Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+            cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+            cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+            cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+            memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+            /* compute the initial chaining values from config block */
+            memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+            Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+            break;
+        }
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein1024_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    union
+        {
+        u08b_t  b[SKEIN1024_STATE_BYTES];
+        u64b_t  w[SKEIN1024_STATE_WORDS];
+        } cfg;                              /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein1024_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein1024_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+        {
+        uint_t i;
+        for (i=0;i<SKEIN1024_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+#endif
+        }
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(1024,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash (default) */
+    ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+    Skein_Start_New_Type(ctx,MSG);
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+            Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN1024_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+            msg        += n * SKEIN1024_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN1024_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)            /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN1024_BLOCK_BYTES)
+            n  = SKEIN1024_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein1024_API_CodeSize) -
+           ((u08b_t *) Skein1024_Init);
+    }
+#endif
+
+/**************** Functions to support MAC/tree hashing ***************/
+/*   (this code is identical for Optimized and Reference versions)    */
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+    Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+    Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+#if SKEIN_TREE_HASH
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein_256_Output(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_256_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_256_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_256_BLOCK_BYTES)
+            n  = SKEIN_256_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein_512_Output(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_512_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_512_BLOCK_BYTES)
+            n  = SKEIN_512_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN1024_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN1024_BLOCK_BYTES)
+            n  = SKEIN1024_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+#endif
diff --git a/Optimized_64bit/skein.h b/Optimized_64bit/skein.h
new file mode 100644
index 0000000000000..721c9bc9ce0db
--- /dev/null
+++ b/Optimized_64bit/skein.h
@@ -0,0 +1,327 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_     1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+** 
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+**  SKEIN_DEBUG            -- make callouts from inside Skein code
+**                            to examine/display intermediate values.
+**                            [default: no callouts (no overhead)]
+**
+**  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+**                            code. If not defined, most error checking 
+**                            is disabled (for performance). Otherwise, 
+**                            the switch value is interpreted as:
+**                                0: use assert()      to flag errors
+**                                1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stddef.h>                          /* get size_t definition */
+#include "skein_port.h"                      /* get platform-specific definitions */
+
+enum
+    {
+    SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
+    SKEIN_FAIL            =      1,
+    SKEIN_BAD_HASHLEN     =      2
+    };
+
+#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define  SKEIN_256_STATE_WORDS ( 4)
+#define  SKEIN_512_STATE_WORDS ( 8)
+#define  SKEIN1024_STATE_WORDS (16)
+#define  SKEIN_MAX_STATE_WORDS (16)
+
+#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+typedef struct
+    {
+    size_t  hashBitLen;                      /* size of hash result, in bits */
+    size_t  bCnt;                            /* current byte count in buffer b[] */
+    u64b_t  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
+    } Skein_Ctxt_Hdr_t;
+
+typedef struct                               /*  256-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein_256_Ctxt_t;
+
+typedef struct                               /*  512-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN_512_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN_512_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein_512_Ctxt_t;
+
+typedef struct                               /* 1024-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN1024_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN1024_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein1024_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+int  Skein_256_Init  (Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+int  Skein_512_Init  (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
+int  Skein1024_Init  (Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+
+int  Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+
+int  Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+/*
+**   Skein APIs for "extended" initialization: MAC keys, tree hashing.
+**   After an InitExt() call, just use Update/Final calls as with Init().
+**
+**   Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes.
+**          When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, 
+**              the results of InitExt() are identical to calling Init().
+**          The function Init() may be called once to "precompute" the IV for
+**              a given hashBitLen value, then by saving a copy of the context
+**              the IV computation may be avoided in later calls.
+**          Similarly, the function InitExt() may be called once per MAC key 
+**              to precompute the MAC IV, then a copy of the context saved and
+**              reused for each new MAC computation.
+**/
+int  Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int  Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int  Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+
+/*
+**   Skein APIs for MAC and tree hash:
+**      Final_Pad:  pad, do final block, but no OUTPUT type
+**      Output:     do just the output stage
+*/
+int  Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+#ifndef SKEIN_TREE_HASH
+#define SKEIN_TREE_HASH (1)
+#endif
+#if  SKEIN_TREE_HASH
+int  Skein_256_Output   (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Output   (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+#endif
+
+/*****************************************************************
+** "Internal" Skein definitions
+**    -- not needed for sequential hashing API, but will be 
+**           helpful for other uses of Skein (e.g., tree hash mode).
+**    -- included here so that they can be shared between
+**           reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+                                
+#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+                                
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD   (((u64b_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+                                
+/* tweak word T[1]: tree level bit field mask */
+#define SKEIN_T1_TREE_LVL_MASK  (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n)  (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY      ( 0)                    /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG      ( 4)                    /* configuration block */
+#define SKEIN_BLK_TYPE_PERS     ( 8)                    /* personalization string */
+#define SKEIN_BLK_TYPE_PK       (12)                    /* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF      (16)                    /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE    (20)                    /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+#define SKEIN_BLK_TYPE_MASK     (63)                    /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK    SKEIN_T1_BLK_TYPE(PK)   /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL       (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION           (1)
+
+#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN       (4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS  ( 0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS  ( 8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS  (16)
+
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl)                   \
+    ( (((u64b_t)(leaf  )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
+      (((u64b_t)(node  )) << SKEIN_CFG_TREE_NODE_SIZE_POS) |    \
+      (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+
+/*
+**   Skein macros for getting/setting tweak words, etc.
+**   These are useful for partial input bytes, hash tree init/update, etc.
+**/
+#define Skein_Get_Tweak(ctxPtr,TWK_NUM)         ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Get_T0(ctxPtr)    Skein_Get_Tweak(ctxPtr,0)
+#define Skein_Get_T1(ctxPtr)    Skein_Get_Tweak(ctxPtr,1)
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1)           \
+    {                                           \
+    Skein_Set_T0(ctxPtr,(T0));                  \
+    Skein_Set_T1(ctxPtr,(T1));                  \
+    }
+
+#define Skein_Set_Type(ctxPtr,BLK_TYPE)         \
+    Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+    { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Clear_First_Flag(hdr)      { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;       }
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+
+/*****************************************************************
+** "Internal" Skein definitions for debugging and error checking
+******************************************************************/
+#ifdef  SKEIN_DEBUG             /* examine/display intermediate values? */
+#include "skein_debug.h"
+#else                           /* default is no callouts */
+#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
+#define Skein_Show_Round(bits,ctx,r,X)
+#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
+#define Skein_Show_Final(bits,ctx,cnt,outPtr)
+#define Skein_Show_Key(bits,ctx,key,keyBytes)
+#endif
+
+#ifndef SKEIN_ERR_CHECK        /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_assert(x)
+#elif   defined(SKEIN_ASSERT)
+#include <assert.h>     
+#define Skein_Assert(x,retCode) assert(x) 
+#define Skein_assert(x)         assert(x) 
+#else
+#include <assert.h>     
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
+#define Skein_assert(x)         assert(x)                     /* internal error */
+#endif
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum    
+    {   
+        /* Skein_256 round rotation constants */
+    R_256_0_0=14, R_256_0_1=16,
+    R_256_1_0=52, R_256_1_1=57,
+    R_256_2_0=23, R_256_2_1=40,
+    R_256_3_0= 5, R_256_3_1=37,
+    R_256_4_0=25, R_256_4_1=33,
+    R_256_5_0=46, R_256_5_1=12,
+    R_256_6_0=58, R_256_6_1=22,
+    R_256_7_0=32, R_256_7_1=32,
+
+        /* Skein_512 round rotation constants */
+    R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+    R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+    R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+    R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+    R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+    R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+    R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+    R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+
+        /* Skein1024 round rotation constants */
+    R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
+    R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
+    R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
+    R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
+    R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
+    R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
+    R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
+    R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
+    };
+
+#ifndef SKEIN_ROUNDS
+#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
+#define SKEIN_512_ROUNDS_TOTAL (72)
+#define SKEIN1024_ROUNDS_TOTAL (80)
+#else                                        /* allow command-line define in range 8*(5..14)   */
+#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5))
+#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
+#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS    ) + 5) % 10) + 5))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* ifndef _SKEIN_H_ */
diff --git a/Optimized_64bit/skein_block.c b/Optimized_64bit/skein_block.c
new file mode 100644
index 0000000000000..bfd29d1eee2d8
--- /dev/null
+++ b/Optimized_64bit/skein_block.c
@@ -0,0 +1,689 @@
+/***********************************************************************
+**
+** Implementation of the Skein block functions.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Compile-time switches:
+**
+**  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
+**                    versions use ASM code for block processing
+**                    [default: use C for all block sizes]
+**
+************************************************************************/
+
+#include <string.h>
+#include "skein.h"
+
+#ifndef SKEIN_USE_ASM
+#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
+#endif
+
+#ifndef SKEIN_LOOP
+#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
+#endif
+
+#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)                
+#define ts              (kw + KW_TWK_BASE)
+
+#ifdef SKEIN_DEBUG
+#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
+#else
+#define DebugSaveTweak(ctx)
+#endif
+
+/*****************************  Skein_256 ******************************/
+#if !(SKEIN_USE_ASM & 256)
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT = SKEIN_256_STATE_WORDS
+        };
+#undef  RCNT
+#define RCNT  (SKEIN_256_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
+#else
+#define SKEIN_UNROLL_256 (0)
+#endif
+
+#if SKEIN_UNROLL_256
+#if (RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
+#endif
+    size_t  r;
+    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+    u64b_t  X0,X1,X2,X3;                        /* local copy of context vars, for speed */
+    u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+    const u64b_t *Xptr[4];                      /* use for debugging (help compiler put Xn in registers) */
+    Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+#endif
+    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+    ts[0] = ctx->h.T[0];
+    ts[1] = ctx->h.T[1];
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ts[0] += byteCntAdd;                    /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[0] = ctx->X[0];     
+        ks[1] = ctx->X[1];
+        ks[2] = ctx->X[2];
+        ks[3] = ctx->X[3];
+        ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT);   /* get input block in little-endian format */
+        DebugSaveTweak(ctx);
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+        X0 = w[0] + ks[0];                      /* do the first full key injection */
+        X1 = w[1] + ks[1] + ts[0];
+        X2 = w[2] + ks[2] + ts[1];
+        X3 = w[3] + ks[3];
+
+        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);    /* show starting state values */
+
+        blkPtr += SKEIN_256_BLOCK_BYTES;
+
+        /* run the rounds */
+
+#define Round256(p0,p1,p2,p3,ROT,rNum)                              \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+
+#if SKEIN_UNROLL_256 == 0                       
+#define R256(p0,p1,p2,p3,ROT,rNum)           /* fully unrolled */   \
+    Round256(p0,p1,p2,p3,ROT,rNum)                                  \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I256(R)                                                     \
+    X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
+    X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
+    X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
+    X3   += ks[((R)+4) % 5] +     (R)+1;                            \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else                                       /* looping version */
+#define R256(p0,p1,p2,p3,ROT,rNum)                                  \
+    Round256(p0,p1,p2,p3,ROT,rNum)                                  \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I256(R)                                                     \
+    X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
+    X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
+    X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
+    X3   += ks[r+(R)+3] +    r+(R)   ;                              \
+    ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
+    ts[r + (R)+2    ]   = ts[r+(R)-1];                              \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+    for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256)  /* loop thru it */
+#endif  
+        {    
+#define R256_8_rounds(R)                  \
+        R256(0,1,2,3,R_256_0,8*(R) + 1);  \
+        R256(0,3,2,1,R_256_1,8*(R) + 2);  \
+        R256(0,1,2,3,R_256_2,8*(R) + 3);  \
+        R256(0,3,2,1,R_256_3,8*(R) + 4);  \
+        I256(2*(R));                      \
+        R256(0,1,2,3,R_256_4,8*(R) + 5);  \
+        R256(0,3,2,1,R_256_5,8*(R) + 6);  \
+        R256(0,1,2,3,R_256_6,8*(R) + 7);  \
+        R256(0,3,2,1,R_256_7,8*(R) + 8);  \
+        I256(2*(R)+1);
+
+        R256_8_rounds( 0);
+
+#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
+
+  #if   R256_Unroll_R( 1)
+        R256_8_rounds( 1);
+  #endif
+  #if   R256_Unroll_R( 2)
+        R256_8_rounds( 2);
+  #endif
+  #if   R256_Unroll_R( 3)
+        R256_8_rounds( 3);
+  #endif
+  #if   R256_Unroll_R( 4)
+        R256_8_rounds( 4);
+  #endif
+  #if   R256_Unroll_R( 5)
+        R256_8_rounds( 5);
+  #endif
+  #if   R256_Unroll_R( 6)
+        R256_8_rounds( 6);
+  #endif
+  #if   R256_Unroll_R( 7)
+        R256_8_rounds( 7);
+  #endif
+  #if   R256_Unroll_R( 8)
+        R256_8_rounds( 8);
+  #endif
+  #if   R256_Unroll_R( 9)
+        R256_8_rounds( 9);
+  #endif
+  #if   R256_Unroll_R(10)
+        R256_8_rounds(10);
+  #endif
+  #if   R256_Unroll_R(11)
+        R256_8_rounds(11);
+  #endif
+  #if   R256_Unroll_R(12)
+        R256_8_rounds(12);
+  #endif
+  #if   R256_Unroll_R(13)
+        R256_8_rounds(13);
+  #endif
+  #if   R256_Unroll_R(14)
+        R256_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_256 > 14)
+#error  "need more unrolling in Skein_256_Process_Block"
+  #endif
+        }
+        /* do the final "feedforward" xor, update context chaining vars */
+        ctx->X[0] = X0 ^ w[0];
+        ctx->X[1] = X1 ^ w[1];
+        ctx->X[2] = X2 ^ w[2];
+        ctx->X[3] = X3 ^ w[3];
+
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+        }
+    while (--blkCnt);
+    ctx->h.T[0] = ts[0];
+    ctx->h.T[1] = ts[1];
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_256_Process_Block_CodeSize) -
+           ((u08b_t *) Skein_256_Process_Block);
+    }
+uint_t Skein_256_Unroll_Cnt(void)
+    {
+    return SKEIN_UNROLL_256;
+    }
+#endif
+#endif
+
+/*****************************  Skein_512 ******************************/
+#if !(SKEIN_USE_ASM & 512)
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT = SKEIN_512_STATE_WORDS
+        };
+#undef  RCNT
+#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
+#else
+#define SKEIN_UNROLL_512 (0)
+#endif
+
+#if SKEIN_UNROLL_512
+#if (RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
+#endif
+    size_t  r;
+    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+    u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
+    u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+    const u64b_t *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
+    Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+    Xptr[4] = &X4;  Xptr[5] = &X5;  Xptr[6] = &X6;  Xptr[7] = &X7;
+#endif
+
+    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+    ts[0] = ctx->h.T[0];
+    ts[1] = ctx->h.T[1];
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ts[0] += byteCntAdd;                    /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[0] = ctx->X[0];
+        ks[1] = ctx->X[1];
+        ks[2] = ctx->X[2];
+        ks[3] = ctx->X[3];
+        ks[4] = ctx->X[4];
+        ks[5] = ctx->X[5];
+        ks[6] = ctx->X[6];
+        ks[7] = ctx->X[7];
+        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
+                ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        DebugSaveTweak(ctx);
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+        X0   = w[0] + ks[0];                    /* do the first full key injection */
+        X1   = w[1] + ks[1];
+        X2   = w[2] + ks[2];
+        X3   = w[3] + ks[3];
+        X4   = w[4] + ks[4];
+        X5   = w[5] + ks[5] + ts[0];
+        X6   = w[6] + ks[6] + ts[1];
+        X7   = w[7] + ks[7];
+
+        blkPtr += SKEIN_512_BLOCK_BYTES;
+
+        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+        /* run the rounds */
+#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+
+#if SKEIN_UNROLL_512 == 0                       
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I512(R)                                                     \
+    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
+    X1   += ks[((R)+2) % 9];                                        \
+    X2   += ks[((R)+3) % 9];                                        \
+    X3   += ks[((R)+4) % 9];                                        \
+    X4   += ks[((R)+5) % 9];                                        \
+    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
+    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
+    X7   += ks[((R)+8) % 9] +     (R)+1;                            \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else                                       /* looping version */
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I512(R)                                                     \
+    X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
+    X1   += ks[r+(R)+1];                                            \
+    X2   += ks[r+(R)+2];                                            \
+    X3   += ks[r+(R)+3];                                            \
+    X4   += ks[r+(R)+4];                                            \
+    X5   += ks[r+(R)+5] + ts[r+(R)+0];                              \
+    X6   += ks[r+(R)+6] + ts[r+(R)+1];                              \
+    X7   += ks[r+(R)+7] +    r+(R)   ;                              \
+    ks[r +       (R)+8] = ks[r+(R)-1];  /* rotate key schedule */   \
+    ts[r +       (R)+2] = ts[r+(R)-1];                              \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+    for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512)   /* loop thru it */
+#endif                         /* end of looped code definitions */
+        {
+#define R512_8_rounds(R)  /* do 8 full rounds */  \
+        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
+        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
+        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
+        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
+        I512(2*(R));                              \
+        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
+        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
+        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
+        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
+        I512(2*(R)+1);        /* and key injection */
+
+        R512_8_rounds( 0);
+
+#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
+
+  #if   R512_Unroll_R( 1)
+        R512_8_rounds( 1);
+  #endif
+  #if   R512_Unroll_R( 2)
+        R512_8_rounds( 2);
+  #endif
+  #if   R512_Unroll_R( 3)
+        R512_8_rounds( 3);
+  #endif
+  #if   R512_Unroll_R( 4)
+        R512_8_rounds( 4);
+  #endif
+  #if   R512_Unroll_R( 5)
+        R512_8_rounds( 5);
+  #endif
+  #if   R512_Unroll_R( 6)
+        R512_8_rounds( 6);
+  #endif
+  #if   R512_Unroll_R( 7)
+        R512_8_rounds( 7);
+  #endif
+  #if   R512_Unroll_R( 8)
+        R512_8_rounds( 8);
+  #endif
+  #if   R512_Unroll_R( 9)
+        R512_8_rounds( 9);
+  #endif
+  #if   R512_Unroll_R(10)
+        R512_8_rounds(10);
+  #endif
+  #if   R512_Unroll_R(11)
+        R512_8_rounds(11);
+  #endif
+  #if   R512_Unroll_R(12)
+        R512_8_rounds(12);
+  #endif
+  #if   R512_Unroll_R(13)
+        R512_8_rounds(13);
+  #endif
+  #if   R512_Unroll_R(14)
+        R512_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_512 > 14)
+#error  "need more unrolling in Skein_512_Process_Block"
+  #endif
+        }
+
+        /* do the final "feedforward" xor, update context chaining vars */
+        ctx->X[0] = X0 ^ w[0];
+        ctx->X[1] = X1 ^ w[1];
+        ctx->X[2] = X2 ^ w[2];
+        ctx->X[3] = X3 ^ w[3];
+        ctx->X[4] = X4 ^ w[4];
+        ctx->X[5] = X5 ^ w[5];
+        ctx->X[6] = X6 ^ w[6];
+        ctx->X[7] = X7 ^ w[7];
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+        }
+    while (--blkCnt);
+    ctx->h.T[0] = ts[0];
+    ctx->h.T[1] = ts[1];
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
+           ((u08b_t *) Skein_512_Process_Block);
+    }
+uint_t Skein_512_Unroll_Cnt(void)
+    {
+    return SKEIN_UNROLL_512;
+    }
+#endif
+#endif
+
+/*****************************  Skein1024 ******************************/
+#if !(SKEIN_USE_ASM & 1024)
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C, always looping (unrolled is bigger AND slower!) */
+    enum
+        {
+        WCNT = SKEIN1024_STATE_WORDS
+        };
+#undef  RCNT
+#define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define SKEIN_UNROLL_1024 (0)
+#endif
+
+#if (SKEIN_UNROLL_1024 != 0)
+#if (RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024"              /* sanity check on unroll count */
+#endif
+    size_t  r;
+    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+
+    u64b_t  X00,X01,X02,X03,X04,X05,X06,X07,    /* local copy of vars, for speed */
+            X08,X09,X10,X11,X12,X13,X14,X15;
+    u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+    const u64b_t *Xptr[16];                     /* use for debugging (help compiler put Xn in registers) */
+    Xptr[ 0] = &X00;  Xptr[ 1] = &X01;  Xptr[ 2] = &X02;  Xptr[ 3] = &X03;
+    Xptr[ 4] = &X04;  Xptr[ 5] = &X05;  Xptr[ 6] = &X06;  Xptr[ 7] = &X07;
+    Xptr[ 8] = &X08;  Xptr[ 9] = &X09;  Xptr[10] = &X10;  Xptr[11] = &X11;
+    Xptr[12] = &X12;  Xptr[13] = &X13;  Xptr[14] = &X14;  Xptr[15] = &X15;
+#endif
+
+    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+    ts[0] = ctx->h.T[0];
+    ts[1] = ctx->h.T[1];
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ts[0] += byteCntAdd;                    /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[ 0] = ctx->X[ 0];
+        ks[ 1] = ctx->X[ 1];
+        ks[ 2] = ctx->X[ 2];
+        ks[ 3] = ctx->X[ 3];
+        ks[ 4] = ctx->X[ 4];
+        ks[ 5] = ctx->X[ 5];
+        ks[ 6] = ctx->X[ 6];
+        ks[ 7] = ctx->X[ 7];
+        ks[ 8] = ctx->X[ 8];
+        ks[ 9] = ctx->X[ 9];
+        ks[10] = ctx->X[10];
+        ks[11] = ctx->X[11];
+        ks[12] = ctx->X[12];
+        ks[13] = ctx->X[13];
+        ks[14] = ctx->X[14];
+        ks[15] = ctx->X[15];
+        ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^
+                 ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^
+                 ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^
+                 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+        ts[2]  = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        DebugSaveTweak(ctx);
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+        X00    = w[ 0] + ks[ 0];                 /* do the first full key injection */
+        X01    = w[ 1] + ks[ 1];
+        X02    = w[ 2] + ks[ 2];
+        X03    = w[ 3] + ks[ 3];
+        X04    = w[ 4] + ks[ 4];
+        X05    = w[ 5] + ks[ 5];
+        X06    = w[ 6] + ks[ 6];
+        X07    = w[ 7] + ks[ 7];
+        X08    = w[ 8] + ks[ 8];
+        X09    = w[ 9] + ks[ 9];
+        X10    = w[10] + ks[10];
+        X11    = w[11] + ks[11];
+        X12    = w[12] + ks[12];
+        X13    = w[13] + ks[13] + ts[0];
+        X14    = w[14] + ks[14] + ts[1];
+        X15    = w[15] + ks[15];
+
+        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+
+#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0;   \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2;   \
+    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4;   \
+    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6;   \
+    X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8;   \
+    X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA;   \
+    X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC;   \
+    X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE;   \
+
+#if SKEIN_UNROLL_1024 == 0                      
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr);
+
+#define I1024(R)                                                      \
+    X00   += ks[((R)+ 1) % 17]; /* inject the key schedule value */   \
+    X01   += ks[((R)+ 2) % 17];                                       \
+    X02   += ks[((R)+ 3) % 17];                                       \
+    X03   += ks[((R)+ 4) % 17];                                       \
+    X04   += ks[((R)+ 5) % 17];                                       \
+    X05   += ks[((R)+ 6) % 17];                                       \
+    X06   += ks[((R)+ 7) % 17];                                       \
+    X07   += ks[((R)+ 8) % 17];                                       \
+    X08   += ks[((R)+ 9) % 17];                                       \
+    X09   += ks[((R)+10) % 17];                                       \
+    X10   += ks[((R)+11) % 17];                                       \
+    X11   += ks[((R)+12) % 17];                                       \
+    X12   += ks[((R)+13) % 17];                                       \
+    X13   += ks[((R)+14) % 17] + ts[((R)+1) % 3];                     \
+    X14   += ks[((R)+15) % 17] + ts[((R)+2) % 3];                     \
+    X15   += ks[((R)+16) % 17] +     (R)+1;                           \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); 
+#else                                       /* looping version */
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr);
+
+#define I1024(R)                                                      \
+    X00   += ks[r+(R)+ 0];    /* inject the key schedule value */     \
+    X01   += ks[r+(R)+ 1];                                            \
+    X02   += ks[r+(R)+ 2];                                            \
+    X03   += ks[r+(R)+ 3];                                            \
+    X04   += ks[r+(R)+ 4];                                            \
+    X05   += ks[r+(R)+ 5];                                            \
+    X06   += ks[r+(R)+ 6];                                            \
+    X07   += ks[r+(R)+ 7];                                            \
+    X08   += ks[r+(R)+ 8];                                            \
+    X09   += ks[r+(R)+ 9];                                            \
+    X10   += ks[r+(R)+10];                                            \
+    X11   += ks[r+(R)+11];                                            \
+    X12   += ks[r+(R)+12];                                            \
+    X13   += ks[r+(R)+13] + ts[r+(R)+0];                              \
+    X14   += ks[r+(R)+14] + ts[r+(R)+1];                              \
+    X15   += ks[r+(R)+15] +    r+(R)   ;                              \
+    ks[r  +       (R)+16] = ks[r+(R)-1];  /* rotate key schedule */   \
+    ts[r  +       (R)+ 2] = ts[r+(R)-1];                              \
+    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+    for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024)    /* loop thru it */
+#endif  
+        {
+#define R1024_8_rounds(R)    /* do 8 full rounds */                               \
+        R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \
+        R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \
+        R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \
+        R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \
+        I1024(2*(R));                                                             \
+        R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \
+        R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \
+        R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \
+        R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \
+        I1024(2*(R)+1);
+
+        R1024_8_rounds( 0);
+
+#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
+
+  #if   R1024_Unroll_R( 1)
+        R1024_8_rounds( 1);
+  #endif
+  #if   R1024_Unroll_R( 2)
+        R1024_8_rounds( 2);
+  #endif
+  #if   R1024_Unroll_R( 3)
+        R1024_8_rounds( 3);
+  #endif
+  #if   R1024_Unroll_R( 4)
+        R1024_8_rounds( 4);
+  #endif
+  #if   R1024_Unroll_R( 5)
+        R1024_8_rounds( 5);
+  #endif
+  #if   R1024_Unroll_R( 6)
+        R1024_8_rounds( 6);
+  #endif
+  #if   R1024_Unroll_R( 7)
+        R1024_8_rounds( 7);
+  #endif
+  #if   R1024_Unroll_R( 8)
+        R1024_8_rounds( 8);
+  #endif
+  #if   R1024_Unroll_R( 9)
+        R1024_8_rounds( 9);
+  #endif
+  #if   R1024_Unroll_R(10)
+        R1024_8_rounds(10);
+  #endif
+  #if   R1024_Unroll_R(11)
+        R1024_8_rounds(11);
+  #endif
+  #if   R1024_Unroll_R(12)
+        R1024_8_rounds(12);
+  #endif
+  #if   R1024_Unroll_R(13)
+        R1024_8_rounds(13);
+  #endif
+  #if   R1024_Unroll_R(14)
+        R1024_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_1024 > 14)
+#error  "need more unrolling in Skein_1024_Process_Block"
+  #endif
+        }
+        /* do the final "feedforward" xor, update context chaining vars */
+
+        ctx->X[ 0] = X00 ^ w[ 0];
+        ctx->X[ 1] = X01 ^ w[ 1];
+        ctx->X[ 2] = X02 ^ w[ 2];
+        ctx->X[ 3] = X03 ^ w[ 3];
+        ctx->X[ 4] = X04 ^ w[ 4];
+        ctx->X[ 5] = X05 ^ w[ 5];
+        ctx->X[ 6] = X06 ^ w[ 6];
+        ctx->X[ 7] = X07 ^ w[ 7];
+        ctx->X[ 8] = X08 ^ w[ 8];
+        ctx->X[ 9] = X09 ^ w[ 9];
+        ctx->X[10] = X10 ^ w[10];
+        ctx->X[11] = X11 ^ w[11];
+        ctx->X[12] = X12 ^ w[12];
+        ctx->X[13] = X13 ^ w[13];
+        ctx->X[14] = X14 ^ w[14];
+        ctx->X[15] = X15 ^ w[15];
+
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+        
+        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+        blkPtr += SKEIN1024_BLOCK_BYTES;
+        }
+    while (--blkCnt);
+    ctx->h.T[0] = ts[0];
+    ctx->h.T[1] = ts[1];
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein1024_Process_Block_CodeSize) -
+           ((u08b_t *) Skein1024_Process_Block);
+    }
+uint_t Skein1024_Unroll_Cnt(void)
+    {
+    return SKEIN_UNROLL_1024;
+    }
+#endif
+#endif
diff --git a/Optimized_64bit/skein_debug.c b/Optimized_64bit/skein_debug.c
new file mode 100644
index 0000000000000..fac5038598ea5
--- /dev/null
+++ b/Optimized_64bit/skein_debug.c
@@ -0,0 +1,247 @@
+/***********************************************************************
+**
+** Debug output functions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+#include <stdio.h>
+
+#ifdef SKEIN_DEBUG  /* only instantiate this code if SKEIN_DEBUG is on */
+#include "skein.h"
+
+static const char INDENT[] =  "    ";  /* how much to indent on new line */
+
+uint_t skein_DebugFlag = 0;  /* off by default. Must be set externally */
+
+static void Show64_step(size_t cnt,const u64b_t *X,size_t step)
+    {
+    size_t i,j;
+    for (i=j=0;i < cnt;i++,j+=step)
+        {
+        if (i % 4 ==  0) printf(INDENT);
+        printf(" %08X.%08X ",(uint_32t)(X[j] >> 32),(uint_32t)X[j]);
+        if (i % 4 ==  3 || i==cnt-1) printf("\n");
+        fflush(stdout);
+        }
+    }
+
+#define Show64(cnt,X) Show64_step(cnt,X,1)
+
+static void Show64_flag(size_t cnt,const u64b_t *X)
+    {
+    size_t xptr = (size_t) X;
+    size_t step = (xptr & 1) ? 2 : 1;
+    if (step != 1)
+        {
+        X = (const u64b_t *) (xptr & ~1);
+        }
+    Show64_step(cnt,X,step);
+    }
+
+static void Show08(size_t cnt,const u08b_t *b)
+    {
+    size_t i;
+    for (i=0;i < cnt;i++)
+        {
+        if (i %16 ==  0) printf(INDENT);
+        else if (i % 4 == 0) printf(" ");
+        printf(" %02X",b[i]);
+        if (i %16 == 15 || i==cnt-1) printf("\n");
+        fflush(stdout);
+        }
+    }
+
+static const char *AlgoHeader(uint_t bits)
+    {
+    if (skein_DebugFlag & SKEIN_DEBUG_THREEFISH)
+        switch (bits)
+            {
+            case  256:  return ":Threefish-256: ";
+            case  512:  return ":Threefish-512: ";
+            case 1024:  return ":Threefish-1024:";
+            }
+    else
+        switch (bits)
+            {
+            case  256:  return ":Skein-256: ";
+            case  512:  return ":Skein-512: ";
+            case 1024:  return ":Skein-1024:";
+            }
+    return NULL;
+    }
+
+void Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr)
+    {
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag & SKEIN_DEBUG_FINAL)
+        {
+        printf("\n%s Final output=\n",AlgoHeader(bits));
+        Show08(cnt,outPtr);
+        printf("    ++++++++++\n");
+        fflush(stdout);
+        }
+    }
+
+/* show state after a round (or "pseudo-round") */
+void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X)
+    {
+    static uint_t injectNum=0;  /* not multi-thread safe! */
+
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag)
+        {
+        if (r >= SKEIN_RND_SPECIAL) 
+            {       /* a key injection (or feedforward) point */
+            injectNum = (r == SKEIN_RND_KEY_INITIAL) ? 0 : injectNum+1;
+            if (  skein_DebugFlag & SKEIN_DEBUG_INJECT ||
+                ((skein_DebugFlag & SKEIN_DEBUG_FINAL) && r == SKEIN_RND_FEED_FWD))
+                {
+                printf("\n%s",AlgoHeader(bits));
+                switch (r)
+                    {
+                    case SKEIN_RND_KEY_INITIAL:
+                        printf(" [state after initial key injection]");
+                        break;
+                    case SKEIN_RND_KEY_INJECT:
+                        printf(" [state after key injection #%02d]",injectNum);
+                        break;
+                    case SKEIN_RND_FEED_FWD:
+                        printf(" [state after plaintext feedforward]");
+                        injectNum = 0;
+                        break;
+                    }
+                printf("=\n");
+                Show64(bits/64,X);
+                if (r== SKEIN_RND_FEED_FWD)
+                    printf("    ----------\n");
+                }
+            }
+        else if (skein_DebugFlag & SKEIN_DEBUG_ROUNDS)
+            {
+            uint_t j;
+            u64b_t p[SKEIN_MAX_STATE_WORDS];
+            const u08b_t *perm;
+            const static u08b_t PERM_256 [4][ 4] = { { 0,1,2,3 }, { 0,3,2,1 }, { 0,1,2,3 }, { 0,3,2,1 } };
+            const static u08b_t PERM_512 [4][ 8] = { { 0,1,2,3,4,5,6,7 },
+                                                     { 2,1,4,7,6,5,0,3 },
+                                                     { 4,1,6,3,0,5,2,7 },
+                                                     { 6,1,0,7,2,5,4,3 }
+                                                   };
+            const static u08b_t PERM_1024[4][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+                                                     { 0, 9, 2,13, 6,11, 4,15,10, 7,12, 3,14, 5, 8, 1 },
+                                                     { 0, 7, 2, 5, 4, 3, 6, 1,12,15,14,13, 8,11,10, 9 },
+                                                     { 0,15, 2,11, 6,13, 4, 9,14, 1, 8, 5,10, 3,12, 7 }
+                                                   };
+                    
+            if ((skein_DebugFlag & SKEIN_DEBUG_PERMUTE) && (r & 3))
+                {
+                printf("\n%s [state after round %2d (permuted)]=\n",AlgoHeader(bits),(int)r);
+                switch (bits)
+                    {
+                    case  256: perm = PERM_256 [r&3];   break;
+                    case  512: perm = PERM_512 [r&3];   break;
+                    default:   perm = PERM_1024[r&3];   break;
+                    }
+                for (j=0;j<bits/64;j++)
+                    p[j] = X[perm[j]];
+                Show64(bits/64,p);
+                }
+            else
+                {
+                printf("\n%s [state after round %2d]=\n",AlgoHeader(bits),(int)r);
+                Show64(bits/64,X);
+                }
+            }
+        }
+    }
+
+/* show state after a round (or "pseudo-round"), given a list of pointers */
+void Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[])
+    {
+    uint_t i;
+    u64b_t X[SKEIN_MAX_STATE_WORDS];
+
+    for (i=0;i<bits/64;i++)     /* copy over the words */ 
+        X[i] = X_ptr[i][0];
+    Skein_Show_Round(bits,h,r,X);
+    }
+
+
+/* show the state at the start of a block */
+void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+                      const u64b_t *wPtr, const u64b_t *ksPtr, const u64b_t *tsPtr)
+    {
+    uint_t n;
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag)
+        {
+        if (skein_DebugFlag & SKEIN_DEBUG_HDR)
+            {
+            printf("\n%s Block: outBits=%4d. T0=%06X.",AlgoHeader(bits),(uint_t) h->hashBitLen,(uint_t)h->T[0]);
+            printf(" Type=");
+            n = (uint_t) ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) >> SKEIN_T1_POS_BLK_TYPE);
+            switch (n)
+                {
+                case SKEIN_BLK_TYPE_KEY:  printf("KEY. ");  break;
+                case SKEIN_BLK_TYPE_CFG:  printf("CFG. ");  break;
+                case SKEIN_BLK_TYPE_PERS: printf("PERS.");  break;
+                case SKEIN_BLK_TYPE_PK :  printf("PK.  ");  break;
+                case SKEIN_BLK_TYPE_KDF:  printf("KDF. ");  break;
+                case SKEIN_BLK_TYPE_MSG:  printf("MSG. ");  break;
+                case SKEIN_BLK_TYPE_OUT:  printf("OUT. ");  break;
+                default:    printf("0x%02X.",n); break;
+                }
+            printf(" Flags=");
+            printf((h->T[1] & SKEIN_T1_FLAG_FIRST)   ? " First":"      ");
+            printf((h->T[1] & SKEIN_T1_FLAG_FINAL)   ? " Final":"      ");
+            printf((h->T[1] & SKEIN_T1_FLAG_BIT_PAD) ? " Pad"  :"    ");
+            n = (uint_t) ((h->T[1] & SKEIN_T1_TREE_LVL_MASK) >> SKEIN_T1_POS_TREE_LVL);
+            if (n)
+                printf("  TreeLevel = %02X",n);
+            printf("\n");
+            fflush(stdout);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_TWEAK)
+            {
+            printf("  Tweak:\n");
+            Show64(2,h->T);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_STATE)
+            {
+            printf("  %s words:\n",(skein_DebugFlag & SKEIN_DEBUG_THREEFISH)?"Key":"State");
+            Show64(bits/64,X);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_KEYSCHED)
+            {
+            printf("  Tweak schedule:\n");
+            Show64_flag(3,tsPtr);
+            printf("  Key   schedule:\n");
+            Show64_flag((bits/64)+1,ksPtr);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_INPUT_64)
+            {
+            printf("  Input block (words):\n");
+            Show64(bits/64,wPtr);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_INPUT_08)
+            {
+            printf("  Input block (bytes):\n");
+            Show08(bits/8,blkPtr);
+            }
+        }
+    }
+
+void Skein_Show_Key(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes)
+    {
+    if (keyBytes)
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag & SKEIN_DEBUG_KEY)
+        {
+        printf("\n%s MAC key = %4u bytes\n",AlgoHeader(bits),(unsigned) keyBytes);
+        Show08(keyBytes,key);
+        }
+    }
+#endif
diff --git a/Optimized_64bit/skein_debug.h b/Optimized_64bit/skein_debug.h
new file mode 100644
index 0000000000000..7775c0165c0ac
--- /dev/null
+++ b/Optimized_64bit/skein_debug.h
@@ -0,0 +1,48 @@
+#ifndef _SKEIN_DEBUG_H_
+#define _SKEIN_DEBUG_H_
+/***********************************************************************
+**
+** Interface definitions for Skein hashing debug output.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#ifdef  SKEIN_DEBUG
+/* callout functions used inside Skein code */
+void    Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+                         const u64b_t *wPtr,const u64b_t *ksPtr,const u64b_t *tsPtr);
+void    Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X);
+void    Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[]);
+void    Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr);
+void    Skein_Show_Key  (uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes);
+
+extern  uint_t skein_DebugFlag;            /* flags to control debug output (0 --> none) */
+
+#define SKEIN_RND_SPECIAL       (1000u)
+#define SKEIN_RND_KEY_INITIAL   (SKEIN_RND_SPECIAL+0u)
+#define SKEIN_RND_KEY_INJECT    (SKEIN_RND_SPECIAL+1u)
+#define SKEIN_RND_FEED_FWD      (SKEIN_RND_SPECIAL+2u)
+
+/* flag bits:  skein_DebugFlag */
+#define SKEIN_DEBUG_KEY         (1u << 1)  /* show MAC key */
+#define SKEIN_DEBUG_CONFIG      (1u << 2)  /* show config block processing */
+#define SKEIN_DEBUG_STATE       (1u << 3)  /* show input state during Show_Block() */
+#define SKEIN_DEBUG_TWEAK       (1u << 4)  /* show input state during Show_Block() */
+#define SKEIN_DEBUG_KEYSCHED    (1u << 5)  /* show expanded key schedule */
+#define SKEIN_DEBUG_INPUT_64    (1u << 6)  /* show input block as 64-bit words */
+#define SKEIN_DEBUG_INPUT_08    (1u << 7)  /* show input block as  8-bit bytes */
+#define SKEIN_DEBUG_INJECT      (1u << 8)  /* show state after key injection & feedforward points */
+#define SKEIN_DEBUG_ROUNDS      (1u << 9)  /* show state after all rounds */
+#define SKEIN_DEBUG_FINAL       (1u <<10)  /* show final output of Skein */
+#define SKEIN_DEBUG_HDR         (1u <<11)  /* show block header */
+#define SKEIN_DEBUG_THREEFISH   (1u <<12)  /* use Threefish name instead of Skein */
+#define SKEIN_DEBUG_PERMUTE     (1u <<13)  /* use word permutations */
+#define SKEIN_DEBUG_ALL         ((~0u) & ~(SKEIN_DEBUG_THREEFISH | SKEIN_DEBUG_PERMUTE))
+#define THREEFISH_DEBUG_ALL     (SKEIN_DEBUG_ALL | SKEIN_DEBUG_THREEFISH)
+
+#endif /*  SKEIN_DEBUG    */
+
+#endif /* _SKEIN_DEBUG_H_ */
diff --git a/Optimized_64bit/skein_iv.h b/Optimized_64bit/skein_iv.h
new file mode 100644
index 0000000000000..a8f54a41d3450
--- /dev/null
+++ b/Optimized_64bit/skein_iv.h
@@ -0,0 +1,199 @@
+#ifndef _SKEIN_IV_H_
+#define _SKEIN_IV_H_
+
+#include "skein.h"    /* get Skein macros and types */
+
+/*
+***************** Pre-computed Skein IVs *******************
+**
+** NOTE: these values are not "magic" constants, but
+** are generated using the Threefish block function.
+** They are pre-computed here only for speed; i.e., to
+** avoid the need for a Threefish call during Init().
+**
+** The IV for any fixed hash length may be pre-computed.
+** Only the most common values are included here.
+**
+************************************************************
+**/
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize =  256 bits. hashSize =  128 bits */
+const u64b_t SKEIN_256_IV_128[] =
+    {
+    MK_64(0xE1111906,0x964D7260),
+    MK_64(0x883DAAA7,0x7C8D811C),
+    MK_64(0x10080DF4,0x91960F7A),
+    MK_64(0xCCF7DDE5,0xB45BC1C2)
+    };
+
+/* blkSize =  256 bits. hashSize =  160 bits */
+const u64b_t SKEIN_256_IV_160[] =
+    {
+    MK_64(0x14202314,0x72825E98),
+    MK_64(0x2AC4E9A2,0x5A77E590),
+    MK_64(0xD47A5856,0x8838D63E),
+    MK_64(0x2DD2E496,0x8586AB7D)
+    };
+
+/* blkSize =  256 bits. hashSize =  224 bits */
+const u64b_t SKEIN_256_IV_224[] =
+    {
+    MK_64(0xC6098A8C,0x9AE5EA0B),
+    MK_64(0x876D5686,0x08C5191C),
+    MK_64(0x99CB88D7,0xD7F53884),
+    MK_64(0x384BDDB1,0xAEDDB5DE)
+    };
+
+/* blkSize =  256 bits. hashSize =  256 bits */
+const u64b_t SKEIN_256_IV_256[] =
+    {
+    MK_64(0xFC9DA860,0xD048B449),
+    MK_64(0x2FCA6647,0x9FA7D833),
+    MK_64(0xB33BC389,0x6656840F),
+    MK_64(0x6A54E920,0xFDE8DA69)
+    };
+
+/* blkSize =  512 bits. hashSize =  128 bits */
+const u64b_t SKEIN_512_IV_128[] =
+    {
+    MK_64(0xA8BC7BF3,0x6FBF9F52),
+    MK_64(0x1E9872CE,0xBD1AF0AA),
+    MK_64(0x309B1790,0xB32190D3),
+    MK_64(0xBCFBB854,0x3F94805C),
+    MK_64(0x0DA61BCD,0x6E31B11B),
+    MK_64(0x1A18EBEA,0xD46A32E3),
+    MK_64(0xA2CC5B18,0xCE84AA82),
+    MK_64(0x6982AB28,0x9D46982D)
+    };
+
+/* blkSize =  512 bits. hashSize =  160 bits */
+const u64b_t SKEIN_512_IV_160[] =
+    {
+    MK_64(0x28B81A2A,0xE013BD91),
+    MK_64(0xC2F11668,0xB5BDF78F),
+    MK_64(0x1760D8F3,0xF6A56F12),
+    MK_64(0x4FB74758,0x8239904F),
+    MK_64(0x21EDE07F,0x7EAF5056),
+    MK_64(0xD908922E,0x63ED70B8),
+    MK_64(0xB8EC76FF,0xECCB52FA),
+    MK_64(0x01A47BB8,0xA3F27A6E)
+    };
+
+/* blkSize =  512 bits. hashSize =  224 bits */
+const u64b_t SKEIN_512_IV_224[] =
+    {
+    MK_64(0xCCD06162,0x48677224),
+    MK_64(0xCBA65CF3,0xA92339EF),
+    MK_64(0x8CCD69D6,0x52FF4B64),
+    MK_64(0x398AED7B,0x3AB890B4),
+    MK_64(0x0F59D1B1,0x457D2BD0),
+    MK_64(0x6776FE65,0x75D4EB3D),
+    MK_64(0x99FBC70E,0x997413E9),
+    MK_64(0x9E2CFCCF,0xE1C41EF7)
+    };
+
+/* blkSize =  512 bits. hashSize =  256 bits */
+const u64b_t SKEIN_512_IV_256[] =
+    {
+    MK_64(0xCCD044A1,0x2FDB3E13),
+    MK_64(0xE8359030,0x1A79A9EB),
+    MK_64(0x55AEA061,0x4F816E6F),
+    MK_64(0x2A2767A4,0xAE9B94DB),
+    MK_64(0xEC06025E,0x74DD7683),
+    MK_64(0xE7A436CD,0xC4746251),
+    MK_64(0xC36FBAF9,0x393AD185),
+    MK_64(0x3EEDBA18,0x33EDFC13)
+    };
+
+/* blkSize =  512 bits. hashSize =  384 bits */
+const u64b_t SKEIN_512_IV_384[] =
+    {
+    MK_64(0xA3F6C6BF,0x3A75EF5F),
+    MK_64(0xB0FEF9CC,0xFD84FAA4),
+    MK_64(0x9D77DD66,0x3D770CFE),
+    MK_64(0xD798CBF3,0xB468FDDA),
+    MK_64(0x1BC4A666,0x8A0E4465),
+    MK_64(0x7ED7D434,0xE5807407),
+    MK_64(0x548FC1AC,0xD4EC44D6),
+    MK_64(0x266E1754,0x6AA18FF8)
+    };
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+const u64b_t SKEIN_512_IV_512[] =
+    {
+    MK_64(0x4903ADFF,0x749C51CE),
+    MK_64(0x0D95DE39,0x9746DF03),
+    MK_64(0x8FD19341,0x27C79BCE),
+    MK_64(0x9A255629,0xFF352CB1),
+    MK_64(0x5DB62599,0xDF6CA7B0),
+    MK_64(0xEABE394C,0xA9D5C3F4),
+    MK_64(0x991112C7,0x1A75B523),
+    MK_64(0xAE18A40B,0x660FCC33)
+    };
+
+/* blkSize = 1024 bits. hashSize =  384 bits */
+const u64b_t SKEIN1024_IV_384[] =
+    {
+    MK_64(0x5102B6B8,0xC1894A35),
+    MK_64(0xFEEBC9E3,0xFE8AF11A),
+    MK_64(0x0C807F06,0xE32BED71),
+    MK_64(0x60C13A52,0xB41A91F6),
+    MK_64(0x9716D35D,0xD4917C38),
+    MK_64(0xE780DF12,0x6FD31D3A),
+    MK_64(0x797846B6,0xC898303A),
+    MK_64(0xB172C2A8,0xB3572A3B),
+    MK_64(0xC9BC8203,0xA6104A6C),
+    MK_64(0x65909338,0xD75624F4),
+    MK_64(0x94BCC568,0x4B3F81A0),
+    MK_64(0x3EBBF51E,0x10ECFD46),
+    MK_64(0x2DF50F0B,0xEEB08542),
+    MK_64(0x3B5A6530,0x0DBC6516),
+    MK_64(0x484B9CD2,0x167BBCE1),
+    MK_64(0x2D136947,0xD4CBAFEA)
+    };
+
+/* blkSize = 1024 bits. hashSize =  512 bits */
+const u64b_t SKEIN1024_IV_512[] =
+    {
+    MK_64(0xCAEC0E5D,0x7C1B1B18),
+    MK_64(0xA01B0E04,0x5F03E802),
+    MK_64(0x33840451,0xED912885),
+    MK_64(0x374AFB04,0xEAEC2E1C),
+    MK_64(0xDF25A0E2,0x813581F7),
+    MK_64(0xE4004093,0x8B12F9D2),
+    MK_64(0xA662D539,0xC2ED39B6),
+    MK_64(0xFA8B85CF,0x45D8C75A),
+    MK_64(0x8316ED8E,0x29EDE796),
+    MK_64(0x053289C0,0x2E9F91B8),
+    MK_64(0xC3F8EF1D,0x6D518B73),
+    MK_64(0xBDCEC3C4,0xD5EF332E),
+    MK_64(0x549A7E52,0x22974487),
+    MK_64(0x67070872,0x5B749816),
+    MK_64(0xB9CD28FB,0xF0581BD1),
+    MK_64(0x0E2940B8,0x15804974)
+    };
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const u64b_t SKEIN1024_IV_1024[] =
+    {
+    MK_64(0xD593DA07,0x41E72355),
+    MK_64(0x15B5E511,0xAC73E00C),
+    MK_64(0x5180E5AE,0xBAF2C4F0),
+    MK_64(0x03BD41D3,0xFCBCAFAF),
+    MK_64(0x1CAEC6FD,0x1983A898),
+    MK_64(0x6E510B8B,0xCDD0589F),
+    MK_64(0x77E2BDFD,0xC6394ADA),
+    MK_64(0xC11E1DB5,0x24DCB0A3),
+    MK_64(0xD6D14AF9,0xC6329AB5),
+    MK_64(0x6A9B0BFC,0x6EB67E0D),
+    MK_64(0x9243C60D,0xCCFF1332),
+    MK_64(0x1A1F1DDE,0x743F02D4),
+    MK_64(0x0996753C,0x10ED0BB8),
+    MK_64(0x6572DD22,0xF2B4969A),
+    MK_64(0x61FD3062,0xD00A579A),
+    MK_64(0x1DE0536E,0x8682E539)
+    };
+
+#endif /* _SKEIN_IV_H_ */
diff --git a/Optimized_64bit/skein_port.h b/Optimized_64bit/skein_port.h
new file mode 100644
index 0000000000000..653302de7467b
--- /dev/null
+++ b/Optimized_64bit/skein_port.h
@@ -0,0 +1,124 @@
+#ifndef _SKEIN_PORT_H_
+#define _SKEIN_PORT_H_
+/*******************************************************************
+**
+** Platform-specific definitions for Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Many thanks to Brian Gladman for his portable header files.
+**
+** To port Skein to an "unsupported" platform, change the definitions
+** in this file appropriately.
+** 
+********************************************************************/
+
+#include "brg_types.h"                      /* get integer type definitions */
+
+typedef unsigned int    uint_t;             /* native unsigned integer */
+typedef uint_8t         u08b_t;             /*  8-bit unsigned integer */
+typedef uint_64t        u64b_t;             /* 64-bit unsigned integer */
+
+#ifndef RotL_64
+#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs.  The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    SKEIN_NEED_SWAP:  0 for little-endian, 1 for big-endian
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which 
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */
+
+#include "brg_endian.h"                     /* get endianness selection */
+#if   PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+    /* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP   (1)
+#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP   (0)
+#if   PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
+#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
+#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
+#endif
+#else
+#error "Skein needs endianness setting!"
+#endif
+
+#endif /* ifndef SKEIN_NEED_SWAP */
+
+/*
+ ******************************************************************
+ *      Provide any definitions still needed.
+ ******************************************************************
+ */
+#ifndef Skein_Swap64  /* swap for big-endian, nop for little-endian */
+#if     SKEIN_NEED_SWAP
+#define Skein_Swap64(w64)                       \
+  ( (( ((u64b_t)(w64))       & 0xFF) << 56) |   \
+    (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |   \
+    (((((u64b_t)(w64)) >>16) & 0xFF) << 40) |   \
+    (((((u64b_t)(w64)) >>24) & 0xFF) << 32) |   \
+    (((((u64b_t)(w64)) >>32) & 0xFF) << 24) |   \
+    (((((u64b_t)(w64)) >>40) & 0xFF) << 16) |   \
+    (((((u64b_t)(w64)) >>48) & 0xFF) <<  8) |   \
+    (((((u64b_t)(w64)) >>56) & 0xFF)      ) )
+#else
+#define Skein_Swap64(w64)  (w64)
+#endif
+#endif  /* ifndef Skein_Swap64 */
+
+
+#ifndef Skein_Put64_LSB_First
+void    Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<bCnt;n++)
+        dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
+    }
+#else
+    ;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Put64_LSB_First */
+
+
+#ifndef Skein_Get64_LSB_First
+void    Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<8*wCnt;n+=8)
+        dst[n/8] = (((u64b_t) src[n  ])      ) +
+                   (((u64b_t) src[n+1]) <<  8) +
+                   (((u64b_t) src[n+2]) << 16) +
+                   (((u64b_t) src[n+3]) << 24) +
+                   (((u64b_t) src[n+4]) << 32) +
+                   (((u64b_t) src[n+5]) << 40) +
+                   (((u64b_t) src[n+6]) << 48) +
+                   (((u64b_t) src[n+7]) << 56) ;
+    }
+#else
+    ;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Get64_LSB_First */
+
+#endif   /* ifndef _SKEIN_PORT_H_ */
diff --git a/README/readme.txt b/README/readme.txt
new file mode 100644
index 0000000000000..c827482e8098b
--- /dev/null
+++ b/README/readme.txt
@@ -0,0 +1,166 @@
+Below is a list of Skein files included on the NIST submission CD, along 
+with a very brief description of each file. In both the reference and 
+optimized directories, all C files should be compiled to generate a 
+SHA3 NIST API "library" for Skein.
+
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+The following files are identical and common between the reference and optimized 
+versions of the code:
+
+File Name           Description
+--------------------------------------------------------------------------------
+brg_endian.h        Brian Gladman's header file to auto-detect CPU endianness
+                        (with a few extensions for handling various platforms/compilers)
+
+
+brg_types.h         Brian Gladman's header file to auto-detect integer types
+                        (with a few extensions for handling various platforms/compilers)
+
+
+SHA3api_ref.h       API definitions for SHA3 API, implemented in SHA3api_ref.c
+
+
+SHA3api_ref.c       "Wrapper" code that implements the NIST SHA3 API on top of the
+                    Skein API.
+
+
+skein_debug.h       Header for with routines used internally by Skein routines for
+                    generating debug i/o (e.g., round-by-round intermediate values)
+                    If SKEIN_DEBUG is not defined at compile time, these interface
+                    declarations instead become "dummy" macros so that there is
+                    no performance impact.
+
+
+skein_debug.c       Debug i/o routines called by Skein functions.
+
+
+skein.h             Function prototypes, data structures, and constant definitions
+                    for Skein. The Skein API is more general than the NIST API 
+                    (e.g., MAC functions). 
+
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+The following files are different for the reference and optimized versions
+of the code. Note that the source files in Optimized_32bit and Optimized_64bit 
+directories are identical.
+
+File Name           Description
+--------------------------------------------------------------------------------
+skein_port.h        Definitions that might need to be changed to port Skein to 
+                    a different CPU platform (e.g., big-endian). The Skein code
+                    should run on most CPU platforms, but the macros/functions here
+                    may be helpful in making the code run more efficiently
+
+skein.c             The main Skein interface functions: Init, Update, and Final, for
+                    all three Skein block sizes. Additionally, the InitExt() function
+                    allows for MAC and other extended functionality.
+
+skein_block.c       The Skein block processing function, based on the Threefish block
+                    cipher. This module contains the most performance-sensitive code
+                    and can be replaced by the assembly modules for slight speedups
+                    on some platforms. The functions here are only for internal use
+                    inside "skein.c" and are not intended for external APIs.
+                    
+skein_iv.h          Initial values for various Skein hash functions. Note that these
+                    values are NOT "magic constants", as they are computed using
+                    the initial Skein "configuration" block.  These values are used 
+                    only by the optimized code, in order to speed up the hash 
+                    computations.
+
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+The following files are included in the Additional_Implementations directory:
+
+File Name           Description
+--------------------------------------------------------------------------------
+skein_test.c        The Skein test module, used to measure performance and generate
+                    KAT vectors for testing. This module should be compiled together
+                    with the Skein source files (i.e., from the Reference or the
+                    Optimized directories) to generate an executable, skein_test.exe.
+                    This program is used internally to test/validate/compare different
+                    implementations (e.g., Reference, Optimized, Assembly).
+
+skein_block_x64.asm This is the 64-bit assembly language version of skein_block.c. 
+                    It may be used to replace that file in the Optimized_64bit 
+                    directory to improve performance on 64-bit Intel/AMD systems. 
+                    It should be assembled with ml64.exe.
+
+skein_block_x86.asm This is the 32-bit assembly language version of skein_block.c. 
+                    It may be used to replace that file in the Optimized_32bit 
+                    directory to improve performance on 32-bit Intel/AMD systems. 
+                    It should be assembled with ml.exe.
+
+skein_rot_search.c  This is the program that searches for the Threefish rotation 
+                    constants. It has many different command-line switches, but by
+                    default it generates the constants used in the Skein paper.
+                    This file is a stand-alone C file. To run it, simply re-direct
+                    the output to a test file:  "skein_rot_search > srs_log.txt".
+                    Note that it takes nearly 3 DAYS on a Core 2 Duo to complete
+                    program execution in this case. Alternately, to generate individual
+                    files, run the following command lines:
+                        skein_rot_search -b256  > srs_256.txt
+                        skein_rot_search -b512  > srs_512.txt
+                        skein_rot_search -b1024 > srs_1024.txt
+
+srs_256.txt         These three files contain the results of running skein_rot_search.exe
+srs_512.txt         for the three different Skein block sizes. They are rather large.
+srs_1024.txt        At the end of each file, the "finalists" are re-graded with different
+                    number of random samples.
+
+Atmel_AVR.c         This file was used to compile on the Atmel AVR 8-bit CPU.
+                    It includes the optimized versions of skein.c and skein_block.c
+                    with compile-time settings to only implement one at time.
+                    This was compiled with the free AVR tool set from Atmel
+                    and simulated to give the 8-bit C performance numbers.
+
+skein_8bit_estimates.xls
+                    This file is a spreadsheet used to generate the estimates for
+                    code size and speed of assembly versions of Skein on the Atmel
+                    8-bit CPU family. Note that this is MUCH faster than the C
+                    versions, since it uses static variables, with optimized loading
+                    and rotations.  No attempt is made here to minimize code size by 
+                    sharing code using calls, although the code size could be shrunk 
+                    significantly using calls, at some cost in performance.
+
+skein_perf_core2.txt
+                    This file contains code size and performance data running on
+                    an Intel Core 2 Duo CPU under Windows Vista 64-bit, using the
+                    Microsoft and other compilers and assemblers. It includes
+                    results for both 32-bit and 64-bit code. 
+
+skein_MSC_v9_perf.txt
+                    This file contains a subset of the skein_perf_core2.txt file,
+                    including only results from the MSVC 2008 compiler, with message
+                    sizes that are powers of 10.
+
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+The following files are included in the KAT_MCT directory, in addition to the 
+KAT/MCT files required by NIST:
+
+genKAT.c            NIST-supplied source file for generating KAT_MCT vectors.
+                    This module should be compiled together with the Skein source 
+                    files (i.e., from the Reference or the Optimized directories) 
+                    to generate an executable genKAT.exe, which can generate the
+                    KAT_MCT vectors.
+                    [FWIW, compiling this source file under gcc gives several nasty compiler warnings!]
+                    
+skein_golden_kat.txt
+                    The "golden" KAT file generated using "skein_test.exe -k". This 
+                    file tries to cover various block sizes, message sizes, and output 
+                    sizes, as well as MAC modes. It is used for testing compliance of
+                    a Skein implementation, using skein_test.c
+
+skein_golden_kat_internals.txt
+                    The KAT file generated using "skein_test.exe -k -dc". It covers
+                    the same test as "skein_golden_kat.txt" , but also prints out
+                    intermediate (round-by-round) values. The file is very large, but
+                    it is quite useful in debugging when porting Skein to a new
+                    CPU platform and/or programming language.
+
+skein_golden_kat_short.txt
+                    This is a shorter version (subset) of skein_golden_kat.txt
+
+skein_golden_kat_short_internals.txt
+                    This is a shorter version (subset) of skein_golden_kat_internals.txt
diff --git a/Reference_Implementation/SHA3api_ref.c b/Reference_Implementation/SHA3api_ref.c
new file mode 100644
index 0000000000000..6861a3e4bffb2
--- /dev/null
+++ b/Reference_Implementation/SHA3api_ref.c
@@ -0,0 +1,115 @@
+/***********************************************************************
+**
+** Implementation of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#include <string.h>     /* get the memcpy/memset functions */
+#include "skein.h"      /* get the Skein API definitions   */
+#include "SHA3api_ref.h"/* get the  AHS  API definitions   */
+
+/******************************************************************/
+/*     AHS API code                                               */
+/******************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+HashReturn Init(hashState *state, int hashbitlen)
+    {
+#if SKEIN_256_NIST_MAX_HASH_BITS
+    if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
+        {
+        Skein_Assert(hashbitlen > 0,BAD_HASHLEN);
+        state->statebits = 64*SKEIN_256_STATE_WORDS;
+        return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
+        }
+#endif
+    if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
+        {
+        state->statebits = 64*SKEIN_512_STATE_WORDS;
+        return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+        }
+    else
+        {
+        state->statebits = 64*SKEIN1024_STATE_WORDS;
+        return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+    {
+    /* only the final Update() call is allowed do partial bytes, else assert an error */
+    Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, FAIL);
+
+    Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+    if ((databitlen & 7) == 0)  /* partial bytes? */
+        {
+        switch ((state->statebits >> 8) & 3)
+            {
+            case 2:  return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+            case 1:  return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
+            case 0:  return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
+            default: return FAIL;
+            }
+        }
+    else
+        {   /* handle partial final byte */
+        size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
+        u08b_t b,mask;
+
+        mask = (u08b_t) (1u << (7 - (databitlen & 7)));       /* partial byte bit mask */
+        b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+
+        switch ((state->statebits >> 8) & 3)
+            {
+            case 2:  Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte    */
+                     Skein_512_Update(&state->u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            case 1:  Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte    */
+                     Skein_256_Update(&state->u.ctx_256,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            case 0:  Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte    */
+                     Skein1024_Update(&state->u.ctx1024,&b  ,  1   ); /* process the (masked) partial byte */
+                     break;
+            default: return FAIL;
+            }
+        Skein_Set_Bit_Pad_Flag(state->u.h);                    /* set tweak flag for the final call */
+        
+        return SUCCESS;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+HashReturn Final(hashState *state, BitSequence *hashval)
+    {
+    Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+    switch ((state->statebits >> 8) & 3)
+        {
+        case 2:  return Skein_512_Final(&state->u.ctx_512,hashval);
+        case 1:  return Skein_256_Final(&state->u.ctx_256,hashval);
+        case 0:  return Skein1024_Final(&state->u.ctx1024,hashval);
+        default: return FAIL;
+        }
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+HashReturn Hash(int hashbitlen, const BitSequence *data, /* all-in-one call */
+                DataLength databitlen,BitSequence *hashval)
+    {
+    hashState  state;
+    HashReturn r = Init(&state,hashbitlen);
+    if (r == SUCCESS)
+        { /* these calls do not fail when called properly */
+        r = Update(&state,data,databitlen);
+        Final(&state,hashval);
+        }
+    return r;
+    }
diff --git a/Reference_Implementation/SHA3api_ref.h b/Reference_Implementation/SHA3api_ref.h
new file mode 100644
index 0000000000000..6d62304e59b7e
--- /dev/null
+++ b/Reference_Implementation/SHA3api_ref.h
@@ -0,0 +1,66 @@
+#ifndef _AHS_API_H_
+#define _AHS_API_H_
+
+/***********************************************************************
+**
+** Interface declarations of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#include "skein.h"
+
+typedef enum
+    {
+    SUCCESS     = SKEIN_SUCCESS,
+    FAIL        = SKEIN_FAIL,
+    BAD_HASHLEN = SKEIN_BAD_HASHLEN
+    }
+    HashReturn;
+
+typedef size_t   DataLength;                /* bit count  type */
+typedef u08b_t   BitSequence;               /* bit stream type */
+
+typedef struct
+    {
+    uint_t  statebits;                      /* 256, 512, or 1024 */
+    union
+        {
+        Skein_Ctxt_Hdr_t h;                 /* common header "overlay" */
+        Skein_256_Ctxt_t ctx_256;
+        Skein_512_Ctxt_t ctx_512;
+        Skein1024_Ctxt_t ctx1024;
+        } u;
+    }
+    hashState;
+
+/* "incremental" hashing API */
+HashReturn Init  (hashState *state, int hashbitlen);
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+HashReturn Final (hashState *state,       BitSequence *hashval);
+
+/* "all-in-one" call */
+HashReturn Hash  (int hashbitlen,   const BitSequence *data, 
+                  DataLength databitlen,  BitSequence *hashval);
+
+
+/*
+** Re-define the compile-time constants below to change the selection
+** of the Skein state size in the Init() function in SHA3api_ref.c.
+**
+** That is, the NIST API does not allow for explicit selection of the
+** Skein block size, so it must be done implicitly in the Init() function.
+** The selection is controlled by these constants.
+*/
+#ifndef SKEIN_256_NIST_MAX_HASHBITS
+#define SKEIN_256_NIST_MAX_HASHBITS (0)
+#endif
+
+#ifndef SKEIN_512_NIST_MAX_HASHBITS
+#define SKEIN_512_NIST_MAX_HASHBITS (512)
+#endif
+
+#endif  /* ifdef _AHS_API_H_ */
diff --git a/Reference_Implementation/brg_endian.h b/Reference_Implementation/brg_endian.h
new file mode 100644
index 0000000000000..978eb33f08cf1
--- /dev/null
+++ b/Reference_Implementation/brg_endian.h
@@ -0,0 +1,148 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 20/10/2006
+*/
+
+#ifndef BRG_ENDIAN_H
+#define BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined(AVR)
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )  || defined( AVR )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+#endif
+
+/* special handler for IA64, which may be either endianness (?)  */
+/* here we assume little-endian, but this may need to be changed */
+#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+#  define PLATFORM_MUST_ALIGN (1)
+#ifndef PLATFORM_BYTE_ORDER
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+#endif
+
+#ifndef   PLATFORM_MUST_ALIGN
+#  define PLATFORM_MUST_ALIGN (0)
+#endif
+
+#endif  /* ifndef BRG_ENDIAN_H */
diff --git a/Reference_Implementation/brg_types.h b/Reference_Implementation/brg_types.h
new file mode 100644
index 0000000000000..d6d6cdab9fbfd
--- /dev/null
+++ b/Reference_Implementation/brg_types.h
@@ -0,0 +1,188 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef BRG_TYPES_H
+#define BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined(__GNUC__)  /* DLW: avoid mingw problem with -ansi */
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#  endif
+#endif
+
+#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 )
+#  error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8
+
+    dec_unit_type(size,x)       declares a variable 'x' of length 
+                                'size' bits
+
+    dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    ptr_cast(x,size)            casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define ui_type(size)               uint_##size##t
+#define dec_unit_type(size,x)       typedef ui_type(size) x
+#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/Reference_Implementation/skein.c b/Reference_Implementation/skein.c
new file mode 100644
index 0000000000000..945baa7b9f782
--- /dev/null
+++ b/Reference_Implementation/skein.c
@@ -0,0 +1,747 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#include <string.h>      /* get the memcpy/memset functions */
+#include "skein.h"       /* get the Skein API definitions   */
+
+/*****************************************************************/
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void    Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void    Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+
+/*****************************************************************/
+/*     Portable (i.e., slow) endianness conversion functions     */
+u64b_t Skein_Swap64(u64b_t w64)
+    {    /* instantiate the function body here */
+    static const u64b_t ONE = 1;              /* use this to check endianness */
+
+    /* figure out endianness "on-the-fly" */
+    if (1 == ((u08b_t *) & ONE)[0])
+        return w64;                           /* little-endian is fast */
+    else
+        return  (( w64       & 0xFF) << 56) | /*    big-endian is slow */
+                (((w64 >> 8) & 0xFF) << 48) |
+                (((w64 >>16) & 0xFF) << 40) |
+                (((w64 >>24) & 0xFF) << 32) |
+                (((w64 >>32) & 0xFF) << 24) |
+                (((w64 >>40) & 0xFF) << 16) |
+                (((w64 >>48) & 0xFF) <<  8) |
+                (((w64 >>56) & 0xFF)      ) ;
+    }
+
+void    Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<bCnt;n++)
+        dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
+    }
+
+void    Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
+    { /* this version is fully portable (big-endian or little-endian), but slow */
+    size_t n;
+
+    for (n=0;n<8*wCnt;n+=8)
+        dst[n/8] = (((u64b_t) src[n  ])      ) +
+                   (((u64b_t) src[n+1]) <<  8) +
+                   (((u64b_t) src[n+2]) << 16) +
+                   (((u64b_t) src[n+3]) << 24) +
+                   (((u64b_t) src[n+4]) << 32) +
+                   (((u64b_t) src[n+5]) << 40) +
+                   (((u64b_t) src[n+6]) << 48) +
+                   (((u64b_t) src[n+7]) << 56) ;
+    }
+
+/*****************************************************************/
+/*     256-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation */
+int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_256_STATE_BYTES];
+        u64b_t  w[SKEIN_256_STATE_WORDS];
+        } cfg;                                  /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+
+    /* build/process config block for hashing */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash byte count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+
+    /* compute the initial chaining values from config block */
+    memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+    Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type, h.bCnt=0 */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_256_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    uint_t i;
+    union
+        {
+        u08b_t  b[SKEIN_256_STATE_BYTES];
+        u64b_t  w[SKEIN_256_STATE_WORDS];
+        } cfg;                                  /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein_256_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein_256_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+        for (i=0;i<SKEIN_256_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(256,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type, h.bCnt=0 */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);     /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+            Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN_256_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+            msg        += n * SKEIN_256_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_256_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+    Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_256_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_256_BLOCK_BYTES)
+            n  = SKEIN_256_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_256_API_CodeSize) -
+           ((u08b_t *) Skein_256_Init);
+    }
+#endif
+
+/*****************************************************************/
+/*     512-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation */
+int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN_512_STATE_BYTES];
+        u64b_t  w[SKEIN_512_STATE_WORDS];
+        } cfg;                                  /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+
+    /* build/process config block for hashing */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash byte count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+
+    /* compute the initial chaining values from config block */
+    memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+    Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type, h.bCnt=0 */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_512_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    uint_t i;
+    union
+        {
+        u08b_t  b[SKEIN_512_STATE_BYTES];
+        u64b_t  w[SKEIN_512_STATE_WORDS];
+        } cfg;                                  /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein_512_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein_512_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+        for (i=0;i<SKEIN_512_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(512,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type, h.bCnt=0 */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);     /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+            Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+            msg        += n * SKEIN_512_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_512_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate more output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_512_BLOCK_BYTES)
+            n  = SKEIN_512_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_512_API_CodeSize) -
+           ((u08b_t *) Skein_512_Init);
+    }
+#endif
+
+/*****************************************************************/
+/*    1024-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation */
+int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+    {
+    union
+        {
+        u08b_t  b[SKEIN1024_STATE_BYTES];
+        u64b_t  w[SKEIN1024_STATE_WORDS];
+        } cfg;                                  /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+
+    /* build/process config block for hashing */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash byte count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+
+    /* compute the initial chaining values from config block */
+    memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+    Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+    /* Set up to process the data message portion of the hash (default) */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type, h.bCnt=0 */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein1024_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+    {
+    uint_t i;
+    union
+        {
+        u08b_t  b[SKEIN1024_STATE_BYTES];
+        u64b_t  w[SKEIN1024_STATE_WORDS];
+        } cfg;                                  /* config block */
+        
+    Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+    Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+    /* compute the initial chaining values ctx->X[], based on key */
+    if (keyBytes == 0)                          /* is there a key? */
+        {                                   
+        memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+        }
+    else                                        /* here to pre-process a key */
+        {
+        Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+        /* do a mini-Init right here */
+        ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+        Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+        memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+        Skein1024_Update(ctx,key,keyBytes);     /* hash the key */
+        Skein1024_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+        memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+        for (i=0;i<SKEIN1024_STATE_WORDS;i++)   /* convert key bytes to context words */
+            ctx->X[i] = Skein_Swap64(ctx->X[i]);
+        }
+
+    /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+    ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+    Skein_Start_New_Type(ctx,CFG_FINAL);
+
+    memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+    cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+    cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+    cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+    Skein_Show_Key(1024,&ctx->h,key,keyBytes);
+
+    /* compute the initial chaining values from config block */
+    Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+    /* The chaining vars ctx->X are now initialized */
+    /* Set up to process the data message portion of the hash */
+    Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type, h.bCnt=0 */
+
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+    {
+    size_t n;
+
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);     /* catch uninitialized context */
+
+    /* process full blocks, if any */
+    if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+        {
+        if (ctx->h.bCnt)                              /* finish up any buffered message data */
+            {
+            n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+            if (n)
+                {
+                Skein_assert(n < msgByteCnt);         /* check on our logic here */
+                memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+                msgByteCnt  -= n;
+                msg         += n;
+                ctx->h.bCnt += n;
+                }
+            Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+            Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES);
+            ctx->h.bCnt = 0;
+            }
+        /* now process any remaining full blocks, directly from input message data */
+        if (msgByteCnt > SKEIN1024_BLOCK_BYTES)
+            {
+            n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES;   /* number of full blocks to process */
+            Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES);
+            msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+            msg        += n * SKEIN1024_BLOCK_BYTES;
+            }
+        Skein_assert(ctx->h.bCnt == 0);
+        }
+
+    /* copy any remaining source message data bytes into b[] */
+    if (msgByteCnt)
+        {
+        Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+        memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+        ctx->h.bCnt += msgByteCnt;
+        }
+
+    return SKEIN_SUCCESS;
+    }
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN1024_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+    Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+    
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN1024_BLOCK_BYTES)
+            n  = SKEIN1024_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_API_CodeSize(void)
+    {
+    return ((u08b_t *) Skein1024_API_CodeSize) -
+           ((u08b_t *) Skein1024_Init);
+    }
+#endif
+
+/**************** Functions to support MAC/tree hashing ***************/
+/*   (this code is identical for Optimized and Reference versions)    */
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+    Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+    Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+    if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)   /* zero pad b[] if necessary */
+        memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+    Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+    
+    Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES);   /* "output" the state bytes */
+    
+    return SKEIN_SUCCESS;
+    }
+
+#if SKEIN_TREE_HASH
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein_256_Output(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_256_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_256_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_256_BLOCK_BYTES)
+            n  = SKEIN_256_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein_512_Output(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN_512_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN_512_BLOCK_BYTES)
+            n  = SKEIN_512_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+    {
+    size_t i,n,byteCnt;
+    u64b_t X[SKEIN1024_STATE_WORDS];
+    Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+    /* now output the result */
+    byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+    /* run Threefish in "counter mode" to generate output */
+    memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+    memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+    for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+        {
+        ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+        Skein_Start_New_Type(ctx,OUT_FINAL);
+        Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+        n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+        if (n >= SKEIN1024_BLOCK_BYTES)
+            n  = SKEIN1024_BLOCK_BYTES;
+        Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+        Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+        memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+        }
+    return SKEIN_SUCCESS;
+    }
+#endif
diff --git a/Reference_Implementation/skein.h b/Reference_Implementation/skein.h
new file mode 100644
index 0000000000000..721c9bc9ce0db
--- /dev/null
+++ b/Reference_Implementation/skein.h
@@ -0,0 +1,327 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_     1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+** 
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+**  SKEIN_DEBUG            -- make callouts from inside Skein code
+**                            to examine/display intermediate values.
+**                            [default: no callouts (no overhead)]
+**
+**  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+**                            code. If not defined, most error checking 
+**                            is disabled (for performance). Otherwise, 
+**                            the switch value is interpreted as:
+**                                0: use assert()      to flag errors
+**                                1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stddef.h>                          /* get size_t definition */
+#include "skein_port.h"                      /* get platform-specific definitions */
+
+enum
+    {
+    SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
+    SKEIN_FAIL            =      1,
+    SKEIN_BAD_HASHLEN     =      2
+    };
+
+#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define  SKEIN_256_STATE_WORDS ( 4)
+#define  SKEIN_512_STATE_WORDS ( 8)
+#define  SKEIN1024_STATE_WORDS (16)
+#define  SKEIN_MAX_STATE_WORDS (16)
+
+#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+typedef struct
+    {
+    size_t  hashBitLen;                      /* size of hash result, in bits */
+    size_t  bCnt;                            /* current byte count in buffer b[] */
+    u64b_t  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
+    } Skein_Ctxt_Hdr_t;
+
+typedef struct                               /*  256-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein_256_Ctxt_t;
+
+typedef struct                               /*  512-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN_512_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN_512_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein_512_Ctxt_t;
+
+typedef struct                               /* 1024-bit Skein hash context structure */
+    {
+    Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+    u64b_t  X[SKEIN1024_STATE_WORDS];        /* chaining variables */
+    u08b_t  b[SKEIN1024_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+    } Skein1024_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+int  Skein_256_Init  (Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+int  Skein_512_Init  (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
+int  Skein1024_Init  (Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+
+int  Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+
+int  Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+/*
+**   Skein APIs for "extended" initialization: MAC keys, tree hashing.
+**   After an InitExt() call, just use Update/Final calls as with Init().
+**
+**   Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes.
+**          When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, 
+**              the results of InitExt() are identical to calling Init().
+**          The function Init() may be called once to "precompute" the IV for
+**              a given hashBitLen value, then by saving a copy of the context
+**              the IV computation may be avoided in later calls.
+**          Similarly, the function InitExt() may be called once per MAC key 
+**              to precompute the MAC IV, then a copy of the context saved and
+**              reused for each new MAC computation.
+**/
+int  Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int  Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int  Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+
+/*
+**   Skein APIs for MAC and tree hash:
+**      Final_Pad:  pad, do final block, but no OUTPUT type
+**      Output:     do just the output stage
+*/
+int  Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+#ifndef SKEIN_TREE_HASH
+#define SKEIN_TREE_HASH (1)
+#endif
+#if  SKEIN_TREE_HASH
+int  Skein_256_Output   (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512_Output   (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+#endif
+
+/*****************************************************************
+** "Internal" Skein definitions
+**    -- not needed for sequential hashing API, but will be 
+**           helpful for other uses of Skein (e.g., tree hash mode).
+**    -- included here so that they can be shared between
+**           reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+                                
+#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+                                
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD   (((u64b_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+                                
+/* tweak word T[1]: tree level bit field mask */
+#define SKEIN_T1_TREE_LVL_MASK  (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n)  (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY      ( 0)                    /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG      ( 4)                    /* configuration block */
+#define SKEIN_BLK_TYPE_PERS     ( 8)                    /* personalization string */
+#define SKEIN_BLK_TYPE_PK       (12)                    /* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF      (16)                    /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE    (20)                    /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+#define SKEIN_BLK_TYPE_MASK     (63)                    /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK    SKEIN_T1_BLK_TYPE(PK)   /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL       (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION           (1)
+
+#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN       (4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS  ( 0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS  ( 8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS  (16)
+
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl)                   \
+    ( (((u64b_t)(leaf  )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
+      (((u64b_t)(node  )) << SKEIN_CFG_TREE_NODE_SIZE_POS) |    \
+      (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+
+/*
+**   Skein macros for getting/setting tweak words, etc.
+**   These are useful for partial input bytes, hash tree init/update, etc.
+**/
+#define Skein_Get_Tweak(ctxPtr,TWK_NUM)         ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Get_T0(ctxPtr)    Skein_Get_Tweak(ctxPtr,0)
+#define Skein_Get_T1(ctxPtr)    Skein_Get_Tweak(ctxPtr,1)
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1)           \
+    {                                           \
+    Skein_Set_T0(ctxPtr,(T0));                  \
+    Skein_Set_T1(ctxPtr,(T1));                  \
+    }
+
+#define Skein_Set_Type(ctxPtr,BLK_TYPE)         \
+    Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+    { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Clear_First_Flag(hdr)      { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;       }
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+
+/*****************************************************************
+** "Internal" Skein definitions for debugging and error checking
+******************************************************************/
+#ifdef  SKEIN_DEBUG             /* examine/display intermediate values? */
+#include "skein_debug.h"
+#else                           /* default is no callouts */
+#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
+#define Skein_Show_Round(bits,ctx,r,X)
+#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
+#define Skein_Show_Final(bits,ctx,cnt,outPtr)
+#define Skein_Show_Key(bits,ctx,key,keyBytes)
+#endif
+
+#ifndef SKEIN_ERR_CHECK        /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_assert(x)
+#elif   defined(SKEIN_ASSERT)
+#include <assert.h>     
+#define Skein_Assert(x,retCode) assert(x) 
+#define Skein_assert(x)         assert(x) 
+#else
+#include <assert.h>     
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
+#define Skein_assert(x)         assert(x)                     /* internal error */
+#endif
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum    
+    {   
+        /* Skein_256 round rotation constants */
+    R_256_0_0=14, R_256_0_1=16,
+    R_256_1_0=52, R_256_1_1=57,
+    R_256_2_0=23, R_256_2_1=40,
+    R_256_3_0= 5, R_256_3_1=37,
+    R_256_4_0=25, R_256_4_1=33,
+    R_256_5_0=46, R_256_5_1=12,
+    R_256_6_0=58, R_256_6_1=22,
+    R_256_7_0=32, R_256_7_1=32,
+
+        /* Skein_512 round rotation constants */
+    R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+    R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+    R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+    R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+    R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+    R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+    R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+    R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+
+        /* Skein1024 round rotation constants */
+    R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
+    R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
+    R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
+    R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
+    R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
+    R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
+    R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
+    R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
+    };
+
+#ifndef SKEIN_ROUNDS
+#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
+#define SKEIN_512_ROUNDS_TOTAL (72)
+#define SKEIN1024_ROUNDS_TOTAL (80)
+#else                                        /* allow command-line define in range 8*(5..14)   */
+#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5))
+#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
+#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS    ) + 5) % 10) + 5))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* ifndef _SKEIN_H_ */
diff --git a/Reference_Implementation/skein_block.c b/Reference_Implementation/skein_block.c
new file mode 100644
index 0000000000000..48d499813642d
--- /dev/null
+++ b/Reference_Implementation/skein_block.c
@@ -0,0 +1,369 @@
+/***********************************************************************
+**
+** Implementation of the Skein block functions.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Compile-time switches:
+**
+**  SKEIN_USE_ASM             -- set bits (256/512/1024) to select which
+**                               versions use ASM code for block processing
+**                               [default: use C for all block sizes]
+**
+************************************************************************/
+
+#include <string.h>
+#include "skein.h"
+
+/* 64-bit rotate left */
+u64b_t RotL_64(u64b_t x,uint_t N)
+    {
+    return (x << (N & 63)) | (x >> ((64-N) & 63));
+    }
+
+#define BLK_BITS    (WCNT*64)
+
+/* macro to perform a key injection (same for all block sizes) */
+#define InjectKey(r)                                                \
+    for (i=0;i < WCNT;i++)                                          \
+         X[i] += ks[((r)+i) % (WCNT+1)];                            \
+    X[WCNT-3] += ts[((r)+0) % 3];                                   \
+    X[WCNT-2] += ts[((r)+1) % 3];                                   \
+    X[WCNT-1] += (r);                    /* avoid slide attacks */  \
+    Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,X);
+
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT     = SKEIN_256_STATE_WORDS
+        };
+    size_t  i,r;
+    u64b_t  ts[3];                            /* key schedule: tweak */
+    u64b_t  ks[WCNT+1];                       /* key schedule: chaining vars */
+    u64b_t  X [WCNT];                         /* local copy of context vars */
+    u64b_t  w [WCNT];                         /* local copy of input block */
+
+    Skein_assert(blkCnt != 0);                /* never call with blkCnt == 0! */
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ctx->h.T[0] += byteCntAdd;            /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[WCNT] = SKEIN_KS_PARITY;
+        for (i=0;i < WCNT; i++)
+            {
+            ks[i]     = ctx->X[i];
+            ks[WCNT] ^= ctx->X[i];            /* compute overall parity */
+            }
+        ts[0] = ctx->h.T[0];
+        ts[1] = ctx->h.T[1];
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+        for (i=0;i < WCNT; i++)               /* do the first full key injection */
+            {
+            X[i]  = w[i] + ks[i];
+            }
+        X[WCNT-3] += ts[0];
+        X[WCNT-2] += ts[1];
+
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,X);     /* show starting state values */
+
+        for (r=1;r <= SKEIN_256_ROUNDS_TOTAL/8; r++)
+            { /* unroll 8 rounds */
+            X[0] += X[1]; X[1] = RotL_64(X[1],R_256_0_0); X[1] ^= X[0];
+            X[2] += X[3]; X[3] = RotL_64(X[3],R_256_0_1); X[3] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-7,X);
+
+            X[0] += X[3]; X[3] = RotL_64(X[3],R_256_1_0); X[3] ^= X[0];
+            X[2] += X[1]; X[1] = RotL_64(X[1],R_256_1_1); X[1] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-6,X);
+
+            X[0] += X[1]; X[1] = RotL_64(X[1],R_256_2_0); X[1] ^= X[0];
+            X[2] += X[3]; X[3] = RotL_64(X[3],R_256_2_1); X[3] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-5,X);
+
+            X[0] += X[3]; X[3] = RotL_64(X[3],R_256_3_0); X[3] ^= X[0];
+            X[2] += X[1]; X[1] = RotL_64(X[1],R_256_3_1); X[1] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-4,X);
+            InjectKey(2*r-1);
+
+            X[0] += X[1]; X[1] = RotL_64(X[1],R_256_4_0); X[1] ^= X[0];
+            X[2] += X[3]; X[3] = RotL_64(X[3],R_256_4_1); X[3] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-3,X);
+
+            X[0] += X[3]; X[3] = RotL_64(X[3],R_256_5_0); X[3] ^= X[0];
+            X[2] += X[1]; X[1] = RotL_64(X[1],R_256_5_1); X[1] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-2,X);
+
+            X[0] += X[1]; X[1] = RotL_64(X[1],R_256_6_0); X[1] ^= X[0];
+            X[2] += X[3]; X[3] = RotL_64(X[3],R_256_6_1); X[3] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-1,X);
+
+            X[0] += X[3]; X[3] = RotL_64(X[3],R_256_7_0); X[3] ^= X[0];
+            X[2] += X[1]; X[1] = RotL_64(X[1],R_256_7_1); X[1] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r  ,X);
+            InjectKey(2*r);
+            }
+        /* do the final "feedforward" xor, update context chaining vars */
+        for (i=0;i < WCNT;i++)
+            ctx->X[i] = X[i] ^ w[i];
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+		Skein_Clear_First_Flag(ctx->h);		/* clear the start bit */
+        blkPtr += SKEIN_256_BLOCK_BYTES;
+        }
+    while (--blkCnt);
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_256_Process_Block_CodeSize) -
+           ((u08b_t *) Skein_256_Process_Block);
+    }
+uint_t Skein_256_Unroll_Cnt(void)
+    {
+    return 1;
+    }
+#endif
+
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT = SKEIN_512_STATE_WORDS
+        };
+
+    size_t  i,r;
+    u64b_t  ts[3];                            /* key schedule: tweak */
+    u64b_t  ks[WCNT+1];                       /* key schedule: chaining vars */
+    u64b_t  X [WCNT];                         /* local copy of vars */
+    u64b_t  w [WCNT];                         /* local copy of input block */
+
+    Skein_assert(blkCnt != 0);                /* never call with blkCnt == 0! */
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ctx->h.T[0] += byteCntAdd;            /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[WCNT] = SKEIN_KS_PARITY;
+        for (i=0;i < WCNT; i++)
+            {
+            ks[i]     = ctx->X[i];
+            ks[WCNT] ^= ctx->X[i];            /* compute overall parity */
+            }
+        ts[0] = ctx->h.T[0];
+        ts[1] = ctx->h.T[1];
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+        for (i=0;i < WCNT; i++)               /* do the first full key injection */
+            {
+            X[i]  = w[i] + ks[i];
+            }
+        X[WCNT-3] += ts[0];
+        X[WCNT-2] += ts[1];
+
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,X);
+        for (r=1;r <= SKEIN_512_ROUNDS_TOTAL/8; r++)
+            { /* unroll 8 rounds */
+            X[0] += X[1]; X[1] = RotL_64(X[1],R_512_0_0); X[1] ^= X[0];
+            X[2] += X[3]; X[3] = RotL_64(X[3],R_512_0_1); X[3] ^= X[2];
+            X[4] += X[5]; X[5] = RotL_64(X[5],R_512_0_2); X[5] ^= X[4];
+            X[6] += X[7]; X[7] = RotL_64(X[7],R_512_0_3); X[7] ^= X[6];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-7,X);
+
+            X[2] += X[1]; X[1] = RotL_64(X[1],R_512_1_0); X[1] ^= X[2];
+            X[4] += X[7]; X[7] = RotL_64(X[7],R_512_1_1); X[7] ^= X[4];
+            X[6] += X[5]; X[5] = RotL_64(X[5],R_512_1_2); X[5] ^= X[6];
+            X[0] += X[3]; X[3] = RotL_64(X[3],R_512_1_3); X[3] ^= X[0];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-6,X);
+
+            X[4] += X[1]; X[1] = RotL_64(X[1],R_512_2_0); X[1] ^= X[4];
+            X[6] += X[3]; X[3] = RotL_64(X[3],R_512_2_1); X[3] ^= X[6];
+            X[0] += X[5]; X[5] = RotL_64(X[5],R_512_2_2); X[5] ^= X[0];
+            X[2] += X[7]; X[7] = RotL_64(X[7],R_512_2_3); X[7] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-5,X);
+
+            X[6] += X[1]; X[1] = RotL_64(X[1],R_512_3_0); X[1] ^= X[6];
+            X[0] += X[7]; X[7] = RotL_64(X[7],R_512_3_1); X[7] ^= X[0];
+            X[2] += X[5]; X[5] = RotL_64(X[5],R_512_3_2); X[5] ^= X[2];
+            X[4] += X[3]; X[3] = RotL_64(X[3],R_512_3_3); X[3] ^= X[4];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-4,X);
+            InjectKey(2*r-1);
+
+            X[0] += X[1]; X[1] = RotL_64(X[1],R_512_4_0); X[1] ^= X[0];
+            X[2] += X[3]; X[3] = RotL_64(X[3],R_512_4_1); X[3] ^= X[2];
+            X[4] += X[5]; X[5] = RotL_64(X[5],R_512_4_2); X[5] ^= X[4];
+            X[6] += X[7]; X[7] = RotL_64(X[7],R_512_4_3); X[7] ^= X[6];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-3,X);
+
+            X[2] += X[1]; X[1] = RotL_64(X[1],R_512_5_0); X[1] ^= X[2];
+            X[4] += X[7]; X[7] = RotL_64(X[7],R_512_5_1); X[7] ^= X[4];
+            X[6] += X[5]; X[5] = RotL_64(X[5],R_512_5_2); X[5] ^= X[6];
+            X[0] += X[3]; X[3] = RotL_64(X[3],R_512_5_3); X[3] ^= X[0];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-2,X);
+
+            X[4] += X[1]; X[1] = RotL_64(X[1],R_512_6_0); X[1] ^= X[4];
+            X[6] += X[3]; X[3] = RotL_64(X[3],R_512_6_1); X[3] ^= X[6];
+            X[0] += X[5]; X[5] = RotL_64(X[5],R_512_6_2); X[5] ^= X[0];
+            X[2] += X[7]; X[7] = RotL_64(X[7],R_512_6_3); X[7] ^= X[2];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r-1,X);
+
+            X[6] += X[1]; X[1] = RotL_64(X[1],R_512_7_0); X[1] ^= X[6];
+            X[0] += X[7]; X[7] = RotL_64(X[7],R_512_7_1); X[7] ^= X[0];
+            X[2] += X[5]; X[5] = RotL_64(X[5],R_512_7_2); X[5] ^= X[2];
+            X[4] += X[3]; X[3] = RotL_64(X[3],R_512_7_3); X[3] ^= X[4];  Skein_Show_Round(BLK_BITS,&ctx->h,8*r  ,X);
+            InjectKey(2*r);
+            }
+        /* do the final "feedforward" xor, update context chaining vars */
+        for (i=0;i < WCNT;i++)
+            ctx->X[i] = X[i] ^ w[i];
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+		Skein_Clear_First_Flag(ctx->h);		/* clear the start bit */
+        blkPtr += SKEIN_512_BLOCK_BYTES;
+        }
+    while (--blkCnt);
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
+           ((u08b_t *) Skein_512_Process_Block);
+    }
+uint_t Skein_512_Unroll_Cnt(void)
+    {
+    return 1;
+    }
+#endif
+
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+    { /* do it in C */
+    enum
+        {
+        WCNT = SKEIN1024_STATE_WORDS
+        };
+
+    size_t  i,r;
+    u64b_t  ts[3];                            /* key schedule: tweak */
+    u64b_t  ks[WCNT+1];                       /* key schedule: chaining vars */
+    u64b_t  X [WCNT];                         /* local copy of vars */
+    u64b_t  w [WCNT];                         /* local copy of input block */
+
+    Skein_assert(blkCnt != 0);                /* never call with blkCnt == 0! */
+    do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+        ctx->h.T[0] += byteCntAdd;            /* update processed length */
+
+        /* precompute the key schedule for this block */
+        ks[WCNT] = SKEIN_KS_PARITY;
+        for (i=0;i < WCNT; i++)
+            {
+            ks[i]     = ctx->X[i];
+            ks[WCNT] ^= ctx->X[i];            /* compute overall parity */
+            }
+        ts[0] = ctx->h.T[0];
+        ts[1] = ctx->h.T[1];
+        ts[2] = ts[0] ^ ts[1];
+
+        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+        for (i=0;i < WCNT; i++)               /* do the first full key injection */
+            {
+            X[i]  = w[i] + ks[i];
+            }
+        X[WCNT-3] += ts[0];
+        X[WCNT-2] += ts[1];
+
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,X);   /* show starting state values */
+        for (r=1;r <= SKEIN1024_ROUNDS_TOTAL/8; r++)
+            { /* unroll 8 rounds */
+            X[ 0] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_0_0); X[ 1] ^= X[ 0];
+            X[ 2] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_0_1); X[ 3] ^= X[ 2];
+            X[ 4] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_0_2); X[ 5] ^= X[ 4];
+            X[ 6] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_0_3); X[ 7] ^= X[ 6];
+            X[ 8] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_0_4); X[ 9] ^= X[ 8];
+            X[10] += X[11]; X[11] = RotL_64(X[11],R1024_0_5); X[11] ^= X[10];
+            X[12] += X[13]; X[13] = RotL_64(X[13],R1024_0_6); X[13] ^= X[12];
+            X[14] += X[15]; X[15] = RotL_64(X[15],R1024_0_7); X[15] ^= X[14];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r-7,X);
+
+            X[ 0] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_1_0); X[ 9] ^= X[ 0];
+            X[ 2] += X[13]; X[13] = RotL_64(X[13],R1024_1_1); X[13] ^= X[ 2];
+            X[ 6] += X[11]; X[11] = RotL_64(X[11],R1024_1_2); X[11] ^= X[ 6];
+            X[ 4] += X[15]; X[15] = RotL_64(X[15],R1024_1_3); X[15] ^= X[ 4];
+            X[10] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_1_4); X[ 7] ^= X[10];
+            X[12] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_1_5); X[ 3] ^= X[12];
+            X[14] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_1_6); X[ 5] ^= X[14];
+            X[ 8] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_1_7); X[ 1] ^= X[ 8];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r-6,X);
+
+            X[ 0] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_2_0); X[ 7] ^= X[ 0];
+            X[ 2] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_2_1); X[ 5] ^= X[ 2];
+            X[ 4] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_2_2); X[ 3] ^= X[ 4];
+            X[ 6] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_2_3); X[ 1] ^= X[ 6];
+            X[12] += X[15]; X[15] = RotL_64(X[15],R1024_2_4); X[15] ^= X[12];
+            X[14] += X[13]; X[13] = RotL_64(X[13],R1024_2_5); X[13] ^= X[14];
+            X[ 8] += X[11]; X[11] = RotL_64(X[11],R1024_2_6); X[11] ^= X[ 8];
+            X[10] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_2_7); X[ 9] ^= X[10];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r-5,X);
+                                                                            
+            X[ 0] += X[15]; X[15] = RotL_64(X[15],R1024_3_0); X[15] ^= X[ 0];
+            X[ 2] += X[11]; X[11] = RotL_64(X[11],R1024_3_1); X[11] ^= X[ 2];
+            X[ 6] += X[13]; X[13] = RotL_64(X[13],R1024_3_2); X[13] ^= X[ 6];
+            X[ 4] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_3_3); X[ 9] ^= X[ 4];
+            X[14] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_3_4); X[ 1] ^= X[14];
+            X[ 8] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_3_5); X[ 5] ^= X[ 8];
+            X[10] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_3_6); X[ 3] ^= X[10];
+            X[12] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_3_7); X[ 7] ^= X[12];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r-4,X);
+            InjectKey(2*r-1);
+
+            X[ 0] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_4_0); X[ 1] ^= X[ 0];
+            X[ 2] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_4_1); X[ 3] ^= X[ 2];
+            X[ 4] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_4_2); X[ 5] ^= X[ 4];
+            X[ 6] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_4_3); X[ 7] ^= X[ 6];
+            X[ 8] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_4_4); X[ 9] ^= X[ 8];
+            X[10] += X[11]; X[11] = RotL_64(X[11],R1024_4_5); X[11] ^= X[10];
+            X[12] += X[13]; X[13] = RotL_64(X[13],R1024_4_6); X[13] ^= X[12];
+            X[14] += X[15]; X[15] = RotL_64(X[15],R1024_4_7); X[15] ^= X[14];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r-3,X);
+
+            X[ 0] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_5_0); X[ 9] ^= X[ 0];
+            X[ 2] += X[13]; X[13] = RotL_64(X[13],R1024_5_1); X[13] ^= X[ 2];
+            X[ 6] += X[11]; X[11] = RotL_64(X[11],R1024_5_2); X[11] ^= X[ 6];
+            X[ 4] += X[15]; X[15] = RotL_64(X[15],R1024_5_3); X[15] ^= X[ 4];
+            X[10] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_5_4); X[ 7] ^= X[10];
+            X[12] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_5_5); X[ 3] ^= X[12];
+            X[14] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_5_6); X[ 5] ^= X[14];
+            X[ 8] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_5_7); X[ 1] ^= X[ 8];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r-2,X);
+
+            X[ 0] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_6_0); X[ 7] ^= X[ 0];
+            X[ 2] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_6_1); X[ 5] ^= X[ 2];
+            X[ 4] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_6_2); X[ 3] ^= X[ 4];
+            X[ 6] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_6_3); X[ 1] ^= X[ 6];
+            X[12] += X[15]; X[15] = RotL_64(X[15],R1024_6_4); X[15] ^= X[12];
+            X[14] += X[13]; X[13] = RotL_64(X[13],R1024_6_5); X[13] ^= X[14];
+            X[ 8] += X[11]; X[11] = RotL_64(X[11],R1024_6_6); X[11] ^= X[ 8];
+            X[10] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_6_7); X[ 9] ^= X[10];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r-1,X);
+                                                                            
+            X[ 0] += X[15]; X[15] = RotL_64(X[15],R1024_7_0); X[15] ^= X[ 0];
+            X[ 2] += X[11]; X[11] = RotL_64(X[11],R1024_7_1); X[11] ^= X[ 2];
+            X[ 6] += X[13]; X[13] = RotL_64(X[13],R1024_7_2); X[13] ^= X[ 6];
+            X[ 4] += X[ 9]; X[ 9] = RotL_64(X[ 9],R1024_7_3); X[ 9] ^= X[ 4];
+            X[14] += X[ 1]; X[ 1] = RotL_64(X[ 1],R1024_7_4); X[ 1] ^= X[14];
+            X[ 8] += X[ 5]; X[ 5] = RotL_64(X[ 5],R1024_7_5); X[ 5] ^= X[ 8];
+            X[10] += X[ 3]; X[ 3] = RotL_64(X[ 3],R1024_7_6); X[ 3] ^= X[10];
+            X[12] += X[ 7]; X[ 7] = RotL_64(X[ 7],R1024_7_7); X[ 7] ^= X[12];    Skein_Show_Round(BLK_BITS,&ctx->h,8*r  ,X);
+            InjectKey(2*r);
+            }
+        /* do the final "feedforward" xor, update context chaining vars */
+        for (i=0;i<WCNT;i++)
+            ctx->X[i] = X[i] ^ w[i];
+        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+        
+		Skein_Clear_First_Flag(ctx->h);		/* clear the start bit */
+        blkPtr += SKEIN1024_BLOCK_BYTES;
+        }
+    while (--blkCnt);
+    }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_Process_Block_CodeSize(void)
+    {
+    return ((u08b_t *) Skein1024_Process_Block_CodeSize) -
+           ((u08b_t *) Skein1024_Process_Block);
+    }
+uint_t Skein1024_Unroll_Cnt(void)
+    {
+    return 1;
+    }
+#endif
diff --git a/Reference_Implementation/skein_debug.c b/Reference_Implementation/skein_debug.c
new file mode 100644
index 0000000000000..fac5038598ea5
--- /dev/null
+++ b/Reference_Implementation/skein_debug.c
@@ -0,0 +1,247 @@
+/***********************************************************************
+**
+** Debug output functions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+#include <stdio.h>
+
+#ifdef SKEIN_DEBUG  /* only instantiate this code if SKEIN_DEBUG is on */
+#include "skein.h"
+
+static const char INDENT[] =  "    ";  /* how much to indent on new line */
+
+uint_t skein_DebugFlag = 0;  /* off by default. Must be set externally */
+
+static void Show64_step(size_t cnt,const u64b_t *X,size_t step)
+    {
+    size_t i,j;
+    for (i=j=0;i < cnt;i++,j+=step)
+        {
+        if (i % 4 ==  0) printf(INDENT);
+        printf(" %08X.%08X ",(uint_32t)(X[j] >> 32),(uint_32t)X[j]);
+        if (i % 4 ==  3 || i==cnt-1) printf("\n");
+        fflush(stdout);
+        }
+    }
+
+#define Show64(cnt,X) Show64_step(cnt,X,1)
+
+static void Show64_flag(size_t cnt,const u64b_t *X)
+    {
+    size_t xptr = (size_t) X;
+    size_t step = (xptr & 1) ? 2 : 1;
+    if (step != 1)
+        {
+        X = (const u64b_t *) (xptr & ~1);
+        }
+    Show64_step(cnt,X,step);
+    }
+
+static void Show08(size_t cnt,const u08b_t *b)
+    {
+    size_t i;
+    for (i=0;i < cnt;i++)
+        {
+        if (i %16 ==  0) printf(INDENT);
+        else if (i % 4 == 0) printf(" ");
+        printf(" %02X",b[i]);
+        if (i %16 == 15 || i==cnt-1) printf("\n");
+        fflush(stdout);
+        }
+    }
+
+static const char *AlgoHeader(uint_t bits)
+    {
+    if (skein_DebugFlag & SKEIN_DEBUG_THREEFISH)
+        switch (bits)
+            {
+            case  256:  return ":Threefish-256: ";
+            case  512:  return ":Threefish-512: ";
+            case 1024:  return ":Threefish-1024:";
+            }
+    else
+        switch (bits)
+            {
+            case  256:  return ":Skein-256: ";
+            case  512:  return ":Skein-512: ";
+            case 1024:  return ":Skein-1024:";
+            }
+    return NULL;
+    }
+
+void Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr)
+    {
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag & SKEIN_DEBUG_FINAL)
+        {
+        printf("\n%s Final output=\n",AlgoHeader(bits));
+        Show08(cnt,outPtr);
+        printf("    ++++++++++\n");
+        fflush(stdout);
+        }
+    }
+
+/* show state after a round (or "pseudo-round") */
+void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X)
+    {
+    static uint_t injectNum=0;  /* not multi-thread safe! */
+
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag)
+        {
+        if (r >= SKEIN_RND_SPECIAL) 
+            {       /* a key injection (or feedforward) point */
+            injectNum = (r == SKEIN_RND_KEY_INITIAL) ? 0 : injectNum+1;
+            if (  skein_DebugFlag & SKEIN_DEBUG_INJECT ||
+                ((skein_DebugFlag & SKEIN_DEBUG_FINAL) && r == SKEIN_RND_FEED_FWD))
+                {
+                printf("\n%s",AlgoHeader(bits));
+                switch (r)
+                    {
+                    case SKEIN_RND_KEY_INITIAL:
+                        printf(" [state after initial key injection]");
+                        break;
+                    case SKEIN_RND_KEY_INJECT:
+                        printf(" [state after key injection #%02d]",injectNum);
+                        break;
+                    case SKEIN_RND_FEED_FWD:
+                        printf(" [state after plaintext feedforward]");
+                        injectNum = 0;
+                        break;
+                    }
+                printf("=\n");
+                Show64(bits/64,X);
+                if (r== SKEIN_RND_FEED_FWD)
+                    printf("    ----------\n");
+                }
+            }
+        else if (skein_DebugFlag & SKEIN_DEBUG_ROUNDS)
+            {
+            uint_t j;
+            u64b_t p[SKEIN_MAX_STATE_WORDS];
+            const u08b_t *perm;
+            const static u08b_t PERM_256 [4][ 4] = { { 0,1,2,3 }, { 0,3,2,1 }, { 0,1,2,3 }, { 0,3,2,1 } };
+            const static u08b_t PERM_512 [4][ 8] = { { 0,1,2,3,4,5,6,7 },
+                                                     { 2,1,4,7,6,5,0,3 },
+                                                     { 4,1,6,3,0,5,2,7 },
+                                                     { 6,1,0,7,2,5,4,3 }
+                                                   };
+            const static u08b_t PERM_1024[4][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+                                                     { 0, 9, 2,13, 6,11, 4,15,10, 7,12, 3,14, 5, 8, 1 },
+                                                     { 0, 7, 2, 5, 4, 3, 6, 1,12,15,14,13, 8,11,10, 9 },
+                                                     { 0,15, 2,11, 6,13, 4, 9,14, 1, 8, 5,10, 3,12, 7 }
+                                                   };
+                    
+            if ((skein_DebugFlag & SKEIN_DEBUG_PERMUTE) && (r & 3))
+                {
+                printf("\n%s [state after round %2d (permuted)]=\n",AlgoHeader(bits),(int)r);
+                switch (bits)
+                    {
+                    case  256: perm = PERM_256 [r&3];   break;
+                    case  512: perm = PERM_512 [r&3];   break;
+                    default:   perm = PERM_1024[r&3];   break;
+                    }
+                for (j=0;j<bits/64;j++)
+                    p[j] = X[perm[j]];
+                Show64(bits/64,p);
+                }
+            else
+                {
+                printf("\n%s [state after round %2d]=\n",AlgoHeader(bits),(int)r);
+                Show64(bits/64,X);
+                }
+            }
+        }
+    }
+
+/* show state after a round (or "pseudo-round"), given a list of pointers */
+void Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[])
+    {
+    uint_t i;
+    u64b_t X[SKEIN_MAX_STATE_WORDS];
+
+    for (i=0;i<bits/64;i++)     /* copy over the words */ 
+        X[i] = X_ptr[i][0];
+    Skein_Show_Round(bits,h,r,X);
+    }
+
+
+/* show the state at the start of a block */
+void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+                      const u64b_t *wPtr, const u64b_t *ksPtr, const u64b_t *tsPtr)
+    {
+    uint_t n;
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag)
+        {
+        if (skein_DebugFlag & SKEIN_DEBUG_HDR)
+            {
+            printf("\n%s Block: outBits=%4d. T0=%06X.",AlgoHeader(bits),(uint_t) h->hashBitLen,(uint_t)h->T[0]);
+            printf(" Type=");
+            n = (uint_t) ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) >> SKEIN_T1_POS_BLK_TYPE);
+            switch (n)
+                {
+                case SKEIN_BLK_TYPE_KEY:  printf("KEY. ");  break;
+                case SKEIN_BLK_TYPE_CFG:  printf("CFG. ");  break;
+                case SKEIN_BLK_TYPE_PERS: printf("PERS.");  break;
+                case SKEIN_BLK_TYPE_PK :  printf("PK.  ");  break;
+                case SKEIN_BLK_TYPE_KDF:  printf("KDF. ");  break;
+                case SKEIN_BLK_TYPE_MSG:  printf("MSG. ");  break;
+                case SKEIN_BLK_TYPE_OUT:  printf("OUT. ");  break;
+                default:    printf("0x%02X.",n); break;
+                }
+            printf(" Flags=");
+            printf((h->T[1] & SKEIN_T1_FLAG_FIRST)   ? " First":"      ");
+            printf((h->T[1] & SKEIN_T1_FLAG_FINAL)   ? " Final":"      ");
+            printf((h->T[1] & SKEIN_T1_FLAG_BIT_PAD) ? " Pad"  :"    ");
+            n = (uint_t) ((h->T[1] & SKEIN_T1_TREE_LVL_MASK) >> SKEIN_T1_POS_TREE_LVL);
+            if (n)
+                printf("  TreeLevel = %02X",n);
+            printf("\n");
+            fflush(stdout);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_TWEAK)
+            {
+            printf("  Tweak:\n");
+            Show64(2,h->T);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_STATE)
+            {
+            printf("  %s words:\n",(skein_DebugFlag & SKEIN_DEBUG_THREEFISH)?"Key":"State");
+            Show64(bits/64,X);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_KEYSCHED)
+            {
+            printf("  Tweak schedule:\n");
+            Show64_flag(3,tsPtr);
+            printf("  Key   schedule:\n");
+            Show64_flag((bits/64)+1,ksPtr);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_INPUT_64)
+            {
+            printf("  Input block (words):\n");
+            Show64(bits/64,wPtr);
+            }
+        if (skein_DebugFlag & SKEIN_DEBUG_INPUT_08)
+            {
+            printf("  Input block (bytes):\n");
+            Show08(bits/8,blkPtr);
+            }
+        }
+    }
+
+void Skein_Show_Key(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes)
+    {
+    if (keyBytes)
+    if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+    if (skein_DebugFlag & SKEIN_DEBUG_KEY)
+        {
+        printf("\n%s MAC key = %4u bytes\n",AlgoHeader(bits),(unsigned) keyBytes);
+        Show08(keyBytes,key);
+        }
+    }
+#endif
diff --git a/Reference_Implementation/skein_debug.h b/Reference_Implementation/skein_debug.h
new file mode 100644
index 0000000000000..7775c0165c0ac
--- /dev/null
+++ b/Reference_Implementation/skein_debug.h
@@ -0,0 +1,48 @@
+#ifndef _SKEIN_DEBUG_H_
+#define _SKEIN_DEBUG_H_
+/***********************************************************************
+**
+** Interface definitions for Skein hashing debug output.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#ifdef  SKEIN_DEBUG
+/* callout functions used inside Skein code */
+void    Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+                         const u64b_t *wPtr,const u64b_t *ksPtr,const u64b_t *tsPtr);
+void    Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X);
+void    Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[]);
+void    Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr);
+void    Skein_Show_Key  (uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes);
+
+extern  uint_t skein_DebugFlag;            /* flags to control debug output (0 --> none) */
+
+#define SKEIN_RND_SPECIAL       (1000u)
+#define SKEIN_RND_KEY_INITIAL   (SKEIN_RND_SPECIAL+0u)
+#define SKEIN_RND_KEY_INJECT    (SKEIN_RND_SPECIAL+1u)
+#define SKEIN_RND_FEED_FWD      (SKEIN_RND_SPECIAL+2u)
+
+/* flag bits:  skein_DebugFlag */
+#define SKEIN_DEBUG_KEY         (1u << 1)  /* show MAC key */
+#define SKEIN_DEBUG_CONFIG      (1u << 2)  /* show config block processing */
+#define SKEIN_DEBUG_STATE       (1u << 3)  /* show input state during Show_Block() */
+#define SKEIN_DEBUG_TWEAK       (1u << 4)  /* show input state during Show_Block() */
+#define SKEIN_DEBUG_KEYSCHED    (1u << 5)  /* show expanded key schedule */
+#define SKEIN_DEBUG_INPUT_64    (1u << 6)  /* show input block as 64-bit words */
+#define SKEIN_DEBUG_INPUT_08    (1u << 7)  /* show input block as  8-bit bytes */
+#define SKEIN_DEBUG_INJECT      (1u << 8)  /* show state after key injection & feedforward points */
+#define SKEIN_DEBUG_ROUNDS      (1u << 9)  /* show state after all rounds */
+#define SKEIN_DEBUG_FINAL       (1u <<10)  /* show final output of Skein */
+#define SKEIN_DEBUG_HDR         (1u <<11)  /* show block header */
+#define SKEIN_DEBUG_THREEFISH   (1u <<12)  /* use Threefish name instead of Skein */
+#define SKEIN_DEBUG_PERMUTE     (1u <<13)  /* use word permutations */
+#define SKEIN_DEBUG_ALL         ((~0u) & ~(SKEIN_DEBUG_THREEFISH | SKEIN_DEBUG_PERMUTE))
+#define THREEFISH_DEBUG_ALL     (SKEIN_DEBUG_ALL | SKEIN_DEBUG_THREEFISH)
+
+#endif /*  SKEIN_DEBUG    */
+
+#endif /* _SKEIN_DEBUG_H_ */
diff --git a/Reference_Implementation/skein_port.h b/Reference_Implementation/skein_port.h
new file mode 100644
index 0000000000000..e0dcc85bdc624
--- /dev/null
+++ b/Reference_Implementation/skein_port.h
@@ -0,0 +1,44 @@
+#ifndef _SKEIN_PORT_H_
+#define _SKEIN_PORT_H_
+/*******************************************************************
+**
+** Platform-specific definitions for Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Many thanks to Brian Gladman for his portable header files, which
+** have been modified slightly here, to handle a few more platforms.
+**
+** To port Skein to an "unsupported" platform, change the definitions
+** in this file appropriately.
+** 
+********************************************************************/
+
+#include "brg_types.h"                      /* get integer type definitions */
+
+typedef unsigned int    uint_t;             /* native unsigned integer */
+typedef uint_8t         u08b_t;             /*  8-bit unsigned integer */
+typedef uint_64t        u64b_t;             /* 64-bit unsigned integer */
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs.  The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * In the reference code, these functions are implemented in a
+ * very portable (and thus slow) fashion, for clarity. See the file
+ * "skein_port.h" in the Optimized_Code directory for ways to make
+ * these functions fast(er) on x86 platforms.
+ */
+
+u64b_t Skein_Swap64(u64b_t w64);
+void   Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt);
+void   Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt);
+
+#endif   /* ifndef _SKEIN_PORT_H_ */
diff --git a/Supporting_Documentation/Skein Cover Sheet.pdf b/Supporting_Documentation/Skein Cover Sheet.pdf
new file mode 100644
index 0000000000000..c08b0e11322df
--- /dev/null
+++ b/Supporting_Documentation/Skein Cover Sheet.pdf
diff --git a/Supporting_Documentation/Skein_Implementation_Statement.pdf b/Supporting_Documentation/Skein_Implementation_Statement.pdf
new file mode 100644
index 0000000000000..87366ab3d73e2
--- /dev/null
+++ b/Supporting_Documentation/Skein_Implementation_Statement.pdf
diff --git a/Supporting_Documentation/Skein_Submitter_Statement.pdf b/Supporting_Documentation/Skein_Submitter_Statement.pdf
new file mode 100644
index 0000000000000..cc96accf83684
--- /dev/null
+++ b/Supporting_Documentation/Skein_Submitter_Statement.pdf
diff --git a/Supporting_Documentation/skein1.3.pdf b/Supporting_Documentation/skein1.3.pdf
new file mode 100644
index 0000000000000..844ba9e925e9d
--- /dev/null
+++ b/Supporting_Documentation/skein1.3.pdf
diff --git a/Supporting_Documentation/skeinround3Mods.pdf b/Supporting_Documentation/skeinround3Mods.pdf
new file mode 100644
index 0000000000000..304dfe2df5ba4
--- /dev/null
+++ b/Supporting_Documentation/skeinround3Mods.pdf
diff --git a/Supporting_Documentation/tex/key_recover.pdf b/Supporting_Documentation/tex/key_recover.pdf
new file mode 100644
index 0000000000000..6b69970ad1363
--- /dev/null
+++ b/Supporting_Documentation/tex/key_recover.pdf
diff --git a/Supporting_Documentation/tex/reverserounds256.pdf b/Supporting_Documentation/tex/reverserounds256.pdf
new file mode 100644
index 0000000000000..37f8f48b3ab65
--- /dev/null
+++ b/Supporting_Documentation/tex/reverserounds256.pdf
diff --git a/Supporting_Documentation/tex/skein-21.mps b/Supporting_Documentation/tex/skein-21.mps
new file mode 100644
index 0000000000000..5da4a37b26aee
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-21.mps
@@ -0,0 +1,161 @@
+%!PS
+%%BoundingBox: -46 -68 85 19 
+%%HiResBoundingBox: -45.20761 -67.94917 84.85632 18.1732 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.08.28:0231
+%%Pages: 1
+%*Font: cmmi10 9.96265 9.96265 3c:800002
+%*Font: cmmi7 6.97385 6.97385 3b:80000000000201
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 4 4 moveto
+4 4 lineto
+4 -4 lineto
+4 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -4 4 moveto
+-4 4 lineto
+4 4 lineto
+4 4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -4 4 moveto
+-4 4 lineto
+-4 -4 lineto
+-4 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -4 -4 moveto
+-4 -4 lineto
+4 -4 lineto
+4 -4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 4 moveto
+0 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 4 0 moveto
+-4 0 lineto stroke
+31.3464 -25.37865 moveto
+(<) cmmi10 9.96265 fshow
+35.7743 -25.37865 moveto
+(<) cmmi10 9.96265 fshow
+40.2023 -25.37865 moveto
+(<) cmmi10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 50.95102 -17.00784 moveto
+50.95102 -17.00784 lineto
+50.95102 -28.76813 lineto
+50.95102 -28.76813 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 28.3464 -17.00784 moveto
+28.3464 -17.00784 lineto
+50.95102 -17.00784 lineto
+50.95102 -17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 28.3464 -17.00784 moveto
+28.3464 -17.00784 lineto
+28.3464 -28.76813 lineto
+28.3464 -28.76813 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 28.3464 -28.76813 moveto
+28.3464 -28.76813 lineto
+50.95102 -28.76813 lineto
+50.95102 -28.76813 lineto stroke
+newpath 42.47728 -46.9474 moveto
+43.20105 -47.67117 43.64871 -48.67105 43.64871 -49.77548 curveto
+43.64871 -49.77646 lineto
+43.64871 -50.88089 43.20105 -51.88077 42.47728 -52.60454 curveto stroke
+newpath 36.82014 -46.9474 moveto
+37.54391 -46.22363 38.5438 -45.77597 39.64822 -45.77597 curveto
+39.6492 -45.77597 lineto
+40.75363 -45.77597 41.75351 -46.22363 42.47728 -46.9474 curveto stroke
+newpath 36.82014 -46.9474 moveto
+36.09637 -47.67117 35.64871 -48.67105 35.64871 -49.77548 curveto
+35.64871 -49.77646 lineto
+35.64871 -50.88089 36.09637 -51.88077 36.82014 -52.60454 curveto stroke
+newpath 36.82014 -52.60454 moveto
+37.54391 -53.32831 38.5438 -53.77597 39.64822 -53.77597 curveto
+39.6492 -53.77597 lineto
+40.75363 -53.77597 41.75351 -53.32831 42.47728 -52.60454 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 -45.77597 moveto
+39.64871 -53.77597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 43.64871 -49.77597 moveto
+35.64871 -49.77597 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 18.1732 moveto
+0 4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.53078 7.69554 moveto
+0 4 lineto
+1.53078 7.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 18.1732 moveto
+39.64871 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.11801 -13.31248 moveto
+39.64871 -17.00784 lineto
+41.17941 -13.31248 lineto
+ closepath
+gsave fill grestore stroke
+newpath 39.64871 0 moveto
+4 0 lineto stroke
+newpath 7.69574 1.53087 moveto
+4 0 lineto
+7.69574 -1.53087 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -4 moveto
+0 -67.94917 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.53067 -64.2539 moveto
+0 -67.94917 lineto
+1.53067 -64.2539 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 -28.76813 moveto
+39.64871 -45.77597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.11795 -42.08049 moveto
+39.64871 -45.77597 lineto
+41.17947 -42.08049 lineto
+ closepath
+gsave fill grestore stroke
+newpath 0 -49.77597 moveto
+35.64871 -49.77597 lineto stroke
+newpath 31.95297 -51.30684 moveto
+35.64871 -49.77597 lineto
+31.95297 -48.2451 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 -53.77597 moveto
+39.64871 -67.94917 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.11794 -64.25363 moveto
+39.64871 -67.94917 lineto
+41.17949 -64.25363 lineto
+ closepath
+gsave fill grestore stroke
+newpath 65.12422 -22.88799 moveto
+50.95102 -22.88799 lineto stroke
+newpath 54.64656 -21.35721 moveto
+50.95102 -22.88799 lineto
+54.64656 -24.41876 lineto
+ closepath
+gsave fill grestore stroke
+68.12422 -24.86668 moveto
+(R) cmmi10 9.96265 fshow
+75.68872 -26.36108 moveto
+(r) cmmi7 6.97385 fshow
+79.17342 -26.36108 moveto
+(;i) cmmi7 6.97385 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-22.mps b/Supporting_Documentation/tex/skein-22.mps
new file mode 100644
index 0000000000000..7d14dfe24d84b
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-22.mps
@@ -0,0 +1,832 @@
+%!PS
+%%BoundingBox: -59 -315 229 30 
+%%HiResBoundingBox: -58.56683 -314.45247 228.6453 29.76099 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.08.28:0231
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 30:c400000490006c5f3cc
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 170.07843 0 moveto
+170.07843 0 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 0 moveto
+0 0 lineto
+170.07843 0 lineto
+170.07843 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 0 moveto
+0 0 lineto
+0 -16 lineto
+0 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -16 moveto
+0 -16 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -4 moveto
+89.03922 -4 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+89.03922 -4 lineto
+89.03922 -4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+81.03922 -12 lineto
+81.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -12 moveto
+81.03922 -12 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -4 moveto
+85.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -8 moveto
+81.03922 -8 lineto stroke
+newpath -14.1732 -8 moveto
+0 -8 lineto stroke
+newpath -3.69554 -9.53078 moveto
+0 -8 lineto
+-3.69554 -6.46922 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -10.49066 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -10.49066 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -10.49066 moveto
+(0) cmr10 9.96265 fshow
+33.94067 -43.49834 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 77.95241 -30.1732 moveto
+77.95241 -30.1732 lineto
+77.95241 -50.01569 lineto
+77.95241 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -30.1732 moveto
+7.08684 -30.1732 lineto
+77.95241 -30.1732 lineto
+77.95241 -30.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -30.1732 moveto
+7.08684 -30.1732 lineto
+7.08684 -50.01569 lineto
+7.08684 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -50.01569 moveto
+7.08684 -50.01569 lineto
+77.95241 -50.01569 lineto
+77.95241 -50.01569 lineto stroke
+118.97946 -43.49834 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -30.1732 moveto
+162.9912 -30.1732 lineto
+162.9912 -50.01569 lineto
+162.9912 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12563 -30.1732 moveto
+92.12563 -30.1732 lineto
+162.9912 -30.1732 lineto
+162.9912 -30.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 92.12563 -30.1732 moveto
+92.12563 -30.1732 lineto
+92.12563 -50.01569 lineto
+92.12563 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12563 -50.01569 moveto
+92.12563 -50.01569 lineto
+162.9912 -50.01569 lineto
+162.9912 -50.01569 lineto stroke
+66.69112 -73.99669 moveto
+(P) cmr10 9.96265 fshow
+73.19452 -73.99669 moveto
+(erm) cmr10 9.96265 fshow
+89.54991 -73.99669 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -64.18889 moveto
+162.9912 -64.18889 lineto
+162.9912 -76.99669 lineto
+162.9912 -76.99669 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -64.18889 moveto
+7.08684 -64.18889 lineto
+162.9912 -64.18889 lineto
+162.9912 -64.18889 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -64.18889 moveto
+7.08684 -64.18889 lineto
+7.08684 -76.99669 lineto
+7.08684 -76.99669 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -76.99669 moveto
+7.08684 -76.99669 lineto
+162.9912 -76.99669 lineto
+162.9912 -76.99669 lineto stroke
+33.94067 -104.49503 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 77.95241 -91.16989 moveto
+77.95241 -91.16989 lineto
+77.95241 -111.01237 lineto
+77.95241 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -91.16989 moveto
+7.08684 -91.16989 lineto
+77.95241 -91.16989 lineto
+77.95241 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -91.16989 moveto
+7.08684 -91.16989 lineto
+7.08684 -111.01237 lineto
+7.08684 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -111.01237 moveto
+7.08684 -111.01237 lineto
+77.95241 -111.01237 lineto
+77.95241 -111.01237 lineto stroke
+118.97945 -104.49503 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -91.16989 moveto
+162.9912 -91.16989 lineto
+162.9912 -111.01237 lineto
+162.9912 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -91.16989 moveto
+92.12561 -91.16989 lineto
+162.9912 -91.16989 lineto
+162.9912 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 92.12561 -91.16989 moveto
+92.12561 -91.16989 lineto
+92.12561 -111.01237 lineto
+92.12561 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -111.01237 moveto
+92.12561 -111.01237 lineto
+162.9912 -111.01237 lineto
+162.9912 -111.01237 lineto stroke
+66.69112 -134.99338 moveto
+(P) cmr10 9.96265 fshow
+73.19452 -134.99338 moveto
+(erm) cmr10 9.96265 fshow
+89.54991 -134.99338 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -125.18558 moveto
+162.9912 -125.18558 lineto
+162.9912 -137.99338 lineto
+162.9912 -137.99338 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -125.18558 moveto
+7.08684 -125.18558 lineto
+162.9912 -125.18558 lineto
+162.9912 -125.18558 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -125.18558 moveto
+7.08684 -125.18558 lineto
+7.08684 -137.99338 lineto
+7.08684 -137.99338 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -137.99338 moveto
+7.08684 -137.99338 lineto
+162.9912 -137.99338 lineto
+162.9912 -137.99338 lineto stroke
+33.94067 -236.35773 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 77.95241 -223.0326 moveto
+77.95241 -223.0326 lineto
+77.95241 -242.87508 lineto
+77.95241 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -223.0326 moveto
+7.08684 -223.0326 lineto
+77.95241 -223.0326 lineto
+77.95241 -223.0326 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -223.0326 moveto
+7.08684 -223.0326 lineto
+7.08684 -242.87508 lineto
+7.08684 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -242.87508 moveto
+7.08684 -242.87508 lineto
+77.95241 -242.87508 lineto
+77.95241 -242.87508 lineto stroke
+118.97945 -236.35773 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -223.0326 moveto
+162.9912 -223.0326 lineto
+162.9912 -242.87508 lineto
+162.9912 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -223.0326 moveto
+92.12561 -223.0326 lineto
+162.9912 -223.0326 lineto
+162.9912 -223.0326 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 92.12561 -223.0326 moveto
+92.12561 -223.0326 lineto
+92.12561 -242.87508 lineto
+92.12561 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -242.87508 moveto
+92.12561 -242.87508 lineto
+162.9912 -242.87508 lineto
+162.9912 -242.87508 lineto stroke
+66.69112 -266.85608 moveto
+(P) cmr10 9.96265 fshow
+73.19452 -266.85608 moveto
+(erm) cmr10 9.96265 fshow
+89.54991 -266.85608 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -257.04828 moveto
+162.9912 -257.04828 lineto
+162.9912 -269.85606 lineto
+162.9912 -269.85606 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -257.04828 moveto
+7.08684 -257.04828 lineto
+162.9912 -257.04828 lineto
+162.9912 -257.04828 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -257.04828 moveto
+7.08684 -257.04828 lineto
+7.08684 -269.85606 lineto
+7.08684 -269.85606 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -269.85606 moveto
+7.08684 -269.85606 lineto
+162.9912 -269.85606 lineto
+162.9912 -269.85606 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.07843 -284.02927 moveto
+170.07843 -284.02927 lineto
+170.07843 -300.02927 lineto
+170.07843 -300.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -284.02927 moveto
+0 -284.02927 lineto
+170.07843 -284.02927 lineto
+170.07843 -284.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -284.02927 moveto
+0 -284.02927 lineto
+0 -300.02927 lineto
+0 -300.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -300.02927 moveto
+0 -300.02927 lineto
+170.07843 -300.02927 lineto
+170.07843 -300.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -288.02927 moveto
+89.03922 -288.02927 lineto
+89.03922 -296.02927 lineto
+89.03922 -296.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -288.02927 moveto
+81.03922 -288.02927 lineto
+89.03922 -288.02927 lineto
+89.03922 -288.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -288.02927 moveto
+81.03922 -288.02927 lineto
+81.03922 -296.02927 lineto
+81.03922 -296.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -296.02927 moveto
+81.03922 -296.02927 lineto
+89.03922 -296.02927 lineto
+89.03922 -296.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -288.02927 moveto
+85.03922 -296.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -292.02927 moveto
+81.03922 -292.02927 lineto stroke
+newpath -14.1732 -292.02927 moveto
+0 -292.02927 lineto stroke
+newpath -3.69554 -293.56004 moveto
+0 -292.02927 lineto
+-3.69554 -290.49849 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -294.51993 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -294.51993 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -294.51993 moveto
+(1) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 14.1732 moveto
+21.25981 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 3.69554 moveto
+21.25981 0 lineto
+22.79059 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -16 moveto
+21.25981 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -26.47766 moveto
+21.25981 -30.1732 lineto
+22.79059 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -50.01569 moveto
+21.25981 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -60.49335 moveto
+21.25981 -64.18889 lineto
+22.79059 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -76.99669 moveto
+21.25981 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -87.47435 moveto
+21.25981 -91.16989 lineto
+22.79059 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -111.01237 moveto
+21.25981 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -121.49004 moveto
+21.25981 -125.18558 lineto
+22.79059 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -137.99338 moveto
+21.25981 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -148.47104 moveto
+21.25981 -152.16658 lineto
+22.79059 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -208.85939 moveto
+21.25981 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -219.33705 moveto
+21.25981 -223.0326 lineto
+22.79059 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -242.87508 moveto
+21.25981 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -253.35274 moveto
+21.25981 -257.04828 lineto
+22.79059 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -269.85606 moveto
+21.25981 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -280.33372 moveto
+21.25981 -284.02927 lineto
+22.79059 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -300.02927 moveto
+21.25981 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -310.50693 moveto
+21.25981 -314.20247 lineto
+22.79059 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 22.75421 -166.33978 moveto
+22.75421 -165.94345 22.59677 -165.56334 22.31651 -165.28308 curveto
+22.03625 -165.00282 21.65614 -164.84538 21.25981 -164.84538 curveto
+20.86348 -164.84538 20.48337 -165.00282 20.20311 -165.28308 curveto
+19.92285 -165.56334 19.76541 -165.94345 19.76541 -166.33978 curveto
+19.76541 -166.73611 19.92285 -167.11623 20.20311 -167.39648 curveto
+20.48337 -167.67674 20.86348 -167.83418 21.25981 -167.83418 curveto
+21.65614 -167.83418 22.03625 -167.67674 22.31651 -167.39648 curveto
+22.59677 -167.11623 22.75421 -166.73611 22.75421 -166.33978 curveto closepath
+ fill
+newpath 22.75421 -180.51299 moveto
+22.75421 -180.11665 22.59677 -179.73654 22.31651 -179.45628 curveto
+22.03625 -179.17603 21.65614 -179.01859 21.25981 -179.01859 curveto
+20.86348 -179.01859 20.48337 -179.17603 20.20311 -179.45628 curveto
+19.92285 -179.73654 19.76541 -180.11665 19.76541 -180.51299 curveto
+19.76541 -180.90932 19.92285 -181.28943 20.20311 -181.56969 curveto
+20.48337 -181.84995 20.86348 -182.00739 21.25981 -182.00739 curveto
+21.65614 -182.00739 22.03625 -181.84995 22.31651 -181.56969 curveto
+22.59677 -181.28943 22.75421 -180.90932 22.75421 -180.51299 curveto closepath
+ fill
+newpath 22.75421 -194.68619 moveto
+22.75421 -194.28986 22.59677 -193.90974 22.31651 -193.62949 curveto
+22.03625 -193.34923 21.65614 -193.19179 21.25981 -193.19179 curveto
+20.86348 -193.19179 20.48337 -193.34923 20.20311 -193.62949 curveto
+19.92285 -193.90974 19.76541 -194.28986 19.76541 -194.68619 curveto
+19.76541 -195.08252 19.92285 -195.46263 20.20311 -195.74289 curveto
+20.48337 -196.02315 20.86348 -196.18059 21.25981 -196.18059 curveto
+21.65614 -196.18059 22.03625 -196.02315 22.31651 -195.74289 curveto
+22.59677 -195.46263 22.75421 -195.08252 22.75421 -194.68619 curveto closepath
+ fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 14.1732 moveto
+63.77942 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 3.69554 moveto
+63.77942 0 lineto
+65.3102 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -16 moveto
+63.77942 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -26.47766 moveto
+63.77942 -30.1732 lineto
+65.3102 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -50.01569 moveto
+63.77942 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -60.49335 moveto
+63.77942 -64.18889 lineto
+65.3102 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -76.99669 moveto
+63.77942 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -87.47435 moveto
+63.77942 -91.16989 lineto
+65.3102 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -111.01237 moveto
+63.77942 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -121.49004 moveto
+63.77942 -125.18558 lineto
+65.3102 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -137.99338 moveto
+63.77942 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -148.47104 moveto
+63.77942 -152.16658 lineto
+65.3102 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -208.85939 moveto
+63.77942 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -219.33705 moveto
+63.77942 -223.0326 lineto
+65.3102 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -242.87508 moveto
+63.77942 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -253.35274 moveto
+63.77942 -257.04828 lineto
+65.3102 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -269.85606 moveto
+63.77942 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -280.33372 moveto
+63.77942 -284.02927 lineto
+65.3102 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -300.02927 moveto
+63.77942 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -310.50693 moveto
+63.77942 -314.20247 lineto
+65.3102 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 65.27382 -166.33978 moveto
+65.27382 -165.94345 65.11638 -165.56334 64.83612 -165.28308 curveto
+64.55586 -165.00282 64.17575 -164.84538 63.77942 -164.84538 curveto
+63.38309 -164.84538 63.00298 -165.00282 62.72272 -165.28308 curveto
+62.44246 -165.56334 62.28502 -165.94345 62.28502 -166.33978 curveto
+62.28502 -166.73611 62.44246 -167.11623 62.72272 -167.39648 curveto
+63.00298 -167.67674 63.38309 -167.83418 63.77942 -167.83418 curveto
+64.17575 -167.83418 64.55586 -167.67674 64.83612 -167.39648 curveto
+65.11638 -167.11623 65.27382 -166.73611 65.27382 -166.33978 curveto closepath
+ fill
+newpath 65.27382 -180.51299 moveto
+65.27382 -180.11665 65.11638 -179.73654 64.83612 -179.45628 curveto
+64.55586 -179.17603 64.17575 -179.01859 63.77942 -179.01859 curveto
+63.38309 -179.01859 63.00298 -179.17603 62.72272 -179.45628 curveto
+62.44246 -179.73654 62.28502 -180.11665 62.28502 -180.51299 curveto
+62.28502 -180.90932 62.44246 -181.28943 62.72272 -181.56969 curveto
+63.00298 -181.84995 63.38309 -182.00739 63.77942 -182.00739 curveto
+64.17575 -182.00739 64.55586 -181.84995 64.83612 -181.56969 curveto
+65.11638 -181.28943 65.27382 -180.90932 65.27382 -180.51299 curveto closepath
+ fill
+newpath 65.27382 -194.68619 moveto
+65.27382 -194.28986 65.11638 -193.90974 64.83612 -193.62949 curveto
+64.55586 -193.34923 64.17575 -193.19179 63.77942 -193.19179 curveto
+63.38309 -193.19179 63.00298 -193.34923 62.72272 -193.62949 curveto
+62.44246 -193.90974 62.28502 -194.28986 62.28502 -194.68619 curveto
+62.28502 -195.08252 62.44246 -195.46263 62.72272 -195.74289 curveto
+63.00298 -196.02315 63.38309 -196.18059 63.77942 -196.18059 curveto
+64.17575 -196.18059 64.55586 -196.02315 64.83612 -195.74289 curveto
+65.11638 -195.46263 65.27382 -195.08252 65.27382 -194.68619 curveto closepath
+ fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 14.1732 moveto
+106.29903 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 3.69554 moveto
+106.29903 0 lineto
+107.8298 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -16 moveto
+106.29903 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -26.47766 moveto
+106.29903 -30.1732 lineto
+107.8298 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -50.01569 moveto
+106.29903 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -60.49335 moveto
+106.29903 -64.18889 lineto
+107.8298 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -76.99669 moveto
+106.29903 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -87.47435 moveto
+106.29903 -91.16989 lineto
+107.8298 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -111.01237 moveto
+106.29903 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -121.49004 moveto
+106.29903 -125.18558 lineto
+107.8298 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -137.99338 moveto
+106.29903 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -148.47104 moveto
+106.29903 -152.16658 lineto
+107.8298 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -208.85939 moveto
+106.29903 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -219.33705 moveto
+106.29903 -223.0326 lineto
+107.8298 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -242.87508 moveto
+106.29903 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -253.35274 moveto
+106.29903 -257.04828 lineto
+107.8298 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -269.85606 moveto
+106.29903 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -280.33372 moveto
+106.29903 -284.02927 lineto
+107.8298 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -300.02927 moveto
+106.29903 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -310.50693 moveto
+106.29903 -314.20247 lineto
+107.8298 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 107.79343 -166.33978 moveto
+107.79343 -165.94345 107.63599 -165.56334 107.35573 -165.28308 curveto
+107.07547 -165.00282 106.69536 -164.84538 106.29903 -164.84538 curveto
+105.9027 -164.84538 105.52258 -165.00282 105.24232 -165.28308 curveto
+104.96207 -165.56334 104.80463 -165.94345 104.80463 -166.33978 curveto
+104.80463 -166.73611 104.96207 -167.11623 105.24232 -167.39648 curveto
+105.52258 -167.67674 105.9027 -167.83418 106.29903 -167.83418 curveto
+106.69536 -167.83418 107.07547 -167.67674 107.35573 -167.39648 curveto
+107.63599 -167.11623 107.79343 -166.73611 107.79343 -166.33978 curveto
+ closepath fill
+newpath 107.79343 -180.51299 moveto
+107.79343 -180.11665 107.63599 -179.73654 107.35573 -179.45628 curveto
+107.07547 -179.17603 106.69536 -179.01859 106.29903 -179.01859 curveto
+105.9027 -179.01859 105.52258 -179.17603 105.24232 -179.45628 curveto
+104.96207 -179.73654 104.80463 -180.11665 104.80463 -180.51299 curveto
+104.80463 -180.90932 104.96207 -181.28943 105.24232 -181.56969 curveto
+105.52258 -181.84995 105.9027 -182.00739 106.29903 -182.00739 curveto
+106.69536 -182.00739 107.07547 -181.84995 107.35573 -181.56969 curveto
+107.63599 -181.28943 107.79343 -180.90932 107.79343 -180.51299 curveto
+ closepath fill
+newpath 107.79343 -194.68619 moveto
+107.79343 -194.28986 107.63599 -193.90974 107.35573 -193.62949 curveto
+107.07547 -193.34923 106.69536 -193.19179 106.29903 -193.19179 curveto
+105.9027 -193.19179 105.52258 -193.34923 105.24232 -193.62949 curveto
+104.96207 -193.90974 104.80463 -194.28986 104.80463 -194.68619 curveto
+104.80463 -195.08252 104.96207 -195.46263 105.24232 -195.74289 curveto
+105.52258 -196.02315 105.9027 -196.18059 106.29903 -196.18059 curveto
+106.69536 -196.18059 107.07547 -196.02315 107.35573 -195.74289 curveto
+107.63599 -195.46263 107.79343 -195.08252 107.79343 -194.68619 curveto
+ closepath fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 14.1732 moveto
+148.81863 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 3.69554 moveto
+148.81863 0 lineto
+150.34941 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -16 moveto
+148.81863 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -26.47766 moveto
+148.81863 -30.1732 lineto
+150.34941 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -50.01569 moveto
+148.81863 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -60.49335 moveto
+148.81863 -64.18889 lineto
+150.34941 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -76.99669 moveto
+148.81863 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -87.47435 moveto
+148.81863 -91.16989 lineto
+150.34941 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -111.01237 moveto
+148.81863 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -121.49004 moveto
+148.81863 -125.18558 lineto
+150.34941 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -137.99338 moveto
+148.81863 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -148.47104 moveto
+148.81863 -152.16658 lineto
+150.34941 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -208.85939 moveto
+148.81863 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -219.33705 moveto
+148.81863 -223.0326 lineto
+150.34941 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -242.87508 moveto
+148.81863 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -253.35274 moveto
+148.81863 -257.04828 lineto
+150.34941 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -269.85606 moveto
+148.81863 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -280.33372 moveto
+148.81863 -284.02927 lineto
+150.34941 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -300.02927 moveto
+148.81863 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -310.50693 moveto
+148.81863 -314.20247 lineto
+150.34941 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 150.31303 -166.33978 moveto
+150.31303 -165.94345 150.1556 -165.56334 149.87534 -165.28308 curveto
+149.59508 -165.00282 149.21497 -164.84538 148.81863 -164.84538 curveto
+148.4223 -164.84538 148.04219 -165.00282 147.76193 -165.28308 curveto
+147.48167 -165.56334 147.32423 -165.94345 147.32423 -166.33978 curveto
+147.32423 -166.73611 147.48167 -167.11623 147.76193 -167.39648 curveto
+148.04219 -167.67674 148.4223 -167.83418 148.81863 -167.83418 curveto
+149.21497 -167.83418 149.59508 -167.67674 149.87534 -167.39648 curveto
+150.1556 -167.11623 150.31303 -166.73611 150.31303 -166.33978 curveto closepath
+ fill
+newpath 150.31303 -180.51299 moveto
+150.31303 -180.11665 150.1556 -179.73654 149.87534 -179.45628 curveto
+149.59508 -179.17603 149.21497 -179.01859 148.81863 -179.01859 curveto
+148.4223 -179.01859 148.04219 -179.17603 147.76193 -179.45628 curveto
+147.48167 -179.73654 147.32423 -180.11665 147.32423 -180.51299 curveto
+147.32423 -180.90932 147.48167 -181.28943 147.76193 -181.56969 curveto
+148.04219 -181.84995 148.4223 -182.00739 148.81863 -182.00739 curveto
+149.21497 -182.00739 149.59508 -181.84995 149.87534 -181.56969 curveto
+150.1556 -181.28943 150.31303 -180.90932 150.31303 -180.51299 curveto closepath
+ fill
+newpath 150.31303 -194.68619 moveto
+150.31303 -194.28986 150.1556 -193.90974 149.87534 -193.62949 curveto
+149.59508 -193.34923 149.21497 -193.19179 148.81863 -193.19179 curveto
+148.4223 -193.19179 148.04219 -193.34923 147.76193 -193.62949 curveto
+147.48167 -193.90974 147.32423 -194.28986 147.32423 -194.68619 curveto
+147.32423 -195.08252 147.48167 -195.46263 147.76193 -195.74289 curveto
+148.04219 -196.02315 148.4223 -196.18059 148.81863 -196.18059 curveto
+149.21497 -196.18059 149.59508 -196.02315 149.87534 -195.74289 curveto
+150.1556 -195.46263 150.31303 -195.08252 150.31303 -194.68619 curveto closepath
+ fill
+65.04471 22.84248 moveto
+(Plain) cmr10 9.96265 fshow
+87.5991 22.84248 moveto
+(text) cmr10 9.96265 fshow
+165.9912 -183.97226 moveto
+(5) cmr10 9.96265 fshow
+174.2934 -183.97226 moveto
+(more) cmr10 9.96265 fshow
+199.22769 -183.97226 moveto
+(rounds) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-23.mps b/Supporting_Documentation/tex/skein-23.mps
new file mode 100644
index 0000000000000..bd31152d40b89
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-23.mps
@@ -0,0 +1,327 @@
+%!PS
+%%BoundingBox: -54 -61 145 46 
+%%HiResBoundingBox: -53.12164 -60.92598 144.97418 45.3179 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.08.28:0231
+%%Pages: 1
+%*Font: cmmi10 9.96265 9.96265 48:84
+%*Font: cmr10 9.96265 9.96265 0c:800000000fca000000000440a
+%*Font: cmr7 6.97385 6.97385 30:e
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 1.66928 17.00784 moveto
+1.66928 17.00784 lineto
+1.66928 -17.00784 lineto
+1.66928 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -18.1732 17.00784 moveto
+-18.1732 17.00784 lineto
+1.66928 17.00784 lineto
+1.66928 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -18.1732 17.00784 moveto
+-18.1732 17.00784 lineto
+-18.1732 -17.00784 lineto
+-18.1732 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -18.1732 -17.00784 moveto
+-18.1732 -17.00784 lineto
+1.66928 -17.00784 lineto
+1.66928 -17.00784 lineto stroke
+newpath 17.00177 2.82857 moveto
+17.72554 2.1048 18.1732 1.10492 18.1732 0.00049 curveto
+18.1732 -0.00049 lineto
+18.1732 -1.10492 17.72554 -2.1048 17.00177 -2.82857 curveto stroke
+newpath 11.34464 2.82857 moveto
+12.0684 3.55234 13.06828 4 14.17271 4 curveto
+14.17369 4 lineto
+15.27812 4 16.278 3.55234 17.00177 2.82857 curveto stroke
+newpath 11.34464 2.82857 moveto
+10.62086 2.1048 10.1732 1.10492 10.1732 0.00049 curveto
+10.1732 -0.00049 lineto
+10.1732 -1.10492 10.62086 -2.1048 11.34464 -2.82857 curveto stroke
+newpath 11.34464 -2.82857 moveto
+12.0684 -3.55234 13.06828 -4 14.17271 -4 curveto
+14.17369 -4 lineto
+15.27812 -4 16.278 -3.55234 17.00177 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 14.1732 4 moveto
+14.1732 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 18.1732 0 moveto
+10.1732 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -8.25197 34.01569 moveto
+-8.25197 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -9.78273 20.70332 moveto
+-8.25197 17.00784 lineto
+-6.7212 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath -8.25197 25.51176 moveto
+14.1732 25.51176 lineto
+14.1732 4 lineto stroke
+newpath 12.64244 7.69548 moveto
+14.1732 4 lineto
+15.70396 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 1.66928 0 moveto
+10.1732 0 lineto stroke
+newpath 6.47772 -1.53076 moveto
+10.1732 0 lineto
+6.47772 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath -23.84248 -34.01569 moveto
+-23.84248 -2.83464 lineto
+-18.1732 -2.83464 lineto stroke
+newpath -21.86865 -4.36539 moveto
+-18.1732 -2.83464 lineto
+-21.86865 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath -18.1732 5.66928 moveto
+-12.50392 0 lineto
+-18.1732 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 57.85817 17.00784 moveto
+57.85817 17.00784 lineto
+57.85817 -17.00784 lineto
+57.85817 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.01569 17.00784 moveto
+38.01569 17.00784 lineto
+57.85817 17.00784 lineto
+57.85817 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.01569 17.00784 moveto
+38.01569 17.00784 lineto
+38.01569 -17.00784 lineto
+38.01569 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.01569 -17.00784 moveto
+38.01569 -17.00784 lineto
+57.85817 -17.00784 lineto
+57.85817 -17.00784 lineto stroke
+newpath 73.19066 2.82857 moveto
+73.91443 2.1048 74.36209 1.10492 74.36209 0.00049 curveto
+74.36209 -0.00049 lineto
+74.36209 -1.10492 73.91443 -2.1048 73.19066 -2.82857 curveto stroke
+newpath 67.53352 2.82857 moveto
+68.2573 3.55234 69.25717 4 70.3616 4 curveto
+70.36258 4 lineto
+71.46701 4 72.46689 3.55234 73.19066 2.82857 curveto stroke
+newpath 67.53352 2.82857 moveto
+66.80975 2.1048 66.36209 1.10492 66.36209 0.00049 curveto
+66.36209 -0.00049 lineto
+66.36209 -1.10492 66.80975 -2.1048 67.53352 -2.82857 curveto stroke
+newpath 67.53352 -2.82857 moveto
+68.2573 -3.55234 69.25717 -4 70.3616 -4 curveto
+70.36258 -4 lineto
+71.46701 -4 72.46689 -3.55234 73.19066 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 70.36209 4 moveto
+70.36209 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 74.36209 0 moveto
+66.36209 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 47.93692 34.01569 moveto
+47.93692 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.40616 20.70332 moveto
+47.93692 17.00784 lineto
+49.46768 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 47.93692 25.51176 moveto
+70.36209 25.51176 lineto
+70.36209 4 lineto stroke
+newpath 68.83133 7.69548 moveto
+70.36209 4 lineto
+71.89285 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 57.85817 0 moveto
+66.36209 0 lineto stroke
+newpath 62.66661 -1.53076 moveto
+66.36209 0 lineto
+62.66661 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath 32.3464 -34.01569 moveto
+32.3464 -2.83464 lineto
+38.01569 -2.83464 lineto stroke
+newpath 34.32024 -4.36539 moveto
+38.01569 -2.83464 lineto
+34.32024 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 38.01569 5.66928 moveto
+43.68497 0 lineto
+38.01569 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 114.04706 17.00784 moveto
+114.04706 17.00784 lineto
+114.04706 -17.00784 lineto
+114.04706 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.20457 17.00784 moveto
+94.20457 17.00784 lineto
+114.04706 17.00784 lineto
+114.04706 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 94.20457 17.00784 moveto
+94.20457 17.00784 lineto
+94.20457 -17.00784 lineto
+94.20457 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.20457 -17.00784 moveto
+94.20457 -17.00784 lineto
+114.04706 -17.00784 lineto
+114.04706 -17.00784 lineto stroke
+newpath 129.37955 2.82857 moveto
+130.10332 2.1048 130.55098 1.10492 130.55098 0.00049 curveto
+130.55098 -0.00049 lineto
+130.55098 -1.10492 130.10332 -2.1048 129.37955 -2.82857 curveto stroke
+newpath 123.72241 2.82857 moveto
+124.44618 3.55234 125.44606 4 126.55049 4 curveto
+126.55147 4 lineto
+127.6559 4 128.65578 3.55234 129.37955 2.82857 curveto stroke
+newpath 123.72241 2.82857 moveto
+122.99864 2.1048 122.55098 1.10492 122.55098 0.00049 curveto
+122.55098 -0.00049 lineto
+122.55098 -1.10492 122.99864 -2.1048 123.72241 -2.82857 curveto stroke
+newpath 123.72241 -2.82857 moveto
+124.44618 -3.55234 125.44606 -4 126.55049 -4 curveto
+126.55147 -4 lineto
+127.6559 -4 128.65578 -3.55234 129.37955 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 126.55098 4 moveto
+126.55098 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 130.55098 0 moveto
+122.55098 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 104.12581 34.01569 moveto
+104.12581 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 102.59505 20.70332 moveto
+104.12581 17.00784 lineto
+105.65657 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 104.12581 25.51176 moveto
+126.55098 25.51176 lineto
+126.55098 4 lineto stroke
+newpath 125.02022 7.69548 moveto
+126.55098 4 lineto
+128.08174 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 114.04706 0 moveto
+122.55098 0 lineto stroke
+newpath 118.8555 -1.53076 moveto
+122.55098 0 lineto
+118.8555 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath 88.5353 -34.01569 moveto
+88.5353 -2.83464 lineto
+94.20457 -2.83464 lineto stroke
+newpath 90.50912 -4.36539 moveto
+94.20457 -2.83464 lineto
+90.50912 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 94.20457 5.66928 moveto
+99.87386 0 lineto
+94.20457 -5.66928 lineto stroke
+newpath 18.1732 0 moveto
+38.01569 0 lineto stroke
+newpath 34.32025 -1.53073 moveto
+38.01569 0 lineto
+34.32025 1.53073 lineto
+ closepath
+gsave fill grestore stroke
+newpath 74.36209 0 moveto
+94.20457 0 lineto stroke
+newpath 90.50914 -1.53073 moveto
+94.20457 0 lineto
+90.50914 1.53073 lineto
+ closepath
+gsave fill grestore stroke
+newpath 130.55098 0 moveto
+144.72418 0 lineto stroke
+newpath 141.02864 -1.53078 moveto
+144.72418 0 lineto
+141.02864 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+newpath -32.3464 0 moveto
+-18.1732 0 lineto stroke
+newpath -21.86874 -1.53078 moveto
+-18.1732 0 lineto
+-21.86874 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+-48.0972 -2.65671 moveto
+(H) cmmi10 9.96265 fshow
+-39.8158 -4.15111 moveto
+(0) cmr7 6.97385 fshow
+-15.31923 38.5101 moveto
+(M) cmmi10 9.96265 fshow
+-5.65402 37.0157 moveto
+(0) cmr7 6.97385 fshow
+40.86966 38.5101 moveto
+(M) cmmi10 9.96265 fshow
+50.53487 37.0157 moveto
+(1) cmr7 6.97385 fshow
+97.05855 38.5101 moveto
+(M) cmmi10 9.96265 fshow
+106.72375 37.0157 moveto
+(2) cmr7 6.97385 fshow
+-38.95255 -45.48389 moveto
+(len) cmr10 9.96265 fshow
+-25.22615 -45.48389 moveto
+(:) cmr10 9.96265 fshow
+-21.46245 -45.48389 moveto
+(512) cmr10 9.96265 fshow
+-45.04085 -57.43909 moveto
+(\014nal) cmr10 9.96265 fshow
+-25.22615 -57.43909 moveto
+(:) cmr10 9.96265 fshow
+-21.46245 -57.43909 moveto
+(0) cmr10 9.96265 fshow
+17.23634 -45.48389 moveto
+(len) cmr10 9.96265 fshow
+30.96274 -45.48389 moveto
+(:) cmr10 9.96265 fshow
+34.72644 -45.48389 moveto
+(1024) cmr10 9.96265 fshow
+11.14804 -57.43909 moveto
+(\014nal) cmr10 9.96265 fshow
+30.96274 -57.43909 moveto
+(:) cmr10 9.96265 fshow
+34.72644 -57.43909 moveto
+(0) cmr10 9.96265 fshow
+73.42523 -45.48389 moveto
+(len) cmr10 9.96265 fshow
+87.15163 -45.48389 moveto
+(:) cmr10 9.96265 fshow
+90.91533 -45.48389 moveto
+(1328) cmr10 9.96265 fshow
+67.33693 -57.43909 moveto
+(\014nal) cmr10 9.96265 fshow
+87.15163 -57.43909 moveto
+(:) cmr10 9.96265 fshow
+90.91533 -57.43909 moveto
+(1) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-24.mps b/Supporting_Documentation/tex/skein-24.mps
new file mode 100644
index 0000000000000..b8d475df4ff7d
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-24.mps
@@ -0,0 +1,398 @@
+%!PS
+%%BoundingBox: -25 -35 244 35 
+%%HiResBoundingBox: -24.09248 -34.26569 243.17876 34.26569 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.08.28:0231
+%%Pages: 1
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 1.66928 17.00784 moveto
+1.66928 17.00784 lineto
+1.66928 -17.00784 lineto
+1.66928 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -18.1732 17.00784 moveto
+-18.1732 17.00784 lineto
+1.66928 17.00784 lineto
+1.66928 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -18.1732 17.00784 moveto
+-18.1732 17.00784 lineto
+-18.1732 -17.00784 lineto
+-18.1732 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -18.1732 -17.00784 moveto
+-18.1732 -17.00784 lineto
+1.66928 -17.00784 lineto
+1.66928 -17.00784 lineto stroke
+newpath 17.00177 2.82857 moveto
+17.72554 2.1048 18.1732 1.10492 18.1732 0.00049 curveto
+18.1732 -0.00049 lineto
+18.1732 -1.10492 17.72554 -2.1048 17.00177 -2.82857 curveto stroke
+newpath 11.34464 2.82857 moveto
+12.0684 3.55234 13.06828 4 14.17271 4 curveto
+14.17369 4 lineto
+15.27812 4 16.278 3.55234 17.00177 2.82857 curveto stroke
+newpath 11.34464 2.82857 moveto
+10.62086 2.1048 10.1732 1.10492 10.1732 0.00049 curveto
+10.1732 -0.00049 lineto
+10.1732 -1.10492 10.62086 -2.1048 11.34464 -2.82857 curveto stroke
+newpath 11.34464 -2.82857 moveto
+12.0684 -3.55234 13.06828 -4 14.17271 -4 curveto
+14.17369 -4 lineto
+15.27812 -4 16.278 -3.55234 17.00177 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 14.1732 4 moveto
+14.1732 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 18.1732 0 moveto
+10.1732 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -8.25197 34.01569 moveto
+-8.25197 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -9.78273 20.70332 moveto
+-8.25197 17.00784 lineto
+-6.7212 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath -8.25197 25.51176 moveto
+14.1732 25.51176 lineto
+14.1732 4 lineto stroke
+newpath 12.64244 7.69548 moveto
+14.1732 4 lineto
+15.70396 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 1.66928 0 moveto
+10.1732 0 lineto stroke
+newpath 6.47772 -1.53076 moveto
+10.1732 0 lineto
+6.47772 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath -23.84248 -34.01569 moveto
+-23.84248 -2.83464 lineto
+-18.1732 -2.83464 lineto stroke
+newpath -21.86865 -4.36539 moveto
+-18.1732 -2.83464 lineto
+-21.86865 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath -18.1732 5.66928 moveto
+-12.50392 0 lineto
+-18.1732 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 57.85817 17.00784 moveto
+57.85817 17.00784 lineto
+57.85817 -17.00784 lineto
+57.85817 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.01569 17.00784 moveto
+38.01569 17.00784 lineto
+57.85817 17.00784 lineto
+57.85817 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.01569 17.00784 moveto
+38.01569 17.00784 lineto
+38.01569 -17.00784 lineto
+38.01569 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.01569 -17.00784 moveto
+38.01569 -17.00784 lineto
+57.85817 -17.00784 lineto
+57.85817 -17.00784 lineto stroke
+newpath 73.19066 2.82857 moveto
+73.91443 2.1048 74.36209 1.10492 74.36209 0.00049 curveto
+74.36209 -0.00049 lineto
+74.36209 -1.10492 73.91443 -2.1048 73.19066 -2.82857 curveto stroke
+newpath 67.53352 2.82857 moveto
+68.2573 3.55234 69.25717 4 70.3616 4 curveto
+70.36258 4 lineto
+71.46701 4 72.46689 3.55234 73.19066 2.82857 curveto stroke
+newpath 67.53352 2.82857 moveto
+66.80975 2.1048 66.36209 1.10492 66.36209 0.00049 curveto
+66.36209 -0.00049 lineto
+66.36209 -1.10492 66.80975 -2.1048 67.53352 -2.82857 curveto stroke
+newpath 67.53352 -2.82857 moveto
+68.2573 -3.55234 69.25717 -4 70.3616 -4 curveto
+70.36258 -4 lineto
+71.46701 -4 72.46689 -3.55234 73.19066 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 70.36209 4 moveto
+70.36209 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 74.36209 0 moveto
+66.36209 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 47.93692 34.01569 moveto
+47.93692 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.40616 20.70332 moveto
+47.93692 17.00784 lineto
+49.46768 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 47.93692 25.51176 moveto
+70.36209 25.51176 lineto
+70.36209 4 lineto stroke
+newpath 68.83133 7.69548 moveto
+70.36209 4 lineto
+71.89285 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 57.85817 0 moveto
+66.36209 0 lineto stroke
+newpath 62.66661 -1.53076 moveto
+66.36209 0 lineto
+62.66661 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath 32.3464 -34.01569 moveto
+32.3464 -2.83464 lineto
+38.01569 -2.83464 lineto stroke
+newpath 34.32024 -4.36539 moveto
+38.01569 -2.83464 lineto
+34.32024 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 38.01569 5.66928 moveto
+43.68497 0 lineto
+38.01569 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 114.04706 17.00784 moveto
+114.04706 17.00784 lineto
+114.04706 -17.00784 lineto
+114.04706 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.20457 17.00784 moveto
+94.20457 17.00784 lineto
+114.04706 17.00784 lineto
+114.04706 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 94.20457 17.00784 moveto
+94.20457 17.00784 lineto
+94.20457 -17.00784 lineto
+94.20457 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.20457 -17.00784 moveto
+94.20457 -17.00784 lineto
+114.04706 -17.00784 lineto
+114.04706 -17.00784 lineto stroke
+newpath 129.37955 2.82857 moveto
+130.10332 2.1048 130.55098 1.10492 130.55098 0.00049 curveto
+130.55098 -0.00049 lineto
+130.55098 -1.10492 130.10332 -2.1048 129.37955 -2.82857 curveto stroke
+newpath 123.72241 2.82857 moveto
+124.44618 3.55234 125.44606 4 126.55049 4 curveto
+126.55147 4 lineto
+127.6559 4 128.65578 3.55234 129.37955 2.82857 curveto stroke
+newpath 123.72241 2.82857 moveto
+122.99864 2.1048 122.55098 1.10492 122.55098 0.00049 curveto
+122.55098 -0.00049 lineto
+122.55098 -1.10492 122.99864 -2.1048 123.72241 -2.82857 curveto stroke
+newpath 123.72241 -2.82857 moveto
+124.44618 -3.55234 125.44606 -4 126.55049 -4 curveto
+126.55147 -4 lineto
+127.6559 -4 128.65578 -3.55234 129.37955 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 126.55098 4 moveto
+126.55098 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 130.55098 0 moveto
+122.55098 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 104.12581 34.01569 moveto
+104.12581 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 102.59505 20.70332 moveto
+104.12581 17.00784 lineto
+105.65657 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 104.12581 25.51176 moveto
+126.55098 25.51176 lineto
+126.55098 4 lineto stroke
+newpath 125.02022 7.69548 moveto
+126.55098 4 lineto
+128.08174 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 114.04706 0 moveto
+122.55098 0 lineto stroke
+newpath 118.8555 -1.53076 moveto
+122.55098 0 lineto
+118.8555 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath 88.5353 -34.01569 moveto
+88.5353 -2.83464 lineto
+94.20457 -2.83464 lineto stroke
+newpath 90.50912 -4.36539 moveto
+94.20457 -2.83464 lineto
+90.50912 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 94.20457 5.66928 moveto
+99.87386 0 lineto
+94.20457 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.23595 17.00784 moveto
+170.23595 17.00784 lineto
+170.23595 -17.00784 lineto
+170.23595 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 150.39346 17.00784 moveto
+150.39346 17.00784 lineto
+170.23595 17.00784 lineto
+170.23595 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 150.39346 17.00784 moveto
+150.39346 17.00784 lineto
+150.39346 -17.00784 lineto
+150.39346 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 150.39346 -17.00784 moveto
+150.39346 -17.00784 lineto
+170.23595 -17.00784 lineto
+170.23595 -17.00784 lineto stroke
+newpath 185.56844 2.82857 moveto
+186.2922 2.1048 186.73987 1.10492 186.73987 0.00049 curveto
+186.73987 -0.00049 lineto
+186.73987 -1.10492 186.2922 -2.1048 185.56844 -2.82857 curveto stroke
+newpath 179.9113 2.82857 moveto
+180.63507 3.55234 181.63495 4 182.73938 4 curveto
+182.74036 4 lineto
+183.84479 4 184.84467 3.55234 185.56844 2.82857 curveto stroke
+newpath 179.9113 2.82857 moveto
+179.18753 2.1048 178.73987 1.10492 178.73987 0.00049 curveto
+178.73987 -0.00049 lineto
+178.73987 -1.10492 179.18753 -2.1048 179.9113 -2.82857 curveto stroke
+newpath 179.9113 -2.82857 moveto
+180.63507 -3.55234 181.63495 -4 182.73938 -4 curveto
+182.74036 -4 lineto
+183.84479 -4 184.84467 -3.55234 185.56844 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 182.73987 4 moveto
+182.73987 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 186.73987 0 moveto
+178.73987 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 160.3147 34.01569 moveto
+160.3147 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 158.78394 20.70332 moveto
+160.3147 17.00784 lineto
+161.84546 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 160.3147 25.51176 moveto
+182.73987 25.51176 lineto
+182.73987 4 lineto stroke
+newpath 181.2091 7.69548 moveto
+182.73987 4 lineto
+184.27063 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 170.23595 0 moveto
+178.73987 0 lineto stroke
+newpath 175.04439 -1.53076 moveto
+178.73987 0 lineto
+175.04439 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath 144.72418 -34.01569 moveto
+144.72418 -2.83464 lineto
+150.39346 -2.83464 lineto stroke
+newpath 146.69801 -4.36539 moveto
+150.39346 -2.83464 lineto
+146.69801 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 150.39346 5.66928 moveto
+156.06274 0 lineto
+150.39346 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 226.42484 17.00784 moveto
+226.42484 17.00784 lineto
+226.42484 -17.00784 lineto
+226.42484 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 206.58235 17.00784 moveto
+206.58235 17.00784 lineto
+226.42484 17.00784 lineto
+226.42484 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 206.58235 17.00784 moveto
+206.58235 17.00784 lineto
+206.58235 -17.00784 lineto
+206.58235 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 206.58235 -17.00784 moveto
+206.58235 -17.00784 lineto
+226.42484 -17.00784 lineto
+226.42484 -17.00784 lineto stroke
+newpath 241.75732 2.82857 moveto
+242.4811 2.1048 242.92876 1.10492 242.92876 0.00049 curveto
+242.92876 -0.00049 lineto
+242.92876 -1.10492 242.4811 -2.1048 241.75732 -2.82857 curveto stroke
+newpath 236.10019 2.82857 moveto
+236.82396 3.55234 237.82384 4 238.92827 4 curveto
+238.92924 4 lineto
+240.03368 4 241.03355 3.55234 241.75732 2.82857 curveto stroke
+newpath 236.10019 2.82857 moveto
+235.37642 2.1048 234.92876 1.10492 234.92876 0.00049 curveto
+234.92876 -0.00049 lineto
+234.92876 -1.10492 235.37642 -2.1048 236.10019 -2.82857 curveto stroke
+newpath 236.10019 -2.82857 moveto
+236.82396 -3.55234 237.82384 -4 238.92827 -4 curveto
+238.92924 -4 lineto
+240.03368 -4 241.03355 -3.55234 241.75732 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 238.92876 4 moveto
+238.92876 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 242.92876 0 moveto
+234.92876 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 216.50359 34.01569 moveto
+216.50359 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 214.97282 20.70332 moveto
+216.50359 17.00784 lineto
+218.03435 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 216.50359 25.51176 moveto
+238.92876 25.51176 lineto
+238.92876 4 lineto stroke
+newpath 237.398 7.69548 moveto
+238.92876 4 lineto
+240.45952 7.69548 lineto
+ closepath
+gsave fill grestore stroke
+newpath 226.42484 0 moveto
+234.92876 0 lineto stroke
+newpath 231.23328 -1.53076 moveto
+234.92876 0 lineto
+231.23328 1.53076 lineto
+ closepath
+gsave fill grestore stroke
+newpath 200.91307 -34.01569 moveto
+200.91307 -2.83464 lineto
+206.58235 -2.83464 lineto stroke
+newpath 202.8869 -4.36539 moveto
+206.58235 -2.83464 lineto
+202.8869 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 206.58235 5.66928 moveto
+212.25163 0 lineto
+206.58235 -5.66928 lineto stroke
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-25.mps b/Supporting_Documentation/tex/skein-25.mps
new file mode 100644
index 0000000000000..1519d259f91cb
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-25.mps
@@ -0,0 +1,1440 @@
+%!PS
+%%BoundingBox: -63 -315 233 30 
+%%HiResBoundingBox: -62.10995 -314.45247 232.18839 29.76099 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.08.28:0231
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 30:c400000490006c5f3cc
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 170.07843 0 moveto
+170.07843 0 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 0 moveto
+0 0 lineto
+170.07843 0 lineto
+170.07843 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 0 moveto
+0 0 lineto
+0 -16 lineto
+0 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -16 moveto
+0 -16 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -4 moveto
+89.03922 -4 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+89.03922 -4 lineto
+89.03922 -4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+81.03922 -12 lineto
+81.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -12 moveto
+81.03922 -12 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -4 moveto
+85.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -8 moveto
+81.03922 -8 lineto stroke
+newpath -14.1732 -8 moveto
+0 -8 lineto stroke
+newpath -3.69554 -9.53078 moveto
+0 -8 lineto
+-3.69554 -6.46922 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -10.49066 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -10.49066 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -10.49066 moveto
+(0) cmr10 9.96265 fshow
+12.68088 -43.49834 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.9762 -30.1732 moveto
+38.9762 -30.1732 lineto
+38.9762 -50.01569 lineto
+38.9762 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -30.1732 moveto
+3.54344 -30.1732 lineto
+38.9762 -30.1732 lineto
+38.9762 -30.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -30.1732 moveto
+3.54344 -30.1732 lineto
+3.54344 -50.01569 lineto
+3.54344 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -50.01569 moveto
+3.54344 -50.01569 lineto
+38.9762 -50.01569 lineto
+38.9762 -50.01569 lineto stroke
+55.20024 -43.49834 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.49556 -30.1732 moveto
+81.49556 -30.1732 lineto
+81.49556 -50.01569 lineto
+81.49556 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -30.1732 moveto
+46.0628 -30.1732 lineto
+81.49556 -30.1732 lineto
+81.49556 -30.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 46.0628 -30.1732 moveto
+46.0628 -30.1732 lineto
+46.0628 -50.01569 lineto
+46.0628 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -50.01569 moveto
+46.0628 -50.01569 lineto
+81.49556 -50.01569 lineto
+81.49556 -50.01569 lineto stroke
+97.7196 -43.49834 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 124.01492 -30.1732 moveto
+124.01492 -30.1732 lineto
+124.01492 -50.01569 lineto
+124.01492 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -30.1732 moveto
+88.58217 -30.1732 lineto
+124.01492 -30.1732 lineto
+124.01492 -30.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.58217 -30.1732 moveto
+88.58217 -30.1732 lineto
+88.58217 -50.01569 lineto
+88.58217 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -50.01569 moveto
+88.58217 -50.01569 lineto
+124.01492 -50.01569 lineto
+124.01492 -50.01569 lineto stroke
+140.23897 -43.49834 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -30.1732 moveto
+166.53429 -30.1732 lineto
+166.53429 -50.01569 lineto
+166.53429 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -30.1732 moveto
+131.10153 -30.1732 lineto
+166.53429 -30.1732 lineto
+166.53429 -30.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 131.10153 -30.1732 moveto
+131.10153 -30.1732 lineto
+131.10153 -50.01569 lineto
+131.10153 -50.01569 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -50.01569 moveto
+131.10153 -50.01569 lineto
+166.53429 -50.01569 lineto
+166.53429 -50.01569 lineto stroke
+66.69096 -73.99669 moveto
+(P) cmr10 9.96265 fshow
+73.19437 -73.99669 moveto
+(erm) cmr10 9.96265 fshow
+89.54976 -73.99669 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -64.18889 moveto
+166.53429 -64.18889 lineto
+166.53429 -76.99669 lineto
+166.53429 -76.99669 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -64.18889 moveto
+3.54344 -64.18889 lineto
+166.53429 -64.18889 lineto
+166.53429 -64.18889 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -64.18889 moveto
+3.54344 -64.18889 lineto
+3.54344 -76.99669 lineto
+3.54344 -76.99669 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -76.99669 moveto
+3.54344 -76.99669 lineto
+166.53429 -76.99669 lineto
+166.53429 -76.99669 lineto stroke
+12.68088 -104.49503 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.9762 -91.16989 moveto
+38.9762 -91.16989 lineto
+38.9762 -111.01237 lineto
+38.9762 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -91.16989 moveto
+3.54344 -91.16989 lineto
+38.9762 -91.16989 lineto
+38.9762 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -91.16989 moveto
+3.54344 -91.16989 lineto
+3.54344 -111.01237 lineto
+3.54344 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -111.01237 moveto
+3.54344 -111.01237 lineto
+38.9762 -111.01237 lineto
+38.9762 -111.01237 lineto stroke
+55.20024 -104.49503 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.49556 -91.16989 moveto
+81.49556 -91.16989 lineto
+81.49556 -111.01237 lineto
+81.49556 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -91.16989 moveto
+46.0628 -91.16989 lineto
+81.49556 -91.16989 lineto
+81.49556 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 46.0628 -91.16989 moveto
+46.0628 -91.16989 lineto
+46.0628 -111.01237 lineto
+46.0628 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -111.01237 moveto
+46.0628 -111.01237 lineto
+81.49556 -111.01237 lineto
+81.49556 -111.01237 lineto stroke
+97.7196 -104.49503 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 124.01492 -91.16989 moveto
+124.01492 -91.16989 lineto
+124.01492 -111.01237 lineto
+124.01492 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -91.16989 moveto
+88.58217 -91.16989 lineto
+124.01492 -91.16989 lineto
+124.01492 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.58217 -91.16989 moveto
+88.58217 -91.16989 lineto
+88.58217 -111.01237 lineto
+88.58217 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -111.01237 moveto
+88.58217 -111.01237 lineto
+124.01492 -111.01237 lineto
+124.01492 -111.01237 lineto stroke
+140.23897 -104.49503 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -91.16989 moveto
+166.53429 -91.16989 lineto
+166.53429 -111.01237 lineto
+166.53429 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -91.16989 moveto
+131.10153 -91.16989 lineto
+166.53429 -91.16989 lineto
+166.53429 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 131.10153 -91.16989 moveto
+131.10153 -91.16989 lineto
+131.10153 -111.01237 lineto
+131.10153 -111.01237 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -111.01237 moveto
+131.10153 -111.01237 lineto
+166.53429 -111.01237 lineto
+166.53429 -111.01237 lineto stroke
+66.69096 -134.99338 moveto
+(P) cmr10 9.96265 fshow
+73.19437 -134.99338 moveto
+(erm) cmr10 9.96265 fshow
+89.54976 -134.99338 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -125.18558 moveto
+166.53429 -125.18558 lineto
+166.53429 -137.99338 lineto
+166.53429 -137.99338 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -125.18558 moveto
+3.54344 -125.18558 lineto
+166.53429 -125.18558 lineto
+166.53429 -125.18558 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -125.18558 moveto
+3.54344 -125.18558 lineto
+3.54344 -137.99338 lineto
+3.54344 -137.99338 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -137.99338 moveto
+3.54344 -137.99338 lineto
+166.53429 -137.99338 lineto
+166.53429 -137.99338 lineto stroke
+12.68088 -236.35773 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.9762 -223.0326 moveto
+38.9762 -223.0326 lineto
+38.9762 -242.87508 lineto
+38.9762 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -223.0326 moveto
+3.54344 -223.0326 lineto
+38.9762 -223.0326 lineto
+38.9762 -223.0326 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -223.0326 moveto
+3.54344 -223.0326 lineto
+3.54344 -242.87508 lineto
+3.54344 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -242.87508 moveto
+3.54344 -242.87508 lineto
+38.9762 -242.87508 lineto
+38.9762 -242.87508 lineto stroke
+55.20024 -236.35773 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.49556 -223.0326 moveto
+81.49556 -223.0326 lineto
+81.49556 -242.87508 lineto
+81.49556 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -223.0326 moveto
+46.0628 -223.0326 lineto
+81.49556 -223.0326 lineto
+81.49556 -223.0326 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 46.0628 -223.0326 moveto
+46.0628 -223.0326 lineto
+46.0628 -242.87508 lineto
+46.0628 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -242.87508 moveto
+46.0628 -242.87508 lineto
+81.49556 -242.87508 lineto
+81.49556 -242.87508 lineto stroke
+97.7196 -236.35773 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 124.01492 -223.0326 moveto
+124.01492 -223.0326 lineto
+124.01492 -242.87508 lineto
+124.01492 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -223.0326 moveto
+88.58217 -223.0326 lineto
+124.01492 -223.0326 lineto
+124.01492 -223.0326 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.58217 -223.0326 moveto
+88.58217 -223.0326 lineto
+88.58217 -242.87508 lineto
+88.58217 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -242.87508 moveto
+88.58217 -242.87508 lineto
+124.01492 -242.87508 lineto
+124.01492 -242.87508 lineto stroke
+140.23897 -236.35773 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -223.0326 moveto
+166.53429 -223.0326 lineto
+166.53429 -242.87508 lineto
+166.53429 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -223.0326 moveto
+131.10153 -223.0326 lineto
+166.53429 -223.0326 lineto
+166.53429 -223.0326 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 131.10153 -223.0326 moveto
+131.10153 -223.0326 lineto
+131.10153 -242.87508 lineto
+131.10153 -242.87508 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -242.87508 moveto
+131.10153 -242.87508 lineto
+166.53429 -242.87508 lineto
+166.53429 -242.87508 lineto stroke
+66.69096 -266.85608 moveto
+(P) cmr10 9.96265 fshow
+73.19437 -266.85608 moveto
+(erm) cmr10 9.96265 fshow
+89.54976 -266.85608 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -257.04828 moveto
+166.53429 -257.04828 lineto
+166.53429 -269.85606 lineto
+166.53429 -269.85606 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -257.04828 moveto
+3.54344 -257.04828 lineto
+166.53429 -257.04828 lineto
+166.53429 -257.04828 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -257.04828 moveto
+3.54344 -257.04828 lineto
+3.54344 -269.85606 lineto
+3.54344 -269.85606 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -269.85606 moveto
+3.54344 -269.85606 lineto
+166.53429 -269.85606 lineto
+166.53429 -269.85606 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.07843 -284.02927 moveto
+170.07843 -284.02927 lineto
+170.07843 -300.02927 lineto
+170.07843 -300.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -284.02927 moveto
+0 -284.02927 lineto
+170.07843 -284.02927 lineto
+170.07843 -284.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -284.02927 moveto
+0 -284.02927 lineto
+0 -300.02927 lineto
+0 -300.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -300.02927 moveto
+0 -300.02927 lineto
+170.07843 -300.02927 lineto
+170.07843 -300.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -288.02927 moveto
+89.03922 -288.02927 lineto
+89.03922 -296.02927 lineto
+89.03922 -296.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -288.02927 moveto
+81.03922 -288.02927 lineto
+89.03922 -288.02927 lineto
+89.03922 -288.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -288.02927 moveto
+81.03922 -288.02927 lineto
+81.03922 -296.02927 lineto
+81.03922 -296.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -296.02927 moveto
+81.03922 -296.02927 lineto
+89.03922 -296.02927 lineto
+89.03922 -296.02927 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -288.02927 moveto
+85.03922 -296.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -292.02927 moveto
+81.03922 -292.02927 lineto stroke
+newpath -14.1732 -292.02927 moveto
+0 -292.02927 lineto stroke
+newpath -3.69554 -293.56004 moveto
+0 -292.02927 lineto
+-3.69554 -290.49849 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -294.51993 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -294.51993 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -294.51993 moveto
+(1) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 14.1732 moveto
+10.6299 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 3.69554 moveto
+10.6299 0 lineto
+12.16068 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -16 moveto
+10.6299 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -26.47766 moveto
+10.6299 -30.1732 lineto
+12.16068 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -50.01569 moveto
+10.6299 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -60.49335 moveto
+10.6299 -64.18889 lineto
+12.16068 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -76.99669 moveto
+10.6299 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -87.47435 moveto
+10.6299 -91.16989 lineto
+12.16068 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -111.01237 moveto
+10.6299 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -121.49004 moveto
+10.6299 -125.18558 lineto
+12.16068 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -137.99338 moveto
+10.6299 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -148.47104 moveto
+10.6299 -152.16658 lineto
+12.16068 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -208.85939 moveto
+10.6299 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -219.33705 moveto
+10.6299 -223.0326 lineto
+12.16068 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -242.87508 moveto
+10.6299 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -253.35274 moveto
+10.6299 -257.04828 lineto
+12.16068 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -269.85606 moveto
+10.6299 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -280.33372 moveto
+10.6299 -284.02927 lineto
+12.16068 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -300.02927 moveto
+10.6299 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -310.50693 moveto
+10.6299 -314.20247 lineto
+12.16068 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 11.62616 -166.33978 moveto
+11.62616 -166.07556 11.5212 -165.82214 11.33437 -165.63531 curveto
+11.14754 -165.44849 10.89412 -165.34352 10.6299 -165.34352 curveto
+10.36568 -165.34352 10.11226 -165.44849 9.92543 -165.63531 curveto
+9.7386 -165.82214 9.63364 -166.07556 9.63364 -166.33978 curveto
+9.63364 -166.604 9.7386 -166.85742 9.92543 -167.04425 curveto
+10.11226 -167.23108 10.36568 -167.33604 10.6299 -167.33604 curveto
+10.89412 -167.33604 11.14754 -167.23108 11.33437 -167.04425 curveto
+11.5212 -166.85742 11.62616 -166.604 11.62616 -166.33978 curveto closepath fill
+newpath 11.62616 -180.51299 moveto
+11.62616 -180.24876 11.5212 -179.99535 11.33437 -179.80852 curveto
+11.14754 -179.62169 10.89412 -179.51672 10.6299 -179.51672 curveto
+10.36568 -179.51672 10.11226 -179.62169 9.92543 -179.80852 curveto
+9.7386 -179.99535 9.63364 -180.24876 9.63364 -180.51299 curveto
+9.63364 -180.7772 9.7386 -181.03062 9.92543 -181.21745 curveto
+10.11226 -181.40428 10.36568 -181.50925 10.6299 -181.50925 curveto
+10.89412 -181.50925 11.14754 -181.40428 11.33437 -181.21745 curveto
+11.5212 -181.03062 11.62616 -180.7772 11.62616 -180.51299 curveto closepath
+ fill
+newpath 11.62616 -194.68619 moveto
+11.62616 -194.42197 11.5212 -194.16855 11.33437 -193.98172 curveto
+11.14754 -193.79489 10.89412 -193.68993 10.6299 -193.68993 curveto
+10.36568 -193.68993 10.11226 -193.79489 9.92543 -193.98172 curveto
+9.7386 -194.16855 9.63364 -194.42197 9.63364 -194.68619 curveto
+9.63364 -194.95041 9.7386 -195.20383 9.92543 -195.39066 curveto
+10.11226 -195.57748 10.36568 -195.68245 10.6299 -195.68245 curveto
+10.89412 -195.68245 11.14754 -195.57748 11.33437 -195.39066 curveto
+11.5212 -195.20383 11.62616 -194.95041 11.62616 -194.68619 curveto closepath
+ fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 14.1732 moveto
+31.88971 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 3.69554 moveto
+31.88971 0 lineto
+33.42049 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -16 moveto
+31.88971 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -26.47766 moveto
+31.88971 -30.1732 lineto
+33.42049 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -50.01569 moveto
+31.88971 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -60.49335 moveto
+31.88971 -64.18889 lineto
+33.42049 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -76.99669 moveto
+31.88971 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -87.47435 moveto
+31.88971 -91.16989 lineto
+33.42049 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -111.01237 moveto
+31.88971 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -121.49004 moveto
+31.88971 -125.18558 lineto
+33.42049 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -137.99338 moveto
+31.88971 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -148.47104 moveto
+31.88971 -152.16658 lineto
+33.42049 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -208.85939 moveto
+31.88971 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -219.33705 moveto
+31.88971 -223.0326 lineto
+33.42049 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -242.87508 moveto
+31.88971 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -253.35274 moveto
+31.88971 -257.04828 lineto
+33.42049 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -269.85606 moveto
+31.88971 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -280.33372 moveto
+31.88971 -284.02927 lineto
+33.42049 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -300.02927 moveto
+31.88971 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -310.50693 moveto
+31.88971 -314.20247 lineto
+33.42049 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 32.88597 -166.33978 moveto
+32.88597 -166.07556 32.781 -165.82214 32.59418 -165.63531 curveto
+32.40735 -165.44849 32.15393 -165.34352 31.88971 -165.34352 curveto
+31.62549 -165.34352 31.37207 -165.44849 31.18524 -165.63531 curveto
+30.99841 -165.82214 30.89345 -166.07556 30.89345 -166.33978 curveto
+30.89345 -166.604 30.99841 -166.85742 31.18524 -167.04425 curveto
+31.37207 -167.23108 31.62549 -167.33604 31.88971 -167.33604 curveto
+32.15393 -167.33604 32.40735 -167.23108 32.59418 -167.04425 curveto
+32.781 -166.85742 32.88597 -166.604 32.88597 -166.33978 curveto closepath fill
+newpath 32.88597 -180.51299 moveto
+32.88597 -180.24876 32.781 -179.99535 32.59418 -179.80852 curveto
+32.40735 -179.62169 32.15393 -179.51672 31.88971 -179.51672 curveto
+31.62549 -179.51672 31.37207 -179.62169 31.18524 -179.80852 curveto
+30.99841 -179.99535 30.89345 -180.24876 30.89345 -180.51299 curveto
+30.89345 -180.7772 30.99841 -181.03062 31.18524 -181.21745 curveto
+31.37207 -181.40428 31.62549 -181.50925 31.88971 -181.50925 curveto
+32.15393 -181.50925 32.40735 -181.40428 32.59418 -181.21745 curveto
+32.781 -181.03062 32.88597 -180.7772 32.88597 -180.51299 curveto closepath fill
+newpath 32.88597 -194.68619 moveto
+32.88597 -194.42197 32.781 -194.16855 32.59418 -193.98172 curveto
+32.40735 -193.79489 32.15393 -193.68993 31.88971 -193.68993 curveto
+31.62549 -193.68993 31.37207 -193.79489 31.18524 -193.98172 curveto
+30.99841 -194.16855 30.89345 -194.42197 30.89345 -194.68619 curveto
+30.89345 -194.95041 30.99841 -195.20383 31.18524 -195.39066 curveto
+31.37207 -195.57748 31.62549 -195.68245 31.88971 -195.68245 curveto
+32.15393 -195.68245 32.40735 -195.57748 32.59418 -195.39066 curveto
+32.781 -195.20383 32.88597 -194.95041 32.88597 -194.68619 curveto closepath
+ fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 14.1732 moveto
+53.1495 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 3.69554 moveto
+53.1495 0 lineto
+54.68028 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -16 moveto
+53.1495 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -26.47766 moveto
+53.1495 -30.1732 lineto
+54.68028 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -50.01569 moveto
+53.1495 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -60.49335 moveto
+53.1495 -64.18889 lineto
+54.68028 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -76.99669 moveto
+53.1495 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -87.47435 moveto
+53.1495 -91.16989 lineto
+54.68028 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -111.01237 moveto
+53.1495 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -121.49004 moveto
+53.1495 -125.18558 lineto
+54.68028 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -137.99338 moveto
+53.1495 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -148.47104 moveto
+53.1495 -152.16658 lineto
+54.68028 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -208.85939 moveto
+53.1495 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -219.33705 moveto
+53.1495 -223.0326 lineto
+54.68028 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -242.87508 moveto
+53.1495 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -253.35274 moveto
+53.1495 -257.04828 lineto
+54.68028 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -269.85606 moveto
+53.1495 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -280.33372 moveto
+53.1495 -284.02927 lineto
+54.68028 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -300.02927 moveto
+53.1495 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -310.50693 moveto
+53.1495 -314.20247 lineto
+54.68028 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 54.14577 -166.33978 moveto
+54.14577 -166.07556 54.0408 -165.82214 53.85397 -165.63531 curveto
+53.66714 -165.44849 53.41373 -165.34352 53.1495 -165.34352 curveto
+52.88528 -165.34352 52.63187 -165.44849 52.44504 -165.63531 curveto
+52.25821 -165.82214 52.15324 -166.07556 52.15324 -166.33978 curveto
+52.15324 -166.604 52.25821 -166.85742 52.44504 -167.04425 curveto
+52.63187 -167.23108 52.88528 -167.33604 53.1495 -167.33604 curveto
+53.41373 -167.33604 53.66714 -167.23108 53.85397 -167.04425 curveto
+54.0408 -166.85742 54.14577 -166.604 54.14577 -166.33978 curveto closepath fill
+newpath 54.14577 -180.51299 moveto
+54.14577 -180.24876 54.0408 -179.99535 53.85397 -179.80852 curveto
+53.66714 -179.62169 53.41373 -179.51672 53.1495 -179.51672 curveto
+52.88528 -179.51672 52.63187 -179.62169 52.44504 -179.80852 curveto
+52.25821 -179.99535 52.15324 -180.24876 52.15324 -180.51299 curveto
+52.15324 -180.7772 52.25821 -181.03062 52.44504 -181.21745 curveto
+52.63187 -181.40428 52.88528 -181.50925 53.1495 -181.50925 curveto
+53.41373 -181.50925 53.66714 -181.40428 53.85397 -181.21745 curveto
+54.0408 -181.03062 54.14577 -180.7772 54.14577 -180.51299 curveto closepath
+ fill
+newpath 54.14577 -194.68619 moveto
+54.14577 -194.42197 54.0408 -194.16855 53.85397 -193.98172 curveto
+53.66714 -193.79489 53.41373 -193.68993 53.1495 -193.68993 curveto
+52.88528 -193.68993 52.63187 -193.79489 52.44504 -193.98172 curveto
+52.25821 -194.16855 52.15324 -194.42197 52.15324 -194.68619 curveto
+52.15324 -194.95041 52.25821 -195.20383 52.44504 -195.39066 curveto
+52.63187 -195.57748 52.88528 -195.68245 53.1495 -195.68245 curveto
+53.41373 -195.68245 53.66714 -195.57748 53.85397 -195.39066 curveto
+54.0408 -195.20383 54.14577 -194.95041 54.14577 -194.68619 curveto closepath
+ fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 14.1732 moveto
+74.40932 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 3.69554 moveto
+74.40932 0 lineto
+75.9401 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -16 moveto
+74.40932 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -26.47766 moveto
+74.40932 -30.1732 lineto
+75.9401 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -50.01569 moveto
+74.40932 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -60.49335 moveto
+74.40932 -64.18889 lineto
+75.9401 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -76.99669 moveto
+74.40932 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -87.47435 moveto
+74.40932 -91.16989 lineto
+75.9401 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -111.01237 moveto
+74.40932 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -121.49004 moveto
+74.40932 -125.18558 lineto
+75.9401 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -137.99338 moveto
+74.40932 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -148.47104 moveto
+74.40932 -152.16658 lineto
+75.9401 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -208.85939 moveto
+74.40932 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -219.33705 moveto
+74.40932 -223.0326 lineto
+75.9401 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -242.87508 moveto
+74.40932 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -253.35274 moveto
+74.40932 -257.04828 lineto
+75.9401 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -269.85606 moveto
+74.40932 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -280.33372 moveto
+74.40932 -284.02927 lineto
+75.9401 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -300.02927 moveto
+74.40932 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -310.50693 moveto
+74.40932 -314.20247 lineto
+75.9401 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 75.40558 -166.33978 moveto
+75.40558 -166.07556 75.30061 -165.82214 75.11378 -165.63531 curveto
+74.92696 -165.44849 74.67354 -165.34352 74.40932 -165.34352 curveto
+74.1451 -165.34352 73.89168 -165.44849 73.70485 -165.63531 curveto
+73.51802 -165.82214 73.41306 -166.07556 73.41306 -166.33978 curveto
+73.41306 -166.604 73.51802 -166.85742 73.70485 -167.04425 curveto
+73.89168 -167.23108 74.1451 -167.33604 74.40932 -167.33604 curveto
+74.67354 -167.33604 74.92696 -167.23108 75.11378 -167.04425 curveto
+75.30061 -166.85742 75.40558 -166.604 75.40558 -166.33978 curveto closepath
+ fill
+newpath 75.40558 -180.51299 moveto
+75.40558 -180.24876 75.30061 -179.99535 75.11378 -179.80852 curveto
+74.92696 -179.62169 74.67354 -179.51672 74.40932 -179.51672 curveto
+74.1451 -179.51672 73.89168 -179.62169 73.70485 -179.80852 curveto
+73.51802 -179.99535 73.41306 -180.24876 73.41306 -180.51299 curveto
+73.41306 -180.7772 73.51802 -181.03062 73.70485 -181.21745 curveto
+73.89168 -181.40428 74.1451 -181.50925 74.40932 -181.50925 curveto
+74.67354 -181.50925 74.92696 -181.40428 75.11378 -181.21745 curveto
+75.30061 -181.03062 75.40558 -180.7772 75.40558 -180.51299 curveto closepath
+ fill
+newpath 75.40558 -194.68619 moveto
+75.40558 -194.42197 75.30061 -194.16855 75.11378 -193.98172 curveto
+74.92696 -193.79489 74.67354 -193.68993 74.40932 -193.68993 curveto
+74.1451 -193.68993 73.89168 -193.79489 73.70485 -193.98172 curveto
+73.51802 -194.16855 73.41306 -194.42197 73.41306 -194.68619 curveto
+73.41306 -194.95041 73.51802 -195.20383 73.70485 -195.39066 curveto
+73.89168 -195.57748 74.1451 -195.68245 74.40932 -195.68245 curveto
+74.67354 -195.68245 74.92696 -195.57748 75.11378 -195.39066 curveto
+75.30061 -195.20383 75.40558 -194.95041 75.40558 -194.68619 curveto closepath
+ fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 14.1732 moveto
+95.66911 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 3.69554 moveto
+95.66911 0 lineto
+97.19989 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -16 moveto
+95.66911 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -26.47766 moveto
+95.66911 -30.1732 lineto
+97.19989 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -50.01569 moveto
+95.66911 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -60.49335 moveto
+95.66911 -64.18889 lineto
+97.19989 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -76.99669 moveto
+95.66911 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -87.47435 moveto
+95.66911 -91.16989 lineto
+97.19989 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -111.01237 moveto
+95.66911 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -121.49004 moveto
+95.66911 -125.18558 lineto
+97.19989 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -137.99338 moveto
+95.66911 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -148.47104 moveto
+95.66911 -152.16658 lineto
+97.19989 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -208.85939 moveto
+95.66911 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -219.33705 moveto
+95.66911 -223.0326 lineto
+97.19989 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -242.87508 moveto
+95.66911 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -253.35274 moveto
+95.66911 -257.04828 lineto
+97.19989 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -269.85606 moveto
+95.66911 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -280.33372 moveto
+95.66911 -284.02927 lineto
+97.19989 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -300.02927 moveto
+95.66911 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -310.50693 moveto
+95.66911 -314.20247 lineto
+97.19989 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 96.66537 -166.33978 moveto
+96.66537 -166.07556 96.56041 -165.82214 96.37358 -165.63531 curveto
+96.18675 -165.44849 95.93333 -165.34352 95.66911 -165.34352 curveto
+95.40489 -165.34352 95.15147 -165.44849 94.96465 -165.63531 curveto
+94.77782 -165.82214 94.67285 -166.07556 94.67285 -166.33978 curveto
+94.67285 -166.604 94.77782 -166.85742 94.96465 -167.04425 curveto
+95.15147 -167.23108 95.40489 -167.33604 95.66911 -167.33604 curveto
+95.93333 -167.33604 96.18675 -167.23108 96.37358 -167.04425 curveto
+96.56041 -166.85742 96.66537 -166.604 96.66537 -166.33978 curveto closepath
+ fill
+newpath 96.66537 -180.51299 moveto
+96.66537 -180.24876 96.56041 -179.99535 96.37358 -179.80852 curveto
+96.18675 -179.62169 95.93333 -179.51672 95.66911 -179.51672 curveto
+95.40489 -179.51672 95.15147 -179.62169 94.96465 -179.80852 curveto
+94.77782 -179.99535 94.67285 -180.24876 94.67285 -180.51299 curveto
+94.67285 -180.7772 94.77782 -181.03062 94.96465 -181.21745 curveto
+95.15147 -181.40428 95.40489 -181.50925 95.66911 -181.50925 curveto
+95.93333 -181.50925 96.18675 -181.40428 96.37358 -181.21745 curveto
+96.56041 -181.03062 96.66537 -180.7772 96.66537 -180.51299 curveto closepath
+ fill
+newpath 96.66537 -194.68619 moveto
+96.66537 -194.42197 96.56041 -194.16855 96.37358 -193.98172 curveto
+96.18675 -193.79489 95.93333 -193.68993 95.66911 -193.68993 curveto
+95.40489 -193.68993 95.15147 -193.79489 94.96465 -193.98172 curveto
+94.77782 -194.16855 94.67285 -194.42197 94.67285 -194.68619 curveto
+94.67285 -194.95041 94.77782 -195.20383 94.96465 -195.39066 curveto
+95.15147 -195.57748 95.40489 -195.68245 95.66911 -195.68245 curveto
+95.93333 -195.68245 96.18675 -195.57748 96.37358 -195.39066 curveto
+96.56041 -195.20383 96.66537 -194.95041 96.66537 -194.68619 curveto closepath
+ fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 14.1732 moveto
+116.92892 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 3.69554 moveto
+116.92892 0 lineto
+118.4597 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -16 moveto
+116.92892 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -26.47766 moveto
+116.92892 -30.1732 lineto
+118.4597 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -50.01569 moveto
+116.92892 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -60.49335 moveto
+116.92892 -64.18889 lineto
+118.4597 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -76.99669 moveto
+116.92892 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -87.47435 moveto
+116.92892 -91.16989 lineto
+118.4597 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -111.01237 moveto
+116.92892 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -121.49004 moveto
+116.92892 -125.18558 lineto
+118.4597 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -137.99338 moveto
+116.92892 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -148.47104 moveto
+116.92892 -152.16658 lineto
+118.4597 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -208.85939 moveto
+116.92892 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -219.33705 moveto
+116.92892 -223.0326 lineto
+118.4597 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -242.87508 moveto
+116.92892 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -253.35274 moveto
+116.92892 -257.04828 lineto
+118.4597 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -269.85606 moveto
+116.92892 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -280.33372 moveto
+116.92892 -284.02927 lineto
+118.4597 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -300.02927 moveto
+116.92892 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -310.50693 moveto
+116.92892 -314.20247 lineto
+118.4597 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 117.92519 -166.33978 moveto
+117.92519 -166.07556 117.82022 -165.82214 117.63339 -165.63531 curveto
+117.44656 -165.44849 117.19315 -165.34352 116.92892 -165.34352 curveto
+116.6647 -165.34352 116.41129 -165.44849 116.22446 -165.63531 curveto
+116.03763 -165.82214 115.93266 -166.07556 115.93266 -166.33978 curveto
+115.93266 -166.604 116.03763 -166.85742 116.22446 -167.04425 curveto
+116.41129 -167.23108 116.6647 -167.33604 116.92892 -167.33604 curveto
+117.19315 -167.33604 117.44656 -167.23108 117.63339 -167.04425 curveto
+117.82022 -166.85742 117.92519 -166.604 117.92519 -166.33978 curveto closepath
+ fill
+newpath 117.92519 -180.51299 moveto
+117.92519 -180.24876 117.82022 -179.99535 117.63339 -179.80852 curveto
+117.44656 -179.62169 117.19315 -179.51672 116.92892 -179.51672 curveto
+116.6647 -179.51672 116.41129 -179.62169 116.22446 -179.80852 curveto
+116.03763 -179.99535 115.93266 -180.24876 115.93266 -180.51299 curveto
+115.93266 -180.7772 116.03763 -181.03062 116.22446 -181.21745 curveto
+116.41129 -181.40428 116.6647 -181.50925 116.92892 -181.50925 curveto
+117.19315 -181.50925 117.44656 -181.40428 117.63339 -181.21745 curveto
+117.82022 -181.03062 117.92519 -180.7772 117.92519 -180.51299 curveto closepath
+ fill
+newpath 117.92519 -194.68619 moveto
+117.92519 -194.42197 117.82022 -194.16855 117.63339 -193.98172 curveto
+117.44656 -193.79489 117.19315 -193.68993 116.92892 -193.68993 curveto
+116.6647 -193.68993 116.41129 -193.79489 116.22446 -193.98172 curveto
+116.03763 -194.16855 115.93266 -194.42197 115.93266 -194.68619 curveto
+115.93266 -194.95041 116.03763 -195.20383 116.22446 -195.39066 curveto
+116.41129 -195.57748 116.6647 -195.68245 116.92892 -195.68245 curveto
+117.19315 -195.68245 117.44656 -195.57748 117.63339 -195.39066 curveto
+117.82022 -195.20383 117.92519 -194.95041 117.92519 -194.68619 curveto
+ closepath fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 14.1732 moveto
+138.18872 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 3.69554 moveto
+138.18872 0 lineto
+139.7195 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -16 moveto
+138.18872 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -26.47766 moveto
+138.18872 -30.1732 lineto
+139.7195 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -50.01569 moveto
+138.18872 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -60.49335 moveto
+138.18872 -64.18889 lineto
+139.7195 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -76.99669 moveto
+138.18872 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -87.47435 moveto
+138.18872 -91.16989 lineto
+139.7195 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -111.01237 moveto
+138.18872 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -121.49004 moveto
+138.18872 -125.18558 lineto
+139.7195 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -137.99338 moveto
+138.18872 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -148.47104 moveto
+138.18872 -152.16658 lineto
+139.7195 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -208.85939 moveto
+138.18872 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -219.33705 moveto
+138.18872 -223.0326 lineto
+139.7195 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -242.87508 moveto
+138.18872 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -253.35274 moveto
+138.18872 -257.04828 lineto
+139.7195 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -269.85606 moveto
+138.18872 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -280.33372 moveto
+138.18872 -284.02927 lineto
+139.7195 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -300.02927 moveto
+138.18872 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -310.50693 moveto
+138.18872 -314.20247 lineto
+139.7195 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 139.18498 -166.33978 moveto
+139.18498 -166.07556 139.08002 -165.82214 138.89319 -165.63531 curveto
+138.70636 -165.44849 138.45294 -165.34352 138.18872 -165.34352 curveto
+137.9245 -165.34352 137.67108 -165.44849 137.48425 -165.63531 curveto
+137.29742 -165.82214 137.19246 -166.07556 137.19246 -166.33978 curveto
+137.19246 -166.604 137.29742 -166.85742 137.48425 -167.04425 curveto
+137.67108 -167.23108 137.9245 -167.33604 138.18872 -167.33604 curveto
+138.45294 -167.33604 138.70636 -167.23108 138.89319 -167.04425 curveto
+139.08002 -166.85742 139.18498 -166.604 139.18498 -166.33978 curveto closepath
+ fill
+newpath 139.18498 -180.51299 moveto
+139.18498 -180.24876 139.08002 -179.99535 138.89319 -179.80852 curveto
+138.70636 -179.62169 138.45294 -179.51672 138.18872 -179.51672 curveto
+137.9245 -179.51672 137.67108 -179.62169 137.48425 -179.80852 curveto
+137.29742 -179.99535 137.19246 -180.24876 137.19246 -180.51299 curveto
+137.19246 -180.7772 137.29742 -181.03062 137.48425 -181.21745 curveto
+137.67108 -181.40428 137.9245 -181.50925 138.18872 -181.50925 curveto
+138.45294 -181.50925 138.70636 -181.40428 138.89319 -181.21745 curveto
+139.08002 -181.03062 139.18498 -180.7772 139.18498 -180.51299 curveto closepath
+ fill
+newpath 139.18498 -194.68619 moveto
+139.18498 -194.42197 139.08002 -194.16855 138.89319 -193.98172 curveto
+138.70636 -193.79489 138.45294 -193.68993 138.18872 -193.68993 curveto
+137.9245 -193.68993 137.67108 -193.79489 137.48425 -193.98172 curveto
+137.29742 -194.16855 137.19246 -194.42197 137.19246 -194.68619 curveto
+137.19246 -194.95041 137.29742 -195.20383 137.48425 -195.39066 curveto
+137.67108 -195.57748 137.9245 -195.68245 138.18872 -195.68245 curveto
+138.45294 -195.68245 138.70636 -195.57748 138.89319 -195.39066 curveto
+139.08002 -195.20383 139.18498 -194.95041 139.18498 -194.68619 curveto
+ closepath fill
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 14.1732 moveto
+159.44853 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 3.69554 moveto
+159.44853 0 lineto
+160.97931 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -16 moveto
+159.44853 -30.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -26.47766 moveto
+159.44853 -30.1732 lineto
+160.97931 -26.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -50.01569 moveto
+159.44853 -64.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -60.49335 moveto
+159.44853 -64.18889 lineto
+160.97931 -60.49335 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -76.99669 moveto
+159.44853 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -87.47435 moveto
+159.44853 -91.16989 lineto
+160.97931 -87.47435 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -111.01237 moveto
+159.44853 -125.18558 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -121.49004 moveto
+159.44853 -125.18558 lineto
+160.97931 -121.49004 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -137.99338 moveto
+159.44853 -152.16658 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -148.47104 moveto
+159.44853 -152.16658 lineto
+160.97931 -148.47104 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -208.85939 moveto
+159.44853 -223.0326 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -219.33705 moveto
+159.44853 -223.0326 lineto
+160.97931 -219.33705 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -242.87508 moveto
+159.44853 -257.04828 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -253.35274 moveto
+159.44853 -257.04828 lineto
+160.97931 -253.35274 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -269.85606 moveto
+159.44853 -284.02927 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -280.33372 moveto
+159.44853 -284.02927 lineto
+160.97931 -280.33372 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -300.02927 moveto
+159.44853 -314.20247 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -310.50693 moveto
+159.44853 -314.20247 lineto
+160.97931 -310.50693 lineto
+ closepath
+gsave fill grestore stroke
+newpath 160.4448 -166.33978 moveto
+160.4448 -166.07556 160.33983 -165.82214 160.153 -165.63531 curveto
+159.96617 -165.44849 159.71275 -165.34352 159.44853 -165.34352 curveto
+159.18431 -165.34352 158.9309 -165.44849 158.74406 -165.63531 curveto
+158.55724 -165.82214 158.45227 -166.07556 158.45227 -166.33978 curveto
+158.45227 -166.604 158.55724 -166.85742 158.74406 -167.04425 curveto
+158.9309 -167.23108 159.18431 -167.33604 159.44853 -167.33604 curveto
+159.71275 -167.33604 159.96617 -167.23108 160.153 -167.04425 curveto
+160.33983 -166.85742 160.4448 -166.604 160.4448 -166.33978 curveto closepath
+ fill
+newpath 160.4448 -180.51299 moveto
+160.4448 -180.24876 160.33983 -179.99535 160.153 -179.80852 curveto
+159.96617 -179.62169 159.71275 -179.51672 159.44853 -179.51672 curveto
+159.18431 -179.51672 158.9309 -179.62169 158.74406 -179.80852 curveto
+158.55724 -179.99535 158.45227 -180.24876 158.45227 -180.51299 curveto
+158.45227 -180.7772 158.55724 -181.03062 158.74406 -181.21745 curveto
+158.9309 -181.40428 159.18431 -181.50925 159.44853 -181.50925 curveto
+159.71275 -181.50925 159.96617 -181.40428 160.153 -181.21745 curveto
+160.33983 -181.03062 160.4448 -180.7772 160.4448 -180.51299 curveto closepath
+ fill
+newpath 160.4448 -194.68619 moveto
+160.4448 -194.42197 160.33983 -194.16855 160.153 -193.98172 curveto
+159.96617 -193.79489 159.71275 -193.68993 159.44853 -193.68993 curveto
+159.18431 -193.68993 158.9309 -193.79489 158.74406 -193.98172 curveto
+158.55724 -194.16855 158.45227 -194.42197 158.45227 -194.68619 curveto
+158.45227 -194.95041 158.55724 -195.20383 158.74406 -195.39066 curveto
+158.9309 -195.57748 159.18431 -195.68245 159.44853 -195.68245 curveto
+159.71275 -195.68245 159.96617 -195.57748 160.153 -195.39066 curveto
+160.33983 -195.20383 160.4448 -194.95041 160.4448 -194.68619 curveto closepath
+ fill
+65.04471 22.84248 moveto
+(Plain) cmr10 9.96265 fshow
+87.5991 22.84248 moveto
+(text) cmr10 9.96265 fshow
+169.53429 -183.97226 moveto
+(5) cmr10 9.96265 fshow
+177.83649 -183.97226 moveto
+(more) cmr10 9.96265 fshow
+202.77078 -183.97226 moveto
+(rounds) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-31.mps b/Supporting_Documentation/tex/skein-31.mps
new file mode 100644
index 0000000000000..be892a8f4ae45
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-31.mps
@@ -0,0 +1,161 @@
+%!PS
+%%BoundingBox: -46 -68 85 19 
+%%HiResBoundingBox: -45.20761 -67.94917 84.85632 18.1732 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmmi10 9.96265 9.96265 3c:800002
+%*Font: cmmi7 6.97385 6.97385 3b:80000000000201
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 4 4 moveto
+4 4 lineto
+4 -4 lineto
+4 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -4 4 moveto
+-4 4 lineto
+4 4 lineto
+4 4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -4 4 moveto
+-4 4 lineto
+-4 -4 lineto
+-4 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -4 -4 moveto
+-4 -4 lineto
+4 -4 lineto
+4 -4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 4 moveto
+0 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 4 0 moveto
+-4 0 lineto stroke
+31.3464 -25.37865 moveto
+(<) cmmi10 9.96265 fshow
+35.7743 -25.37865 moveto
+(<) cmmi10 9.96265 fshow
+40.2023 -25.37865 moveto
+(<) cmmi10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 50.95102 -17.00784 moveto
+50.95102 -17.00784 lineto
+50.95102 -28.76813 lineto
+50.95102 -28.76813 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 28.3464 -17.00784 moveto
+28.3464 -17.00784 lineto
+50.95102 -17.00784 lineto
+50.95102 -17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 28.3464 -17.00784 moveto
+28.3464 -17.00784 lineto
+28.3464 -28.76813 lineto
+28.3464 -28.76813 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 28.3464 -28.76813 moveto
+28.3464 -28.76813 lineto
+50.95102 -28.76813 lineto
+50.95102 -28.76813 lineto stroke
+newpath 42.47728 -46.9474 moveto
+43.20105 -47.67117 43.64871 -48.67105 43.64871 -49.77548 curveto
+43.64871 -49.77646 lineto
+43.64871 -50.88089 43.20105 -51.88077 42.47728 -52.60454 curveto stroke
+newpath 36.82014 -46.9474 moveto
+37.54391 -46.22363 38.5438 -45.77597 39.64822 -45.77597 curveto
+39.6492 -45.77597 lineto
+40.75363 -45.77597 41.75351 -46.22363 42.47728 -46.9474 curveto stroke
+newpath 36.82014 -46.9474 moveto
+36.09637 -47.67117 35.64871 -48.67105 35.64871 -49.77548 curveto
+35.64871 -49.77646 lineto
+35.64871 -50.88089 36.09637 -51.88077 36.82014 -52.60454 curveto stroke
+newpath 36.82014 -52.60454 moveto
+37.54391 -53.32831 38.5438 -53.77597 39.64822 -53.77597 curveto
+39.6492 -53.77597 lineto
+40.75363 -53.77597 41.75351 -53.32831 42.47728 -52.60454 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 -45.77597 moveto
+39.64871 -53.77597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 43.64871 -49.77597 moveto
+35.64871 -49.77597 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 18.1732 moveto
+0 4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.53078 7.69554 moveto
+0 4 lineto
+1.53078 7.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 18.1732 moveto
+39.64871 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.11801 -13.31248 moveto
+39.64871 -17.00784 lineto
+41.17941 -13.31248 lineto
+ closepath
+gsave fill grestore stroke
+newpath 39.64871 0 moveto
+4 0 lineto stroke
+newpath 7.69574 1.53087 moveto
+4 0 lineto
+7.69574 -1.53087 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -4 moveto
+0 -67.94917 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.53067 -64.2539 moveto
+0 -67.94917 lineto
+1.53067 -64.2539 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 -28.76813 moveto
+39.64871 -45.77597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.11795 -42.08049 moveto
+39.64871 -45.77597 lineto
+41.17947 -42.08049 lineto
+ closepath
+gsave fill grestore stroke
+newpath 0 -49.77597 moveto
+35.64871 -49.77597 lineto stroke
+newpath 31.95297 -51.30684 moveto
+35.64871 -49.77597 lineto
+31.95297 -48.2451 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 39.64871 -53.77597 moveto
+39.64871 -67.94917 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 38.11794 -64.25363 moveto
+39.64871 -67.94917 lineto
+41.17949 -64.25363 lineto
+ closepath
+gsave fill grestore stroke
+newpath 65.12422 -22.88799 moveto
+50.95102 -22.88799 lineto stroke
+newpath 54.64656 -21.35721 moveto
+50.95102 -22.88799 lineto
+54.64656 -24.41876 lineto
+ closepath
+gsave fill grestore stroke
+68.12422 -24.86668 moveto
+(R) cmmi10 9.96265 fshow
+75.68872 -26.36108 moveto
+(r) cmmi7 6.97385 fshow
+79.17342 -26.36108 moveto
+(;i) cmmi7 6.97385 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-32.mps b/Supporting_Documentation/tex/skein-32.mps
new file mode 100644
index 0000000000000..9c247abb43599
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-32.mps
@@ -0,0 +1,812 @@
+%!PS
+%%BoundingBox: -57 -231 227 30 
+%%HiResBoundingBox: -56.7471 -230.88246 226.82556 29.76099 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 30:c00000049000645e2cc
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 170.07843 0 moveto
+170.07843 0 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 0 moveto
+0 0 lineto
+170.07843 0 lineto
+170.07843 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 0 moveto
+0 0 lineto
+0 -16 lineto
+0 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -16 moveto
+0 -16 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -4 moveto
+89.03922 -4 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+89.03922 -4 lineto
+89.03922 -4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+81.03922 -12 lineto
+81.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -12 moveto
+81.03922 -12 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -4 moveto
+85.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -8 moveto
+81.03922 -8 lineto stroke
+newpath -14.1732 -8 moveto
+0 -8 lineto stroke
+newpath -3.69554 -9.53078 moveto
+0 -8 lineto
+-3.69554 -6.46922 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -10.49066 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -10.49066 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -10.49066 moveto
+(0) cmr10 9.96265 fshow
+33.94067 -34.99442 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 77.95241 -24.50392 moveto
+77.95241 -24.50392 lineto
+77.95241 -38.67712 lineto
+77.95241 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -24.50392 moveto
+7.08684 -24.50392 lineto
+77.95241 -24.50392 lineto
+77.95241 -24.50392 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -24.50392 moveto
+7.08684 -24.50392 lineto
+7.08684 -38.67712 lineto
+7.08684 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -38.67712 moveto
+7.08684 -38.67712 lineto
+77.95241 -38.67712 lineto
+77.95241 -38.67712 lineto stroke
+118.97946 -34.99442 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -24.50392 moveto
+162.9912 -24.50392 lineto
+162.9912 -38.67712 lineto
+162.9912 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12563 -24.50392 moveto
+92.12563 -24.50392 lineto
+162.9912 -24.50392 lineto
+162.9912 -24.50392 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 92.12563 -24.50392 moveto
+92.12563 -24.50392 lineto
+92.12563 -38.67712 lineto
+92.12563 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12563 -38.67712 moveto
+92.12563 -38.67712 lineto
+162.9912 -38.67712 lineto
+162.9912 -38.67712 lineto stroke
+66.69112 -56.98885 moveto
+(P) cmr10 9.96265 fshow
+73.19452 -56.98885 moveto
+(erm) cmr10 9.96265 fshow
+89.54991 -56.98885 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -47.18105 moveto
+162.9912 -47.18105 lineto
+162.9912 -59.98885 lineto
+162.9912 -59.98885 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -47.18105 moveto
+7.08684 -47.18105 lineto
+162.9912 -47.18105 lineto
+162.9912 -47.18105 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -47.18105 moveto
+7.08684 -47.18105 lineto
+7.08684 -59.98885 lineto
+7.08684 -59.98885 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -59.98885 moveto
+7.08684 -59.98885 lineto
+162.9912 -59.98885 lineto
+162.9912 -59.98885 lineto stroke
+33.94067 -78.98326 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 77.95241 -68.49277 moveto
+77.95241 -68.49277 lineto
+77.95241 -82.66597 lineto
+77.95241 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -68.49277 moveto
+7.08684 -68.49277 lineto
+77.95241 -68.49277 lineto
+77.95241 -68.49277 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -68.49277 moveto
+7.08684 -68.49277 lineto
+7.08684 -82.66597 lineto
+7.08684 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -82.66597 moveto
+7.08684 -82.66597 lineto
+77.95241 -82.66597 lineto
+77.95241 -82.66597 lineto stroke
+118.97945 -78.98326 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -68.49277 moveto
+162.9912 -68.49277 lineto
+162.9912 -82.66597 lineto
+162.9912 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -68.49277 moveto
+92.12561 -68.49277 lineto
+162.9912 -68.49277 lineto
+162.9912 -68.49277 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 92.12561 -68.49277 moveto
+92.12561 -68.49277 lineto
+92.12561 -82.66597 lineto
+92.12561 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -82.66597 moveto
+92.12561 -82.66597 lineto
+162.9912 -82.66597 lineto
+162.9912 -82.66597 lineto stroke
+66.69112 -100.97769 moveto
+(P) cmr10 9.96265 fshow
+73.19452 -100.97769 moveto
+(erm) cmr10 9.96265 fshow
+89.54991 -100.97769 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -91.16989 moveto
+162.9912 -91.16989 lineto
+162.9912 -103.97768 lineto
+162.9912 -103.97768 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -91.16989 moveto
+7.08684 -91.16989 lineto
+162.9912 -91.16989 lineto
+162.9912 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -91.16989 moveto
+7.08684 -91.16989 lineto
+7.08684 -103.97768 lineto
+7.08684 -103.97768 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -103.97768 moveto
+7.08684 -103.97768 lineto
+162.9912 -103.97768 lineto
+162.9912 -103.97768 lineto stroke
+33.94067 -122.97209 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 77.95241 -112.4816 moveto
+77.95241 -112.4816 lineto
+77.95241 -126.6548 lineto
+77.95241 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -112.4816 moveto
+7.08684 -112.4816 lineto
+77.95241 -112.4816 lineto
+77.95241 -112.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -112.4816 moveto
+7.08684 -112.4816 lineto
+7.08684 -126.6548 lineto
+7.08684 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -126.6548 moveto
+7.08684 -126.6548 lineto
+77.95241 -126.6548 lineto
+77.95241 -126.6548 lineto stroke
+118.97945 -122.97209 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -112.4816 moveto
+162.9912 -112.4816 lineto
+162.9912 -126.6548 lineto
+162.9912 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -112.4816 moveto
+92.12561 -112.4816 lineto
+162.9912 -112.4816 lineto
+162.9912 -112.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 92.12561 -112.4816 moveto
+92.12561 -112.4816 lineto
+92.12561 -126.6548 lineto
+92.12561 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -126.6548 moveto
+92.12561 -126.6548 lineto
+162.9912 -126.6548 lineto
+162.9912 -126.6548 lineto stroke
+66.69112 -144.96652 moveto
+(P) cmr10 9.96265 fshow
+73.19452 -144.96652 moveto
+(erm) cmr10 9.96265 fshow
+89.54991 -144.96652 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -135.15872 moveto
+162.9912 -135.15872 lineto
+162.9912 -147.9665 lineto
+162.9912 -147.9665 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -135.15872 moveto
+7.08684 -135.15872 lineto
+162.9912 -135.15872 lineto
+162.9912 -135.15872 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -135.15872 moveto
+7.08684 -135.15872 lineto
+7.08684 -147.9665 lineto
+7.08684 -147.9665 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -147.9665 moveto
+7.08684 -147.9665 lineto
+162.9912 -147.9665 lineto
+162.9912 -147.9665 lineto stroke
+33.94067 -166.96092 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 77.95241 -156.47043 moveto
+77.95241 -156.47043 lineto
+77.95241 -170.64363 lineto
+77.95241 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -156.47043 moveto
+7.08684 -156.47043 lineto
+77.95241 -156.47043 lineto
+77.95241 -156.47043 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -156.47043 moveto
+7.08684 -156.47043 lineto
+7.08684 -170.64363 lineto
+7.08684 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -170.64363 moveto
+7.08684 -170.64363 lineto
+77.95241 -170.64363 lineto
+77.95241 -170.64363 lineto stroke
+118.97945 -166.96092 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -156.47043 moveto
+162.9912 -156.47043 lineto
+162.9912 -170.64363 lineto
+162.9912 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -156.47043 moveto
+92.12561 -156.47043 lineto
+162.9912 -156.47043 lineto
+162.9912 -156.47043 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 92.12561 -156.47043 moveto
+92.12561 -156.47043 lineto
+92.12561 -170.64363 lineto
+92.12561 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.12561 -170.64363 moveto
+92.12561 -170.64363 lineto
+162.9912 -170.64363 lineto
+162.9912 -170.64363 lineto stroke
+66.69112 -188.95535 moveto
+(P) cmr10 9.96265 fshow
+73.19452 -188.95535 moveto
+(erm) cmr10 9.96265 fshow
+89.54991 -188.95535 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 162.9912 -179.14755 moveto
+162.9912 -179.14755 lineto
+162.9912 -191.95534 lineto
+162.9912 -191.95534 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -179.14755 moveto
+7.08684 -179.14755 lineto
+162.9912 -179.14755 lineto
+162.9912 -179.14755 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 7.08684 -179.14755 moveto
+7.08684 -179.14755 lineto
+7.08684 -191.95534 lineto
+7.08684 -191.95534 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 7.08684 -191.95534 moveto
+7.08684 -191.95534 lineto
+162.9912 -191.95534 lineto
+162.9912 -191.95534 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.07843 -200.45926 moveto
+170.07843 -200.45926 lineto
+170.07843 -216.45926 lineto
+170.07843 -216.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -200.45926 moveto
+0 -200.45926 lineto
+170.07843 -200.45926 lineto
+170.07843 -200.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -200.45926 moveto
+0 -200.45926 lineto
+0 -216.45926 lineto
+0 -216.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -216.45926 moveto
+0 -216.45926 lineto
+170.07843 -216.45926 lineto
+170.07843 -216.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -204.45926 moveto
+89.03922 -204.45926 lineto
+89.03922 -212.45926 lineto
+89.03922 -212.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -204.45926 moveto
+81.03922 -204.45926 lineto
+89.03922 -204.45926 lineto
+89.03922 -204.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -204.45926 moveto
+81.03922 -204.45926 lineto
+81.03922 -212.45926 lineto
+81.03922 -212.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -212.45926 moveto
+81.03922 -212.45926 lineto
+89.03922 -212.45926 lineto
+89.03922 -212.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -204.45926 moveto
+85.03922 -212.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -208.45926 moveto
+81.03922 -208.45926 lineto stroke
+newpath -14.1732 -208.45926 moveto
+0 -208.45926 lineto stroke
+newpath -3.69554 -209.99004 moveto
+0 -208.45926 lineto
+-3.69554 -206.92848 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -210.94992 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -210.94992 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -210.94992 moveto
+(1) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 14.1732 moveto
+21.25981 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 3.69554 moveto
+21.25981 0 lineto
+22.79059 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -16 moveto
+21.25981 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -20.80844 moveto
+21.25981 -24.50392 lineto
+22.79057 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -38.67712 moveto
+21.25981 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -43.48557 moveto
+21.25981 -47.18105 lineto
+22.79057 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -59.98885 moveto
+21.25981 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -64.79729 moveto
+21.25981 -68.49277 lineto
+22.79057 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -82.66597 moveto
+21.25981 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -87.47441 moveto
+21.25981 -91.16989 lineto
+22.79057 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -103.97768 moveto
+21.25981 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -108.78612 moveto
+21.25981 -112.4816 lineto
+22.79057 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -126.6548 moveto
+21.25981 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -131.46324 moveto
+21.25981 -135.15872 lineto
+22.79057 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -147.9665 moveto
+21.25981 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -152.77495 moveto
+21.25981 -156.47043 lineto
+22.79057 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -170.64363 moveto
+21.25981 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -175.45207 moveto
+21.25981 -179.14755 lineto
+22.79057 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -191.95534 moveto
+21.25981 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72905 -196.76378 moveto
+21.25981 -200.45926 lineto
+22.79057 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 21.25981 -216.45926 moveto
+21.25981 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 19.72903 -226.93692 moveto
+21.25981 -230.63246 lineto
+22.79059 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 14.1732 moveto
+63.77942 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 3.69554 moveto
+63.77942 0 lineto
+65.3102 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -16 moveto
+63.77942 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -20.80844 moveto
+63.77942 -24.50392 lineto
+65.31018 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -38.67712 moveto
+63.77942 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -43.48557 moveto
+63.77942 -47.18105 lineto
+65.31018 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -59.98885 moveto
+63.77942 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -64.79729 moveto
+63.77942 -68.49277 lineto
+65.31018 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -82.66597 moveto
+63.77942 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -87.47441 moveto
+63.77942 -91.16989 lineto
+65.31018 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -103.97768 moveto
+63.77942 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -108.78612 moveto
+63.77942 -112.4816 lineto
+65.31018 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -126.6548 moveto
+63.77942 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -131.46324 moveto
+63.77942 -135.15872 lineto
+65.31018 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -147.9665 moveto
+63.77942 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -152.77495 moveto
+63.77942 -156.47043 lineto
+65.31018 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -170.64363 moveto
+63.77942 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -175.45207 moveto
+63.77942 -179.14755 lineto
+65.31018 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -191.95534 moveto
+63.77942 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24866 -196.76378 moveto
+63.77942 -200.45926 lineto
+65.31018 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 63.77942 -216.45926 moveto
+63.77942 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 62.24864 -226.93692 moveto
+63.77942 -230.63246 lineto
+65.3102 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 14.1732 moveto
+106.29903 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 3.69554 moveto
+106.29903 0 lineto
+107.8298 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -16 moveto
+106.29903 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -20.80844 moveto
+106.29903 -24.50392 lineto
+107.82979 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -38.67712 moveto
+106.29903 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -43.48557 moveto
+106.29903 -47.18105 lineto
+107.82979 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -59.98885 moveto
+106.29903 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -64.79729 moveto
+106.29903 -68.49277 lineto
+107.82979 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -82.66597 moveto
+106.29903 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -87.47441 moveto
+106.29903 -91.16989 lineto
+107.82979 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -103.97768 moveto
+106.29903 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -108.78612 moveto
+106.29903 -112.4816 lineto
+107.82979 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -126.6548 moveto
+106.29903 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -131.46324 moveto
+106.29903 -135.15872 lineto
+107.82979 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -147.9665 moveto
+106.29903 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -152.77495 moveto
+106.29903 -156.47043 lineto
+107.82979 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -170.64363 moveto
+106.29903 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -175.45207 moveto
+106.29903 -179.14755 lineto
+107.82979 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -191.95534 moveto
+106.29903 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76826 -196.76378 moveto
+106.29903 -200.45926 lineto
+107.82979 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.29903 -216.45926 moveto
+106.29903 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 104.76825 -226.93692 moveto
+106.29903 -230.63246 lineto
+107.8298 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 14.1732 moveto
+148.81863 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 3.69554 moveto
+148.81863 0 lineto
+150.34941 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -16 moveto
+148.81863 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -20.80844 moveto
+148.81863 -24.50392 lineto
+150.3494 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -38.67712 moveto
+148.81863 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -43.48557 moveto
+148.81863 -47.18105 lineto
+150.3494 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -59.98885 moveto
+148.81863 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -64.79729 moveto
+148.81863 -68.49277 lineto
+150.3494 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -82.66597 moveto
+148.81863 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -87.47441 moveto
+148.81863 -91.16989 lineto
+150.3494 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -103.97768 moveto
+148.81863 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -108.78612 moveto
+148.81863 -112.4816 lineto
+150.3494 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -126.6548 moveto
+148.81863 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -131.46324 moveto
+148.81863 -135.15872 lineto
+150.3494 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -147.9665 moveto
+148.81863 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -152.77495 moveto
+148.81863 -156.47043 lineto
+150.3494 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -170.64363 moveto
+148.81863 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -175.45207 moveto
+148.81863 -179.14755 lineto
+150.3494 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -191.95534 moveto
+148.81863 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28787 -196.76378 moveto
+148.81863 -200.45926 lineto
+150.3494 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 148.81863 -216.45926 moveto
+148.81863 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 147.28786 -226.93692 moveto
+148.81863 -230.63246 lineto
+150.34941 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+65.04471 22.84248 moveto
+(Plain) cmr10 9.96265 fshow
+87.5991 22.84248 moveto
+(text) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-33.mps b/Supporting_Documentation/tex/skein-33.mps
new file mode 100644
index 0000000000000..9a577a6cd86d9
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-33.mps
@@ -0,0 +1,1384 @@
+%!PS
+%%BoundingBox: -57 -231 227 30 
+%%HiResBoundingBox: -56.7471 -230.88246 226.82553 29.76099 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 30:c00000049000645e2cc
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 170.07843 0 moveto
+170.07843 0 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 0 moveto
+0 0 lineto
+170.07843 0 lineto
+170.07843 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 0 moveto
+0 0 lineto
+0 -16 lineto
+0 -16 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -16 moveto
+0 -16 lineto
+170.07843 -16 lineto
+170.07843 -16 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -4 moveto
+89.03922 -4 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+89.03922 -4 lineto
+89.03922 -4 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -4 moveto
+81.03922 -4 lineto
+81.03922 -12 lineto
+81.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -12 moveto
+81.03922 -12 lineto
+89.03922 -12 lineto
+89.03922 -12 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -4 moveto
+85.03922 -12 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -8 moveto
+81.03922 -8 lineto stroke
+newpath -14.1732 -8 moveto
+0 -8 lineto stroke
+newpath -3.69554 -9.53078 moveto
+0 -8 lineto
+-3.69554 -6.46922 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -10.49066 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -10.49066 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -10.49066 moveto
+(0) cmr10 9.96265 fshow
+12.68088 -34.99442 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.9762 -24.50392 moveto
+38.9762 -24.50392 lineto
+38.9762 -38.67712 lineto
+38.9762 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -24.50392 moveto
+3.54344 -24.50392 lineto
+38.9762 -24.50392 lineto
+38.9762 -24.50392 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -24.50392 moveto
+3.54344 -24.50392 lineto
+3.54344 -38.67712 lineto
+3.54344 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -38.67712 moveto
+3.54344 -38.67712 lineto
+38.9762 -38.67712 lineto
+38.9762 -38.67712 lineto stroke
+55.20024 -34.99442 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.49556 -24.50392 moveto
+81.49556 -24.50392 lineto
+81.49556 -38.67712 lineto
+81.49556 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -24.50392 moveto
+46.0628 -24.50392 lineto
+81.49556 -24.50392 lineto
+81.49556 -24.50392 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 46.0628 -24.50392 moveto
+46.0628 -24.50392 lineto
+46.0628 -38.67712 lineto
+46.0628 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -38.67712 moveto
+46.0628 -38.67712 lineto
+81.49556 -38.67712 lineto
+81.49556 -38.67712 lineto stroke
+97.7196 -34.99442 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 124.01492 -24.50392 moveto
+124.01492 -24.50392 lineto
+124.01492 -38.67712 lineto
+124.01492 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -24.50392 moveto
+88.58217 -24.50392 lineto
+124.01492 -24.50392 lineto
+124.01492 -24.50392 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.58217 -24.50392 moveto
+88.58217 -24.50392 lineto
+88.58217 -38.67712 lineto
+88.58217 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -38.67712 moveto
+88.58217 -38.67712 lineto
+124.01492 -38.67712 lineto
+124.01492 -38.67712 lineto stroke
+140.23897 -34.99442 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -24.50392 moveto
+166.53429 -24.50392 lineto
+166.53429 -38.67712 lineto
+166.53429 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -24.50392 moveto
+131.10153 -24.50392 lineto
+166.53429 -24.50392 lineto
+166.53429 -24.50392 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 131.10153 -24.50392 moveto
+131.10153 -24.50392 lineto
+131.10153 -38.67712 lineto
+131.10153 -38.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -38.67712 moveto
+131.10153 -38.67712 lineto
+166.53429 -38.67712 lineto
+166.53429 -38.67712 lineto stroke
+66.69096 -56.98885 moveto
+(P) cmr10 9.96265 fshow
+73.19437 -56.98885 moveto
+(erm) cmr10 9.96265 fshow
+89.54976 -56.98885 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -47.18105 moveto
+166.53429 -47.18105 lineto
+166.53429 -59.98885 lineto
+166.53429 -59.98885 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -47.18105 moveto
+3.54344 -47.18105 lineto
+166.53429 -47.18105 lineto
+166.53429 -47.18105 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -47.18105 moveto
+3.54344 -47.18105 lineto
+3.54344 -59.98885 lineto
+3.54344 -59.98885 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -59.98885 moveto
+3.54344 -59.98885 lineto
+166.53429 -59.98885 lineto
+166.53429 -59.98885 lineto stroke
+12.68088 -78.98326 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.9762 -68.49277 moveto
+38.9762 -68.49277 lineto
+38.9762 -82.66597 lineto
+38.9762 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -68.49277 moveto
+3.54344 -68.49277 lineto
+38.9762 -68.49277 lineto
+38.9762 -68.49277 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -68.49277 moveto
+3.54344 -68.49277 lineto
+3.54344 -82.66597 lineto
+3.54344 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -82.66597 moveto
+3.54344 -82.66597 lineto
+38.9762 -82.66597 lineto
+38.9762 -82.66597 lineto stroke
+55.20024 -78.98326 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.49556 -68.49277 moveto
+81.49556 -68.49277 lineto
+81.49556 -82.66597 lineto
+81.49556 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -68.49277 moveto
+46.0628 -68.49277 lineto
+81.49556 -68.49277 lineto
+81.49556 -68.49277 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 46.0628 -68.49277 moveto
+46.0628 -68.49277 lineto
+46.0628 -82.66597 lineto
+46.0628 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -82.66597 moveto
+46.0628 -82.66597 lineto
+81.49556 -82.66597 lineto
+81.49556 -82.66597 lineto stroke
+97.7196 -78.98326 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 124.01492 -68.49277 moveto
+124.01492 -68.49277 lineto
+124.01492 -82.66597 lineto
+124.01492 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -68.49277 moveto
+88.58217 -68.49277 lineto
+124.01492 -68.49277 lineto
+124.01492 -68.49277 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.58217 -68.49277 moveto
+88.58217 -68.49277 lineto
+88.58217 -82.66597 lineto
+88.58217 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -82.66597 moveto
+88.58217 -82.66597 lineto
+124.01492 -82.66597 lineto
+124.01492 -82.66597 lineto stroke
+140.23897 -78.98326 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -68.49277 moveto
+166.53429 -68.49277 lineto
+166.53429 -82.66597 lineto
+166.53429 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -68.49277 moveto
+131.10153 -68.49277 lineto
+166.53429 -68.49277 lineto
+166.53429 -68.49277 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 131.10153 -68.49277 moveto
+131.10153 -68.49277 lineto
+131.10153 -82.66597 lineto
+131.10153 -82.66597 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -82.66597 moveto
+131.10153 -82.66597 lineto
+166.53429 -82.66597 lineto
+166.53429 -82.66597 lineto stroke
+66.69096 -100.97769 moveto
+(P) cmr10 9.96265 fshow
+73.19437 -100.97769 moveto
+(erm) cmr10 9.96265 fshow
+89.54976 -100.97769 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -91.16989 moveto
+166.53429 -91.16989 lineto
+166.53429 -103.97768 lineto
+166.53429 -103.97768 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -91.16989 moveto
+3.54344 -91.16989 lineto
+166.53429 -91.16989 lineto
+166.53429 -91.16989 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -91.16989 moveto
+3.54344 -91.16989 lineto
+3.54344 -103.97768 lineto
+3.54344 -103.97768 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -103.97768 moveto
+3.54344 -103.97768 lineto
+166.53429 -103.97768 lineto
+166.53429 -103.97768 lineto stroke
+12.68088 -122.97209 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.9762 -112.4816 moveto
+38.9762 -112.4816 lineto
+38.9762 -126.6548 lineto
+38.9762 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -112.4816 moveto
+3.54344 -112.4816 lineto
+38.9762 -112.4816 lineto
+38.9762 -112.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -112.4816 moveto
+3.54344 -112.4816 lineto
+3.54344 -126.6548 lineto
+3.54344 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -126.6548 moveto
+3.54344 -126.6548 lineto
+38.9762 -126.6548 lineto
+38.9762 -126.6548 lineto stroke
+55.20024 -122.97209 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.49556 -112.4816 moveto
+81.49556 -112.4816 lineto
+81.49556 -126.6548 lineto
+81.49556 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -112.4816 moveto
+46.0628 -112.4816 lineto
+81.49556 -112.4816 lineto
+81.49556 -112.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 46.0628 -112.4816 moveto
+46.0628 -112.4816 lineto
+46.0628 -126.6548 lineto
+46.0628 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -126.6548 moveto
+46.0628 -126.6548 lineto
+81.49556 -126.6548 lineto
+81.49556 -126.6548 lineto stroke
+97.7196 -122.97209 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 124.01492 -112.4816 moveto
+124.01492 -112.4816 lineto
+124.01492 -126.6548 lineto
+124.01492 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -112.4816 moveto
+88.58217 -112.4816 lineto
+124.01492 -112.4816 lineto
+124.01492 -112.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.58217 -112.4816 moveto
+88.58217 -112.4816 lineto
+88.58217 -126.6548 lineto
+88.58217 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -126.6548 moveto
+88.58217 -126.6548 lineto
+124.01492 -126.6548 lineto
+124.01492 -126.6548 lineto stroke
+140.23897 -122.97209 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -112.4816 moveto
+166.53429 -112.4816 lineto
+166.53429 -126.6548 lineto
+166.53429 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -112.4816 moveto
+131.10153 -112.4816 lineto
+166.53429 -112.4816 lineto
+166.53429 -112.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 131.10153 -112.4816 moveto
+131.10153 -112.4816 lineto
+131.10153 -126.6548 lineto
+131.10153 -126.6548 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -126.6548 moveto
+131.10153 -126.6548 lineto
+166.53429 -126.6548 lineto
+166.53429 -126.6548 lineto stroke
+66.69096 -144.96652 moveto
+(P) cmr10 9.96265 fshow
+73.19437 -144.96652 moveto
+(erm) cmr10 9.96265 fshow
+89.54976 -144.96652 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -135.15872 moveto
+166.53429 -135.15872 lineto
+166.53429 -147.9665 lineto
+166.53429 -147.9665 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -135.15872 moveto
+3.54344 -135.15872 lineto
+166.53429 -135.15872 lineto
+166.53429 -135.15872 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -135.15872 moveto
+3.54344 -135.15872 lineto
+3.54344 -147.9665 lineto
+3.54344 -147.9665 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -147.9665 moveto
+3.54344 -147.9665 lineto
+166.53429 -147.9665 lineto
+166.53429 -147.9665 lineto stroke
+12.68088 -166.96092 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 38.9762 -156.47043 moveto
+38.9762 -156.47043 lineto
+38.9762 -170.64363 lineto
+38.9762 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -156.47043 moveto
+3.54344 -156.47043 lineto
+38.9762 -156.47043 lineto
+38.9762 -156.47043 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -156.47043 moveto
+3.54344 -156.47043 lineto
+3.54344 -170.64363 lineto
+3.54344 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -170.64363 moveto
+3.54344 -170.64363 lineto
+38.9762 -170.64363 lineto
+38.9762 -170.64363 lineto stroke
+55.20024 -166.96092 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.49556 -156.47043 moveto
+81.49556 -156.47043 lineto
+81.49556 -170.64363 lineto
+81.49556 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -156.47043 moveto
+46.0628 -156.47043 lineto
+81.49556 -156.47043 lineto
+81.49556 -156.47043 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 46.0628 -156.47043 moveto
+46.0628 -156.47043 lineto
+46.0628 -170.64363 lineto
+46.0628 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 46.0628 -170.64363 moveto
+46.0628 -170.64363 lineto
+81.49556 -170.64363 lineto
+81.49556 -170.64363 lineto stroke
+97.7196 -166.96092 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 124.01492 -156.47043 moveto
+124.01492 -156.47043 lineto
+124.01492 -170.64363 lineto
+124.01492 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -156.47043 moveto
+88.58217 -156.47043 lineto
+124.01492 -156.47043 lineto
+124.01492 -156.47043 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.58217 -156.47043 moveto
+88.58217 -156.47043 lineto
+88.58217 -170.64363 lineto
+88.58217 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 88.58217 -170.64363 moveto
+88.58217 -170.64363 lineto
+124.01492 -170.64363 lineto
+124.01492 -170.64363 lineto stroke
+140.23897 -166.96092 moveto
+(Mix) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -156.47043 moveto
+166.53429 -156.47043 lineto
+166.53429 -170.64363 lineto
+166.53429 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -156.47043 moveto
+131.10153 -156.47043 lineto
+166.53429 -156.47043 lineto
+166.53429 -156.47043 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 131.10153 -156.47043 moveto
+131.10153 -156.47043 lineto
+131.10153 -170.64363 lineto
+131.10153 -170.64363 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 131.10153 -170.64363 moveto
+131.10153 -170.64363 lineto
+166.53429 -170.64363 lineto
+166.53429 -170.64363 lineto stroke
+66.69096 -188.95535 moveto
+(P) cmr10 9.96265 fshow
+73.19437 -188.95535 moveto
+(erm) cmr10 9.96265 fshow
+89.54976 -188.95535 moveto
+(ute) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 166.53429 -179.14755 moveto
+166.53429 -179.14755 lineto
+166.53429 -191.95534 lineto
+166.53429 -191.95534 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -179.14755 moveto
+3.54344 -179.14755 lineto
+166.53429 -179.14755 lineto
+166.53429 -179.14755 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 3.54344 -179.14755 moveto
+3.54344 -179.14755 lineto
+3.54344 -191.95534 lineto
+3.54344 -191.95534 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 3.54344 -191.95534 moveto
+3.54344 -191.95534 lineto
+166.53429 -191.95534 lineto
+166.53429 -191.95534 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.07843 -200.45926 moveto
+170.07843 -200.45926 lineto
+170.07843 -216.45926 lineto
+170.07843 -216.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -200.45926 moveto
+0 -200.45926 lineto
+170.07843 -200.45926 lineto
+170.07843 -200.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -200.45926 moveto
+0 -200.45926 lineto
+0 -216.45926 lineto
+0 -216.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 0 -216.45926 moveto
+0 -216.45926 lineto
+170.07843 -216.45926 lineto
+170.07843 -216.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 89.03922 -204.45926 moveto
+89.03922 -204.45926 lineto
+89.03922 -212.45926 lineto
+89.03922 -212.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -204.45926 moveto
+81.03922 -204.45926 lineto
+89.03922 -204.45926 lineto
+89.03922 -204.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 81.03922 -204.45926 moveto
+81.03922 -204.45926 lineto
+81.03922 -212.45926 lineto
+81.03922 -212.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 81.03922 -212.45926 moveto
+81.03922 -212.45926 lineto
+89.03922 -212.45926 lineto
+89.03922 -212.45926 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 85.03922 -204.45926 moveto
+85.03922 -212.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 89.03922 -208.45926 moveto
+81.03922 -208.45926 lineto stroke
+newpath -14.1732 -208.45926 moveto
+0 -208.45926 lineto stroke
+newpath -3.69554 -209.99004 moveto
+0 -208.45926 lineto
+-3.69554 -206.92848 lineto
+ closepath
+gsave fill grestore stroke
+-56.7471 -210.94992 moveto
+(Subk) cmr10 9.96265 fshow
+-35.1613 -210.94992 moveto
+(ey) cmr10 9.96265 fshow
+-22.1545 -210.94992 moveto
+(1) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 14.1732 moveto
+10.6299 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 3.69554 moveto
+10.6299 0 lineto
+12.16068 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -16 moveto
+10.6299 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -20.80844 moveto
+10.6299 -24.50392 lineto
+12.16066 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -38.67712 moveto
+10.6299 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -43.48557 moveto
+10.6299 -47.18105 lineto
+12.16066 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -59.98885 moveto
+10.6299 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -64.79729 moveto
+10.6299 -68.49277 lineto
+12.16066 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -82.66597 moveto
+10.6299 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -87.47441 moveto
+10.6299 -91.16989 lineto
+12.16066 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -103.97768 moveto
+10.6299 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -108.78612 moveto
+10.6299 -112.4816 lineto
+12.16066 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -126.6548 moveto
+10.6299 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -131.46324 moveto
+10.6299 -135.15872 lineto
+12.16066 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -147.9665 moveto
+10.6299 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -152.77495 moveto
+10.6299 -156.47043 lineto
+12.16066 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -170.64363 moveto
+10.6299 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -175.45207 moveto
+10.6299 -179.14755 lineto
+12.16066 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -191.95534 moveto
+10.6299 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09914 -196.76378 moveto
+10.6299 -200.45926 lineto
+12.16066 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 10.6299 -216.45926 moveto
+10.6299 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 9.09912 -226.93692 moveto
+10.6299 -230.63246 lineto
+12.16068 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 14.1732 moveto
+31.88971 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 3.69554 moveto
+31.88971 0 lineto
+33.42049 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -16 moveto
+31.88971 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -20.80844 moveto
+31.88971 -24.50392 lineto
+33.42047 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -38.67712 moveto
+31.88971 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -43.48557 moveto
+31.88971 -47.18105 lineto
+33.42047 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -59.98885 moveto
+31.88971 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -64.79729 moveto
+31.88971 -68.49277 lineto
+33.42047 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -82.66597 moveto
+31.88971 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -87.47441 moveto
+31.88971 -91.16989 lineto
+33.42047 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -103.97768 moveto
+31.88971 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -108.78612 moveto
+31.88971 -112.4816 lineto
+33.42047 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -126.6548 moveto
+31.88971 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -131.46324 moveto
+31.88971 -135.15872 lineto
+33.42047 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -147.9665 moveto
+31.88971 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -152.77495 moveto
+31.88971 -156.47043 lineto
+33.42047 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -170.64363 moveto
+31.88971 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -175.45207 moveto
+31.88971 -179.14755 lineto
+33.42047 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -191.95534 moveto
+31.88971 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35895 -196.76378 moveto
+31.88971 -200.45926 lineto
+33.42047 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 31.88971 -216.45926 moveto
+31.88971 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.35893 -226.93692 moveto
+31.88971 -230.63246 lineto
+33.42049 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 14.1732 moveto
+53.1495 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 3.69554 moveto
+53.1495 0 lineto
+54.68028 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -16 moveto
+53.1495 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -20.80844 moveto
+53.1495 -24.50392 lineto
+54.68027 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -38.67712 moveto
+53.1495 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -43.48557 moveto
+53.1495 -47.18105 lineto
+54.68027 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -59.98885 moveto
+53.1495 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -64.79729 moveto
+53.1495 -68.49277 lineto
+54.68027 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -82.66597 moveto
+53.1495 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -87.47441 moveto
+53.1495 -91.16989 lineto
+54.68027 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -103.97768 moveto
+53.1495 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -108.78612 moveto
+53.1495 -112.4816 lineto
+54.68027 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -126.6548 moveto
+53.1495 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -131.46324 moveto
+53.1495 -135.15872 lineto
+54.68027 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -147.9665 moveto
+53.1495 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -152.77495 moveto
+53.1495 -156.47043 lineto
+54.68027 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -170.64363 moveto
+53.1495 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -175.45207 moveto
+53.1495 -179.14755 lineto
+54.68027 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -191.95534 moveto
+53.1495 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61874 -196.76378 moveto
+53.1495 -200.45926 lineto
+54.68027 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 53.1495 -216.45926 moveto
+53.1495 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 51.61873 -226.93692 moveto
+53.1495 -230.63246 lineto
+54.68028 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 14.1732 moveto
+74.40932 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 3.69554 moveto
+74.40932 0 lineto
+75.9401 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -16 moveto
+74.40932 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -20.80844 moveto
+74.40932 -24.50392 lineto
+75.94008 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -38.67712 moveto
+74.40932 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -43.48557 moveto
+74.40932 -47.18105 lineto
+75.94008 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -59.98885 moveto
+74.40932 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -64.79729 moveto
+74.40932 -68.49277 lineto
+75.94008 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -82.66597 moveto
+74.40932 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -87.47441 moveto
+74.40932 -91.16989 lineto
+75.94008 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -103.97768 moveto
+74.40932 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -108.78612 moveto
+74.40932 -112.4816 lineto
+75.94008 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -126.6548 moveto
+74.40932 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -131.46324 moveto
+74.40932 -135.15872 lineto
+75.94008 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -147.9665 moveto
+74.40932 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -152.77495 moveto
+74.40932 -156.47043 lineto
+75.94008 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -170.64363 moveto
+74.40932 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -175.45207 moveto
+74.40932 -179.14755 lineto
+75.94008 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -191.95534 moveto
+74.40932 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87856 -196.76378 moveto
+74.40932 -200.45926 lineto
+75.94008 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 74.40932 -216.45926 moveto
+74.40932 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.87854 -226.93692 moveto
+74.40932 -230.63246 lineto
+75.9401 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 14.1732 moveto
+95.66911 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 3.69554 moveto
+95.66911 0 lineto
+97.19989 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -16 moveto
+95.66911 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -20.80844 moveto
+95.66911 -24.50392 lineto
+97.19987 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -38.67712 moveto
+95.66911 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -43.48557 moveto
+95.66911 -47.18105 lineto
+97.19987 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -59.98885 moveto
+95.66911 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -64.79729 moveto
+95.66911 -68.49277 lineto
+97.19987 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -82.66597 moveto
+95.66911 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -87.47441 moveto
+95.66911 -91.16989 lineto
+97.19987 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -103.97768 moveto
+95.66911 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -108.78612 moveto
+95.66911 -112.4816 lineto
+97.19987 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -126.6548 moveto
+95.66911 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -131.46324 moveto
+95.66911 -135.15872 lineto
+97.19987 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -147.9665 moveto
+95.66911 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -152.77495 moveto
+95.66911 -156.47043 lineto
+97.19987 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -170.64363 moveto
+95.66911 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -175.45207 moveto
+95.66911 -179.14755 lineto
+97.19987 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -191.95534 moveto
+95.66911 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13835 -196.76378 moveto
+95.66911 -200.45926 lineto
+97.19987 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 95.66911 -216.45926 moveto
+95.66911 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 94.13834 -226.93692 moveto
+95.66911 -230.63246 lineto
+97.19989 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 14.1732 moveto
+116.92892 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 3.69554 moveto
+116.92892 0 lineto
+118.4597 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -16 moveto
+116.92892 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -20.80844 moveto
+116.92892 -24.50392 lineto
+118.45969 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -38.67712 moveto
+116.92892 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -43.48557 moveto
+116.92892 -47.18105 lineto
+118.45969 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -59.98885 moveto
+116.92892 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -64.79729 moveto
+116.92892 -68.49277 lineto
+118.45969 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -82.66597 moveto
+116.92892 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -87.47441 moveto
+116.92892 -91.16989 lineto
+118.45969 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -103.97768 moveto
+116.92892 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -108.78612 moveto
+116.92892 -112.4816 lineto
+118.45969 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -126.6548 moveto
+116.92892 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -131.46324 moveto
+116.92892 -135.15872 lineto
+118.45969 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -147.9665 moveto
+116.92892 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -152.77495 moveto
+116.92892 -156.47043 lineto
+118.45969 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -170.64363 moveto
+116.92892 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -175.45207 moveto
+116.92892 -179.14755 lineto
+118.45969 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -191.95534 moveto
+116.92892 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39816 -196.76378 moveto
+116.92892 -200.45926 lineto
+118.45969 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 116.92892 -216.45926 moveto
+116.92892 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.39815 -226.93692 moveto
+116.92892 -230.63246 lineto
+118.4597 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 14.1732 moveto
+138.18872 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 3.69554 moveto
+138.18872 0 lineto
+139.7195 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -16 moveto
+138.18872 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -20.80844 moveto
+138.18872 -24.50392 lineto
+139.71948 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -38.67712 moveto
+138.18872 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -43.48557 moveto
+138.18872 -47.18105 lineto
+139.71948 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -59.98885 moveto
+138.18872 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -64.79729 moveto
+138.18872 -68.49277 lineto
+139.71948 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -82.66597 moveto
+138.18872 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -87.47441 moveto
+138.18872 -91.16989 lineto
+139.71948 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -103.97768 moveto
+138.18872 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -108.78612 moveto
+138.18872 -112.4816 lineto
+139.71948 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -126.6548 moveto
+138.18872 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -131.46324 moveto
+138.18872 -135.15872 lineto
+139.71948 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -147.9665 moveto
+138.18872 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -152.77495 moveto
+138.18872 -156.47043 lineto
+139.71948 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -170.64363 moveto
+138.18872 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -175.45207 moveto
+138.18872 -179.14755 lineto
+139.71948 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -191.95534 moveto
+138.18872 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65796 -196.76378 moveto
+138.18872 -200.45926 lineto
+139.71948 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 138.18872 -216.45926 moveto
+138.18872 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 136.65794 -226.93692 moveto
+138.18872 -230.63246 lineto
+139.7195 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 14.1732 moveto
+159.44853 0 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 3.69554 moveto
+159.44853 0 lineto
+160.97931 3.69554 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -16 moveto
+159.44853 -24.50392 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -20.80844 moveto
+159.44853 -24.50392 lineto
+160.9793 -20.80844 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -38.67712 moveto
+159.44853 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -43.48557 moveto
+159.44853 -47.18105 lineto
+160.9793 -43.48557 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -59.98885 moveto
+159.44853 -68.49277 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -64.79729 moveto
+159.44853 -68.49277 lineto
+160.9793 -64.79729 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -82.66597 moveto
+159.44853 -91.16989 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -87.47441 moveto
+159.44853 -91.16989 lineto
+160.9793 -87.47441 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -103.97768 moveto
+159.44853 -112.4816 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -108.78612 moveto
+159.44853 -112.4816 lineto
+160.9793 -108.78612 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -126.6548 moveto
+159.44853 -135.15872 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -131.46324 moveto
+159.44853 -135.15872 lineto
+160.9793 -131.46324 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -147.9665 moveto
+159.44853 -156.47043 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -152.77495 moveto
+159.44853 -156.47043 lineto
+160.9793 -152.77495 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -170.64363 moveto
+159.44853 -179.14755 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -175.45207 moveto
+159.44853 -179.14755 lineto
+160.9793 -175.45207 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -191.95534 moveto
+159.44853 -200.45926 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91777 -196.76378 moveto
+159.44853 -200.45926 lineto
+160.9793 -196.76378 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 159.44853 -216.45926 moveto
+159.44853 -230.63246 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 157.91776 -226.93692 moveto
+159.44853 -230.63246 lineto
+160.97931 -226.93692 lineto
+ closepath
+gsave fill grestore stroke
+65.04471 22.84248 moveto
+(Plain) cmr10 9.96265 fshow
+87.5991 22.84248 moveto
+(text) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-41.mps b/Supporting_Documentation/tex/skein-41.mps
new file mode 100644
index 0000000000000..5fbad2526a8ef
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-41.mps
@@ -0,0 +1,349 @@
+%!PS
+%%BoundingBox: -52 -73 184 46 
+%%HiResBoundingBox: -51.70432 -72.88118 183.24184 45.3179 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmmi10 9.96265 9.96265 47:82
+%*Font: cmr10 9.96265 9.96265 0c:800000000eaa000000000440a38
+%*Font: cmr7 6.97385 6.97385 30:e
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 3.08661 17.00784 moveto
+3.08661 17.00784 lineto
+3.08661 -17.00784 lineto
+3.08661 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -16.75587 17.00784 moveto
+-16.75587 17.00784 lineto
+3.08661 17.00784 lineto
+3.08661 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -16.75587 17.00784 moveto
+-16.75587 17.00784 lineto
+-16.75587 -17.00784 lineto
+-16.75587 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -16.75587 -17.00784 moveto
+-16.75587 -17.00784 lineto
+3.08661 -17.00784 lineto
+3.08661 -17.00784 lineto stroke
+newpath 15.58446 2.82857 moveto
+16.30823 2.1048 16.75589 1.10492 16.75589 0.00049 curveto
+16.75589 -0.00049 lineto
+16.75589 -1.10492 16.30823 -2.1048 15.58446 -2.82857 curveto stroke
+newpath 9.92732 2.82857 moveto
+10.6511 3.55234 11.65097 4 12.7554 4 curveto
+12.75638 4 lineto
+13.86081 4 14.86069 3.55234 15.58446 2.82857 curveto stroke
+newpath 9.92732 2.82857 moveto
+9.20355 2.1048 8.75589 1.10492 8.75589 0.00049 curveto
+8.75589 -0.00049 lineto
+8.75589 -1.10492 9.20355 -2.1048 9.92732 -2.82857 curveto stroke
+newpath 9.92732 -2.82857 moveto
+10.6511 -3.55234 11.65097 -4 12.7554 -4 curveto
+12.75638 -4 lineto
+13.86081 -4 14.86069 -3.55234 15.58446 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 12.75589 4 moveto
+12.75589 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 16.75589 0 moveto
+8.75589 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -6.83464 34.01569 moveto
+-6.83464 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -8.3654 20.70332 moveto
+-6.83464 17.00784 lineto
+-5.30388 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath -6.83464 22.67712 moveto
+12.75589 22.67712 lineto
+12.75589 4 lineto stroke
+newpath 11.22508 7.6956 moveto
+12.75589 4 lineto
+14.2867 7.6956 lineto
+ closepath
+gsave fill grestore stroke
+newpath -6.83464 -17.00784 moveto
+-6.83464 -22.67712 lineto
+12.75589 -22.67712 lineto
+12.75589 -4 lineto stroke
+newpath 14.2867 -7.6956 moveto
+12.75589 -4 lineto
+11.22508 -7.6956 lineto
+ closepath
+gsave fill grestore stroke
+newpath -22.42516 -34.01569 moveto
+-22.42516 -2.83464 lineto
+-16.75587 -2.83464 lineto stroke
+newpath -20.45132 -4.36539 moveto
+-16.75587 -2.83464 lineto
+-20.45132 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath -16.75587 5.66928 moveto
+-11.0866 0 lineto
+-16.75587 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 79.11798 17.00784 moveto
+79.11798 17.00784 lineto
+79.11798 -17.00784 lineto
+79.11798 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 59.2755 17.00784 moveto
+59.2755 17.00784 lineto
+79.11798 17.00784 lineto
+79.11798 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 59.2755 17.00784 moveto
+59.2755 17.00784 lineto
+59.2755 -17.00784 lineto
+59.2755 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 59.2755 -17.00784 moveto
+59.2755 -17.00784 lineto
+79.11798 -17.00784 lineto
+79.11798 -17.00784 lineto stroke
+newpath 91.61583 2.82857 moveto
+92.3396 2.1048 92.78726 1.10492 92.78726 0.00049 curveto
+92.78726 -0.00049 lineto
+92.78726 -1.10492 92.3396 -2.1048 91.61583 -2.82857 curveto stroke
+newpath 85.9587 2.82857 moveto
+86.68246 3.55234 87.68234 4 88.78677 4 curveto
+88.78775 4 lineto
+89.89218 4 90.89206 3.55234 91.61583 2.82857 curveto stroke
+newpath 85.9587 2.82857 moveto
+85.23492 2.1048 84.78726 1.10492 84.78726 0.00049 curveto
+84.78726 -0.00049 lineto
+84.78726 -1.10492 85.23492 -2.1048 85.9587 -2.82857 curveto stroke
+newpath 85.9587 -2.82857 moveto
+86.68246 -3.55234 87.68234 -4 88.78677 -4 curveto
+88.78775 -4 lineto
+89.89218 -4 90.89206 -3.55234 91.61583 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 88.78726 4 moveto
+88.78726 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 92.78726 0 moveto
+84.78726 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 69.19673 34.01569 moveto
+69.19673 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 67.66597 20.70332 moveto
+69.19673 17.00784 lineto
+70.7275 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 69.19673 22.67712 moveto
+88.78726 22.67712 lineto
+88.78726 4 lineto stroke
+newpath 87.25645 7.6956 moveto
+88.78726 4 lineto
+90.31807 7.6956 lineto
+ closepath
+gsave fill grestore stroke
+newpath 69.19673 -17.00784 moveto
+69.19673 -22.67712 lineto
+88.78726 -22.67712 lineto
+88.78726 -4 lineto stroke
+newpath 90.31807 -7.6956 moveto
+88.78726 -4 lineto
+87.25645 -7.6956 lineto
+ closepath
+gsave fill grestore stroke
+newpath 53.60622 -34.01569 moveto
+53.60622 -2.83464 lineto
+59.2755 -2.83464 lineto stroke
+newpath 55.58005 -4.36539 moveto
+59.2755 -2.83464 lineto
+55.58005 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 59.2755 5.66928 moveto
+64.94478 0 lineto
+59.2755 -5.66928 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 155.14935 17.00784 moveto
+155.14935 17.00784 lineto
+155.14935 -17.00784 lineto
+155.14935 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 135.30687 17.00784 moveto
+135.30687 17.00784 lineto
+155.14935 17.00784 lineto
+155.14935 17.00784 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 135.30687 17.00784 moveto
+135.30687 17.00784 lineto
+135.30687 -17.00784 lineto
+135.30687 -17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 135.30687 -17.00784 moveto
+135.30687 -17.00784 lineto
+155.14935 -17.00784 lineto
+155.14935 -17.00784 lineto stroke
+newpath 167.6472 2.82857 moveto
+168.37097 2.1048 168.81863 1.10492 168.81863 0.00049 curveto
+168.81863 -0.00049 lineto
+168.81863 -1.10492 168.37097 -2.1048 167.6472 -2.82857 curveto stroke
+newpath 161.99007 2.82857 moveto
+162.71384 3.55234 163.71371 4 164.81815 4 curveto
+164.81912 4 lineto
+165.92355 4 166.92343 3.55234 167.6472 2.82857 curveto stroke
+newpath 161.99007 2.82857 moveto
+161.2663 2.1048 160.81863 1.10492 160.81863 0.00049 curveto
+160.81863 -0.00049 lineto
+160.81863 -1.10492 161.2663 -2.1048 161.99007 -2.82857 curveto stroke
+newpath 161.99007 -2.82857 moveto
+162.71384 -3.55234 163.71371 -4 164.81815 -4 curveto
+164.81912 -4 lineto
+165.92355 -4 166.92343 -3.55234 167.6472 -2.82857 curveto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 164.81863 4 moveto
+164.81863 -4 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 168.81863 0 moveto
+160.81863 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 145.2281 34.01569 moveto
+145.2281 17.00784 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 143.69734 20.70332 moveto
+145.2281 17.00784 lineto
+146.75887 20.70332 lineto
+ closepath
+gsave fill grestore stroke
+newpath 145.2281 22.67712 moveto
+164.81863 22.67712 lineto
+164.81863 4 lineto stroke
+newpath 163.28783 7.6956 moveto
+164.81863 4 lineto
+166.34944 7.6956 lineto
+ closepath
+gsave fill grestore stroke
+newpath 145.2281 -17.00784 moveto
+145.2281 -22.67712 lineto
+164.81863 -22.67712 lineto
+164.81863 -4 lineto stroke
+newpath 166.34944 -7.6956 moveto
+164.81863 -4 lineto
+163.28783 -7.6956 lineto
+ closepath
+gsave fill grestore stroke
+newpath 129.63759 -34.01569 moveto
+129.63759 -2.83464 lineto
+135.30687 -2.83464 lineto stroke
+newpath 131.61142 -4.36539 moveto
+135.30687 -2.83464 lineto
+131.61142 -1.3039 lineto
+ closepath
+gsave fill grestore stroke
+newpath 135.30687 5.66928 moveto
+140.97615 0 lineto
+135.30687 -5.66928 lineto stroke
+newpath 16.75589 0 moveto
+59.2755 0 lineto stroke
+newpath 55.58014 -1.5307 moveto
+59.2755 0 lineto
+55.58014 1.5307 lineto
+ closepath
+gsave fill grestore stroke
+newpath 92.78726 0 moveto
+135.30687 0 lineto stroke
+newpath 131.61151 -1.5307 moveto
+135.30687 0 lineto
+131.61151 1.5307 lineto
+ closepath
+gsave fill grestore stroke
+newpath 168.81863 0 moveto
+182.99184 0 lineto stroke
+newpath 179.2963 -1.53078 moveto
+182.99184 0 lineto
+179.2963 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+newpath -30.92908 0 moveto
+-16.75587 0 lineto stroke
+newpath -20.45142 -1.53078 moveto
+-16.75587 0 lineto
+-20.45142 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+-41.76218 -3.4039 moveto
+(G) cmmi10 9.96265 fshow
+-13.9019 38.5101 moveto
+(M) cmmi10 9.96265 fshow
+-4.2367 37.0157 moveto
+(0) cmr7 6.97385 fshow
+62.12947 38.5101 moveto
+(M) cmmi10 9.96265 fshow
+71.79468 37.0157 moveto
+(1) cmr7 6.97385 fshow
+138.16084 38.5101 moveto
+(M) cmmi10 9.96265 fshow
+147.82605 37.0157 moveto
+(2) cmr7 6.97385 fshow
+-37.53522 -45.484 moveto
+(len) cmr10 9.96265 fshow
+-23.80882 -45.484 moveto
+(:) cmr10 9.96265 fshow
+-20.04512 -45.484 moveto
+(64) cmr10 9.96265 fshow
+-42.04611 -57.43909 moveto
+(\014rst) cmr10 9.96265 fshow
+-23.80882 -57.43909 moveto
+(:) cmr10 9.96265 fshow
+-20.04512 -57.43909 moveto
+(1) cmr10 9.96265 fshow
+-43.62352 -69.39429 moveto
+(\014nal) cmr10 9.96265 fshow
+-23.80882 -69.39429 moveto
+(:) cmr10 9.96265 fshow
+-20.04512 -69.39429 moveto
+(0) cmr10 9.96265 fshow
+38.49615 -45.484 moveto
+(len) cmr10 9.96265 fshow
+52.22255 -45.484 moveto
+(:) cmr10 9.96265 fshow
+55.98625 -45.484 moveto
+(128) cmr10 9.96265 fshow
+33.98526 -57.43909 moveto
+(\014rst) cmr10 9.96265 fshow
+52.22255 -57.43909 moveto
+(:) cmr10 9.96265 fshow
+55.98625 -57.43909 moveto
+(0) cmr10 9.96265 fshow
+32.40785 -69.39429 moveto
+(\014nal) cmr10 9.96265 fshow
+52.22255 -69.39429 moveto
+(:) cmr10 9.96265 fshow
+55.98625 -69.39429 moveto
+(0) cmr10 9.96265 fshow
+114.52753 -45.484 moveto
+(len) cmr10 9.96265 fshow
+128.25392 -45.484 moveto
+(:) cmr10 9.96265 fshow
+132.01762 -45.484 moveto
+(166) cmr10 9.96265 fshow
+110.01663 -57.43909 moveto
+(\014rst) cmr10 9.96265 fshow
+128.25392 -57.43909 moveto
+(:) cmr10 9.96265 fshow
+132.01762 -57.43909 moveto
+(0) cmr10 9.96265 fshow
+108.43922 -69.39429 moveto
+(\014nal) cmr10 9.96265 fshow
+128.25392 -69.39429 moveto
+(:) cmr10 9.96265 fshow
+132.01762 -69.39429 moveto
+(1) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-42.mps b/Supporting_Documentation/tex/skein-42.mps
new file mode 100644
index 0000000000000..bd1a4efc78217
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-42.mps
@@ -0,0 +1,163 @@
+%!PS
+%%BoundingBox: -192 -64 192 15 
+%%HiResBoundingBox: -191.58823 -63.9553 191.58824 14.4232 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 0d:800000001c24044022000fbb77ca
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 191.33824 14.1732 moveto
+191.33824 14.1732 lineto
+191.33824 -14.1732 lineto
+191.33824 -14.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -191.33823 14.1732 moveto
+-191.33823 14.1732 lineto
+191.33824 14.1732 lineto
+191.33824 14.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -191.33823 14.1732 moveto
+-191.33823 14.1732 lineto
+-191.33823 -14.1732 lineto
+-191.33823 -14.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -191.33823 -14.1732 moveto
+-191.33823 -14.1732 lineto
+191.33824 -14.1732 lineto
+191.33824 -14.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -167.42094 14.1732 moveto
+-167.42094 11.3384 lineto stroke
+newpath -167.42094 -14.1732 moveto
+-167.42094 -11.3384 lineto stroke
+newpath -143.50366 14.1732 moveto
+-143.50366 11.3384 lineto stroke
+newpath -143.50366 -14.1732 moveto
+-143.50366 -11.3384 lineto stroke
+newpath -119.5864 14.1732 moveto
+-119.5864 11.3384 lineto stroke
+newpath -119.5864 -14.1732 moveto
+-119.5864 -11.3384 lineto stroke
+newpath -95.66911 14.1732 moveto
+-95.66911 11.3384 lineto stroke
+newpath -95.66911 -14.1732 moveto
+-95.66911 -11.3384 lineto stroke
+newpath -71.75183 14.1732 moveto
+-71.75183 11.3384 lineto stroke
+newpath -71.75183 -14.1732 moveto
+-71.75183 -11.3384 lineto stroke
+newpath -47.83455 14.1732 moveto
+-47.83455 11.3384 lineto stroke
+newpath -47.83455 -14.1732 moveto
+-47.83455 -11.3384 lineto stroke
+newpath -23.91727 14.1732 moveto
+-23.91727 11.3384 lineto stroke
+newpath -23.91727 -14.1732 moveto
+-23.91727 -11.3384 lineto stroke
+newpath 0.00002 14.1732 moveto
+0.00002 11.3384 lineto stroke
+newpath 0.00002 -14.1732 moveto
+0.00002 -11.3384 lineto stroke
+newpath 23.91728 14.1732 moveto
+23.91728 11.3384 lineto stroke
+newpath 23.91728 -14.1732 moveto
+23.91728 -11.3384 lineto stroke
+newpath 47.83456 14.1732 moveto
+47.83456 11.3384 lineto stroke
+newpath 47.83456 -14.1732 moveto
+47.83456 -11.3384 lineto stroke
+newpath 71.75185 14.1732 moveto
+71.75185 11.3384 lineto stroke
+newpath 71.75185 -14.1732 moveto
+71.75185 -11.3384 lineto stroke
+newpath 95.66913 14.1732 moveto
+95.66913 11.3384 lineto stroke
+newpath 95.66913 -14.1732 moveto
+95.66913 -11.3384 lineto stroke
+newpath 119.58641 14.1732 moveto
+119.58641 11.3384 lineto stroke
+newpath 119.58641 -14.1732 moveto
+119.58641 -11.3384 lineto stroke
+newpath 143.50368 14.1732 moveto
+143.50368 11.3384 lineto stroke
+newpath 143.50368 -14.1732 moveto
+143.50368 -11.3384 lineto stroke
+newpath 167.42096 14.1732 moveto
+167.42096 11.3384 lineto stroke
+newpath 167.42096 -14.1732 moveto
+167.42096 -11.3384 lineto stroke
+newpath 95.66913 14.1732 moveto
+95.66913 -14.1732 lineto stroke
+newpath 143.50368 14.1732 moveto
+143.50368 -14.1732 lineto stroke
+newpath 167.42096 14.1732 moveto
+167.42096 -14.1732 lineto stroke
+-54.1996 -3.45926 moveto
+(len) cmr10 9.96265 fshow
+101.81966 -3.45926 moveto
+(reserv) cmr10 9.96265 fshow
+127.39046 -3.45926 moveto
+(ed) cmr10 9.96265 fshow
+newpath 155.46233 0 moveto
+155.46233 -48.18889 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 152.62769 -48.18889 moveto
+155.46233 -48.18889 lineto stroke
+newpath 152.62769 -36.85033 moveto
+155.46233 -36.85033 lineto stroke
+newpath 152.62769 -25.51176 moveto
+155.46233 -25.51176 lineto stroke
+newpath 179.37961 0 moveto
+179.37961 -59.52745 lineto
+176.54497 -59.52745 lineto stroke
+80.58089 -28.00243 moveto
+(Final) cmr10 9.96265 fshow
+106.45619 -28.00243 moveto
+(\015ag:) cmr10 9.96265 fshow
+129.14888 -28.00243 moveto
+(bit) cmr10 9.96265 fshow
+144.6464 -28.00243 moveto
+(0) cmr10 9.96265 fshow
+38.37799 -39.34099 moveto
+(Odd) cmr10 9.96265 fshow
+60.51729 -39.34099 moveto
+(bit) cmr10 9.96265 fshow
+76.0147 -39.34099 moveto
+(length) cmr10 9.96265 fshow
+106.45619 -39.34099 moveto
+(\015ag:) cmr10 9.96265 fshow
+129.1489 -39.34099 moveto
+(bit) cmr10 9.96265 fshow
+144.6464 -39.34099 moveto
+(1) cmr10 9.96265 fshow
+65.08339 -50.67955 moveto
+(Blo) cmr10 9.96265 fshow
+80.16579 -50.67955 moveto
+(c) cmr10 9.96265 fshow
+84.3169 -50.67955 moveto
+(k) cmr10 9.96265 fshow
+92.89578 -50.67955 moveto
+(t) cmr10 9.96265 fshow
+96.49348 -50.67955 moveto
+(yp) cmr10 9.96265 fshow
+107.5631 -50.67955 moveto
+(e:) cmr10 9.96265 fshow
+119.18619 -50.67955 moveto
+(bit) cmr10 9.96265 fshow
+134.68369 -50.67955 moveto
+(2{7) cmr10 9.96265 fshow
+106.54607 -62.01811 moveto
+(Subk) cmr10 9.96265 fshow
+128.13187 -62.01811 moveto
+(ey) cmr10 9.96265 fshow
+141.13867 -62.01811 moveto
+(coun) cmr10 9.96265 fshow
+161.34067 -62.01811 moveto
+(ter) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-51.mps b/Supporting_Documentation/tex/skein-51.mps
new file mode 100644
index 0000000000000..af26532fd942f
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-51.mps
@@ -0,0 +1,200 @@
+%!PS
+%%BoundingBox: -37 -41 143 38 
+%%HiResBoundingBox: -36.3277 -40.46686 142.66476 37.36746 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 0c:80000000080203045040047039c4
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+-9.06326 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 14.1732 11.33856 moveto
+14.1732 11.33856 lineto
+14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -14.1732 11.33856 moveto
+-14.1732 11.33856 lineto
+14.1732 11.33856 lineto
+14.1732 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -14.1732 11.33856 moveto
+-14.1732 11.33856 lineto
+-14.1732 -11.33856 lineto
+-14.1732 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -14.1732 -11.33856 moveto
+-14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto stroke
+47.62955 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 70.86601 11.33856 moveto
+70.86601 11.33856 lineto
+70.86601 -11.33856 lineto
+70.86601 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 42.5196 11.33856 moveto
+42.5196 11.33856 lineto
+70.86601 11.33856 lineto
+70.86601 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 42.5196 11.33856 moveto
+42.5196 11.33856 lineto
+42.5196 -11.33856 lineto
+42.5196 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 42.5196 -11.33856 moveto
+42.5196 -11.33856 lineto
+70.86601 -11.33856 lineto
+70.86601 -11.33856 lineto stroke
+104.32236 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 127.55882 11.33856 moveto
+127.55882 11.33856 lineto
+127.55882 -11.33856 lineto
+127.55882 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 11.33856 moveto
+99.21242 11.33856 lineto
+127.55882 11.33856 lineto
+127.55882 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 99.21242 11.33856 moveto
+99.21242 11.33856 lineto
+99.21242 -11.33856 lineto
+99.21242 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 -11.33856 moveto
+99.21242 -11.33856 lineto
+127.55882 -11.33856 lineto
+127.55882 -11.33856 lineto stroke
+newpath -28.3464 0 moveto
+-14.1732 0 lineto stroke
+newpath -17.86874 -1.53078 moveto
+-14.1732 0 lineto
+-17.86874 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+newpath 14.1732 0 moveto
+42.5196 0 lineto stroke
+newpath 38.82405 -1.53079 moveto
+42.5196 0 lineto
+38.82405 1.53079 lineto
+ closepath
+gsave fill grestore stroke
+newpath 70.86601 0 moveto
+99.21242 0 lineto stroke
+newpath 95.51686 -1.53079 moveto
+99.21242 0 lineto
+95.51686 1.53079 lineto
+ closepath
+gsave fill grestore stroke
+newpath 127.55882 0 moveto
+141.73203 0 lineto stroke
+newpath 138.03648 -1.53078 moveto
+141.73203 0 lineto
+138.03648 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 25.51176 moveto
+0 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.53078 15.0341 moveto
+0 11.33856 lineto
+1.53078 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 56.69281 25.51176 moveto
+56.69281 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 55.16203 15.0341 moveto
+56.69281 11.33856 lineto
+58.22359 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 25.51176 moveto
+113.38562 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 111.85484 15.0341 moveto
+113.38562 11.33856 lineto
+114.9164 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -25.51176 moveto
+0 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 1.53078 -15.0341 moveto
+0 -11.33856 lineto
+-1.53078 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 56.69281 -25.51176 moveto
+56.69281 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 58.22359 -15.0341 moveto
+56.69281 -11.33856 lineto
+55.16203 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 -25.51176 moveto
+113.38562 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 114.9164 -15.0341 moveto
+113.38562 -11.33856 lineto
+111.85484 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+-36.3277 -3.2102 moveto
+(0) cmr10 9.96265 fshow
+-14.1138 30.44896 moveto
+(Con\014g) cmr10 9.96265 fshow
+38.7877 30.44896 moveto
+(Message) cmr10 9.96265 fshow
+110.89497 28.51176 moveto
+(0) cmr10 9.96265 fshow
+-21.47516 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+-17.87756 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+-6.80786 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+-1.38367 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+2.38004 -36.97997 moveto
+(Cfg) cmr10 9.96265 fshow
+35.21765 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+38.81525 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+49.88495 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+55.30914 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+59.07285 -36.97997 moveto
+(Msg) cmr10 9.96265 fshow
+91.91046 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+95.50806 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+106.57776 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+112.00195 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+115.76566 -36.97997 moveto
+(Out) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-52.mps b/Supporting_Documentation/tex/skein-52.mps
new file mode 100644
index 0000000000000..5c5ad6a7137de
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-52.mps
@@ -0,0 +1,334 @@
+%!PS
+%%BoundingBox: -37 -205 143 38 
+%%HiResBoundingBox: -36.3277 -204.876 142.66476 37.36746 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 0c:800000000e0203045040047039c4
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+-9.06326 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 14.1732 11.33856 moveto
+14.1732 11.33856 lineto
+14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -14.1732 11.33856 moveto
+-14.1732 11.33856 lineto
+14.1732 11.33856 lineto
+14.1732 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -14.1732 11.33856 moveto
+-14.1732 11.33856 lineto
+-14.1732 -11.33856 lineto
+-14.1732 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -14.1732 -11.33856 moveto
+-14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto stroke
+47.62955 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 70.86601 11.33856 moveto
+70.86601 11.33856 lineto
+70.86601 -11.33856 lineto
+70.86601 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 42.5196 11.33856 moveto
+42.5196 11.33856 lineto
+70.86601 11.33856 lineto
+70.86601 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 42.5196 11.33856 moveto
+42.5196 11.33856 lineto
+42.5196 -11.33856 lineto
+42.5196 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 42.5196 -11.33856 moveto
+42.5196 -11.33856 lineto
+70.86601 -11.33856 lineto
+70.86601 -11.33856 lineto stroke
+104.32236 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 127.55882 11.33856 moveto
+127.55882 11.33856 lineto
+127.55882 -11.33856 lineto
+127.55882 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 11.33856 moveto
+99.21242 11.33856 lineto
+127.55882 11.33856 lineto
+127.55882 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 99.21242 11.33856 moveto
+99.21242 11.33856 lineto
+99.21242 -11.33856 lineto
+99.21242 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 -11.33856 moveto
+99.21242 -11.33856 lineto
+127.55882 -11.33856 lineto
+127.55882 -11.33856 lineto stroke
+104.32236 -85.60847 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 127.55882 -70.86601 moveto
+127.55882 -70.86601 lineto
+127.55882 -93.54314 lineto
+127.55882 -93.54314 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 -70.86601 moveto
+99.21242 -70.86601 lineto
+127.55882 -70.86601 lineto
+127.55882 -70.86601 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 99.21242 -70.86601 moveto
+99.21242 -70.86601 lineto
+99.21242 -93.54314 lineto
+99.21242 -93.54314 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 -93.54314 moveto
+99.21242 -93.54314 lineto
+127.55882 -93.54314 lineto
+127.55882 -93.54314 lineto stroke
+104.32236 -167.81305 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 127.55882 -153.07059 moveto
+127.55882 -153.07059 lineto
+127.55882 -175.74771 lineto
+127.55882 -175.74771 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 -153.07059 moveto
+99.21242 -153.07059 lineto
+127.55882 -153.07059 lineto
+127.55882 -153.07059 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 99.21242 -153.07059 moveto
+99.21242 -153.07059 lineto
+99.21242 -175.74771 lineto
+99.21242 -175.74771 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 -175.74771 moveto
+99.21242 -175.74771 lineto
+127.55882 -175.74771 lineto
+127.55882 -175.74771 lineto stroke
+newpath -28.3464 0 moveto
+-14.1732 0 lineto stroke
+newpath -17.86874 -1.53078 moveto
+-14.1732 0 lineto
+-17.86874 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+newpath 14.1732 0 moveto
+42.5196 0 lineto stroke
+newpath 38.82405 -1.53079 moveto
+42.5196 0 lineto
+38.82405 1.53079 lineto
+ closepath
+gsave fill grestore stroke
+newpath 70.86601 0 moveto
+99.21242 0 lineto stroke
+newpath 95.51686 -1.53079 moveto
+99.21242 0 lineto
+95.51686 1.53079 lineto
+ closepath
+gsave fill grestore stroke
+newpath 127.55882 0 moveto
+141.73203 0 lineto stroke
+newpath 138.03648 -1.53078 moveto
+141.73203 0 lineto
+138.03648 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+newpath 127.55882 -82.20457 moveto
+141.73203 -82.20457 lineto stroke
+newpath 138.03648 -83.73535 moveto
+141.73203 -82.20457 lineto
+138.03648 -80.6738 lineto
+ closepath
+gsave fill grestore stroke
+newpath 127.55882 -164.40915 moveto
+141.73203 -164.40915 lineto stroke
+newpath 138.03648 -165.93993 moveto
+141.73203 -164.40915 lineto
+138.03648 -162.87837 lineto
+ closepath
+gsave fill grestore stroke
+newpath 85.03922 0 moveto
+85.03922 -82.20457 lineto
+99.21242 -82.20457 lineto stroke
+newpath 95.51688 -83.73535 moveto
+99.21242 -82.20457 lineto
+95.51688 -80.6738 lineto
+ closepath
+gsave fill grestore stroke
+newpath 85.03922 0 moveto
+85.03922 -164.40915 lineto
+99.21242 -164.40915 lineto stroke
+newpath 95.51688 -165.93993 moveto
+99.21242 -164.40915 lineto
+95.51688 -162.87837 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 25.51176 moveto
+0 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.53078 15.0341 moveto
+0 11.33856 lineto
+1.53078 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 56.69281 25.51176 moveto
+56.69281 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 55.16203 15.0341 moveto
+56.69281 11.33856 lineto
+58.22359 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 25.51176 moveto
+113.38562 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 111.85484 15.0341 moveto
+113.38562 11.33856 lineto
+114.9164 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 -56.69281 moveto
+113.38562 -70.86601 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 111.85484 -67.17047 moveto
+113.38562 -70.86601 lineto
+114.9164 -67.17047 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 -138.89738 moveto
+113.38562 -153.07059 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 111.85484 -149.37505 moveto
+113.38562 -153.07059 lineto
+114.9164 -149.37505 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -25.51176 moveto
+0 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 1.53078 -15.0341 moveto
+0 -11.33856 lineto
+-1.53078 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 56.69281 -25.51176 moveto
+56.69281 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 58.22359 -15.0341 moveto
+56.69281 -11.33856 lineto
+55.16203 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 -25.51176 moveto
+113.38562 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 114.9164 -15.0341 moveto
+113.38562 -11.33856 lineto
+111.85484 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 -107.71634 moveto
+113.38562 -93.54314 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 114.9164 -97.23868 moveto
+113.38562 -93.54314 lineto
+111.85484 -97.23868 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 -189.92091 moveto
+113.38562 -175.74771 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 114.9164 -179.44325 moveto
+113.38562 -175.74771 lineto
+111.85484 -179.44325 lineto
+ closepath
+gsave fill grestore stroke
+-36.3277 -3.2102 moveto
+(0) cmr10 9.96265 fshow
+-14.1138 30.44896 moveto
+(Con\014g) cmr10 9.96265 fshow
+38.7877 30.44896 moveto
+(Message) cmr10 9.96265 fshow
+110.89497 28.51176 moveto
+(0) cmr10 9.96265 fshow
+110.89497 -53.69281 moveto
+(1) cmr10 9.96265 fshow
+110.89497 -135.89738 moveto
+(2) cmr10 9.96265 fshow
+-21.47516 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+-17.87756 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+-6.80786 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+-1.38367 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+2.38004 -36.97997 moveto
+(Cfg) cmr10 9.96265 fshow
+35.21765 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+38.81525 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+49.88495 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+55.30914 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+59.07285 -36.97997 moveto
+(Msg) cmr10 9.96265 fshow
+91.91046 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+95.50806 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+106.57776 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+112.00195 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+115.76566 -36.97997 moveto
+(Out) cmr10 9.96265 fshow
+91.91046 -119.18454 moveto
+(t) cmr10 9.96265 fshow
+95.50806 -119.18454 moveto
+(yp) cmr10 9.96265 fshow
+106.57776 -119.18454 moveto
+(e) cmr10 9.96265 fshow
+112.00195 -119.18454 moveto
+(:) cmr10 9.96265 fshow
+115.76566 -119.18454 moveto
+(Out) cmr10 9.96265 fshow
+91.91046 -201.38911 moveto
+(t) cmr10 9.96265 fshow
+95.50806 -201.38911 moveto
+(yp) cmr10 9.96265 fshow
+106.57776 -201.38911 moveto
+(e) cmr10 9.96265 fshow
+112.00195 -201.38911 moveto
+(:) cmr10 9.96265 fshow
+115.76566 -201.38911 moveto
+(Out) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-53.mps b/Supporting_Documentation/tex/skein-53.mps
new file mode 100644
index 0000000000000..0c194db798fc5
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-53.mps
@@ -0,0 +1,259 @@
+%!PS
+%%BoundingBox: -37 -41 200 38 
+%%HiResBoundingBox: -36.3277 -40.46686 199.35757 37.36746 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 0c:80000000080203055040047039c4
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+-9.06326 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 14.1732 11.33856 moveto
+14.1732 11.33856 lineto
+14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -14.1732 11.33856 moveto
+-14.1732 11.33856 lineto
+14.1732 11.33856 lineto
+14.1732 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath -14.1732 11.33856 moveto
+-14.1732 11.33856 lineto
+-14.1732 -11.33856 lineto
+-14.1732 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -14.1732 -11.33856 moveto
+-14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto
+14.1732 -11.33856 lineto stroke
+47.62955 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 70.86601 11.33856 moveto
+70.86601 11.33856 lineto
+70.86601 -11.33856 lineto
+70.86601 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 42.5196 11.33856 moveto
+42.5196 11.33856 lineto
+70.86601 11.33856 lineto
+70.86601 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 42.5196 11.33856 moveto
+42.5196 11.33856 lineto
+42.5196 -11.33856 lineto
+42.5196 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 42.5196 -11.33856 moveto
+42.5196 -11.33856 lineto
+70.86601 -11.33856 lineto
+70.86601 -11.33856 lineto stroke
+104.32236 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 127.55882 11.33856 moveto
+127.55882 11.33856 lineto
+127.55882 -11.33856 lineto
+127.55882 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 11.33856 moveto
+99.21242 11.33856 lineto
+127.55882 11.33856 lineto
+127.55882 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 99.21242 11.33856 moveto
+99.21242 11.33856 lineto
+99.21242 -11.33856 lineto
+99.21242 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 99.21242 -11.33856 moveto
+99.21242 -11.33856 lineto
+127.55882 -11.33856 lineto
+127.55882 -11.33856 lineto stroke
+161.01517 -3.4039 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 184.25163 11.33856 moveto
+184.25163 11.33856 lineto
+184.25163 -11.33856 lineto
+184.25163 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 155.90523 11.33856 moveto
+155.90523 11.33856 lineto
+184.25163 11.33856 lineto
+184.25163 11.33856 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 155.90523 11.33856 moveto
+155.90523 11.33856 lineto
+155.90523 -11.33856 lineto
+155.90523 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 155.90523 -11.33856 moveto
+155.90523 -11.33856 lineto
+184.25163 -11.33856 lineto
+184.25163 -11.33856 lineto stroke
+newpath -28.3464 0 moveto
+-14.1732 0 lineto stroke
+newpath -17.86874 -1.53078 moveto
+-14.1732 0 lineto
+-17.86874 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+newpath 14.1732 0 moveto
+42.5196 0 lineto stroke
+newpath 38.82405 -1.53079 moveto
+42.5196 0 lineto
+38.82405 1.53079 lineto
+ closepath
+gsave fill grestore stroke
+newpath 70.86601 0 moveto
+99.21242 0 lineto stroke
+newpath 95.51686 -1.53079 moveto
+99.21242 0 lineto
+95.51686 1.53079 lineto
+ closepath
+gsave fill grestore stroke
+newpath 127.55882 0 moveto
+155.90523 0 lineto stroke
+newpath 152.20967 -1.53079 moveto
+155.90523 0 lineto
+152.20967 1.53079 lineto
+ closepath
+gsave fill grestore stroke
+newpath 184.25163 0 moveto
+198.42484 0 lineto stroke
+newpath 194.7293 -1.53078 moveto
+198.42484 0 lineto
+194.7293 1.53078 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 25.51176 moveto
+0 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.53078 15.0341 moveto
+0 11.33856 lineto
+1.53078 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 56.69281 25.51176 moveto
+56.69281 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 55.16203 15.0341 moveto
+56.69281 11.33856 lineto
+58.22359 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 25.51176 moveto
+113.38562 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 111.85484 15.0341 moveto
+113.38562 11.33856 lineto
+114.9164 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.07843 25.51176 moveto
+170.07843 11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 168.54765 15.0341 moveto
+170.07843 11.33856 lineto
+171.6092 15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 -25.51176 moveto
+0 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 1.53078 -15.0341 moveto
+0 -11.33856 lineto
+-1.53078 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 56.69281 -25.51176 moveto
+56.69281 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 58.22359 -15.0341 moveto
+56.69281 -11.33856 lineto
+55.16203 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 113.38562 -25.51176 moveto
+113.38562 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 114.9164 -15.0341 moveto
+113.38562 -11.33856 lineto
+111.85484 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.07843 -25.51176 moveto
+170.07843 -11.33856 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 171.6092 -15.0341 moveto
+170.07843 -11.33856 lineto
+168.54765 -15.0341 lineto
+ closepath
+gsave fill grestore stroke
+-36.3277 -3.2102 moveto
+(0) cmr10 9.96265 fshow
+-8.71735 30.44896 moveto
+(Key) cmr10 9.96265 fshow
+42.57901 30.44896 moveto
+(Con\014g) cmr10 9.96265 fshow
+95.48051 30.44896 moveto
+(Message) cmr10 9.96265 fshow
+167.58778 28.51176 moveto
+(0) cmr10 9.96265 fshow
+-21.47516 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+-17.87756 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+-6.80786 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+-1.38367 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+2.38004 -36.97997 moveto
+(Key) cmr10 9.96265 fshow
+35.21765 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+38.81525 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+49.88495 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+55.30914 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+59.07285 -36.97997 moveto
+(Cfg) cmr10 9.96265 fshow
+91.91046 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+95.50806 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+106.57776 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+112.00195 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+115.76566 -36.97997 moveto
+(Msg) cmr10 9.96265 fshow
+148.60327 -36.97997 moveto
+(t) cmr10 9.96265 fshow
+152.20087 -36.97997 moveto
+(yp) cmr10 9.96265 fshow
+163.27057 -36.97997 moveto
+(e) cmr10 9.96265 fshow
+168.69476 -36.97997 moveto
+(:) cmr10 9.96265 fshow
+172.45847 -36.97997 moveto
+(Out) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-61.mps b/Supporting_Documentation/tex/skein-61.mps
new file mode 100644
index 0000000000000..c24eb158bfd2e
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-61.mps
@@ -0,0 +1,247 @@
+%!PS
+%%BoundingBox: -2 -62 317 12 
+%%HiResBoundingBox: -1.7809 -61.60425 316.41519 11.8557 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmmi10 9.96265 9.96265 6e:8
+%*Font: cmr10 9.96265 9.96265 23:80010000000000036099ee
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 72.03137 -14.1732 moveto
+72.03137 -14.1732 lineto
+72.03137 -22.1732 lineto
+72.03137 -22.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 64.03137 -14.1732 moveto
+64.03137 -14.1732 lineto
+72.03137 -14.1732 lineto
+72.03137 -14.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 64.03137 -14.1732 moveto
+64.03137 -14.1732 lineto
+64.03137 -22.1732 lineto
+64.03137 -22.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 64.03137 -22.1732 moveto
+64.03137 -22.1732 lineto
+72.03137 -22.1732 lineto
+72.03137 -22.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 68.03137 -14.1732 moveto
+68.03137 -22.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.03137 -18.1732 moveto
+64.03137 -18.1732 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 106.04706 -26.67712 moveto
+106.04706 -26.67712 lineto
+106.04706 -34.67712 lineto
+106.04706 -34.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 98.04706 -26.67712 moveto
+98.04706 -26.67712 lineto
+106.04706 -26.67712 lineto
+106.04706 -26.67712 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 98.04706 -26.67712 moveto
+98.04706 -26.67712 lineto
+98.04706 -34.67712 lineto
+98.04706 -34.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 98.04706 -34.67712 moveto
+98.04706 -34.67712 lineto
+106.04706 -34.67712 lineto
+106.04706 -34.67712 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 102.04706 -26.67712 moveto
+102.04706 -34.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 106.04706 -30.67712 moveto
+98.04706 -30.67712 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 140.06274 -39.18105 moveto
+140.06274 -39.18105 lineto
+140.06274 -47.18105 lineto
+140.06274 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 132.06274 -39.18105 moveto
+132.06274 -39.18105 lineto
+140.06274 -39.18105 lineto
+140.06274 -39.18105 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 132.06274 -39.18105 moveto
+132.06274 -39.18105 lineto
+132.06274 -47.18105 lineto
+132.06274 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 132.06274 -47.18105 moveto
+132.06274 -47.18105 lineto
+140.06274 -47.18105 lineto
+140.06274 -47.18105 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 136.06274 -39.18105 moveto
+136.06274 -47.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 140.06274 -43.18105 moveto
+132.06274 -43.18105 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 0 moveto
+0 -61.35425 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -1.5309 -57.65843 moveto
+0 -61.35425 lineto
+1.5309 -57.65843 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 68.03137 0 moveto
+68.03137 -14.1732 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 66.5006 -10.47766 moveto
+68.03137 -14.1732 lineto
+69.56215 -10.47766 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 102.04706 0 moveto
+102.04706 -26.67712 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 100.51624 -22.98148 moveto
+102.04706 -26.67712 lineto
+103.57788 -22.98148 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 136.06274 0 moveto
+136.06274 -39.18105 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 134.53188 -35.4853 moveto
+136.06274 -39.18105 lineto
+137.59361 -35.4853 lineto
+ closepath
+gsave fill grestore stroke
+newpath 204.09412 0 moveto
+204.09412 -18.1732 lineto
+72.03137 -18.1732 lineto stroke
+newpath 75.72688 -16.64244 moveto
+72.03137 -18.1732 lineto
+75.72688 -19.70396 lineto
+ closepath
+gsave fill grestore stroke
+newpath 238.1098 0 moveto
+238.1098 -30.67712 lineto
+106.04706 -30.67712 lineto stroke
+newpath 109.74257 -29.14636 moveto
+106.04706 -30.67712 lineto
+109.74257 -32.20789 lineto
+ closepath
+gsave fill grestore stroke
+newpath 272.12549 -43.18105 moveto
+140.06274 -43.18105 lineto stroke
+newpath 143.75826 -41.65028 moveto
+140.06274 -43.18105 lineto
+143.75826 -44.7118 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 68.03137 -22.1732 moveto
+68.03137 -61.35425 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 66.5005 -57.65851 moveto
+68.03137 -61.35425 lineto
+69.56224 -57.65851 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 102.04706 -34.67712 moveto
+102.04706 -61.35425 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 100.51624 -57.6586 moveto
+102.04706 -61.35425 lineto
+103.57788 -57.6586 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 136.06274 -47.18105 moveto
+136.06274 -61.35425 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 134.53197 -57.6587 moveto
+136.06274 -61.35425 lineto
+137.59352 -57.6587 lineto
+ closepath
+gsave fill grestore stroke
+newpath 14.60233 -30.67712 moveto
+14.60233 -30.4129 14.49736 -30.15948 14.31053 -29.97266 curveto
+14.1237 -29.78583 13.87029 -29.68086 13.60606 -29.68086 curveto
+13.34184 -29.68086 13.08842 -29.78583 12.9016 -29.97266 curveto
+12.71477 -30.15948 12.6098 -30.4129 12.6098 -30.67712 curveto
+12.6098 -30.94135 12.71477 -31.19476 12.9016 -31.38159 curveto
+13.08842 -31.56842 13.34184 -31.67339 13.60606 -31.67339 curveto
+13.87029 -31.67339 14.1237 -31.56842 14.31053 -31.38159 curveto
+14.49736 -31.19476 14.60233 -30.94135 14.60233 -30.67712 curveto closepath fill
+newpath 28.20839 -30.67712 moveto
+28.20839 -30.4129 28.10342 -30.15948 27.9166 -29.97266 curveto
+27.72977 -29.78583 27.47635 -29.68086 27.21213 -29.68086 curveto
+26.9479 -29.68086 26.69449 -29.78583 26.50766 -29.97266 curveto
+26.32083 -30.15948 26.21587 -30.4129 26.21587 -30.67712 curveto
+26.21587 -30.94135 26.32083 -31.19476 26.50766 -31.38159 curveto
+26.69449 -31.56842 26.9479 -31.67339 27.21213 -31.67339 curveto
+27.47635 -31.67339 27.72977 -31.56842 27.9166 -31.38159 curveto
+28.10342 -31.19476 28.20839 -30.94135 28.20839 -30.67712 curveto closepath fill
+newpath 41.8155 -30.67712 moveto
+41.8155 -30.4129 41.71054 -30.15948 41.52371 -29.97266 curveto
+41.33688 -29.78583 41.08347 -29.68086 40.81924 -29.68086 curveto
+40.55502 -29.68086 40.3016 -29.78583 40.11478 -29.97266 curveto
+39.92795 -30.15948 39.82298 -30.4129 39.82298 -30.67712 curveto
+39.82298 -30.94135 39.92795 -31.19476 40.11478 -31.38159 curveto
+40.3016 -31.56842 40.55502 -31.67339 40.81924 -31.67339 curveto
+41.08347 -31.67339 41.33688 -31.56842 41.52371 -31.38159 curveto
+41.71054 -31.19476 41.8155 -30.94135 41.8155 -30.67712 curveto closepath fill
+newpath 55.42157 -30.67712 moveto
+55.42157 -30.4129 55.3166 -30.15948 55.12978 -29.97266 curveto
+54.94295 -29.78583 54.68953 -29.68086 54.42531 -29.68086 curveto
+54.16109 -29.68086 53.90767 -29.78583 53.72084 -29.97266 curveto
+53.53401 -30.15948 53.42905 -30.4129 53.42905 -30.67712 curveto
+53.42905 -30.94135 53.53401 -31.19476 53.72084 -31.38159 curveto
+53.90767 -31.56842 54.16109 -31.67339 54.42531 -31.67339 curveto
+54.68953 -31.67339 54.94295 -31.56842 55.12978 -31.38159 curveto
+55.3166 -31.19476 55.42157 -30.94135 55.42157 -30.67712 curveto closepath fill
+20.58307 4.9372 moveto
+(n) cmmi10 9.96265 fshow
+29.88387 4.9372 moveto
+(extended) cmr10 9.96265 fshow
+72.22517 4.9372 moveto
+(k) cmr10 9.96265 fshow
+77.20647 4.9372 moveto
+(ey) cmr10 9.96265 fshow
+90.21327 4.9372 moveto
+(w) cmr10 9.96265 fshow
+97.13177 4.9372 moveto
+(ords) cmr10 9.96265 fshow
+168.89485 3 moveto
+(2) cmr10 9.96265 fshow
+177.19705 3 moveto
+(extended) cmr10 9.96265 fshow
+219.53835 3 moveto
+(t) cmr10 9.96265 fshow
+223.13596 3 moveto
+(w) cmr10 9.96265 fshow
+230.05446 3 moveto
+(eak) cmr10 9.96265 fshow
+248.04265 3 moveto
+(w) cmr10 9.96265 fshow
+254.96115 3 moveto
+(ords) cmr10 9.96265 fshow
+275.12549 -45.6717 moveto
+(subk) cmr10 9.96265 fshow
+295.10619 -45.6717 moveto
+(ey) cmr10 9.96265 fshow
+308.11299 -45.6717 moveto
+(#) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-71.mps b/Supporting_Documentation/tex/skein-71.mps
new file mode 100644
index 0000000000000..136250fc68ab8
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-71.mps
@@ -0,0 +1,90 @@
+%!PS
+%%BoundingBox: -218 -38 215 22 
+%%HiResBoundingBox: -217.82841 -37.47495 214.34148 21.66747 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 42:88222001312ee9
+%*Font: cmr10 6.97382 9.96265 30:e2c
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor 0.29887
+ 0 dtransform exch truncate exch idtransform pop setlinewidth [] 0 setdash
+ 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 0 -14.1732 moveto
+0 14.1732 lineto stroke
+newpath -106.29903 -14.1732 moveto
+-106.29903 14.1732 lineto stroke
+newpath -152.80482 -14.1732 moveto
+-152.80482 14.1732 lineto stroke
+newpath -159.44853 -14.1732 moveto
+-159.44853 14.1732 lineto stroke
+newpath -199.31065 -14.1732 moveto
+-199.31065 14.1732 lineto stroke
+newpath -205.95436 -14.1732 moveto
+-205.95436 14.1732 lineto stroke
+88.62912 -3.4039 moveto
+(P) cmr10 9.96265 fshow
+95.13252 -3.4039 moveto
+(osition) cmr10 9.96265 fshow
+-70.91628 -3.45926 moveto
+(reserv) cmr10 9.96265 fshow
+-45.34547 -3.45926 moveto
+(ed) cmr10 9.96265 fshow
+-150.52882 -3.45926 moveto
+(T) cmr10 9.96265 fshow
+-144.16382 -3.45926 moveto
+(reeLev) cmr10 9.96265 fshow
+-115.77022 -3.45926 moveto
+(el) cmr10 9.96265 fshow
+ 0 0.29887 dtransform truncate idtransform setlinewidth pop
+newpath -156.12668 0 moveto
+-156.12668 -22.67712 lineto
+-146.16115 -22.67712 lineto stroke
+-143.16115 -26.13638 moveto
+(BitP) cmr10 9.96265 fshow
+-122.95905 -26.13638 moveto
+(ad) cmr10 9.96265 fshow
+-190.5876 -2.4353 moveto
+(T) cmr10 9.96265 fshow
+-183.6691 -2.4353 moveto
+(yp) cmr10 9.96265 fshow
+-172.5995 -2.4353 moveto
+(e) cmr10 9.96265 fshow
+newpath -202.6325 0 moveto
+-202.6325 -22.67712 lineto
+-192.66698 -22.67712 lineto stroke
+-189.66698 -26.08102 moveto
+(First) cmr10 9.96265 fshow
+newpath -209.2762 0 moveto
+-209.2762 -34.01569 lineto
+-199.31065 -34.01569 lineto stroke
+-196.31065 -37.47495 moveto
+(Final) cmr10 9.96265 fshow
+210.85458 17.1732 moveto
+(0) cmr10 6.97382 fshow
+-3.48694 17.1732 moveto
+(96) cmr10 6.97382 fshow
+-111.5294 17.1732 moveto
+(112) cmr10 6.97382 fshow
+-164.67891 17.1732 moveto
+(120) cmr10 6.97382 fshow
+-217.82841 17.1732 moveto
+(128) cmr10 6.97382 fshow
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath -212.59804 14.1732 moveto
+204.53703 14.1732 lineto
+205.95436 19.84248 lineto
+205.95436 8.50392 lineto
+207.37169 14.1732 lineto
+212.59804 14.1732 lineto
+212.59804 -14.1732 lineto
+207.37169 -14.1732 lineto
+205.95436 -19.84248 lineto
+205.95436 -8.50392 lineto
+204.53703 -14.1732 lineto
+-212.59804 -14.1732 lineto
+-212.59804 14.1732 lineto stroke
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein-81.mps b/Supporting_Documentation/tex/skein-81.mps
new file mode 100644
index 0000000000000..714768dffa241
--- /dev/null
+++ b/Supporting_Documentation/tex/skein-81.mps
@@ -0,0 +1,279 @@
+%!PS
+%%BoundingBox: -1 -15 310 161 
+%%HiResBoundingBox: -0.25 -14.89598 309.95052 160.56291 
+%%Creator: MetaPost 1.000
+%%CreationDate: 2008.10.19:0139
+%%Pages: 1
+%*Font: cmr10 9.96265 9.96265 42:8100100114104
+%%BeginProlog
+%%EndProlog
+%%Page: 1 1
+ 0 0 0 setrgbcolor 0 0.5 dtransform truncate idtransform setlinewidth pop
+ [] 0 setdash 1 setlinecap 1 setlinejoin 10 setmiterlimit
+newpath 0 0 moveto
+276.37746 0 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 0 2.83464 moveto
+0 -2.83464 lineto stroke
+newpath 85.03922 2.83464 moveto
+85.03922 -2.83464 lineto stroke
+newpath 170.07843 2.83464 moveto
+170.07843 -2.83464 lineto stroke
+newpath 255.11765 2.83464 moveto
+255.11765 -2.83464 lineto stroke
+newpath 276.37746 2.83464 moveto
+276.37746 -2.83464 lineto stroke
+33.45634 31.3464 moveto
+(UBI) cmr10 9.96265 fshow
+newpath 54.58287 41.1542 moveto
+54.58287 41.1542 lineto
+54.58287 28.3464 lineto
+54.58287 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.45634 41.1542 moveto
+30.45634 41.1542 lineto
+54.58287 41.1542 lineto
+54.58287 41.1542 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 30.45634 41.1542 moveto
+30.45634 41.1542 lineto
+30.45634 28.3464 lineto
+30.45634 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 30.45634 28.3464 moveto
+30.45634 28.3464 lineto
+54.58287 28.3464 lineto
+54.58287 28.3464 lineto stroke
+118.49556 31.3464 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 139.62209 41.1542 moveto
+139.62209 41.1542 lineto
+139.62209 28.3464 lineto
+139.62209 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.49556 41.1542 moveto
+115.49556 41.1542 lineto
+139.62209 41.1542 lineto
+139.62209 41.1542 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 115.49556 41.1542 moveto
+115.49556 41.1542 lineto
+115.49556 28.3464 lineto
+115.49556 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 115.49556 28.3464 moveto
+115.49556 28.3464 lineto
+139.62209 28.3464 lineto
+139.62209 28.3464 lineto stroke
+203.53477 31.3464 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 224.6613 41.1542 moveto
+224.6613 41.1542 lineto
+224.6613 28.3464 lineto
+224.6613 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 200.53477 41.1542 moveto
+200.53477 41.1542 lineto
+224.6613 41.1542 lineto
+224.6613 41.1542 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 200.53477 41.1542 moveto
+200.53477 41.1542 lineto
+200.53477 28.3464 lineto
+200.53477 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 200.53477 28.3464 moveto
+200.53477 28.3464 lineto
+224.6613 28.3464 lineto
+224.6613 28.3464 lineto stroke
+288.57399 31.3464 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 309.70052 41.1542 moveto
+309.70052 41.1542 lineto
+309.70052 28.3464 lineto
+309.70052 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 285.57399 41.1542 moveto
+285.57399 41.1542 lineto
+309.70052 41.1542 lineto
+309.70052 41.1542 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 285.57399 41.1542 moveto
+285.57399 41.1542 lineto
+285.57399 28.3464 lineto
+285.57399 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 285.57399 28.3464 moveto
+285.57399 28.3464 lineto
+309.70052 28.3464 lineto
+309.70052 28.3464 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 42.5196 2.83464 moveto
+42.5196 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 44.05031 24.65105 moveto
+42.5196 28.3464 lineto
+40.9889 24.65105 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 127.55882 2.83464 moveto
+127.55882 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 129.08952 24.65105 moveto
+127.55882 28.3464 lineto
+126.02812 24.65105 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 212.59804 2.83464 moveto
+212.59804 28.3464 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 214.12874 24.65105 moveto
+212.59804 28.3464 lineto
+211.06734 24.65105 lineto
+ closepath
+gsave fill grestore stroke
+newpath 265.74754 2.83464 moveto
+297.63725 28.3464 lineto stroke
+newpath 295.70784 24.84251 moveto
+297.63725 28.3464 lineto
+293.7953 27.23314 lineto
+ closepath
+gsave fill grestore stroke
+75.97595 86.6738 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 97.10248 96.4816 moveto
+97.10248 96.4816 lineto
+97.10248 83.67381 lineto
+97.10248 83.67381 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.97595 96.4816 moveto
+72.97595 96.4816 lineto
+97.10248 96.4816 lineto
+97.10248 96.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 72.97595 96.4816 moveto
+72.97595 96.4816 lineto
+72.97595 83.67381 lineto
+72.97595 83.67381 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 72.97595 83.67381 moveto
+72.97595 83.67381 lineto
+97.10248 83.67381 lineto
+97.10248 83.67381 lineto stroke
+246.05438 86.6738 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 267.18091 96.4816 moveto
+267.18091 96.4816 lineto
+267.18091 83.67381 lineto
+267.18091 83.67381 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 243.05438 96.4816 moveto
+243.05438 96.4816 lineto
+267.18091 96.4816 lineto
+267.18091 96.4816 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 243.05438 96.4816 moveto
+243.05438 96.4816 lineto
+243.05438 83.67381 lineto
+243.05438 83.67381 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 243.05438 83.67381 moveto
+243.05438 83.67381 lineto
+267.18091 83.67381 lineto
+267.18091 83.67381 lineto stroke
+newpath 42.5196 41.1542 moveto
+42.5196 43.98885 lineto
+83.62189 78.00453 lineto
+83.62189 83.67381 lineto stroke
+newpath 85.15263 79.97836 moveto
+83.62189 83.67381 lineto
+82.09114 79.97836 lineto
+ closepath
+gsave fill grestore stroke
+newpath 127.55882 41.1542 moveto
+127.55882 43.98885 lineto
+86.45654 78.00453 lineto
+86.45654 83.67381 lineto stroke
+newpath 87.98729 79.97836 moveto
+86.45654 83.67381 lineto
+84.9258 79.97836 lineto
+ closepath
+gsave fill grestore stroke
+newpath 212.59804 41.1542 moveto
+212.59804 43.98885 lineto
+253.70032 78.00453 lineto
+253.70032 83.67381 lineto stroke
+newpath 255.23106 79.97836 moveto
+253.70032 83.67381 lineto
+252.16957 79.97836 lineto
+ closepath
+gsave fill grestore stroke
+newpath 297.63725 41.1542 moveto
+297.63725 43.98885 lineto
+256.53497 78.00453 lineto
+256.53497 83.67381 lineto stroke
+newpath 258.06572 79.97836 moveto
+256.53497 83.67381 lineto
+255.00423 79.97836 lineto
+ closepath
+gsave fill grestore stroke
+161.01517 142.00119 moveto
+(UBI) cmr10 9.96265 fshow
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 182.1417 151.80899 moveto
+182.1417 151.80899 lineto
+182.1417 139.0012 lineto
+182.1417 139.0012 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 158.01517 151.80899 moveto
+158.01517 151.80899 lineto
+182.1417 151.80899 lineto
+182.1417 151.80899 lineto stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 158.01517 151.80899 moveto
+158.01517 151.80899 lineto
+158.01517 139.0012 lineto
+158.01517 139.0012 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 158.01517 139.0012 moveto
+158.01517 139.0012 lineto
+182.1417 139.0012 lineto
+182.1417 139.0012 lineto stroke
+newpath 85.03922 96.4816 moveto
+85.03922 99.31624 lineto
+168.6611 133.33192 lineto
+168.6611 139.0012 lineto stroke
+newpath 170.19185 135.30576 moveto
+168.6611 139.0012 lineto
+167.13036 135.30576 lineto
+ closepath
+gsave fill grestore stroke
+newpath 255.11765 96.4816 moveto
+255.11765 99.31624 lineto
+171.49576 133.33192 lineto
+171.49576 139.0012 lineto stroke
+newpath 173.0265 135.30576 moveto
+171.49576 139.0012 lineto
+169.96501 135.30576 lineto
+ closepath
+gsave fill grestore stroke
+ 0.5 0 dtransform exch truncate exch idtransform pop setlinewidth
+newpath 170.07843 151.80899 moveto
+170.07843 160.31291 lineto stroke
+ 0 0.5 dtransform truncate idtransform setlinewidth pop
+newpath 171.60919 156.61743 moveto
+170.07843 160.31291 lineto
+168.54767 156.61743 lineto
+ closepath
+gsave fill grestore stroke
+120.69873 -12.95879 moveto
+(message) cmr10 9.96265 fshow
+showpage
+%%EOF
diff --git a/Supporting_Documentation/tex/skein1.3.tex b/Supporting_Documentation/tex/skein1.3.tex
new file mode 100644
index 0000000000000..db70fd8eaf167
--- /dev/null
+++ b/Supporting_Documentation/tex/skein1.3.tex
@@ -0,0 +1,4025 @@
+\documentclass[11pt,twoside]{article}
+\usepackage{amsmath}
+\usepackage{amssymb}
+%\usepackage{graphicx}
+\usepackage{xspace}
+\usepackage{url}
+\usepackage{graphicx}
+\usepackage{tabularx}
+%\usepackage{xrefbib}
+\usepackage{longtable}
+
+\hyphenation{two-fish}
+\hyphenation{three-fish}
+
+\renewcommand{\topfraction}{0.8}     % max fraction of floating figures at the top of a page
+\renewcommand{\bottomfraction}{0.8}  % idem for bottom
+\setcounter{topnumber}{4}            % max figures at top of page
+\setcounter{bottomnumber}{4}
+\setcounter{totalnumber}{4}
+\renewcommand{\textfraction}{0.2}    % minimum fraction of page that is text
+\renewcommand{\floatpagefraction}{0.5}
+
+\setlength{\textheight}{9in}
+\setlength{\textwidth}{6.5in}
+\setlength{\topmargin}{0.0in}
+\setlength{\oddsidemargin}{0in}
+\setlength{\evensidemargin}{0in}
+\setlength{\footskip}{0.5in}
+\setlength{\headheight}{0in}
+\setlength{\headsep}{0in}
+\renewcommand{\baselinestretch}{1}
+
+\newcommand{\xor}{\oplus}
+\newcommand{\purl}{\protect\url}
+\newcommand{\asgn}{\leftarrow}
+\newcommand{\len}{\ell}
+
+\newcommand{\TheConst}{C_{240}} % symbol for the key-schedule constant 
+                                % 0x1BD11BDAA9FC1A22=AES-256_0(240)
+
+%Concatenation symbol that behaves like a binary math operator
+\newcommand{\concat}{\mathbin{|\hspace{-1pt}|}}
+\newcommand{\dotdot}{\mathbin{..}}
+\newcommand{\rol}{\mathbin{\lll}}
+
+\def\symdef#1{\label{symdef:#1}}
+\def\symdefref#1{[Page~\pageref{symdef:#1}]}
+
+\newcommand{\mix}{\text{MIX}}
+\newcommand{\UBI}{\text{UBI}}
+\newcommand{\ToInt}{\text{ToInt}\xspace}
+\newcommand{\ToBytes}{\text{ToBytes}\xspace}
+\newcommand{\BytesToWords}{\text{BytesToWords}\xspace}
+\newcommand{\WordsToBytes}{\text{WordsToBytes}\xspace}
+\newcommand{\Output}{\text{Output}\xspace}
+
+\newcommand{\TypeSymbol}[1]{\text{$T_\text{#1}$}\xspace}
+\newcommand{\TypeKey}{\TypeSymbol{key}}
+\newcommand{\TypeConfig}{\TypeSymbol{cfg}}
+\newcommand{\TypePers}{\TypeSymbol{prs}}
+\newcommand{\TypePK}{\TypeSymbol{PK}}
+\newcommand{\TypeKDF}{\TypeSymbol{kdf}}
+\newcommand{\TypeNonce}{\TypeSymbol{non}}
+\newcommand{\TypeMsg}{\TypeSymbol{msg}}
+\newcommand{\TypeOut}{\TypeSymbol{out}}
+
+\newcommand{\comment}[1]{}
+
+\newcommand{\parameterlabel}[1]{$#1$\hfill}
+
+
+\newenvironment{parameters}{\begin{list}{?}{%
+\parsep = 0pt
+\leftmargin = 17 mm
+\rightmargin = 0cm
+\listparindent = \parindent
+\labelsep = 2 mm
+\labelwidth = 10 mm
+\let\makelabel\parameterlabel
+}%
+}{\end{list}}
+
+%%%%%
+%%%%% Theorem definitions
+%%%%%
+
+\newtheorem{theorem}{Theorem}[section]
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{proposition}[theorem]{Proposition}
+\newtheorem{corollary}[theorem]{Corollary}
+
+\newenvironment{proof}[1][Proof]{\begin{trivlist}
+\item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
+\newenvironment{definition}[1][Definition]{\begin{trivlist}
+\item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
+\newenvironment{example}[1][Example]{\begin{trivlist}
+\item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
+\newenvironment{remark}[1][Remark]{\begin{trivlist}
+\item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
+
+\newcommand{\qed}{\nobreak \ifvmode \relax \else
+      \ifdim\lastskip<1.5em \hskip-\lastskip
+      \hskip1.5em plus0em minus0.5em \fi \nobreak
+      \vrule height0.75em width0.5em depth0.25em\fi}
+
+%%%%%
+%%%%%
+%%%%%
+
+
+
+
+\parindent 0pt
+\parskip 1ex
+
+
+%%% Proof commands
+% =========================================================================
+
+\newcommand{\term}[1]{\emph{#1}}
+\newcommand{\textxor}{\textsc{xor}}
+
+\newcommand{\secref}[1]{\mbox{Section~\ref{#1}}}
+\newcommand{\apref}[1]{\mbox{Appendix~\ref{#1}}}
+\newcommand{\thref}[1]{\mbox{Theorem~\ref{#1}}}
+\newcommand{\defref}[1]{\mbox{Definition~\ref{#1}}}
+\newcommand{\corref}[1]{\mbox{Corollary~\ref{#1}}}
+\newcommand{\lemref}[1]{\mbox{Lemma~\ref{#1}}}
+\newcommand{\clref}[1]{\mbox{Claim~\ref{#1}}}
+\newcommand{\propref}[1]{\mbox{Proposition~\ref{#1}}}
+\newcommand{\remref}[1]{\mbox{Remark~\ref{#1}}}
+\newcommand{\consref}[1]{\mbox{Construction~\ref{#1}}}
+\newcommand{\figref}[1]{\mbox{Figure~\ref{#1}}}
+\renewcommand{\eqref}[1]{\mbox{Equation~(\ref{#1})}}
+
+
+\def\next{\:;\:}
+
+
+\newcommand{\Succ}{\mathbf{Succ}}
+\newcommand{\Adv}{\mathbf{Adv}}
+\newcommand{\Exp}[2]{\mathbf{Exp}_{#1}(#2)}
+\newcommand{\GameSZero}[2]{\mathbf{GameS0}_{#1}(#2)}
+\newcommand{\GameSOne}[2]{\mathbf{GameS1}_{#1}(#2)}
+
+
+
+\newlength{\savejot}
+\setlength{\jot}{4pt}
+\setlength{\savejot}{\jot}
+
+\newenvironment{newmath}{\begin{displaymath}%
+\setlength{\abovedisplayskip}{4pt}%
+\setlength{\belowdisplayskip}{4pt}%
+\setlength{\abovedisplayshortskip}{6pt}%
+\setlength{\belowdisplayshortskip}{6pt} }{\end{displaymath}}
+
+\newenvironment{newequation}{\begin{equation}%
+\setlength{\abovedisplayskip}{4pt}%
+\setlength{\belowdisplayskip}{4pt}%
+\setlength{\abovedisplayshortskip}{6pt}%
+\setlength{\belowdisplayshortskip}{6pt} }{\end{equation}}
+
+\newcommand{\headingg}[1]{{\sc{#1}}}
+\newcommand{\heading}[1]{{\vspace{5pt}\noindent\sc{#1}}}
+
+\newcommand{\find}{\mathsf{find}}
+\newcommand{\guess}{\mathsf{guess}}
+\newcommand{\AskR}{\mathsf{AskR}}
+\newcommand{\AskY}{\mathsf{AskY}}
+
+
+\newcommand{\tuple}[1]{(#1)}
+\newcommand{\birep}[1]{\langle#1\rangle}
+\renewcommand{\Sigma}{\{0,1\}}
+\def\bits{\Sigma}
+% \def\emptystring{\lambda}
+\def\cross{\times}
+%\newcommand{\concat}{\,,\,}
+\newcommand{\smidge}{{\kern .05em}}
+\newcommand{\Colon}{{\smidge\colon\;\:}}
+\newcommand{\norm}[1]{\|#1\|}
+\newcommand{\astrut}{\rule{0em}{15pt}}
+\newcommand{\bstrut}{\rule{0em}{12pt}}
+\def\poly{\mathop{\rm poly}\nolimits}
+\def\emptystring{\varepsilon}
+\newcommand{\calC}{{\cal C}}
+\newcommand{\calF}{{\cal F}}
+\newcommand{\calO}{{\cal O}}
+\newcommand{\calR}{{\cal R}}
+\newcommand{\N}{{{\sf N}}}
+% \newcommand{\Z}{{{\sf Z}}}
+\newcommand{\Z}{{\mathbb{Z}}}
+\newcommand{\Y}{{{\sf Y}}}
+\newcommand{\R}{{{\rm\bf R}}}
+\newcommand{\calG}{{\cal G}}
+\newcommand{\calE}{{\cal E}}
+\newcommand{\goesto}{{\rightarrow}}
+\newcommand{\eqdef}{\stackrel{\rm def}{=}}
+\def\union{\cup}
+\def\bigunion{\bigcup}
+\def\suchthatt{\: :\:}
+%\def\next{\hspace{12pt};\hspace{12pt}}
+\def\nextt{\hspace{3pt};\hspace{6pt}}
+\newcommand{\set}[2]{\{\:#1 \suchthatt #2\:\}}
+\newcommand{\card}[1]{|#1|}
+\def\leqq{\;\leq\;}
+\def\eqq{\;=\;}
+\def\geqq{\;\geq\;}
+\def\lst{\;<\;}
+\def\gst{\;>\;}
+\def\prn#1{\left(#1\right)}
+%\def\getsr{\stackrel{{\scriptscriptstyle R}}{\leftarrow}}
+\def\getsr{\stackrel{{\scriptscriptstyle \$}}{\leftarrow}}
+\newcommand\getsrd[1]{\stackrel{{\scriptscriptstyle \$},#1}{\leftarrow}}
+\newcommand{\replyto}{\Leftarrow}
+\newcommand{\getfrom}{\Rightarrow}
+\renewcommand{\choose}[2]{{{#1}\atopwithdelims(){#2}}}
+\newcommand{\abs}[1]{{\displaystyle \left| {#1} \right| }}
+
+\newcommand{\authnote}[2]{\begin{quote}\textbf{#1 says:} #2\end{quote}}
+
+
+% Skein
+\newcommand{\prefix}{\leq}
+\newcommand{\E}{\mathsf{E}}
+\newcommand{\key}{\mathsf{K}}
+\newcommand{\tweak}{\mathsf{T}}
+
+\newcommand{\finalbit}{\mathsf{fin}}
+\newcommand{\firstbit}{\mathsf{fir}}
+\newcommand{\type}{\mathsf{type}}
+\newcommand{\bitpad}{\mathsf{bitpad}}
+\newcommand{\treelevel}{\mathsf{lvl}}
+\newcommand{\length}{\mathsf{L}}
+\newcommand{\outlen}{\mathsf{N}}
+\newcommand{\runninglength}{\mathsf{l}}
+
+
+\newcommand{\treeleafsize}{\mathsf{lsize}}
+\newcommand{\treenodesize}{\mathsf{nsize}}
+\newcommand{\maxtreeheight}{\mathsf{maxh}}
+
+\newcommand{\msg}{\mathsf{M}}
+\newcommand{\padmsg}{\overline{\mathsf{M}}}
+\newcommand{\chainval}{\mathsf{h}}
+\newcommand{\startinglen}{\mathsf{startl}}
+
+\newcommand{\C}{\mathsf{TComp}}
+\newcommand{\ordinaryC}{\mathsf{Comp}}
+\newcommand{\setX}{\mathcal{X}}
+\newcommand{\inttostr}[2]{\mathsf{IntToStr}_{#2}(#1)}
+\newcommand{\F}{\mathsf{f}}
+\newcommand{\Fstar}{\mathsf{Fserial}}
+\newcommand{\Fstartree}{\mathsf{Ftree}}
+\newcommand{\Fcombined}{\mathsf{Fskein}}
+\newcommand{\Tree}{\mathsf{Tree}}
+\newcommand{\Ycfg}{\mathsf{TyCfg}}
+\newcommand{\Yout}{\mathsf{TyOut}}
+\newcommand{\Ymsg}{\mathsf{TyMsg}}
+\renewcommand{\H}{\mathsf{H}}
+\newcommand{\MkTw}{\mathsf{MkTw}}
+\newcommand{\MkConfig}{\mathsf{MkConfig}}
+\newcommand{\pad}{\mathsf{pad}}
+\newcommand{\Encode}{\mathsf{Encode}}
+\newcommand{\ProofOutput}{\mathsf{Output}}
+\newcommand{\PFEncode}{\mathsf{PFEncode}}
+\newcommand{\MsgSp}{\mathsf{MsgSp}}
+\newcommand{\TreeParamSp}{\mathsf{TreeParamSp}}
+\newcommand{\TypeSp}{\mathsf{TypeSp}}
+\newcommand{\OutLens}{\mathsf{OutLens}}
+\newcommand{\TypeMsgPairSp}{\mathsf{TypeMsgPairSp}}
+\newcommand{\IV}{\mathsf{IV}}
+\newcommand{\textprf}{\ensuremath{\text{\rm prf}}}
+\newcommand{\textcr}{\ensuremath{\text{\rm cr}}}
+
+\newcommand{\EE}[1]{{\E\left[{#1}\right]}}
+\newcommand{\EEE}[2]{{\E_{#1}\left[{#2}\right]}}
+\newcommand{\Prob}[1]{\Pr\left[\: #1 \:\right]}
+\newcommand{\ProbExp}[2]{\Pr\left[\: #1 \: : \: #2\: \right]}
+\newcommand{\Probb}[2]{{\Pr}_{#1}\left[\: #2 \:\right]}
+\newcommand{\Probc}[2]{\Pr_\left[\: #1 \:{\left|\right.}\:#2\:\right]}
+\newcommand{\Probcc}[3]{{\Pr}_{#1}\left[\: #2 \:\left|\right.\:#3\:\right]}
+\newcommand{\CondProb}[2]{{\Pr}\left[\: #1\:\left|\right.\:#2\:\right]}
+\newcommand{\suchthat}{{\mbox{s.t.\ }}}
+\newcommand{\qquadd}{{\quad}}
+\def\d{{\delta}}
+\def\e{{\epsilon}}
+\newcommand{\ceiling}[1]{\lceil #1\rceil}
+\newcommand{\sfrac}[2]{{\textstyle \frac{#1}{#2}}}
+\newcommand{\ssum}[2]{{\textstyle \sum_{\,#1}^{\,#2}\,}}
+\newcommand{\sprod}[2]{{\textstyle \prod_{\,#1}^{\,#2}\,}}
+\def\smax{{\textstyle \max}}
+\def\N{{\sf N}}
+\def\R{{\sf R}}
+\newcommand{\blockindex}[2]{{\langle#1\rangle}_{#2}}
+\def\chv{\raisebox{2pt}{$\chi$}}
+
+\newcommand{\cclass}[1]{{\rm #1}}
+\def\P{\cclass{P}}
+\def\NP{\cclass{NP}}
+\def\BPP{\cclass{BPP}}
+\def\coRP{\cclass{coRP}}
+\def\NEXP{\cclass{NEXP}}
+\def\DES{\mbox{\rm DES}}
+\newcommand{\md}{\textsf{md5}}
+\newcommand{\MD}{\mbox{\rm MD5}}
+\newcommand{\sha}{\textsf{sha-1}}
+\newcommand{\ripemd}{\textsf{ripemd-160}}
+
+% Skip page numbers on empty pages
+\let\mycleardoublepage=\cleardoublepage
+\def\cleardoublepage{\clearpage\pagestyle{empty}\mycleardoublepage\pagestyle{plain}}
+
+\begin{document}
+% \maketitle
+\thispagestyle{empty}
+\begin{center}
+{\Large\bf The Skein Hash Function Family} \\[15pt]
+{\small Version 1.3 --- 1 Oct 2010}
+\end{center}
+
+\vspace*{.5in}
+
+\begin{center}
+\begin{tabular}{ll}
+    \bf{Niels Ferguson} & {Microsoft Corp., \purl{niels@microsoft.com}} \\
+    \bf{Stefan Lucks} & {Bauhaus-Universit\"at Weimar, \purl{stefan.lucks@uni-weimar.de}}  \\
+    \bf{Bruce Schneier} & {BT Group plc, \purl{schneier@schneier.com}}  \\
+    \bf{Doug Whiting} & {Hifn, Inc. \purl{dwhiting@exar.com}}  \\
+    \bf{Mihir Bellare} & {University of California San Diego, \purl{mihir@cs.ucsd.edu}}  \\
+    \bf{Tadayoshi Kohno} & {University of Washington, \purl{yoshi@cs.washington.edu}}  \\
+    \bf{Jon Callas} & {PGP Corp., \purl{jon@pgp.com}} \\
+    \bf{Jesse Walker} & {Intel Corp., \purl{jesse.walker@intel.com}}
+\end{tabular}
+\end{center}
+\date{}
+
+\cleardoublepage
+
+\setcounter{page}{1}
+\pagenumbering{roman}
+
+\section*{Executive Summary}
+
+Skein is a new family of cryptographic hash functions.  Its design combines speed, security, simplicity, and a great deal of flexibility in a modular package that is easy to analyze.
+
+Skein is fast.  Skein-512---our primary proposal---hashes data at 6.1 clock cycles per byte on a 64-bit CPU.  This means that on a 3.1 GHz x64 Core 2 Duo CPU, Skein hashes data at 500 MBytes/second per core---almost twice as fast as SHA-512 and three times faster than SHA-256.  An optional hash-tree mode speeds up parallelizable implementations even more.  Skein is fast for short messages, too; Skein-512 hashes short messages in about 1000 clock cycles.
+
+Skein is secure.  Its conservative design is based on the Threefish
+block cipher.  The current best attack on the tweaked Threefish-512 is
+on 35 of 72 rounds, for a safety factor of just over 2.0. For
+comparison, at a similar stage in the standardization process, the AES
+encryption algorithm had an attack on 6 of 10 rounds, for a safety
+factor of only 1.7.  Additionally, Skein has a number of provably
+secure properties, greatly increasing confidence in the
+algorithm.
+
+Skein is simple.  Using only three primitive operations, the Skein compression function can be easily understood and remembered.  The rest of the algorithm is a straightforward iteration of this function.
+
+Skein is flexible.  Skein is defined for three different internal state sizes---256 bits, 512 bits, and 1024 bits---and any output size.  This allows Skein to be a drop-in replacement for the entire SHA family of hash functions.  A completely optional and extendable argument system makes Skein an efficient tool to use for a very large number of functions: PRNG, stream cipher, key derivation function, authentication without the overhead of HMAC, and personalization capability.  All these features can be implemented with very low overhead.  Together with the Threefish large-block cipher at Skein's core, this design provides a full set of symmetric cryptographic primitives suitable for most modern applications.
+
+Skein is efficient on a variety of platforms, both hardware and software.  Skein-512 can be implemented in about 200 bytes of state.  Small devices, such as 8-bit smart cards, can implement Skein-256 using about 100 bytes of memory.  Larger devices can implement the larger versions of Skein to achieve faster speeds.
+
+Skein was designed by a team of highly experienced cryptographic experts from academia and industry, with expertise in cryptography, security analysis, software, chip design, and implementation of real-world cryptographic systems.  This breadth of knowledge allowed them to create a balanced design that works well in all environments.
+
+\cleardoublepage
+
+\tableofcontents
+
+\cleardoublepage
+\pagenumbering{arabic}
+
+\section{Introduction}
+
+Cryptographic hash functions are the workhorses of cryptography, and can be found everywhere. Originally created to make digital signatures more efficient, they are now used to secure the very fundamentals of our information infrastructure: in password logins, secure web connections, encryption key management, virus- and malware-scanning, and almost every cryptographic protocol in current use.  Without hash functions, the Internet would simply not work.
+
+The most commonly used hash functions are those of the SHA family: SHA-0 \cite{SHA}, SHA-1 \cite{SHA-1}, SHA-256, and SHA-512 \cite{SHA-2}, all based on MD4 \cite{MD4} and MD5 \cite{MD5}.  These SHA variants were all developed by the National Security Agency (NSA) and certified by the National Institute for Standards and Technology (NIST) \cite{SHA,SHA-1,SHA-2}, and are part of several NIST standards \cite{DSS,HMAC2,ECC,RNG} and many Internet standards.
+
+Over the past few years, cryptanalysis of these functions has found serious weaknesses.  Practical collisions have been demonstrated in MD4 \cite{D98,WFLY04,KBPL05,WLFCY05}, MD5 \cite{WFLY04,WY05,KBPL05,K05a,K05b,K06,S06}, and SHA-0 \cite{CJ98,WFLY04,WYY05}.  Known collision attacks against SHA-1 are not yet practical, but they are still more than 10,000 times faster than what was expected \cite{WYY05}.  To date, no flaws have been found in SHA-256 and SHA-512 \cite{HPR04}, but the common heritage and design principles of all these functions makes them suspect. More seriously, if SHA-256 and SHA-512 were to be broken, the industry would be left without any generally accepted hash functions.
+
+To address this undesirable situation, NIST created a design competition for the next generation of hash functions \cite{SHA-3a}.  NIST has asked for proposals \cite{SHA-3b} and will likely select one as the new SHA-3 hash algorithm sometime in the year 2012.  While there is no immediate need to migrate to this new standard, it is assumed that SHA-3 will see widespread use world-wide as applications and standards start using it.
+
+This document introduces Skein\footnote{A ``skein''---pronounced $\backslash$sk\={a}n$\backslash$ and rhymes with ``rain''---is a loosely coiled length of yarn or thread wound on a reel.}, our submission to the SHA-3 competition.
+
+\section{Skein}
+
+\subsection{Overview}
+
+Skein is a family of hash functions with three different internal state sizes: 256, 512, and 1024 bits.
+
+\begin{itemize}
+\item Skein-512 is our primary proposal.  It can safely be used for all current hashing applications, and should remain secure for the foreseeable future.
+
+\item Skein-1024 is our ultra-conservative variant.  Because it has twice the internal-state size of Skein-512, it is failure friendly; even if some future attack managed to break Skein-512, it is quite likely that Skein-1024 would remain secure.  Skein-1024 can also run nearly twice as fast as Skein-512 in dedicated hardware implementations.
+
+\item Skein-256 is our low-memory variant.  It can be implemented using about 100 bytes of RAM.
+\end{itemize}
+
+Each of these state sizes can support any output size.  When a drop-in replacement is required for MD5 or one of the existing SHA hash functions, we recommend one of the configurations in Table~\ref{tab:versions}.
+%
+\begin{table}
+  \begin{center}
+    \begin{tabular}{|llrr|}
+    \hline
+    & & State & Output \\
+    Replace & With & Size & Size \\
+    \hline
+    MD5 & Skein-256-128 & 256 & 128\\
+        & Skein-512-128 & 512 & 128 \\
+    \hline
+    SHA-1 & Skein-256-160 & 256 & 160 \\
+          & Skein-512-160 & 512 & 160 \\
+    \hline
+    SHA-224 & Skein-256-224 & 256 & 224 \\
+            & Skein-512-224 & 512 & 224 \\
+    \hline
+    SHA-256 & Skein-256-256 & 256 & 256 \\
+            & Skein-512-256 & 512 & 256 \\
+    \hline
+    SHA-384 & Skein-512-384 & 512 & 384 \\
+            & Skein-1024-384 & 1024 & 384 \\
+    \hline
+    SHA-512 & Skein-512-512 & 512 & 512 \\
+            & Skein-1024-512 & 1024 & 512 \\
+    \hline
+    \end{tabular}
+  \end{center}\caption{Drop-in replacements for MD5, SHA-1 and SHA-2.}
+  \label{tab:versions}
+\end{table}
+
+Skein's novel idea is to build a hash function out of a tweakable block cipher.  The use of a tweakable block cipher allows Skein to hash configuration data along with the input text in every block, and make every instance of the compression function unique.  This property directly addresses many attacks on hash functions, and greatly improves Skein's flexibility.
+
+More specifically, Skein is built from these three new components:
+
+\begin{itemize}
+\item {\bf Threefish.}  Threefish is the tweakable block cipher at the core of Skein, defined with a 256-, 512-, and 1024-bit block size.
+\item {\bf Unique Block Iteration (UBI).} UBI is a chaining mode that uses Threefish to build a compression function that maps an arbitrary input size to a fixed output size.
+\item {\bf Optional Argument System.} This allows Skein to support a variety of optional features without imposing any overhead on implementations and applications that do not use the features.
+\end{itemize}
+
+Dividing up our design in this way makes Skein easier to understand, analyze, and prove properties about.  The underlying Threefish algorithm draws upon years of knowledge of block cipher design and analysis.  UBI is provably secure and can be used with \emph{any} tweakable cipher.  The optional argument system allows Skein to be tailored for different purposes.  These three components are independent, and are usable on their own, but it's their combination that provides real advantages.  And every aspect of Skein was designed to optimize those advantages.
+
+In the following subsections, we describe each component of Skein.  While this description is comprehensive enough for a reader to understand how Skein works, many details are either hidden or glossed over.  For a complete description of Skein, see the full specification in Section~\ref{sec:full}.
+
+\subsection{The Threefish Block Cipher}
+
+Threefish is a large, tweakable block cipher \cite{LRW02}.  It is defined for three different block sizes: 256 bits, 512 bits, and 1024 bits.  The key is the same size as the block, and the tweak value is 128 bits for all block sizes.
+
+The core design principle of Threefish is that a larger number of simple rounds is more secure than fewer complex rounds.  Threefish uses only three mathematical operations---exclusive-or (XOR), addition, and constant rotations---on 64-bit words---and is very fast on modern 64-bit CPUs.
+
+Figure~\ref{fig:mixingfunction} illustrates the core of Threefish: a simple non-linear mixing function, called MIX, that operates on two 64-bit words.  Each MIX function consists of a single addition, a rotation by a constant, and an XOR.
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-31.mps}
+\end{center}
+\caption{The MIX function.} \label{fig:mixingfunction}
+\end{figure}
+
+Figure~\ref{fig:Threefish512} shows how MIX functions are used to build Threefish-512.  Each of Skein-512's 72 rounds consists of four MIX functions followed by a permutation of the eight 64-bit words.  A subkey is injected every four rounds.  The word permutation, "Permute," is the same for every round; the rotation constants are chosen to maximize diffusion and repeat every eight rounds.
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-33.mps}
+\end{center}
+\caption{Four of the 72 rounds of the Threefish-512 block cipher.} \label{fig:Threefish512}
+\end{figure}
+
+The key schedule generates the subkeys from the key and the tweak.  Each subkey consists of three contributions: key words, tweak words, and a counter value.  To create the key schedule, the key and tweak are each extended with one extra parity word that is the XOR of all the other words.  Each subkey is a combination of all but one of the extended key words, two of the three extended tweak words, and the subkey number as shown in Figure~\ref{fig:ThreefishSubkey}.  Between subkeys, both the extended key and extended tweak are rotated by one word position. (For more details, see Section~\ref{sec:keyschedule}.)  The entire key schedule can be computed in just a few CPU cycles, which minimizes the cost of using a new key---a critical consideration when using a block cipher in a hash function.
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-61.mps}
+\end{center}
+\caption{Constructing a Threefish subkey.} \label{fig:ThreefishSubkey}
+\end{figure}
+
+Figure~\ref{fig:Threefish256} shows Threefish-256.  Threefish-1024 is similar, except that it has eight MIX functions per round and 80 rounds total.  The rotation constants and round permutations are different for each Threefish version, and were selected to maximize diffusion across the entire Threefish block.  (See Section~\ref{sec:threefishdesign} for details on how the rotation constants and permutations were chosen.)
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-32.mps}
+\end{center}
+\caption{Four of the 72 rounds of the Threefish-256 block cipher.} \label{fig:Threefish256}
+\end{figure}
+
+The nonlinearity in Threefish comes from the carry bits in the additions, each of which is a majority function of two input bits and another carry bit.  The MIX/permute structure has been designed to provide full diffusion in 9 rounds for Threefish-256, 10 rounds for Threefish-512, and 11 rounds for Threefish-1024.  At 72 and 80 rounds, Threefish has more full diffusions than most other block ciphers.
+
+\subsection{The UBI Chaining Mode}
+
+The Unique Block Iteration (UBI) chaining mode combines an input chaining value with an arbitrary length input string and produces a fixed-size output.  The easiest way to explain this is with an example.  Figure~\ref{fig:UBI} shows a UBI computation for Skein-512 on a 166-byte (three-block) input, which uses three calls to Threefish-512.
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-41.mps}
+\end{center}
+\caption{Hashing a three-block message using UBI mode.} \label{fig:UBI}
+\end{figure}
+
+Message blocks $M_0$ and $M_1$ contain 64 bytes of data each, and $M_2$ is the padded final block containing 38 bytes of data. The tweak value for each block encodes how many bytes have been processed so far, and whether this is the first and/or last block of the UBI computation.  The tweak also encodes a ``type'' field---not shown in the figure---that is used to distinguish different uses of the UBI mode from each other.
+
+The tweak is the heart of UBI.  By using a tweakable cipher, UBI chaining mode ensures that every block is processed with a unique variant of the compression function.  This stops a large variety of cut-and-paste attacks; a message piece that produces one result in one location will produce a different result in a different location.
+
+UBI is a variant of the Matyas-Meyer-Oseas \cite{MMO85} hash mode.  Unlike many other modes, the message input to the hash function is the same as the plaintext input to the block cipher.  Since the attacker has the greatest control over the message input, this provides an additional level of security.
+
+\subsection{Skein Hashing}
+
+Skein is built on multiple invocations of UBI.  Figure~\ref{fig:SkeinSerialHashing} shows Skein as a straightforward hash function.  Starting with a chaining value of 0, there are three UBI invocations: one each for the configuration block, the message (up to $2^{96}-1$ bytes long), and the output transform.
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-51.mps}
+\end{center}
+\caption{Skein in normal hashing mode.} \label{fig:SkeinSerialHashing}
+\end{figure}
+%
+
+The 32-byte configuration string encodes the desired output length and some parameters to support tree hashing.  If Skein is used as a standard hash function---a fixed output size and no tree hashing or MAC key---the result of the configuration block UBI computation is constant for all messages and can be precomputed as an IV.  A list of suitable precomputed chaining values is given in Appendix~\ref{sec:initialchainingvalues}.
+
+The output transform is required to achieve hashing-appropriate randomness.
+It also allows Skein to produce any size output up to $2^{64}$ bits. If a single output block is not enough, run the output transform several times, as shown in Figure~\ref{fig:SkeinLargeOutput}.
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-52.mps}
+\end{center}
+\caption{Skein with larger output size.} \label{fig:SkeinLargeOutput}
+\end{figure}
+%
+The chaining input to all output transforms is the same, and the data field consists of an 8-byte counter.  Essentially, this uses Threefish in counter mode.  Producing large outputs is often convenient, but---of course---the security of Skein is limited by the internal state size.
+
+\subsection{Optional Arguments}
+
+In order to increase the flexibility of Skein, several optional inputs can be enabled as needed.  These options are all driven by real-world applications we have worked on.
+
+\begin{itemize}
+\item {\bf Key} (Optional)  A key that turns Skein into a MAC or KDF function. The key is always processed first to support some of our security proofs.
+\item {\bf Configuration} (Required)  The configuration block discussed above.
+\item {\bf Personalization}  (Optional)  A string that applications can use to create different functions for different uses.
+\item {\bf Public Key}  (Optional)  Used to hash the public key when hashing a message for signing. This ties the signature hash to the public key.  Thus, this feature ensures that the same message generates different hashes for different public keys.
+\item {\bf Key Derivation Identifier} (Optional) Used for key derivation. To derive a key, provide the master key as the key input, and the identifier of the requested derived key here.
+\item {\bf Nonce}  (Optional)  Nonce value for use in stream cipher mode and randomized hashing.
+\item {\bf Message}  (Optional)  The normal message input of the hash function.
+\item {\bf Output} (Required)  The output transform.
+\end{itemize}
+
+A Skein computation consists of processing these options in order, using UBI.  Each input has a different ``type'' value for the tweak, ensuring that inputs are not interchangeable.
+
+None of these impact the performance and complexity of the basic hash function in any way; different implementations can choose which options to implement and which to ignore.
+
+Obviously, Skein can be extended with other optional arguments.  These can be added at any time, even when the function has already been standardized, as adding new optional arguments is backwards-compatible.  We welcome suggestions for other optional arguments.
+
+\subsection{Skein-MAC}
+
+The standard way to use a hash function for authentication is to use the HMAC construction \cite{HMAC1,HMAC2}.  Skein can---of course---be used with HMAC, but this requires at least two hash computations for every authentication, which is inefficient for short messages.  Skein has zero per-message overhead when used as a MAC function.
+%
+\begin{figure}[htbp]
+\begin{center}
+\includegraphics{skein-53.mps}
+\end{center}
+\caption{Skein-MAC.} \label{fig:SkeinMAC}
+\end{figure}
+
+Turning Skein into a MAC is simple, as illustrated in Figure~\ref{fig:SkeinMAC}.  Instead of starting with zero and processing the configuration block, start with zero, process the key, and then the configuration block.  Or, looking at it the other way, Skein hashing is simply Skein-MAC with a null key.  And just as Skein's output of the configuration block is a precomputable constant for a given state and output size, Skein-MAC's output of the configuration block can be precomputed for a given key.  Since the most common way to use a MAC is to authenticate multiple messages with a single key, this considerably increases performance for short messages.
+
+\subsection{Tree Hashing with Skein}
+
+When hashing very large amounts of data, the linear structure of a classical linear hash function becomes a limitation; it prevents a multi-core CPU from using multiple cores at the same time. Also, a common use of hash functions is to verify the integrity of a large amount of data.  With a linear hash function, all the data has to be verified at the same time.  This can be very inefficient, as it is often desirable to verify the integrity of only a small part of the data.
+
+A hash tree \cite{Mer87,Mer89} solves both these problems.  Rather than hashing the data as one large string, the data is cut into pieces.  Each piece is hashed, and the resulting hashes are treated as a new message.  This procedure can be applied recursively until the result is a single hash value.
+
+Skein includes an optional hash tree mode to support these type of applications.  As different applications have different requirements, there are three parameters that the application can choose among to optimize the hash tree for its particular use: the leaf node size, the tree fan-out, and the maximum tree height.  This structure is explained more fully in Section~\ref{sec:treehash}.
+
+\section{A Full Specification of Skein}\label{sec:full}
+
+This section provides a complete specification of Skein. Readers not interested in technical details might want to skip to the ``Using Skein'' section on page~\pageref{sec:using}.
+
+\subsection{Strings}
+
+When we talk about a ``string of X's,'' we mean a sequence of zero or more values, each of which has type X. For example: a string of bytes is a sequence of zero or more bytes. We write strings as comma-separated lists, and typically number the items starting at zero; for example, a string $t$ of 7 values is written:
+\[
+t = t_0, t_1, \ldots, t_6
+\]
+
+The concatenation operator $\concat$ denotes concatenation of strings. We use $0^n$ to denote a string of $n$ zeroes, where the type of zeroes (bits or bytes) will be clear from the context.
+
+\subsection{Bit and Byte Order}
+
+The order of bits and bytes is a common source of confusion in cryptographic algorithms. In short: Skein always uses the least-significant-byte-first convention. But to ensure there are no misunderstandings, we give formal definitions of our data type conversions.
+
+The basic universal data type in modern CPUs is a string of bytes. Each byte has a value in the range 0..255. A byte is also often viewed as a sequence of 8 bits $b_7, b_6, \ldots, b_0$, where each $b_i$ is either 0 or 1 and the byte value $b$ is given by:
+\[
+b := \sum_{i=0}^7 b_i \cdot 2^i
+\]
+Value $b_i$ is often referred to as ``bit $i$'' of $b$.
+
+A string of bits is stored as a string of bytes. For the hash function competition, NIST specifies a particular mapping from a string of bits to a string of bytes. Every group of 8 bits is encoded in a byte; the first bit goes into bit 7 of the byte, the next into bit 6 of the byte, etc. If the length of the bit string is not a multiple of 8, the last byte is only partially used, with the lower bit positions going unused.
+
+To convert from a sequence of bytes to an integer, we use the least-significant-byte-first convention. Let $b_0, \ldots, b_{n-1}$ be a string of $n$ bytes. We define: \symdef{ToInt}
+\begin{align*}
+  \ToInt( b_0, b_1, \ldots, b_{n-1} ) := \sum_{i=0}^{n-1} b_i \cdot 256^i
+\end{align*}
+%
+The reverse mapping is provided by the \ToBytes function: \symdef{ToBytes}
+\begin{align*}
+  \ToBytes( v, n )&:= b_0, b_1, \ldots, b_{n-1} \qquad
+  \text{where } b_i := \left\lfloor \frac{v}{256^i} \right\rfloor \bmod 256
+\end{align*}
+This function is only applied when $0 \leq v < 256^n$ so that the bytes fully encode the value $v$.
+
+We often convert between a string of $8n$ bytes and a string of $n$ 64-bit words and back. Let $b_0, \ldots, b_{8n-1}$ be the bytes. We define: \symdef{BytesToWords}
+\begin{align*}
+    \BytesToWords( b_0, \ldots, b_{8n-1} ) &:= w_0, \ldots, w_{n-1}
+    \qquad \text{where }  w_i := \ToInt( b_{8i}, b_{8i+1}, \ldots, b_{8i+7} )
+\end{align*}
+The reverse mapping is given by: \symdef{WordsToBytes}
+\begin{align*}
+  \WordsToBytes( w_0, \ldots, w_{n-1} ) := &\ToBytes( w_0, 8 ) \concat
+  \ToBytes( w_1, 8 ) \concat
+  \cdots \concat
+  \ToBytes( w_{n-1}, 8 )
+\end{align*}
+
+\subsection{A Full Specification of Threefish}
+
+Threefish is a tweakable block cipher with a block size of 256, 512, or 1024 bits. The tweak input is always 128 bits.
+
+The encryption function $E( K, T, P )$ takes the following arguments:
+\begin{parameters}
+  \item[K] Block cipher key; a string of 32, 64, or 128 bytes (256, 512, or 1024 bits).\symdef{Threefish-K}
+  \item[T] Tweak, a string of 16 bytes (128 bits).
+  \item[P] Plaintext, a string of bytes of length equal to the key.\symdef{P}
+\end{parameters}
+Threefish operates entirely on unsigned 64-bit words (i.e., values in the range $0..2^{64}-1$). All inputs are converted to strings of 64-bit words. Let $N_w$ be the number of words in the key (and thus also in the plaintext)\symdef{N_w}. The key $K$ is interpreted as key words $(k_0, k_1, \ldots, k_{N_w - 1})$, the tweak $T$ is interpreted as words $(t_0, t_1)$, and the plaintext $P$ as $(p_0, p_1, \ldots, p_{N_w - 1})$. \symdef{k_i}\symdef{t_i}\symdef{p_i}
+\begin{align*}
+  k_0, \ldots, k_{N_w-1} &:= \BytesToWords( K )\\
+  t_0, t_1 &:= \BytesToWords( T ) \\
+  p_0, \ldots, p_{N_w-1} &:=\BytesToWords( P )
+\end{align*}
+
+The number of rounds, $N_r$, is a function of the block size as shown in Table~\ref{tab:rounds}\symdef{N_r}.
+%
+\begin{table}[htbp]
+  \begin{center}
+    \begin{tabular}{|rrr|}
+    \hline
+    Block/Key & \# Words & \# Rounds\\
+    Size      & $N_w$    & $N_r$ \\
+    \hline
+     256 &  4 & 72 \\
+     512 &  8 & 72 \\
+    1024 & 16 & 80 \\
+    \hline
+    \end{tabular}
+  \end{center}\caption{Number of rounds for different block sizes.}
+  \label{tab:rounds}
+\end{table}
+%
+
+The key schedule (documented below) turns the key and tweak into a sequence of $N_r/4 + 1$ subkeys, each of which consists of $N_w$ words. We denote the words of subkey $s$ by $(k_{s,0}, \ldots, k_{s,N_w-1})$. \symdef{s}
+
+Let $v_{d,i}$ be the value of the $i$th word of the encryption state after $d$ rounds. \symdef{v_di} We start out with:
+\[
+v_{0,i} := p_i \quad \text{for $i=0,\ldots,N_w-1$}
+\]
+and then apply $N_r$ rounds numbered $d = 0, \ldots, N_r-1$. \symdef{d}
+
+For each round, we add a subkey if $d \bmod 4 = 0$. For $i=0, \ldots, N_w-1$ we have: \symdef{e_di}
+\begin{align*}
+  e_{d,i} &:=
+  \begin{cases}
+    (v_{d,i} + k_{d/4,i}) \bmod 2^{64} & \text{if $d \bmod 4 = 0$}\\
+     v_{d,i} & \text{otherwise}
+            \end{cases}\\
+%\intertext{If $d \bmod 8 \neq 0$ no key is added}
+%  e_{d,i} &:= v_{d,i} &&\text{for $i=0,\ldots,N_w-1$}
+\end{align*}
+The mixing and word permutations are defined by: \symdef{f_di}
+\begin{align*}
+  (f_{d,2j}, f_{d,2j+1}) &:= \mix_{d,j}(e_{d,2j}, e_{d,2j+1})&&\text{for $j=0,\ldots,N_w/2-1$}\\
+  v_{d+1,i} &:= f_{d,\pi(i)} &&\text{for $i=0,\ldots,N_w-1$}
+\end{align*}
+The $f_{d,i}$ values are the results of the MIX functions (defined below); and the output of the word permutation is the output of the round. The permutation $\pi()$ is given in Table~\ref{tab:wordPermutations}.
+\begin{table}
+  \begin{center}
+    \setlength{\tabcolsep}{4.5pt}
+    \begin{tabular}{@{}|r@{\,\,}r|rrrrrrrrrrrrrrrr|@{}}
+    \hline
+    &&\multicolumn{16}{c|}{$i=$}\\
+    &&\phantom{0}0&\phantom{0}1&\phantom{0}2&\phantom{0}3&\phantom{0}4&\phantom{0}5&\phantom{0}6&\phantom{0}7&\phantom{0}8&\phantom{0}9&10&11&12&13&14&15\\
+    \hline
+            & 4& 0 & 3 & 2 &  1 &&&&&&&&&&&&\\
+    $N_w =$ & 8& 2 & 1 & 4 &  7 & 6 &  5 & 0 &  3 &&&&&&&&\\
+            &16& 0 & 9 & 2 & 13 & 6 & 11 & 4 & 15 & 10 & 7 & 12 & 3 & 14 & 5 & 8 & 1 \\
+    \hline
+    \end{tabular}
+  \end{center}\caption{Values for the word permutation $\pi(i)$.}\symdef{pi}
+  \label{tab:wordPermutations}
+\end{table}
+
+The ciphertext $C$ is given by: \symdef{c_i}\symdef{C}
+\begin{align*}
+  c_i &:= (v_{N_r,i} + k_{N_r/4,i}) \bmod 2^{64} & \text{for $i=0, \ldots, N_w-1$}\\
+  C &:= \WordsToBytes( c_0, \ldots, c_{N_w-1} )
+\end{align*}
+
+\subsubsection{MIX Functions}
+
+Function $\mix_{d,j}$ has two input words $(x_0, x_1)$\symdef{x0x1} and produces two output words $(y_0, y_1)$\symdef{y0y1} using the following relations:
+\begin{align*}
+  y_0 &:= (x_0 + x_1) \bmod 2^{64}\\
+  y_1 &:= (x_1 \rol R_{(d \bmod 8),j}) \xor y_0
+\end{align*}
+where $\rol$ is the rotate-left operator. The constants $R_{d,j}$ are shown in Table~\ref{tab:rotations}.  (These constants were changed in version 1.2 of the paper.  See Appendix~\ref{sec:tweakrotconst} for details.)
+%
+\begin{table}[htbp]
+  \begin{center}
+    \begin{tabular}{|cc|rr|rrrr|rrrrrrrr|}
+    \hline
+    \multicolumn{2}{|c|}{$N_w$}&\multicolumn{2}{c|}{4}&\multicolumn{4}{c|}{8}&\multicolumn{8}{c|}{16}\\
+    \hline
+    \multicolumn{2}{|c|}{$j$}& 0 & 1 & 0 & 1 & 2 & 3 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7\\
+    \hline
+    \hline
+            & 0 &  14 & 16 &   46 & 36 & 19 & 37 &    24 & 13 &  8 & 47 &  8 & 17 & 22 & 37 \\
+            & 1 &  52 & 57 &   33 & 27 & 14 & 42 &    38 & 19 & 10 & 55 & 49 & 18 & 23 & 52 \\
+            & 2 &  23 & 40 &   17 & 49 & 36 & 39 &    33 &  4 & 51 & 13 & 34 & 41 & 59 & 17 \\
+    $d =$   & 3 &   5 & 37 &   44 &  9 & 54 & 56 &     5 & 20 & 48 & 41 & 47 & 28 & 16 & 25 \\
+            & 4 &  25 & 33 &   39 & 30 & 34 & 24 &    41 &  9 & 37 & 31 & 12 & 47 & 44 & 30 \\
+            & 5 &  46 & 12 &   13 & 50 & 10 & 17 &    16 & 34 & 56 & 51 &  4 & 53 & 42 & 41 \\
+            & 6 &  58 & 22 &   25 & 29 & 39 & 43 &    31 & 44 & 47 & 46 & 19 & 42 & 44 & 25 \\
+            & 7 &  32 & 32 &    8 & 35 & 56 & 22 &     9 & 48 & 35 & 52 & 23 & 31 & 37 & 20 \\
+       \hline
+    \end{tabular}
+  \end{center}\caption{Rotation constants $R_{d,j}$ for each $N_w$.}\symdef{R_dj}
+  \label{tab:rotations}
+\end{table}
+%
+
+\subsubsection{The Key Schedule}\label{sec:keyschedule}
+
+The key schedule starts by defining two additional words $k_{N_w}$ and $t_2$ by:
+\begin{align*}
+k_{N_w} &:= \TheConst \xor \bigoplus_{i=0}^{N_w-1} k_i  &\text{and}&&
+t_2 &:= t_0 \xor t_1
+\end{align*}
+
+The constant $\TheConst=\texttt{0x1BD11BDAA9FC1A22}$ ensures that the extended
+key cannot be all zeroes. Section~\ref{sec:threefishdesign} explains
+how the value of $\TheConst$ was chosen.
+
+The key schedule is now defined by: \symdef{k_si}
+\begin{align*}
+  k_{s,i} &:= k_{(s+i) \bmod (N_w+1)}                     &&\text{for $i=0,\ldots,N_w - 4$}\\
+  k_{s,i} &:= k_{(s+i) \bmod (N_w+1)} + t_{s \bmod 3}     &&\text{for $i=N_w-3$}\\
+  k_{s,i} &:= k_{(s+i) \bmod (N_w+1)} + t_{(s+1) \bmod 3} &&\text{for $i=N_w-2$}\\
+  k_{s,i} &:= k_{(s+i) \bmod (N_w+1)} + s                 &&\text{for $i=N_w-1$}
+\end{align*}
+where the additions are all modulo $2^{64}$.
+
+\subsubsection{Decryption}
+
+The Threefish decryption operation is the obvious inverse of the encryption operation. Subkeys are used in reverse order and each round consists of applying the inverse word permutation followed by the inverse MIX functions.
+
+\subsection{A Full Specification of UBI}
+
+The UBI chaining mode is built on a tweakable block cipher with a block size and key size of $N_b$ bytes\symdef{N_b}, and a tweak size of 16 bytes. The function $\UBI(G, M, T_s)$ has inputs:
+\begin{parameters}
+    \item[G] a starting value of $N_b$ bytes.
+    \item[M] a message string of arbitrary bit length up to $2^{99}-8$ bits, encoded in a string of bytes.
+    \item[T_s] a 128-bit integer that is the starting value for the tweak. (See below for some restrictions on the value of $T_s$.)\symdef{T_s}
+\end{parameters}
+
+UBI processes the message in blocks using a unique tweak value for each block. The fields in the tweak are shown in Figure~\ref{fig:tweaklayout} and Table~\ref{tab:tweaklayout}.
+%
+\begin{figure}[tbph]
+\begin{center}
+%\epsfxsize \textwidth
+\includegraphics{skein-71.mps}
+\end{center}
+\caption{The fields in the tweak value.}\label{fig:tweaklayout}
+\end{figure}
+%
+%
+\begin{table}[tbph]
+  \begin{center}
+    \begin{tabular}{|llp{250pt}|}
+    \hline
+    Name & \multicolumn{1}{c}{Bits} & Description \\
+    \hline
+    Position & \phantom{00}0--\phantom{0}95 & The number of bytes in the string processed so far (including this block)\\
+    reserved &\phantom{0}96--111 & Reserved for future use, must be zero \\
+    TreeLevel & 112--118 & Level in the hash tree, zero for non-tree computations.\\
+    BitPad & 119 & Set if this block contains the last byte of an input whose length was not an
+    integral number of bytes. 0 otherwise. \\
+    Type & 120--125 & Type of the field (config, message, output, etc.)\\
+    First & 126 & Set for the first block of a UBI compression.\\
+    Final & 127 & Set for the last block of a UBI compression.\\
+    \hline
+    \end{tabular}
+    \caption{The fields in the tweak value.}\label{tab:tweaklayout}\symdef{T}
+  \end{center}
+\end{table}
+%
+To avoid having many different parameters, we treat the tweak as a single 128-bit value. This simplifies our notation but it imposes some restrictions on the value $T_s$ can have. The BitPad, First, and Final field must be zero; the Position field must have a value such that the sum of the Position field plus the length of $M$ in bytes does not exceed $2^{96}-1$.
+
+
+If the number of bits in the data $M$ is a multiple of 8, we define $B:=0$ and $M' := M$. If the number of bits in $M$ is not a multiple of 8, the last byte is only partially used. The most significant bit positions of the last byte contain data. We pad the last byte by setting the most significant unused bit to 1 and the remaining unused bits (if any) to zero. We define $B:=1$ and let $M'$ be $M$ with the bit-padding applied.
+
+Let $N_M$ be the number of bytes in $M'$. The input is restricted to $N_M < 2^{96}$.
+
+We pad $M'$ with $p$ zero bytes until the length is a multiple of the block size, ensuring that we get at least one whole block.
+\begin{align*}
+  p &:= \begin{cases}
+            N_b & \text{if $N_M=0$}\\
+            (-N_M) \bmod N_b &\text{otherwise}
+        \end{cases}\\
+  M'' &:= M' \concat 0^p
+\end{align*}
+
+We split $M''$ into $k$ message blocks $M_0, \ldots, M_{k-1}$, each of $N_b$ bytes. The UBI result is computed as\symdef{H_i}
+\begin{align*}
+  H_0 &:= G\\
+  H_{i+1} &:= E( H_{i}, \ToBytes( T_s + \min( N_M, (i+1)N_b) + a_i 2^{126} + b_i(B2^{119} + 2^{127}), 16), M_i ) \xor M_i
+\end{align*}
+where $a_0 = b_{k-1} = 1$, all other $a_i$ and $b_i$ values are 0, $E()$ is the tweakable block cipher encryption function, and $H_k$ is the result of the UBI chaining mode.
+
+The tweak value for each block is constructed by the addition
+\[
+T_s + \min( N_M, (i+1)N_b) + a_i 2^{126} + b_i(B2^{119} + 2^{127})
+\]
+The first term is $T_s$, which specifies the TreeLevel and Type fields, and optionally provides an offset for the Position field. The $\min( N_M, (i+1)N_b)$ term modifies only the Position field. For each block, the Position field is the number of bytes processed so far, including all the bytes in the current block, plus the offset from $T_s$. The $T_s$ restrictions above ensure there is never a carry out of the Position field from this addition that could modify another field. The $a_i 2^{126}$ term sets the First flag, but only in the first block of a UBI computation. The $b_i (B 2^{119} + 2^{127})$ term does two things. For any block except the last one, $b_i = 0$ so this term does nothing. In the last block, the Final flag is set (bit position 127) and if any bit padding was applied, then the BitPad flag is set (bit position 119).
+
+\subsection{A Full Specification of Skein}\label{sec:fullSkein}
+
+\subsubsection{Type Values}
+
+Skein has many possible parameters. Each parameter, whether optional or mandatory, has its own unique type identifier and value. Type values are in the range 0..63. Skein processes the parameters in numerically increasing order of type value, as listed in Table~\ref{tab:types}.
+%
+\begin{table}[tbh]
+\begin{center}
+\begin{tabular}{|rrl|}
+\hline
+  Symbol & Value & Description \\
+\hline
+  \TypeKey & 0 & Key (for MAC and KDF)\\
+  \TypeConfig & 4 & Configuration block \\
+  \TypePers & 8 & Personalization string \\
+  \TypePK & 12 & Public key (for digital signature hashing)\\
+  \TypeKDF & 16 & Key identifier (for KDF)\\
+  \TypeNonce & 20 & Nonce (for stream cipher or randomized hashing)\\
+  \TypeMsg & 48 & Message \\
+  \TypeOut & 63 & Output\\
+\hline
+\end{tabular}
+\caption{Values for the type field.}\label{tab:types}\symdef{T_xxx}
+\end{center}
+\end{table}
+%
+
+\subsubsection{The Configuration String}\label{sec:configString}
+
+The configuration string contains the following data:
+\begin{itemize}
+  \item A schema identifier. This is a literal constant. If some other standardization body wants to define an entirely different function based on UBI and Threefish, it can chose a different schema identifier and ensure that its function is different from Skein.
+  \item A version number, to support future extensions.
+  \item $N_o$: the output length of the computation, in bits. This ensures that two Skein computations that differ only in the number of output bits give unrelated results.\symdef{N_o}
+  \item $Y_l$: Tree leaf size encoding. Set to 0 if tree hashing is not used.
+  \item $Y_f$: Tree fan-out encoding. Set to 0 if tree hashing is not used.
+  \item $Y_m$: Max tree height. Set to 0 if tree hashing is not used.
+\end{itemize}
+The values for the tree parameters are detailed in Section~\ref{sec:treehash}. The layout of the 32-byte configuration string $C$ is given in Table~\ref{tab:configlayout}.
+%
+\begin{table}[tbh]
+\begin{center}
+\begin{tabular}{|rrlp{250pt}|}
+  \hline
+  & Size in & &\\
+  Offset & Bytes & Name & Description \\
+  \hline
+   0 & 4 & Schema identifier & The ASCII string ``SHA3'' \\
+     &   &    & = (0x53, 0x48, 0x41, 0x33), \\
+     &   &    & or ToBytes(0x33414853,4) \\
+   4 & 2 & Version number & Currently set to 1: $\ToBytes( 1, 2 )$\\
+   6 & 2 & & Reserved, set to 0\\
+   8 & 8 & Output length & $\ToBytes( N_o, 8 )$ \\
+  16 & 1 & Tree leaf size enc.& $Y_l$\\
+  17 & 1 & Tree fan-out enc.& $Y_f$ \\
+  18 & 1 & Max. tree height & $Y_m$ \\
+  19 & 13 & & Reserved, set to 0\\
+  \hline
+\end{tabular}
+\caption{The Fields in the configuration value.}\label{tab:configlayout}\symdef{Conf}
+\end{center}
+\end{table}
+
+The reserved fields are present to support future extensions in a backward-compatible way.
+
+\subsubsection{The Output Function}
+
+The function $\Output( G, N_o )$ takes the following parameters:
+\begin{parameters}
+  \item[G] the chaining value.
+  \item[N_o] the number of output bits required.
+\end{parameters}
+and produces $N_o$ bits of output.
+
+The result consists of the leading $\lceil N_o/8 \rceil$ bytes of:
+\begin{align*}
+  O := &\UBI( G, \ToBytes( 0, 8 ), \TypeOut 2^{120} ) \concat \\
+        &\UBI( G, \ToBytes( 1, 8), \TypeOut 2^{120} ) \concat \\
+        &\UBI( G, \ToBytes( 2, 8), \TypeOut 2^{120} ) \concat \\
+        &\cdots
+\end{align*}
+If $N_o \bmod 8 = 0$ the output is an integral number of bytes. If $N_o \bmod 8 \neq 0$ the last byte is only partially used.
+
+\subsubsection{Simple Hashing}
+
+A simple Skein hash computation has the following inputs:
+\begin{parameters}
+  \item[N_b] The internal state size, in bytes. Must be 32, 64, or 128.
+  \item[N_o] The output size, in bits.
+  \item[M] The message to be hashed, a string of up to $2^{99}-8$ bits ($2^{96}-1$ bytes).
+\end{parameters}
+
+Let $C$ be the configuration string defined in Section~\ref{sec:configString} with $Y_l = Y_f = Y_m = 0$
+
+We define:
+\begin{align*}
+  K' &:= 0^{N_b} &\text{a string of $N_b$ zero bytes}\\
+  G_0 &:= \UBI( K', C, \TypeConfig 2^{120} )\\
+  G_1 &:= \UBI( G_0, M, \TypeMsg 2^{120} )\\
+  H &:= \Output( G_1, N_o )
+\end{align*}
+where $H$ is the result of the hash.
+
+\subsubsection{Full Skein}
+In its full general form, a Skein computation has the following inputs:
+\begin{parameters}
+  \item[N_b] The internal state size, in bytes. Must be 32, 64, or 128.
+  \item[N_o] The output size, in bits.
+  \item[K] A key of $N_k$ bytes. Set to the empty string ($N_k=0$) if no key is desired. \symdef{Skein-K}
+  \item[Y_l] Tree hash leaf size encoding.
+  \item[Y_f] Tree hash fan-out encoding.
+  \item[Y_m] Maximum tree height.
+  \item[L] List of $t$ tuples $(T_i,M_i)$ where $T_i$ is a type value and $M_i$ is a string of bits encoded in a string of bytes.
+\end{parameters}
+
+We have:
+\[
+L := (T_0,M_0), \ldots, (T_{t-1},M_{t-1})
+\]
+We require that $\TypeConfig < T_0$, $T_i < T_{i+1}$ for all $i$, and $T_{t-1} < \TypeOut$. An empty list $L$ is allowed. Each $M_i$ can be at most $2^{99}-8$ bits ($= 2^{96} - 1$ bytes) long.
+
+The first step is to process the key. If $N_k = 0$, the starting value consists of all zeroes. \symdef{Kprime}
+\begin{align*}
+K' &:= 0 ^ {N_b}
+\intertext{If $N_k \neq 0$ we compress the key using UBI to get our starting value:}
+K' &:= \UBI( 0^{N_b}, K, \TypeKey 2^{120} )
+\end{align*}
+
+Let $C$ be the configuration string defined in Section~\ref{sec:configString}. We compute:
+\begin{align*}
+G_0 := \UBI( K', C, \TypeConfig 2^{120} )
+\end{align*}
+
+The parameters are then processed in order:\symdef{G_i}
+\begin{align*}
+  G_{i+1} &:= \UBI( G_i, M_i, T_i 2^{120} ) \qquad \text{for $i=0, \ldots, t-1$}
+\end{align*}
+with one exception: if the tree parameters $Y_l$, $Y_f$, and $Y_m$ are not all zero, then an input tuple with $T_i = \TypeMsg$ is processed as defined in Section~\ref{sec:treehash}, rather than with straight UBI.
+
+And the final Skein result is given by:
+\begin{align*}
+  H := \Output( G_t, N_o )
+\end{align*}
+
+\subsubsection{Tree Processing}\label{sec:treehash}
+
+The message input (type \TypeMsg) is special and can be processed as a tree. Figure~\ref{fig:treeHashing} gives an example of how tree hashing works.
+%
+\begin{figure}[tbhp]
+\begin{center}
+\includegraphics{skein-81.mps}
+\end{center}
+\caption{An overview of tree hashing.}\label{fig:treeHashing}
+\end{figure}
+%
+Tree processing is controlled by the three tree parameters $Y_l$, $Y_f$, and $Y_m$ in the config block. Normally (for non-tree hashing), these are all zero. If they are not all zero, the normal UBI function that processes the \TypeMsg field is replaced by a tree hashing construction; this is a drop-in replacement of that one UBI function; all other parts of Skein are unchanged.
+
+The tree hashing uses the following input parameters:
+\begin{parameters}
+  \item[Y_l] The leaf size encoding. The size of each leaf of the tree is $N_b 2^{Y_l}$ bytes with $Y_l \geq 1$.\symdef{Y_l}
+  \item[Y_f] The fan-out encoding. The fan-out of a tree node is $2^{Y_f}$ with $Y_f \geq 1$.\symdef{Y_f}
+  \item[Y_m] The maximum tree height; $Y_m \geq 2$. (If the height of the tree is not limited, this parameter is set to 255.)\symdef{Y_m}
+  \item[G] The input chaining value. This is the $G$ input of the UBI call that the tree hashing replaces, and the output of the previous UBI function in the Skein computation.
+  \item[M] The message data.
+\end{parameters}
+We define the leaf size $N_l := N_b 2^{Y_l}$ and the node size $N_n := N_b 2^{Y_f}$.
+
+The message data $M$ is a string of bits encoded in a string of bytes. We first split $M$ into one or more message blocks $M_{0,0}, M_{0,1}, M_{0,2}, ..., M_{0,k-1}$. If $M$ is the empty string, the split results in a single message block $M_{0,0}$ that is itself the empty bit string. If $M$ is not the empty string, then blocks $M_{0,0}, \ldots, M_{0,k-2}$ all contain $8N_l$ bits and block $M_{0,k-1}$ contains between 1 and $8N_l$ bits.
+
+We now define the first level of tree hashing:
+\begin{align*}
+M_1 := \mathop{\Big|\Big|}\limits_{i=0}^{k-1} \UBI( G, M_{0,i}, iN_l + 1 \cdot 2^{112} + \TypeMsg 2^{120} )
+\end{align*}
+Note that in the tweak, the tree level field is set to one and the Position field is given an offset equal to the starting offset (in bytes) of the message block.
+
+The rest of the tree is defined iteratively. For any level $l =1, 2, \ldots $ we use the following rules.
+
+If $M_l$ has length $N_b$ then the result $G_o$ is defined by $G_o := M_l$.
+
+If $M_l$ is longer than $N_b$ bytes and $l = Y_m - 1$ then we have almost reached the maximum tree height. The result is defined by: \[
+G_o := \UBI( G, M_l, Y_m \cdot 2^{112} + \TypeMsg 2^{120} )
+\]
+
+If neither of these conditions holds, we create the next tree level. We split $M_l$ into blocks $M_{l,0}$, $M_{l,1}, \ldots, M_{l,k-1}$ where all blocks but the last one are $N_n$ bytes long and the last block is between $N_b$ and $N_n$ bytes long. We then define:
+\[
+M_{l+1} := \mathop{\Big|\Big|}\limits_{i=0}^{k-1} \UBI( G, M_{l,i}, iN_n + (l+1) 2^{112} + \TypeMsg 2^{120} )
+\]
+and apply the above rules to $M_{l+1}$ again.
+
+The result $G_o$ is the output of the tree hashing. It becomes the chaining input to the next UBI function in Skein. (Currently there are no types defined between \TypeMsg and \TypeOut, so $G_o$ becomes the chaining input to the output transformation.)
+
+As $Y_f \geq 1$ each node of the tree has a fan-out of at least 2, so the height of the tree grows logarithmically in the size of the message input.
+
+\section{Using Skein}\label{sec:using}
+
+In this section we describe some of the many ways in which Skein can be used, and which arguments are used for what data.  All Skein computations contain a configuration block and end with an output transform---so we will not mention them for every use---but there are also a wealth of different options.
+
+\subsection{Skein as a Hash Function}
+
+When used as a hash function, the message type is the only optional input type used. The output of configuration UBI becomes a precomputed initial chaining value.  This is the simplest use of Skein. With the variable output size it becomes a drop-in replacement for almost any existing hash function.
+
+\subsection{Tree Hashing with Skein}
+
+Implementers of tree hashing have a number of decisions to make. There are three parameters to choose: the leaf node size, the fan-out, and the maximum tree height. For efficiency, a larger leaf node size and fan-out is better; it reduces the number of nodes and thus the overhead. But large leaf nodes and high fan-out make some uses less efficient.
+
+An implementer that needs the hash function to process data at a very high data rate can use a leaf node size of a few kilobytes and a maximum tree height of 2. This allows multiple processors to each work on its own leaf node, with one processor doing the second level of the tree. Increasing the leaf node size makes this more efficient, but it increases the amount of memory needed for buffering, and will tend to increase latency.
+
+Limiting the tree height is useful when memory-limited devices are involved. When computing a tree hash incrementally, the implementation must store data for each level of the tree. Limiting the tree height allows a fixed allocation of memory for small devices.
+
+Tree hashes can also be used to create a locally verifiable and/or updatable hash. In this type of application, the message data is typically stored, as well as all the nodes of the tree hash. To verify a part of the message, only that part of the message and the tree nodes that cover it have to be verified. To modify a part of the message, the tree nodes that cover the modified data have to be recomputed. This is most efficient if the leaf node size is relatively small, and the tree fan-out is low.
+
+\subsection{Skein as a MAC}
+
+To compute a MAC, the key is used as the key input, and the message as the message input.
+
+One useful property of Skein-MAC is that a 32-bit MAC on a particular key/message pair is completely unrelated to the 64-bit MAC on the same key/message pair. This address a class of attacks where the attacker interferes with the algorithm negotiation between two parties, and convinces one to use a 32-bit MAC and the other to use a 64-bit MAC.  If the shorter MAC were merely the truncation of the longer MAC, the attacker might be able to divide the keyspace in half and break the 64-bit MAC. Of course, a good algorithm negotiation protocol does not allow this attack, but we've seen this type of attack work against a number of proprietary protocols that we have analyzed in the past.
+
+\subsection{HMAC}
+
+HMAC~\cite{B06,HMAC1} represents one of the most common usages of hash functions.  Skein can easily be used in HMAC mode, which will use it directly as a hash function as specified by~\cite{HMAC2}.
+
+\subsection{Randomized Hashing}
+
+To use randomized hashing \cite{HK06,D08}, use the Nonce input to specify a differentiator for every hash computation.
+
+\subsection{Skein as a Hash Function for Digital Signatures}
+
+For digital signatures, Skein allows the option of hashing the public key as well.  The message is processed into the message input and the public key into the public key input.  This forces message hashes to depend on the public key, and proves that someone with access to the actual document intended to have it signed by that key. This can be relevant in systems that process signatures on documents separately from the documents. An attacker that only sees a signature cannot extract the hash and sign the document himself. Depending on the application and situation such phantom signatures might be a problem; for example, they might allow an attacker to convince an arbitrator that he was involved in developing a document because his signatures are in the audit trail. When the public key is included in the hash, the attacker needs access to the original document to sign it, or convince someone who has access to the document to hash it for his public key.
+
+The presence of the public key in the input to the hash also serves to slow down the rate of digital signature compromise in the case of the discovery of a collision finding attack on the hash function.  The attacker has to reinvest effort for every public key that it wants to attack.  In contrast, when the public key is not an input to the hash, discovery of a single collision for the hash function can be used to quickly compromise a large number of signing keys.
+
+\subsection{Skein as Key Derivation Function (KDF)}
+
+Skein can be used as a KDF \cite{Kal00,Dodis,ECC,Che08}.  To perform a key derivation, the master key is provided as the key input, and the identifier for the derived key is provided as the KDF input.  The desired key size is the output size, $N_o$, which is part of the configuration block.
+
+\subsection{Skein as a Password-Based Key Derivation Function (PBKDF)}
+
+A Password-Based Key Derivation Function is used to derive cryptographic keys from relatively low-entropy passwords. The application stores a random seed $S$, asks the user for a password $P$, and then performs a long computation to combine $S$ and $P$. This computation is deliberately inefficient, often taking something like 100 ms of CPU time. This is acceptable if a user is logging into a computer system, but an attacker that tries to guess the password has to perform 100 ms worth of computations for every password he tries. The seed $S$ ensures that the attacker cannot precompute a table of common passwords and their results; the table would have to be recomputed for every $S$ value.
+
+The most commonly used PBKDFs \cite{Kal00,ECC} use repeated hash function computations. Of course, Skein can be used in any of these constructions.
+
+Skein also provides an alternative method for PBKDFs. The password $P$ is provided as the key input. The seed $S$ is repeated a very large number of times and becomes the message input. The PBKDF result is then computed using Skein with tree parameters $Y_l=1$, $Y_f=1$, $Y_m=255$. The total size of the message input determines the speed of the PBKDF and can be chosen appropriately. (Existing PBKDFs typically have an iteration count of some sort that has the same function.)
+
+This approach is not ideal with a linear hash function; the long computation on the repeated $S$ can lose entropy with regard to the original password. The tree hashing keeps the individual UBI chains short and avoids this problem.
+
+An even simpler PBKDF is to simply create a very long repetition of $S$ and $P$; e.g., $S \concat P \concat S \concat P \concat S \cdots$, and hash that using Skein. (Any other optional data can also be included in the repetition.) This approach is not ideal with a normal hash function, as the computation could fall into a loop.  But in Skein, every block has a different tweak and is thus processed differently.
+
+\subsection{Skein as a PRNG}
+
+Skein can be used as a PRNG with the same security properties as the SP 800-90 PRNGs \cite{RNG} (as well as Yarrow \cite{KSF99} and Fortuna \cite{FS03}): After generating data from the PRNG, the state no longer contains the necessary information to recover that data.
+
+The Skein-PRNG state $S$ consists of $N_b$ bytes.  If an application requests $N$ random bytes, the PRNG computes the Skein output function using the state $S$ as the chaining input and produces $N + N_b$ bytes of output. The first $N_b$ bytes of output become the next state for the next request; the rest of the output bytes are given to the application. Once this function completes and the old $S$ state is overwritten, the PRNG can no longer recover the random bytes produced for the application.
+
+To reseed the PRNG with seed data $D$, set the state to the Skein hash of $S \concat D$ (using the natural output size).  The initial seeding of the PRNG is done by setting the state to all zeroes and performing a reseed with the provided seed data.
+
+Skein-PRNG is fast; it can produce random data at the same speed that it hashes data. For small requests, Skein-PRNG has to process a minimum of two Threefish encryptions; it is more efficient to get larger blocks of random bytes in one request and buffer the result.
+
+\subsection{Skein as a Stream Cipher}
+
+To use Skein as a stream cipher, supply the key to the key input and the nonce (that selects the key stream to generate) to the nonce input.  By convention, since the length of the desired key stream is not known in advance, set the output size in the configuration value (see Table~\ref{tab:configlayout}) to $2^{64}-1$.  Implementations can then compute any part of the key stream as desired. For encryption and decryption, the key stream is XORed with the plaintext or ciphertext.
+
+There is a fundamental difference between Skein-PRNG and using Skein as a stream cipher to generate random bits.  The outputs of a PRNG are typically not reproducible.  Skein-PRNG actually does work to ensure that once an output has been produced, the PRNG state no longer contains the necessary information to reconstruct the output.  A stream cipher creates reproducible random data.  Depending on the application, one or the other might be desirable.
+
+An application that needs random access to a large random string can use the Skein stream cipher mode in two ways.  It can use a single nonce and selectively generate output blocks, or it can include a counter in the nonce and generate a fixed size block for each nonce value.  In general, we recommend the second approach as it does not require a new API for selectively generating parts of the output string, and thus is easier to implement using an existing Skein implementation.
+
+\subsection{Personalization}
+
+All Skein applications (except the PRNG output production) can be personalized with the personalization input.  We recommend that all application designers seriously consider doing this; we have seen many protocols where a hash that is computed in one part of the protocol can be used in an entirely different part because two hash computations were done on similar or related data, and the attacker can force the application to make the hash inputs the same \cite{KSW98,FS99}. Personalizing each hash function used in the protocol summarily stops this type of attack.
+
+When using the personalization input, we recommend that applications use a unique string that starts with a date followed by an email address. The date consists of 8 digits in YYYYMMDD format (Gregorian calendar); this is immediately followed by a space, an email address owned by the creator of the application on the date specified, and a space.  After the space, the creator of the application can use any data to distinguish different applications and uses.
+
+For example, the personalization string for the application FOO might be the UTF8 Unicode string:
+\begin{center}
+\texttt{20081031 somebody@example.com FOO/bar}
+\end{center}
+where ``bar'' is the personalization within the application.
+
+This convention allows anybody to generate unique personalization strings that are distinct from all other personalization strings.  To support all languages, the string is a UTF8-encoded Unicode string\footnote{For readers unfamiliar with UTF8 and Unicode: an ASCII string with all characters $<128$ is a valid UTF8-encoded Unicode string.}.
+
+Alternatively, implementors can generate a 16-byte random value using a high-quality random number generator, and start all their personalization strings with that fixed random value.
+
+\subsection{Choosing the Output Size}
+
+For any of these Skein applications, there can be situations in which the desired output size is not known in advance. This can be resolved in two ways. The simplest way is to compute the result using the natural output size and use this as key to the stream cipher mode to produce the desired output size. Alternatively, applications can set $N_o = 2^{64}-1$ and use only as many of the output bytes as they need. In general, we recommend against this second approach, as the leading bytes of different output sizes are the same. Furthermore, it requires a non-standard implementation that can produce only part of the specified output.
+
+\subsection{Threefish as a Block Cipher}
+
+Threefish can be used as a normal block cipher in any of the well-known block cipher modes \cite{FS03}.
+
+Threefish decryption is generally slower than encryption due to the MIX function having less parallelism in the decryption direction.  Some block ciphers modes use both encryption and decryption (e.g., CBC) but others use only encryption (e.g., CFB and OFB).  Since in most applications decryption happens more often than encryption, when using Threefish as a standalone cipher in a mode that requires decryption, it might be useful to switch the encrypt/decrypt direction.  But this is a minor point, given the raw speed of Threefish.
+
+Several recent block cipher modes, such as Offset Codebook (OCB) \cite{RBB03}, turn a plain block cipher into something similar to a tweakable block cipher using a value added to both the plaintext and the ciphertext. We believe that a native tweakable block cipher, like Threefish, will lead to newer, more efficient modes, where the tweak value is used directly.
+
+Other modes of operation will likely benefit from the extended input space of tweakable block ciphers (plaintext or ciphertext \emph{plus} tweak), compared to conventional block ciphers (plaintext or ciphertext \emph{only}). For example, the Counter--Cipher Feedback (CCFB) mode \cite{L05} uses a conventional block cipher for authenticated encryption. We are working on a variant of that mode, providing more efficient authenticated encryption at the same level of security, when employing a tweakable block cipher instead of a conventional one.
+
+Most block ciphers modes are only secure up to the birthday bound; thus, we can expect most uses of AES to start failing after processing $2^{64}$ blocks. In general, to achieve a security level of $n$ bits it would be nice to have a block cipher with a block size of $2n$ bits.  Threefish has a large enough block size to eliminate all collision-style attacks and provide high security even when processing large amounts of data.
+
+\section{Skein Performance}\label{sec:perf}
+
+\subsection{Software Performance}
+
+Skein is designed to be fast on 64-bit CPUs.  Table~\ref{tab:SkeinSpeedSummary} give a summary of the speed measurements we have made for large messages, in 64- and 32-bit mode on an Intel Core 2 Duo CPU, in assembly language and C.
+
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrr|}
+    \hline
+     & \multicolumn{3}{c|}{Skein-} \\
+                &  256  & 512 &  1024 \\
+    \hline
+    64-bit ASM  &  7.6 &  6.1 &  6.5 \\
+    64-bit C    &  9.2 &  6.5 & 12.3 \\
+    32-bit ASM  & 18.1 & 16.6 & 20.6 \\
+    32-bit C    & 35.8 & 40.1 & 49.0 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{Summary of skein speeds (clocks/byte).}
+  \label{tab:SkeinSpeedSummary}
+\end{table}
+
+The following series of tables gives performance figures for Skein-256, Skein-512, and Skein-1024 with a variety of message sizes.  All measurements were taken on the NIST reference platform: an Intel Core 2 Duo CPU running Windows Vista, using the Microsoft Visual C Studio 2008 compiler.  There are several different levels of loop unrolling for each version of Skein, and each table lists the result from the fastest version of the code, which is not always the fully unrolled version.
+
+The times to hash 1 and 10 bytes are the same: each is less than one block for all block sizes, and Skein requires two Threefish calls to hash a one-block message.  A 100 byte message requires five Threefish calls for Skein-256 (four for the block and one for the output transform), three Threefish calls for Skein-512, and only two for Skein-1024.
+
+For longer message lengths---1000, 10,000, and 100,000 bytes---Skein is making many Threefish calls and the true performance of the algorithm can be measured. It should be noted that these powers of ten are not multiples of the native block size, so the ``rounding'' error there affects the results somewhat.
+
+{\bf 64-bit Implementations.}  Table~\ref{tab:SkeinASM64Speed} gives performance figures for Skein, hand-coded in assembly language.  Table~\ref{tab:SkeinC64Speed} gives preliminary performance figures for Skein, coded in C.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrrrrr|}
+    \hline
+    & \multicolumn{6}{c|}{Message Length (bytes)}\\
+               &    1 &   10 &  100 & 1000 & 10,000 & 100,000 \\
+    \hline
+    Skein-256  &  666 &   65 & 14.3 &  8.2 &   7.6  &   7.6 \\
+    Skein-512  & 1068 &  107 & 15.0 &  7.0 &   6.2  &   6.1 \\
+    Skein-1024 & 1902 &  191 & 19.3 &  7.8 &   6.7  &   6.5 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{Skein speeds (clocks/byte) in ASM on a 64-bit CPU.}
+  \label{tab:SkeinASM64Speed}
+\end{table}
+
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrrrrr|}
+    \hline
+    & \multicolumn{6}{c|}{Message Length (bytes)}\\
+               &    1 &   10 &  100 & 1000 & 10,000 & 100,000 \\
+    \hline
+    Skein-256  &  774 &   77 & 16.6 &  9.8 &    9.2 &   9.2 \\
+    Skein-512  & 1086 &  110 & 15.6 &  7.3 &    6.6 &   6.5 \\
+    Skein-1024 & 3295 &  330 & 33.2 & 14.2 &   12.3 &  12.3 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{Skein speeds (clocks/byte) in C on a 64-bit CPU.}
+  \label{tab:SkeinC64Speed}
+\end{table}
+
+For comparison, Table~\ref{tab:SHAC64Speed} lists the performance of the SHA family in C on an Intel Core 2 Duo CPU \cite{G08a,G08b}.  At 6.5 clocks/byte, Skein-512 is more than twice as fast as SHA-512's 13.3 clocks/byte on the NIST reference platform CPU.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrrrrr|}
+    \hline
+    & \multicolumn{6}{c|}{Message Length (bytes)} \\
+      & 1 & 10 &  100 & 1000 & 10,000 & 100,000   \\
+    \hline
+    SHA-1   &  677 &  74.2 & 14.0 & 10.4 & 10.0 & 10.0 \\
+    SHA-224 & 1379 & 143.1 & 27.4 & 20.7 & 20.1 & 20.0 \\
+    SHA-256 & 1405 & 145.7 & 27.6 & 20.7 & 20.1 & 20.0 \\
+    SHA-384 & 1821 & 187.3 & 19.6 & 13.7 & 13.4 & 13.3 \\
+    SHA-512 & 1899 & 192.5 & 20.6 & 13.8 & 13.4 & 13.3 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{SHA speeds (clocks/byte) in C on a 64-bit CPU.}
+  \label{tab:SHAC64Speed}
+\end{table}
+
+All of these Skein numbers are based on an implementation of Skein optimized for speed.  It is possible to trade speed for code size, allowing Skein to run on platforms with limited memory, as shown in Table~\ref{tab:SizeSpeedTrade-offs} for assembly code. Similar trade-offs exist for the C code. The code size shown is in bytes, for the Skein block processing function. The speed is given in CPU clocks per byte, and the final column indicates how many rounds of the block cipher are unrolled in the Skein block processing function. In general, the looping versions of the code are all fairly close to the speed of the fully unrolled version, which is always the fastest. Among the looping versions, the speed difference between different amounts of unrolling is very minimal---typically not even visible when rounded to the nearest tenth of clocks/byte---so unrolling 8 rounds seems to be the best option when code size is critical.  The Skein block function could also be coded with even les
+s memory by not unrolling the Threefish algorithm at all, and looping it 72 or 80 times.  We have not implemented that variant.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|r|r|r|}
+    \hline
+                     &    Code &     & Unrolled \\
+                     &    Size &Speed& Rounds \\
+        \hline				
+          Skein-256  &	  2323 & 7.6 & 72 \\
+          Skein-256  &	  1288 & 7.8 & 24 \\
+          Skein-256  &	   664 & 7.8 &  8 \\
+        \hline			  		
+          Skein-512  &	  4733 & 6.1 & 72 \\
+          Skein-512  &	  2182 & 6.4 & 24 \\
+          Skein-512  &	  1074 & 6.4 &  8 \\
+        \hline		  		
+          Skein-1024 &	 11817 & 6.5 & 80 \\
+          Skein-1024 &	  7133 & 6.9 & 40 \\
+          Skein-1024 &	  3449 & 7.1 & 16 \\
+          Skein-1024 &	  2221 & 7.0 &  8 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{Code size/speed trade-offs on 64-bit CPUs in ASM.}
+  \label{tab:SizeSpeedTrade-offs}
+\end{table}
+
+The sizes of the API functions---Init, Update, and Final---are not included in Table~\ref{tab:SizeSpeedTrade-offs}, since they are all in C and do not have any significant speed/size trade-offs. The combined code size of these API functions is roughly 1000 bytes for each Skein block size, varying slightly depending on how much function inlining the compiler chooses to do.
+% API code size =  864 1200  864 bytes for 256/512/1024, respectively
+
+{\bf 32-bit Implementations.}  
+On a 32-bit CPU, performance is slower; see
+Table~\ref{tab:SkeinASM32Speed} and Table~\ref{tab:SkeinC32Speed}.
+It should be noted that in some cases, other compilers (e.g., GCC)
+give slightly faster results for 32-bit code. The assembler numbers
+in Table~\ref{tab:SkeinASM32Speed} use the SSE2 instructions.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrrrrr|}
+    \hline
+    & \multicolumn{6}{c|}{Message Length (bytes)}\\
+               &     1 &   10 &  100 & 1000 & 10,000 & 100,000 \\
+    \hline
+    Skein-256  &  1415 &  142 & 31.7 & 19.4 &   18.2 & 18.1 \\
+    Skein-512  &  2560 &  256 & 37.0 & 19.0 &   17.4 & 16.6 \\
+    Skein-1024 &  5590 &  560 & 56.2 & 23.6 &   20.9 & 20.6 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{Skein speeds (clocks/byte) in ASM in 32-bit mode.}
+  \label{tab:SkeinASM32Speed}
+\end{table}
+%
+
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrrrrr|}
+    \hline
+    & \multicolumn{6}{c|}{Message Length (bytes)}\\
+               &    1  &   10 &  100 & 1000 & 10,000 & 100,000 \\
+    \hline
+    Skein-256  &  2544 &  257 & 60.0 & 38.1 &   35.8 & 35.8 \\
+    Skein-512  &  5508 &  549 & 81.2 & 44.3 &   40.1 & 40.1 \\
+    Skein-1024 & 12624 & 1262 & 126  & 55.4 &   49.0 & 49.0 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{Skein speeds (clocks/byte) in C in 32-bit mode.}
+  \label{tab:SkeinC32Speed}
+\end{table}
+%
+
+For comparison, Table~\ref{tab:SHAC32Speed} lists the performance of the SHA family in C in 32-bit mode \cite{G08a,G08b}.  SHA-1, SHA-224, and SHA-256 are optimized for 32-bit words, and are faster on this platform.  SHA-384, SHA-512, and Skein are optimized for 64-bit words, and are slower on a 32-bit CPU.  But Skein-512 is still faster than SHA-512.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrrrrr|}
+    \hline
+    & \multicolumn{6}{c|}{Message Length (bytes)}\\
+    & 1  & 10 &  100 & 1000 & 10,000 & 100,000\\
+    \hline
+    SHA-1    &  716 &  71.6 & 15.1 & 10.4 & 10.0 &  9.9 \\
+    SHA-224  & 1522 & 152.2 & 29.1 & 21.6 & 20.1 & 20.9 \\
+    SHA-256  & 1522 & 153.5 & 29.5 & 21.6 & 20.9 & 20.9 \\
+    SHA-384  & 5747 & 574.7 & 58.8 & 42.9 & 41.9 & 41.4 \\
+    SHA-512  & 5851 & 586.4 & 60.2 & 43.0 & 41.9 & 41.4 \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{SHA speeds (clocks/byte) in C in 32-bit mode.}
+  \label{tab:SHAC32Speed}
+\end{table}
+%
+
+{\bf 8-bit Implementations.}  Table~\ref{tab:SkeinC8Speed} gives Skein's speed, using compiled C code, on an Atmel AVR$^{\textregistered}$ 8-Bit RISC processor.  The implementation unrolls the code to 8 rounds.  These speed numbers are for long messages.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrll|}
+    \hline
+               & code         & clocks/ & block time & large-message \\
+               & size (bytes) & block   & @ 16 MHz  & throughput \\
+    \hline
+    Skein-256  &  22,500 & 208k   & 13 ms    &  2.5 kB/s \\
+    Skein-512  &  46,300 & 341k   & 27 ms    &  2.4 kB/s \\
+    Skein-1024 &  91,500 & 940k   & 59 ms    &  2.2 kB/s \\
+    \hline
+    \end{tabular}
+  \end{center}
+  \caption{Skein speed in C on an 8-bit CPU.}
+  \label{tab:SkeinC8Speed}
+\end{table}
+
+Table~\ref{tab:SkeinASM8Speed} contains our ASM speed estimates on the same 8-bit CPU. The corresponding results are slightly more than ten times faster than the C versions, probably due to an inefficient implementation of the 64-bit rotation in the compiler's C library. These assembly estimates are optimized for speed, not for code size. It would also be possible to cut the code size in half (or better) by sacrificing some performance.  The last row is an implementation that exploits the fact that the 256-bit state fits entirely in the 32 registers of the AVR CPU.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rrll|}
+    \hline
+               & code         & clocks/ & block time & large-message \\
+               & size (bytes) & block   & @ 16 MHz  & throughput \\
+    \hline
+    Skein-256  &  4,800 & 19k & 1.2 ms    &  26 kB/s \\
+    Skein-512  &  8,300 & 37k & 2.3 ms    &  28 kB/s \\
+    Skein-1024 & 13,200 & 80k & 5.0 ms    &  26 kB/s \\
+    \hline
+    Skein-256  &         & 9.5k& 0.6 ms & 54 kB/s \\
+    \hline
+    \end{tabular}
+  \end{center}
+  \caption{Skein speed estimates in ASM on an 8-bit CPU.}
+  \label{tab:SkeinASM8Speed}
+\end{table}
+
+\subsection{Hardware Performance}
+
+{\bf ASIC Implementation.}  The Skein compression function consists of five steps:
+
+\begin{enumerate}
+\item Loading the key and plaintext,
+\item Building the Threefish key schedule,
+\item Executing 72 or 80 rounds for Skein, with key injections every 4 rounds,
+\item Doing the feed-forward step,
+\item Saving the result.
+\end{enumerate}
+
+This description allows us to estimate the gate cost and performance of the Skein compression function implemented by an ASIC. We provide this estimate for Skein-512 only. Estimates for Skein-256 and Skein-1024 are analogous.
+
+The gate count for any implementation is primarily determined by step 3, so we will estimate this first:  A Threefish-512 round consists of four parallel MIX operations and a permutation. A MIX operation consists of a 64-bit XOR, a 64-bit rotate, and a 64-bit add. A 64-bit XOR can be implemented in 192 gates. A 64-bit add can be implemented in about 800 gates. This means a MIX costs about 1000 gates. The delay through this circuit is conservatively about 1 nanosecond, using a 65 nm CMOS process.
+
+Threefish defines distinct rotation constants for eight rounds, with distinct rotation constants for each MIX. Hence, it is necessary to implement 32 different MIX circuits for Threefish and Skein. Since the permutations can be implemented by simply routing the internal state appropriately, this means that the Threefish round functions collectively require about 32K gates.
+
+Threefish-512 requires storage for its internal state and the feed-forward value. Each of these can be implemented with 512 bit flip-flops at about 5K gates each. The Threefish-512 key schedule requires 768 bits of storage, including the key (chaining variable), tweak, and overall parity words. This can be implemented using 768 bits of flip-flop, costing approximately 8K gates. The multiplexers for loading and shifting all these flip-flop bits values add about another 8K gates.
+
+The Threefish-512 subkey injection can be implemented using eleven 64-bit adders (eight adders for the key words, two for the tweak words, and one for the injection count), which we estimate at approximately 9K gates. Computing the parity over the key and tweak words requires 512 two-input XOR gates (2K gates), and we assume that the key schedule values are rotated using shift registers after each key injection.
+
+This gives an estimated gate count of roughly $32+5+5+8+8+9+2 = 69$K gates. The actual gate count will probably be somewhat higher due to additional routing area required by the fixed rotations, so the overall equivalent chip area might be closer to about 80K gates. The delay through the circuit would be 8 nanoseconds, which we round up to 10 nanoseconds (100 MHz) to be conservative.
+
+Skein-512 simply iterates its compression function to hash a string longer than one block, and would require 10 clocks per block (9 clocks for 72 rounds, plus one for setup), or 10M blocks/second. This gives a total throughput of roughly 5 Gb/s. It should be noted that a custom layout, particularly of the adders, could probably increase this performance by more than factor of two.
+
+At the time of writing, the fastest Intel Core 2 CPU can be clocked at 3.4 GHz. At 6.1 cycles/byte, each core can hash data at around  500 MB/s or 4 Gb/s. Thus, ASIC hardware is not much faster than a fast CPU core, although it might be far cheaper and use far less power. At first glance, it is surprising that a software implementation would be that fast, but modern CPUs use highly specialized layouts and cutting-edge chip technologies, whereas ASICs are often made with standard cell libraries and older (cheaper) chip technologies.
+
+Obviously, a company like Intel could use the same chip technology found in CPUs to make faster Skein hardware, but we doubt that will ever happen.
+
+{\bf FPGA Implementation.}  We are building a reference FPGA Skein implementation; our technical report will be available before the NIST Hash Workshop in February 2009.
+
+\subsection{Threefish Software Performance}
+
+Table~\ref{tab:ThreefishSpeed} gives preliminary relative performance figures for Threefish---both encryption and decryption---in C, on the NIST Reference platform CPU in 64-bit mode. These numbers are for Skein using Threefish encryption versus Skein using Threefish decryption. That is, both operations include the plaintext feed-forward and the key schedule, so the encryption number here is identical to the Skein performance. The point is to show the relative slowdown of using decryption. Encryption-only and decryption-only versions would each be slightly faster.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|rr|}
+    \hline
+    & \multicolumn{2}{c|}{Speed} \\
+    & Encrypt & Decrypt \\
+    \hline
+    Threefish-256  &  9.2 & 13.5 \\
+    Threefish-512  &  6.5 &  7.7 \\
+    Threefish-1024 & 12.3 & not implemented \\
+    \hline
+    \end{tabular}
+  \end{center}
+  \caption{Threefish speeds (clocks/byte) in C on an Intel Core 2 Duo CPU.}
+  \label{tab:ThreefishSpeed}
+\end{table}
+
+Of course, Threefish would be faster in ASM.
+
+\subsection{The Word Size As a Tunable Parameter}
+
+All versions of Skein are specified with 64-bit words. The word size can be seen as a tunable parameter; we can define a Skein variant with 32-bit words. This variant would run much faster on 32-bit CPUs, but significantly slower on 64-bit CPUs.
+
+At this point, we have not searched for rotation or permutation constants for a 32-bit variant, nor have we analyzed it to determine how many rounds would be required for security. However, given the knowledge obtained from the 64-bit variants, this would not be complicated.
+
+\section{Skein Security Claims}
+
+\subsection{Basic Security Claims for Skein}
+
+Skein has been developed to be secure for a wide range of applications, including but not limited to digital signatures, key derivation, pseudorandom number generation, and stream cipher usage. Skein supports personalized and randomized hashing. Under a secret key, Skein can be used for message authentication and as a pseudorandom function.
+
+Below, we write $n$ for the state size, and $m$ for the minimum of state and output size. We claim the following levels of security against standard attacks\footnote{Our claims regarding collision resistance, pseudo-collision resistance and corresponding near misses follow Rogaway's formalism \cite{Ro06}.}:
+
+\begin{itemize}
+\item First preimage resistance up to $2^m$.
+\item Second preimage resistance up to $2^{m}$.
+\item Collision resistance up to $2^{m/2}$.
+\item Resistance against $r$-collisions up to roughly $\min\{2^{n/2},2^{(r-1)m/r}\}$. (An $r$ collision consists of $r$ different messages $M_1$, \ldots, $M_r$ with $H(M_1)= \cdots= H(M_r)$.)
+\end{itemize}
+
+Furthermore, we make the following security claims for Skein:
+
+\begin{itemize}
+\item
+When used as a message authentication code (MAC) or as a pseudorandom function, either via the HMAC construction or by using Skein's native MAC/PRF support under a secret key, we claim resistance to key recovery, forgery, or distinguishing attacks up to $\min(2^{n/2},2^m)$.
+
+\item
+For randomized hashing, we claim security up to $2^m$ against the following eTCR attack scenario of~\cite{HK06}: The attacker chooses a message $M_1$ and receives $r_1$ and $H_{r_1}(M_1)$, the randomized hash of $M_1$. Here $r_1$ is an $n$-bit random value not chosen by the adversary. Now the attacker has to find
+an $r_2$ and a  message $M_2$ with
+$H_{r_2}(M_2)=H_{r_1}(M_1)$.
+
+\item
+Old Merkle-Damg{\aa}rd hash functions suffer from a length extension property: Given $H(M)$, without knowing anything about $M$ except for its length, it is feasible to compute an extension $E$ and the hash $H(M||E)$. This kind of attack succeeds with probability 1 for SHA-256 and SHA-512, for example.
+
+Skeins UBI mode defends against length extension. If the entropy of $M$ is sufficiently large, such that the adversary cannot guess $M$, the probability of success for a length extension attack is roughly $2^{-m}$.
+\end{itemize}
+
+In addition to exact collisions, preimages and second preimages for the hash function, near misses are also relevant. For example, a near-collision with Hamming-weight $h\ge 1$ consists of two messages $M \neq M'$ with $H(M) \neq H(M')$, where $n-h$ of the bits in $H(M)$ and $H(M')$ are the same, and $h$ bits differ.
+
+Computing a near miss may be simpler than computing an exact hit, but if it is too simple, this indicates a weakness. For Skein, we claim that finding a near miss (i.e., a near-collision, a near-preimage or a near-second-preimage) is no more than
+\[
+\binom{n}{h} =  \frac{n!}{h! \cdot (n-h)!}
+\]
+times faster than the corresponding exact hit.
+
+\subsection{The Security of Skein's Compression Function and the Threefish Block Cipher}
+
+We make the following claims about the block compression function inside Skein, as used by the UBI mode. Following an old tradition from cryptography, attacks which deal with the compression function rather than the hash function are marked by the prefix ``pseudo.''
+
+\begin{itemize}
+\item Pseudo first-preimage resistance of $2^n$, where $n$ is the size of the chaining value.
+\item Pseudo second-preimage resistance of $2^n$, where $n$ is the size of the chaining value.
+\item Pseudo-collision resistance of $2^{n/2}$, where $n$ is the size of the chaining value.
+\item Resistance against $r$-pseudo-collisions up to roughly $2^{(r-1)n/r}$.
+\end{itemize}
+
+For the collision resistance of UBI, we restrict ourselves to collisions in which the starting positions in the starting tweaks are identical, where $n$ is the size of the chaining value.  This provides an additional line of defense: We claim security against pseudo-collisions in general, but even the ability to find pseudo-collisions would not allow an adversary to break Skein, if these colliding inputs for the Skein compression function have different tweaks.
+
+Security against near misses for the compression function may degenerate by the same factor $\binom{n}{h}$ we claimed for near misses against the Skein hash function.
+
+Furthermore, we claim Threefish to be secure against all standard attacks against a tweakable block cipher: chosen-plaintext attacks, related-key attacks, chosen-tweak attacks, and so on.
+
+\subsection{Security Proofs}
+
+The claims made about Skein's security are backed by proofs \cite{BK09}.  Here we briefly explain what these proofs mean and provide.
+
+The base (also called atomic) primitives underlying Skein are the tweakable block cipher Threefish and its derived compression function.  Skein is built on top of these.  A proof that Skein possesses some security property S is a proof of a statement of the form: ``If the atomic primitive has security property A, then Skein is guaranteed to have security property S." The proof takes the form of a reduction that, given an attacker violating property S of Skein, constructs an attacker violating property A of the atomic primitive. We will be providing such proofs for various choices of S.
+
+It should be understood that a proof of security does not say that it would be impossible to find attacks violating security property S for Skein.  What it says is that it would be impossible to find such attacks without uncovering attacks violating security property A of the atomic primitive.  The proof transfers confidence from the atomic primitive to Skein.  It validates the mode of operation, meaning the higher-level design.  It says there are no flaws in this design.  The practical consequence is that cryptanalysis can be confined to the atomic primitives.  There is no need to attempt to attack Skein itself.  One might as well invest effort in attacking Threefish and the compression function.
+
+The first and most basic property about which we have proofs is collision resistance.  However, this isn't the only security property we support via proofs.  A look at the contemporary usage of hash functions makes it clear that they are used in ways that call for security properties well beyond, and different from, collision resistance.  In particular, hash functions are used for message authentication (e.g.~HMAC~\cite{HMAC1,B06}) and as pseudorandom functions (PRFs) in key derivation. (These usages refer to keyed versions of the hash function.) They are also used to instantiate random oracles in public-key cryptography schemes.  We believe this type of usage will continue, and modern hash functions should support it.  This is the design philosophy that underlies Skein.
+
+We approach providing provable support for these additional properties by showing that the mode of operation underlying Skein is MPP (Multi-Property Preserving) \cite{BR06}. This means that a number of different security attributes, if possessed by the atomic primitive, are guaranteed to be possessed by Skein. The first such property is collision resistance.  The second is pseudo-randomness, as a consequence of which we obtaine provable support for the use of keyed Skein as a KDF and MAC.  The third is indifferentiability from a random oracle.
+
+One of the most widespread current usages of hash functions is for HMAC~\cite{HMAC1,HMAC2}.  This use is supported by proofs of security for the current generation of hash functions that use Merkle-Damg{\aa}rd mode~\cite{HMAC1,B06}.  We expect that any future hash function will continue to be utilized in HMAC mode and that such use should continue to be supported by proofs of security.  We supply these proofs.
+
+We also provide provable support for the use of Skein as a PRNG and as a stream cipher.
+
+Although the outcomes of proofs in this document are discussed in a qualitative sense, the theorems and proofs in \cite{BK09} provide concrete reductions; that is, a concrete quantitative analysis of the relations between the resources of an adversary, and the adversarial advantage. 
+
+Figure~\ref{tab:provable-attributes} summarizes the provable security results regarding Skein;   For each property, we indicate the assumption on the atomic primitive under which it is established. We now discuss these items in more detail. The formal definitions, result statements, and proofs that back up the claims made below will be provided in a supporting document that will be available before the NIST Hash Workshop in February 2009 \cite{BK09}.
+%
+\renewcommand{\arraystretch}{1.3}
+\begin{table}[tbph]
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline
+Skein Property / Mode & Assumption on Atomic Primitive  \\
+\hline
+Hash (collision resistance) & The compression function, $C$, \\
+                            & is collision resistant  \\
+PRF & Threefish is a (tweakable) PRP \\
+KDF & Threefish is a (tweakable) PRP \\
+MAC & Threefish is a (tweakable) PRP \\
+Indifferentiability from random oracle & Threefish is an ideal (tweakable) cipher  \\
+HMAC & Threefish is a (tweakable) PRP \\
+PRNG & Threefish is a (tweakable) PRP \\
+Stream cipher & Threefish is a (tweakable) PRP \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Summary of provable security attributes of Skein.}
+\label{tab:provable-attributes}
+\end{table}
+\renewcommand{\arraystretch}{1}
+
+{\bf Collision resistance.}  We prove that if the compression function is collision resistant, then so is Skein.  (Referring to the above discussion, here S is the collision resistance of Skein and A is the collision resistance of the compression function.) The implication is that it is no easier to find collisions for Skein than for its compression function.  Given that (strengthened) Merkle-Damg{\aa}rd~\cite{MD,Mer89a}, used in the SHA family, is backed by a similar security guarantee, such a guarantee would seem to be a necessary requirement for a new hash function.  We are asserting that we can provide this.
+
+{\bf PRF, MAC, and KDF.}  We prove that if Threefish is a tweakable PRP (pseudorandom permutation), then Skein is a PRF. It is important to understand that we are referring, in this context, to the keyed version of Skein.  The PRF property is that the input-output behavior of keyed Skein should look like that of a random function to an attacker \textit{who is not given the key.}  This proof supports the usage of keyed Skein for key derivation (KDF).  It also supports the use of keyed Skein as a MAC.  This is true because any PRF is a secure MAC~\cite{BKR94}.
+
+The PRF property reflects the increased versatility of Skein compared to the SHA family.  The functions in the latter family are not PRFs when keyed in the natural way; namely, via the initialization vector.  This is because of the extension attack.
+
+We highlight an attractive feature of the proof of PRF security.  Namely, the assumption made pertains to the (tweakable) block cipher rather than to the compression function.  Additionally, this is the standard assumption on a tweakable block cipher: that it is a PRP.  Indeed, in the case of other modes such as EMD~\cite{BR06} that are PRF preserving, the assumption is that the compression function is a PRF, which relies on the underlying block cipher being a PRF when keyed through the message rather than the key port.  The difference in Skein arises because the compression function runs the block cipher in Matyas-Meyer-Oseas mode.
+
+We emphasize that we provide provable support for the use of keyed Skein as a MAC. This is by dint of the fact that we show keyed Skein is a secure MAC, under the assumption that Threefish is a PRP. (This in turn is because, as indicated above, under this assumption, keyed Skein is a PRF, and any PRF is a secure MAC.)
+
+A novel feature of Skein in these modes is the variable output length.  The desired output length is one of the inputs to the hash function.  Skein has been designed so that its output values are independent for different values of this output length parameter, even if other inputs (such as the message) are the same.  This attribute of Skein is also supported by the security proofs.  We define the (new) concept of a VOL (Variable Output Length) PRF.  This is what the proofs show Skein to achieve, under the assumption that Threefish is a PRP.
+
+Keyed Skein is a fast alternative to HMAC-Skein with regard to providing a PRF and secure MAC.  To support legacy applications, however, we will also support HMAC-Skein via proofs.
+
+\iffalse
+We do not show that (keyed) Skein is MAC-preserving.  (This would mean showing that keyed Skein is a MAC assuming only that the block cipher or compression function is itself a MAC.)  This should not be taken to mean that we do not provide provable support for the use of keyed Skein as a MAC.  We do.  This is by dint of the fact that we show that keyed Skein is a secure MAC under the assumption that Threefish is a PRP. (This in turn is because, as indicated above, under this assumption, keyed Skein is a PRF, and any PRF is a secure MAC.) In other words, we do get MAC security, but under an assumption that is slightly stronger than the assumption that the block cipher or compression function is a MAC.  The difference is that  MAC-preservation
+\fi
+
+{\bf Indifferentiability from a random oracle.}  We prove that the Skein mode of operation preserves indifferentiability from a random oracle.  This has, since~\cite{C05,BR06}, become an important requirement for hash functions, due to their use for instantiating random oracles.
+
+What the results say is that if we replace Threefish with an ideal block cipher, the resulting hash function produced by the Skein mode of operation behaves like a random oracle.  Technically, it is indifferentiable from a random oracle.  Indifferentiability~\cite{M04a,C05} is a technical term underlain by a formal definition.  If a function is indifferentiable from a random oracle, it means we can securely replace a random oracle with this function in most (not all) usages of the random oracle.
+
+This can be viewed as saying the Skein mode of operation has no structural weaknesses.  It is evidence that attacks that differentiate it from a random oracle, such as the extension attack, won't work.
+
+We should, however, add a word of warning and explanation. The result pertains to the mode of operation, not to the block cipher.  In the proof, the latter has been replaced by an ideal block cipher.  The subtle point here is that there is no formal notion or assumption that we can state to capture ``Threefish is, or approximates, an ideal block cipher."  This result is different from the other results discussed above.  It is, for example, perfectly meaningful to say that Threefish is a PRP. We emphasize that the subtleties associated with indifferentiability are not peculiar to our results, but instead are endemic to the notion as a whole.  They are, and will be, present for any hash function for which a proof of indifferentiability from a random oracle is supplied.
+
+All this notwithstanding, the general consensus in the community is that indifferentiability buys you something.  It is just difficult to \textit{formally} say exactly what.
+
+{\bf Support for HMAC mode.}  Current hash functions are used in HMAC mode to obtain a MAC or a PRF. The widespread standardization and use of HMAC means this represents a large and important domain of hash function usage.  (HMAC is standardized via an IETF RFC \cite{hmac-rfc}, a NIST FIPS \cite{HMAC2}, and ANSI X9.71 \cite{ansi-hmac}.  It is in IEEE 802.11. It is implemented in SSL, SSH, IPsec, and TLS, among other places.) It is thus important that any new hash function continue to support usage in HMAC mode.
+
+The issue this raises with regard to proofs is as follows.  For hash functions that use Merkle-Damg{\aa}rd~\cite{MD,Mer89a} mode (in particular the MD and SHA families), HMAC mode is supported by proofs~\cite{HMAC1,B06} that arguably played an important role in the widespread and continuing adoption of HMAC. Current support for HMAC in this domain is represented by~\cite{B06}, which showed that HMAC with a  Merkle-Damg{\aa}rd hash function is a secure PRF (and hence MAC), assuming that the compression function is itself a secure PRF. If Skein is to become a replacement for current hash functions, it is important that we provide a similar provable guarantee for its usage in HMAC mode.  But since our underlying iteration method is not Merkle-Damg{\aa}rd, the previous proofs do not apply.
+
+Our contribution in this regard is to supply new proofs. These show the analog of the above-mentioned result.  Namely, if the compression function is a PRF, then so is HMAC-Skein. This means that Skein has the same provable guarantees in HMAC mode as existing hash functions.
+
+As a result, there are two different modes of operation in which Skein can provide a PRF or MAC:~HMAC mode and Skein's native keyed mode as discussed above.  The latter is faster.  However, the former needs to be supported for legacy reasons.
+
+{\bf PRNG and stream cipher.}  The target security property for a stream cipher is that of~\cite{Blum-Micali, Yao}: given a random seed, the output should be computationally indistinguishable from random. The goal for the PRNG is that it should be forward-secure, as defined by~\cite{Bellare-Yee}.  We prove both these properties under the assumption that Threefish is a PRP.
+
+\subsection{Security Above the Birthday Bound}\label{sec:birthday}
+
+There has recently been significant attention drawn to new security models for hash functions, whereby additional properties are required to defend against attacks with greater complexity than $2^{(n/2)}$.  For example, Joux found that if an attacker can expend sufficient work to find a collision in the internal state of an MD hash function, the attacker could amplify that attack to find a large number of additional collisions.  Joux called this a ``multi-collision'' attack~\cite{J04}.
+
+Similarly, we found it is possible to exploit collisions on the internal state of a hash function to find second preimages faster than one might naively otherwise  expect~\cite{KS05}, and we show how to exploit collisions on the internal state of a hash function to mount what we call ``herding'' attacks~\cite{KK06}.
+
+These ``attacks above the birthday bound'' are unique for several reasons.  First, they target traditionally non-standard properties of the hash function.  For example, whereas previous research focused on measuring how hard it would be for an attacker to find a \emph{single} collision, these new works \emph{begin} with the assumption that an attacker can find one collision, and then ask what else an attacker might be able to do with it.  Second, given the nature of these attacks, we are currently forced to argue a hash function's resistance against them using \emph{ad hoc} means, rather than proofs of security.
+
+These attacks above the birthday bound are theoretically interesting, but unimportant in practice. Designers who desire $n$ bits of security should use a hash function with at least $2n$ bits of state. This is already common practice, and it pushes these type of attacks beyond the capabilities of any attacker. The Skein state sizes are large enough to achieve this for all commonly used security levels.
+
+\subsection{Tunable Security Parameter}
+
+Although the number of Threefish rounds is specified for all Skein variants, this represents a tunable security parameter.  It would be straightforward to increase or decrease the number of rounds by multiples of four. To increase or decrease the number of rounds by a number that is not a multiple of four, we would want to investigate changing the rotation constants and the word permutation as well.
+
+\section{Implementing Skein}
+
+\subsection{Software Implementations}
+
+\subsubsection{Threefish}
+
+In software, most of the work of Threefish is in the MIX function.  For that reason, we designed it to be relatively easy to implement.  MIX is optimized for 64-bit CPUs, and implementing the MIX function on those platforms is trivial.
+
+On a 32-bit CPU, the MIX function requires a 64-bit rotation and addition. The 64-bit rotations are typically built out of four 32-bit shifts and some mask/combine operations.  On the x86 architecture, the SHLD instruction implements half of a 64-bit rotation.  The 64-bit additions are typically built from two additions, the second one using the carry bit from the first one.
+
+On an 8-bit CPU, the 64-bit addition must be built from eight 8-bit additions. The rotation is harder; most 8-bit CPUs do not have a barrel shifter and are limited to 1-bit rotations.  A 64-bit rotation is typically implemented as a byte re-order and between zero and four 1-bit left or right rotations. Each 1-bit rotation can be implemented as eight or nine 8-bit rotate-through-carry instructions.
+
+The Threefish round functions can be rolled into a nested loop or a single loop, or they can be fully unrolled.  The smallest and slowest option is a double loop: the outer loop for the rounds and the inner loop for the MIX functions in a round.  For fast implementations, Threefish is typically unrolled to 8 rounds or fully unrolled.  Once 8 rounds are unrolled, the rotation constants become fixed---they repeat every 8 rounds---and can be embedded in the code itself.
+
+The key schedule can be implemented in several ways. The simplest way is to store the expanded key and tweak, and compute each subkey when needed.  When used in Skein, Threefish only uses a key to encrypt one block, so this is also efficient.  If the same key is used many times---if Threefish is encrypting a large block of text---the subkeys can be fully precomputed.  Note that different subkeys can use the same sum of a tweak word and a key word. Implementations can precompute those values, or store the results the first time they are computed.
+
+Small memory implementations might not want to store the entire tweak.  When Threefish is used in Skein for small messages (and without tree hashing or bit padding), most of the tweak is zero.  The first few bytes contain the message length so far, and the last byte is one of two or three values.  Storing just a few bytes is enough to reconstruct the tweak value, and the necessary tweak words can be computed on the fly when they are needed.
+
+When Threefish is used for data encryption rather than hashing, decryption is slower than encryption.  As data is typically decrypted more often than it is encrypted, implementations might want to swap the two directions: using what we describe as encryption for decryption and vice versa.  There are no security implications in making this change.
+
+\subsubsection{UBI}
+
+Unless specifically necessary, we recommend that implementations support only inputs that are an integral number of bytes.  In most circumstances, odd-bit-length inputs are not used, and including the option merely complicates the coding and testing.  It is easy to not support odd bit lengths; just ignore the issue.  There is no bit padding to apply, and the BitPad bit in the tweak is left at zero. We stress that this is not a security issue; an implementation for arbitrary bit lengths is as secure as implementation supporting only integral numbers of bytes.
+
+Implementations that allow messages to be processed incrementally need to buffer one block's worth of data.  This is because a block cannot be processed until it is known whether it is the last block of the message.  High-speed implementations might want to create a single loop that processes multiple blocks of data.  This avoids the overhead of a function call for every block.
+
+To process a block, the implementation needs to store the following information:
+
+\begin{itemize}
+  \item The chaining value/Threefish key
+  \item The current state of the Threefish encryption
+  \item The message block to be XORed at the end
+  \item The tweak, or information to allow it to be constructed on the fly
+\end{itemize}
+
+Thus, UBI requires slightly more than $3N_b$ bytes of memory. Low-memory implementations should consider using Skein-256, as it can be implemented in approximately 100 bytes of RAM (assuming the messages are not too long).
+
+On modern operating systems, memory areas are frequently mapped in such a way that they are accessible from multiple contexts.  For example, a kernel mode function might read data from memory in a user mode process; another thread in that process could be modifying the memory at the same time that the kernel mode thread was reading the data.
+
+This opens up a possible line of attack. An implementer might be tempted not to buffer the message block but read it twice from memory: once to start the encryption and once for the feed-forward XOR. If another thread modifies the message block between these two operations, it can inject a chosen difference in the chaining state---something that is normally not possible.  We do not know whether this leads to an attack---it seems difficult to exploit in Skein---but it violates the properties that our security proofs depend upon.  As a rule of thumb, a cryptographic algorithm should only read its inputs once, which is how the Skein code provided to NIST operates.
+
+\subsubsection{Skein}
+
+Any implementer of Skein has to choose which options to enable.  The simplest implementations only implement straight hashing with a fixed output size.  After that, the most useful options to support are probably:
+\begin{itemize}
+  \item Variable output sizes (in byte increments) up to one block
+  \item Longer outputs
+  \item Key input for a MAC
+  \item PRNG
+  \item Personalization
+\end{itemize}
+%
+We expect that the public-key field, key derivation, and tree hashing will be used less frequently.
+
+Skein defines output sizes of arbitrary bit length, but we recommend that implementations restrict themselves to whole bytes.  There are specific uses for odd bit lengths (e.g., elliptic curves) and the odd bit length provides a symmetry with the arbitrary bit length of the inputs, but in practice, we rarely see arbitrary bit length values being used.
+
+\subsection{Hardware Implementations}
+
+\subsubsection{Threefish}
+
+The core of Threefish is the MIX function. In hardware, this is straightforward to implement. To achieve high performance it is important to use a fast-carry adder and not a ripple-carry adder. Ripple-carry adders are very slow in the worst case; the carry ripples from the least significant bit to the most significant bit, which limits the maximum clock frequency. There are well-known techniques for fast carry propagation in adders, and these should be used for speed-sensitive implementations.
+
+The rotations and word permutations do not require any gates, but they do take up routing space.
+
+The most natural way to implement Threefish is to either implement 8 rounds, or the full 72 or 80 rounds. An implementation that tries to implement only 1 or 4 rounds needs to accommodate different rotation constants in each MIX, leading to a number of multiplexers.
+
+The key schedule can be implemented in several ways. The simplest one is to store the extended key and extended tweak in two shift registers and clock the shift registers once for each subkey. Note that the final state of the shift registers can be directly computed, so implementations that want to perform decryption can efficiently generate the subkeys in reverse order.
+
+\subsubsection{UBI}
+
+In hardware, UBI is implemented like any other block chaining mode. There are no special considerations, other than the need to buffer the last input block until it is known whether this is the last block of the message or not.
+
+\subsubsection{Skein}
+
+For high-speed implementations, the output transform is a problem. If the core Threefish implementation can barely keep up with incoming data, there is no time to compute the output transform between two messages. Implementations have to ensure that the core is twice as fast as the maximum data rate, have two Threefish implementations (one for the data and one for the output transform), or reduce throughput when short messages are processed.
+
+\section{Skein Design}
+
+\subsection{Design Philosophy}
+
+There were several principles that we kept in mind throughout the design
+process.
+
+{\bf Simplicity.}  Simplicity is important in any cryptographic primitive: the easier an algorithm is to understand, the easier it is to analyze.  And the easier it is to analyze, the more confidence the cryptographic community has in its analysis.  Because of this, simplicity was one of our core design goals.  We wanted a design that could be easily explained and remembered.
+
+{\bf Security per clock cycle.}  In all our design trade-offs, security per clock cycle on a 64-bit CPU was the primary measure.  This is a method for evaluating algorithms that we developed previously \cite{SW97}, and have used in the design of Twofish \cite{Twofish}, Helix \cite{Helix}, and Phelix \cite{Phelix}.
+
+{\bf Implementability on a wide range of platforms.} Any standardized hash function ought to run on as many different platforms as possible. Most critical here are low-end platforms: smartcards, embedded systems, sensor network motes, RFID-tags, and so on.  To ensure implementability on these low-end systems, we avoided hardware-expensive operations---such as multiplications---and large constant tables.  We also ensured that Skein and Threefish could be implemented in very small code size and with very limited RAM.
+
+Of course, we did not just focus on low-end platforms. We wanted Skein to perform well on modern 64-bit CPUs. Skein employs simple 64-bit operations, which allow these modern CPUs to perform several operations in parallel. (Skein-512 and Skein-1024 are better at this than Skein-256.) To support multicore architectures and grid computing, Skein provides an optional mode for tree hashing. The memory requirements for tree hashing grow linearly with the tree height. To avoid excluding low-end systems, the user can define a maximum tree height $Y_m$. For the same reason, we made sequential hashing the default and tree hashing optional.
+
+{\bf Many simple rounds.}  We considered many complications to Threefish---additional MIX operations, a more complex key schedule, and so on---but in each case our analysis showed that additional simple rounds was the better alternative.  For example, consider a more complicated MIX function.  Going from three to five operations per MIX makes the algorithm more secure, but there's an additional 66\% cost in clock cycles.  We compared this change with increasing the number of three-operation MIX rounds by 66\%, and our analysis showed that adding additional smaller rounds provided more security than making the MIX operations more complicated.
+
+There are advantages to using many simple rounds.  The resultant algorithm is easier to understand and analyze.  Implementations can be chosen to be small and slow by iterating every round, large and fast by unrolling all rounds, or somewhere in between.  Cryptographically, specific design complications may protect against a particular type of attack---differential \cite{BS94}, related-key \cite{B94,KSW96,KSW97}, etc.---but adding more rounds has the advantage that it protects against almost all attacks and thus almost always adds security.  (Slide attacks \cite{BW99, BW00} are the exception.)  This general principle can be found again and again in block-cipher cryptanalysis: more rounds defeat attacks.
+
+{\bf Maximum diffusion.}  Looking back on the general trend in cryptanalytic attacks over the past couple of decades, one aspect jumps out: they take advantage of insufficient diffusion.  Differential attacks \cite{BS94}, linear attacks \cite{M93}, and correlation attacks \cite{DGV94} are all based on the fact that the diffusion across the algorithm is uneven and incomplete.  Similarly, the recent attacks against the MD and SHA family of hash functions have at their core methods of exploiting insufficient diffusion \cite{BC04,WFLY04,KBPL05,WLFCY05,WY05,WYY05,KBPL05,K05a,K05b,K06,S06}.
+
+We designed Skein to maximize diffusion at every level, and have defined the number of rounds to be high enough to allow for many full diffusions.  Each input bit position affects every output bit position in 10 rounds for Skein-512 (9 rounds for Skein-256 and 11 rounds for Skein-1024), so the algorithm is specified with 7--8 full diffusions.  By comparison, AES-128 and Twofish have only 5 full diffusions.
+
+{\bf Simple CPU operations.}  Modern CPUs are super-scalar and can execute multiple instructions in one clock cycle.  To maximize this capability, an algorithm should only use simple operations such as addition, XOR, rotation by a constant, and so on.  As an added benefit, these operations are also efficient on smaller CPUs.
+
+Skein does not use complex CPU operations such as multiplication, rotation by a variable number of bits, or any of the multimedia extension instructions in various CPUs.  These operations are often expensive to implement in hardware and on smaller CPUs that do not provide direct support for these operations.  For example, the AES submissions Mars \cite{MARS} and RC6 \cite {RC6} used 32-bit multiplication, which is efficient on large CPUs but quite expensive in hardware and on small CPUs.  We chose not to use the AES round function, which will be available as a hardware instruction on many high-end CPUs starting in 2009 \cite{IntelAES}, for the same reason (and because older CPUs would have to rely on table lookups---see below).
+
+{\bf No table lookups.}  Modern CPUs have multi-level memory cache systems that help the processor run faster.  Unfortunately, the current designs have a side-effect in that the memory access time that one processor thread experiences is dependent on the memory locations accessed by other threads, even if those other threads are in different processes.  This provides a side channel \cite{KSWH00}: one thread receives information about what another thread is doing.  There are practical attacks where one thread can determine the cipher key used by another thread \cite{Per05}.  This is a potential problem for an encryption algorithm running in software on a modern operating system.  For example, AES has been successfully cryptanalyzed using a side channel associated with its table lookups \cite{Ber05,BM06}.
+
+Skein solves the problem by not using any table lookups at all.\footnote{There are software techniques for doing table lookups with fixed memory access patterns, but these are so inefficient that they are very rarely used.}  Or more precisely, Skein has no table lookups whose address is not predictable in advance.  A thread that uses a table of rotation constants does not leak anything other than the fact that Skein is running.  And that fact is already known from the memory access pattern of the code itself.
+
+{\bf Minimal loads and stores on reference platform.}  If an algorithm's internal state fits entirely within the CPU's registers, the CPU can run at full speed.  If, on the other hand, the internal state exceeds the registers, any implementation has to perform loads and stores to move information between the registers and memory.  Memory accesses are relatively slow, and don't add any cryptographic strength.  Furthermore, in severe cases, they can provide a side channel to the attacker \cite{Koc96,Koc99,KSWH00}.
+
+An x64 CPU has 15 available 64-bit registers.  Threefish-256 and Threefish-512 fit comfortably within these registers.  Threefish-1024 requires 16 registers, so its performance suffers slightly because it needs a few loads and stores every round.
+
+{\bf Variable internal state.}  To be able to replace SHA-512, we needed a state size of at least 512 bits.  On the other hand, some people hold that a hash function requires $n$-bit security against collision attacks, which requires an internal state size of $2n$.
+
+There is a class of attacks that relies on internal collisions of the hash function (see Section~\ref{sec:birthday}).  For an $n$-bit state, these start to be relevant when the attacker can perform $2^{n/2}$ operations. At worst, they limit the security level of the hash function to $n/2$ bits.  For $n=512$, a generic collision attack requires $2^{n/2}=2^{256}$ time, which is safe enough for any foreseeable application.
+
+Note that if the internal state size is $n$ bits and the output size is $n$ bits, we have the following undesirable property: A collision $H(X)=H(Y)$ between two messages $X \neq Y$ of the same length can be extended to a collision between longer messages $(X||Z) \neq (Y||Z)$ by appending the same string $Z$ to both messages. This has been used in the past to turn random MD5 collisions into meaningful ones \cite{K04,DL05,M04b,LW05,GIS06,SLW07}. In a more general context, Joux \cite{J04} used the same property to create huge multi-collisions very cheaply: a $2^k$-collision just needs time $k \cdot 2^{n/2}$.
+
+The main defense against that kind of attack is collision resistance---the adversary should be unable to find any collision at all. An output size of $n \ge 512$ ought to be beyond hope for the adversary. But it still would be desirable to provide a second line of defense. Even if one day finding collisions turns out to be somehow feasible---as for MD5---exploiting that weakness for creating either multi-collisions or meaningful collisions should remain infeasible. This requires us to increase the internal state size, which is the core idea for the failure friendly ``wide-pipe'' design \cite{L04}. Thus, if we want a 256-bit hash function to be failure friendly, we need 512 bits of internal state, and if we want a failure-friendly 512-bit hash function, we need 1024 bits of internal state.
+
+In general, we regard the internal state size as the main security parameter for a hash function. All versions of Skein support variable-sized outputs.  We provide three different versions of Skein, supporting three different internal state sizes:
+\begin{itemize}
+\item Skein-256, the low-end version, which we consider more than adequately secure for typical applications, as one would expect from a well-designed plain  256-bit hash function.
+\item Skein-512, which we feel is sufficiently secure for essentially all applications.  One can view Skein-512 as a wide-pipe 256-bit hash function, or as a plain 512-bit hash function.
+\item Skein-1024, for users who specifically require an exceptionally high level of security assurance.
+\end{itemize}
+
+We considered having a parameterized state size, but that creates considerable extra complication for very little gain. For the same reason, we dismissed designing a variant with more than 1024 bits of internal state.
+
+{\bf Flexibility.}  Hash functions are used in a dizzying variety of applications: digital signatures, message authentication codes, key derivation, pseudo-random number generators, nonce generators, integrity checkers, cookie generation, and so on.  We wanted our hash function to have the flexibility to be securely used in these widely diverse ways.
+
+\subsection{General Design Decisions}
+
+These are the basic decisions we made in the design of Skein.
+
+{\bf Stream design vs. block design.} Roughly speaking, a stream design has a continuous churning of the internal state and mixes in the message a little bit at a time, while a block design divides the message into larger blocks and thoroughly mixes each block into the internal state in turn.
+
+The commonly used hash functions, like the MD \cite{MD4, MD5} and SHA \cite{SHA, SHA-1, SHA-2} families, are all block designs.  They have a block cipher at the core, and a mode of operation that turns the block cipher into a hash function.  Some of the newer hash function designs, such as RadioGat\'{u}n \cite{Radiogatun}, are stream designs.
+
+Block designs have the advantage of being easier to analyze than stream designs.  Cryptanalysts can leverage the knowledge, tools, and techniques they have developed over the years for analyzing block ciphers.  Analyzing stream constructions is harder. In the last decade, there have only been a few serious proposals for stream hash functions, and relatively little work has been done in analyzing them.  Several of the basic tools of block cipher analysis do not apply to streaming modes.  For example, block ciphers are almost always analyzed in a reduced-round versions, and it is far harder to design cryptanalytically useful reduced-strength versions of stream designs.
+
+A stream-oriented hash function---such as one in the spirit of Helix \cite{Helix} and Phelix \cite{Phelix}---could perhaps be faster than a conventional hash function based on an internal block cipher. But the additional speed---if any---might well be due to optimistic design decisions, lacking cryptanalytic experience for stream designs. Perhaps new attack techniques are just waiting for their discovery? For example, slide attacks are a well-understood tool for the cryptanalysis of block ciphers. But until very recently, slide attacks had not been considered for the analysis of hash functions. The authors of Grindahl \cite{KRT07}, another  recent stream-oriented hash function, were not aware of potential slide attacks.  It turned out that Grindahl can be attacked that way \cite{GLP08}.
+
+Given the current state of cryptanalysis, we feel that a block-oriented design is more conservative and better suited for a new standard.
+
+{\bf Tweakable block cipher.}  Although block design is better understood, a number of attacks against block-cipher-based hash functions directly attack the way that the hash functions process message blocks.  While we shied away from a streaming design, we understand that ``streamingness'' is a desirable property.  This led us towards using a tweakable block cipher.  By directly constructing our underlying cipher so that each output block is different---that a message  block yields a different result no matter where it is fed into the hash function---we produce ``streamingness'' while still using a block cipher.  Our proofs of security are extensions of existing proofs about block design.  Someone familiar with existing block ciphers can easily understand Skein as well as the security claims.
+
+{\bf Padding vs. counter.}  Hash functions need to ensure that he message length is somehow encoded into the hash.  Typically, this is done by appending the message length to the message \cite{MD}.  Our design uses a block counter rather than padding. The counter provides the same security as the message
+length, but ensures that each message block is hashed in a unique way.
+
+\subsection{Threefish Design Decisions}\label{sec:threefishdesign}
+
+This section discusses the reasons for the decisions we made in the design of Threefish.
+
+{\bf SP network.} Threefish uses an SP network \cite{Feistel} like AES \cite{AES,AES2}, rather than a Feistel network \cite{Feistel} like DES \cite{DES} or Twofish \cite{Twofish}.  An SP network has the advantage of providing more inherent parallelism, which modern CPUs can exploit with their superscalar architecture.
+
+{\bf MIX function.} Threefish's MIX function is derived from Helix \cite{Helix} and Phelix \cite{Phelix}.  Initially, we had a more complex MIX function, with 2 adds, 2 XORs, and 4 rotations.  The advantage of a more complex mixing function is that x86 CPUs, which have only 7 usable 32-bit registers, can load all of the function's inputs into registers and execute the entire MIX function without loads or stores.  However, our cryptographic analysis showed that more rounds of a simpler mixing function are more secure, for a given number of CPU clock cycles.
+
+Another candidate design included a MIX function with 3 add/XOR operations and 2 rotations, but our performance measurements also showed that---contrary to what the chip's documentation suggests---the current generation of Intel CPUs can only perform one rotate operation per clock cycle.  This limitation causes a significant speed penalty on x64 CPUs, so we abandoned it, in keeping with the principle that additional rounds more than make up for the simpler MIX function.
+
+The current MIX function has 1 rotate and 2 add/XOR operations, which can be done in 1 clock cycle (amortized) on the current generation of Intel CPUs.
+
+The basic non-linearity comes from the mixing of addition modulo $2^{64}$ and XOR.  Add and XOR are very similar at low Hamming weights (or low Hamming weight differentials), but at average Hamming weights, they are very different. The good diffusion of our design ensures that low Hamming weight values or differentials quickly diffuse to average Hamming weights.  With enough rounds, our MIX function provides excellent nonlinearity and diffusion.
+
+{\bf Rotation constants.} Our goal was to choose rotation constants that maximized diffusion across the entire cipher.  A population of 1024 candidate sets of rotation constants evolved through multiple ``generations,'' as described below. In all cases, we rejected rotation constants with value $0$, $+1$, and $-1$, since the add and XOR operations in the MIX function already provided diffusion to adjacent bits.
+
+To fill the population initially, we selected candidate sets of rotation constants that maximized the Hamming weight of a simplified version of Threefish.  In this modified version, we replaced the addition and XOR operations in the Threefish MIX function with the logical OR operation.  We then generated a random set of rotation constants and, using an all-zero plaintext, injected a single input bit difference at each input bit location.  After $R$ rounds, we measured the minimum Hamming weight of each of the $N$ output words across all input difference locations.  If the Hamming weight value was less than a threshold $W$, we rejected the rotation set and randomly chose another.  If it was greater than or equal to $W$, we saved it for the next phase.
+
+We selected values of $R$ and $W$ empirically based on the block
+size.  The general idea was to choose values that were at the knee of
+the diffusion curve.  In other words, if we chose $R$ to be too
+small, all rotation sets looked alike.  If we chose $R$ to be too
+large, the minimum Hamming weight quickly reached 64 bits.
+Similarly, if we chose $W$ to be too small, all rotation sets passed;
+and if we chose $W$ to be too large, none passed.  After some experimentation, we settled on the $(R,W)$ sets of $(8,50)$, $(8,36)$, and $(9,40)$ for Threefish-256, -512, and -1024, respectively.
+
+This initialization mechanism is important because it is much faster than running the actual Threefish rounds, primarily because this metric is rotationally invariant. That is, we actually ran the diffusion test using only a single bit difference position per word, which sped up this phase by a factor of 64. We could also have used XOR instead of logical OR here, but the former would have included cancellations and hidden the true diffusion rate of a candidate set of rotation constants, so we felt that using OR was a better choice.
+
+With the initial population of 1024 valid candidates, we employed an evolutionary algorithm, suggested by Guillaume Sevestre \cite{Sev09}, to generate the final set of rotation constants.  The evolution from generation to generation proceeded as follows. For each set in the population, we selected $K$ random plaintexts and injected a one-bit difference in each possible input bit location, using the actual Threefish round function.  We chose $K$ to be 1024: small enough to run fairly quickly, but large enough to grade the rotation sets with reasonable probability. We generated a histogram for each output bit based on whether that bit changed after $R$ rounds for each input bit difference, ignoring the key injection.  For example, in Threefish-512, this meant the histogram had an array of 512x512 (256K) entries. 
+
+For a truly random function, the expected value for each histogram entry would be $K/2$ with a binomial distribution, with $p=0.5$.  Of course, with these small values of $R$, the function is not truly random, but the goal was to choose a reasonable metric with which to grade the sets of rotation constants. When averaged over the entire
+histogram, we expect the deviation to be very small, so we defined a metric that concentrates on the maximum deviation within a single bin, to smooth out the bit-to-bit variations as much as possible.  In particular, for each set of rotation constants, we computed the maximum deviation (or ``bias''), called $B_{max}$, from $K/2$ across all histogram entries, as the search metric. In addition, we computed the maximum bias using not only rounds $0..R-1$, but also rounds $4..R+3$, to maximize diffusion from any key injection point. 
+
+The 1024 candidates in the population were then sorted based on their maximum bias. The best 64 ones (i.e. with the lowest bias) formed the ``keep set.''  The 640 worst ones were discarded and replaced by 10 copies of each of the candidates in the keep set.  Then each candidate \emph{not} in the keep set (i.e., 960 candidates, including the 640 replicated ones) were slightly modified, which involved changing a small number (1--8) of rotation values in each candidate set at random. The number of values to change, as well as what value to change them to, were all selected at random. 
+
+The algorithm was run for approximately two days for each block size on a 3.6 GHz CPU (5500 generations for 256 bits, 1600 generations for 512 bits, and 400 generations for 1024 bits), with the keep set slowly evolving toward smaller values of the bias.  We limited the search time to get some actual rotation constants in order to proceed with analysis, since in each case the best bias values in the keep set converged to their final values (a local minimum) well before the run was complete. As a sanity check, we later ran a six-day search, and the improvements in the metric over the two-day results were inconsequential, as we expected.
+
+At this point, the candidates in the final keep set were re-graded using larger values of $K$---4096, 8192, and 16,384---to minimize the expected statistical sampling error. For each bit position during the re-grading process, we also used an input difference pattern of up to three bits, with a nonzero difference in the first bit; i.e., the bit patterns 001, 011, 101, and 111. As we expected, the difference pattern 001 usually produced the worst case bias, but we nonetheless felt it was important to measure the maximum bias across multi-bit input differences.
+
+Based on the relative rankings of the rotation constant sets in this re-grade step, we chose the winner for each block size.  A copy of our search program, along with the resulting search log files, has been submitted to NIST and is also available on the Skein website, so anyone can duplicate and verify our work.
+
+{\bf Word Permutation.}  Threefish's word permutations---one for each block size---were chosen to have the following properties:
+
+\begin{itemize}
+\item Each input word difference can affect all output words after only $\lceil \log_2(N)\rceil$ rounds, where $N=4/8/16$.
+
+\item The period of the permutation must be a divisor of 8, so that the round function can be nicely looped after two key injections.
+
+In fact, all three word permutations have a period of 2 or 4.  This means that after four iterations of $\pi()$, all the words are back where they started.  Thus, software implementations that implement $\pi()$ by merely using different registers in each round can loop after four rounds without having to add the overhead of a word shuffle to the end of the loop.
+
+\item Even words are permuted with even words, and odd words with odd words.  Due to the asymmetry between even and odd words after only one mixing step, this property was found to maximize diffusion.  This means that there are $((N/2)!)^2$ possible permutations.
+\end{itemize}
+
+We believe that any permutation that satisfies these properties is suitable for Threefish.  We performed an exhaustive search---up to renaming the words---and found two permutations each for $N=4,8,16$.  Table~\ref{tab:wordPermutations} list the ones we chose for Threefish, and Table~\ref{tab:altwordPermutations} lists the other---not chosen---permutations.
+%
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|r@{\,\,}r|rrrrrrrrrrrrrrrr|}
+    \hline
+    &&\multicolumn{16}{c|}{$i=$}\\
+    &&0&1&2&3&4&5&6&7&8&9&10&11&12&13&14&15\\
+    \hline
+            & 4& 2 & 1 & 0 &  3 &&&&&&&&&&&&\\
+    $N_w =$ & 8& 0 & 5 & 2 & 7 &  6 & 3 &  4 & 1 &&&&&&&&\\
+            &16& 2 & 1 & 4 & 9 & 6 & 15 & 0 & 3 & 10 & 13 & 12 & 11 & 14 & 7 & 8 & 5 \\
+    \hline
+    \end{tabular}
+  \end{center}\caption{Alternate values for $\pi(i)$.}
+  \label{tab:altwordPermutations}
+\end{table}
+
+{\bf Rounds and cycles.}  Threefish has an unusual design face that, like Mars \cite{MARS}, does not inject the key every round.  Key injections are on a separate schedule of a ``cycle'' of four rounds.
+
+Like other features of Threefish, this comes from our core principle that adding rounds is usually the best way to strengthen a cipher.  Hence, for Threefish-256 and -512, a variant with 60 rounds and a 2-round cycle would run approximately as fast as the 72 rounds and 4-round cycle we finally chose. Similarly, the 80 rounds and 4-round cycle for Threefish-1024 are approximately as fast as a variant with 66 rounds and a 2-round cycle would be.  This is a trade-off that we needed to make: number of full diffusions versus number of key injections.
+
+We examined cycle sizes of 4, 6, and 8 rounds (there were no suitable word permutations with period 5 or 10) and with a number of rounds from 64 to 96 total, always in some integral number of cycles.  Our two best options were 72 rounds and 4-round cycles, and 80 rounds and 8-round cycles.
+
+Various related-key attack methods typically get one or two cycles for free (with a zero differential) and attacks on 3--4 cycles are relatively easy to construct. Our own preliminary cryptanalysis in Section~\ref{sec:prelim-analysis} will illustrate this. The 8-round variant has only 10 cycles; this leads to attacks on a significant fraction of the cipher. The advantage of 4-round cycles is that related-key attacks get fewer free rounds and are not nearly as successful.
+
+We kept the rotation constants on their own cycle of 8 rounds because it comes at no performance cost, and iterative characteristics are harder to construct. 
+
+{\bf Number of rounds.}  The number of rounds represents a balance among several different considerations: the number of key injections, the number of full diffusions, and the ratio of input bits to output bits.  That last consideration may need some explanation.  Looking at Skein generally, the hash function uses Threefish as a compression function: plaintext bits plus key bits plus tweak bits compress into output bits.  The number of input bits determines the attacker's degrees of freedom, and the attacker also gets to control the output bits (at least for pseudo-collision, pseudo-preimage, and pseudo-second-preimage attacks).  A large ratio of input to output bits helps the attacker.  Threefish-256 has a 2.5-to-1 ratio, Threefish-512 has a 2.25-to-1 ratio, and Threefish-1024 has a 2.125-to-1 ratio.  This is why the number of rounds of Skein-256 isn't less than the number of rounds of Skein-512.  Skein-1024 has more rounds because full diffusion is one round slower.
+
+The current number of rounds is intentionally conservative.  We will continue to evaluate Threefish and Skein, and may revise the number of rounds either upwards or downwards, depending on the results of our analysis.
+
+{\bf Key schedule.}  Most key schedules are complicated, and require many clock cycles to set up.  This doesn't matter when encrypting large blocks of text, but hurts performance considerably when encrypting small messages, or when changing key for every block, as UBI does.  And, as always, a more complex key schedule means fewer rounds, from the security-per-clock-cycle principle.
+
+The Threefish key schedule was inspired by Skipjack \cite{Skipjack}.  The Skipjack key schedule uses the bytes of a 10-byte key in order, cyclically.  We found the simplicity very attractive.  The Threefish key schedule is slightly more complicated than that, but it is still very simple compared to other block ciphers.
+
+Our key schedule has the following properties:
+%
+\begin{itemize}
+\item Given any subkey, it is possible to extract the full key for a known tweak and subkey number.
+\item Given any subkey, it is possible to extract the full tweak for a known key and subkey number.
+\item Given any two consecutive subkeys, it is possible to extract the full key, tweak, and subkey number.
+\item In a differential related-key attack, the distance between zero subkey differences is at least seven subkeys.
+\item The subkey values do not repeat with low period.
+\item The minimal repeat period for subkey differences is three.
+\item The key schedule can be implemented in a loop efficiently, without special branches or case statements based on the subkey number.
+\end{itemize}
+
+Recovering the key, tweak, and subkey number from two consecutive subkeys is somewhat complicated. Given the redundancy in the extended key and tweak, it is possible to recover the least significant bit of each key word, tweak word, and the subkey number. This knowledge provides all the carries going into the next bit position, which allows the recovery of the next bit of each value.
+
+{\bf Subkey counter.} The subkey counter prevents slide attacks \cite{BW99,
+  BW00} and any other attacks based on identical subkeys. It also provides
+a defense against rotational cryptanalysis. Consider applying a word-wise
+rotation by some amount $i$ on inputs $X$ and $Y$: Both XOR and
+rotation preserve the rotational difference, i.e., $(X \rol i) \xor (Y \rol i)
+= (X \xor Y) \rol i$ and $(X \rol i) \rol j = (X \rol j) \rol i$. With some 
+probability $p$, the rotation is also preserved under addition, i.e., 
+$(X \rol i) + (Y \rol i) = (X + Y) \rol i$ (here, $p=0.375$ for $i=1$, 
+$p=0.3125$ for $i=2$, and $p$ converges to $0.25$ for large $i$
+\cite{DaumThesis}). Thus, rotational attacks are possible, where the attacker
+word-wise rotates all the key words and all the plaintext words. Adding the
+subkey counter destroys such a rotational property, though. 
+
+{\bf Key schedule constant $\mathbf{\TheConst}$.} The constant
+$\TheConst$ defends against generating extended keys which are all
+zero or almost zero. It also provides an additional defense against
+rotational attacks. $\TheConst$ is the AES encryption of the plaintext
+240 (in decimal) under the all-zero 256-bit key; i.e.,
+$\TheConst=\text{AES-256}_0(240)$. 
+
+Previous versions of Threefish/Skein used the constant 
+$C_5=\text{0x555\ldots5}$. Some external cryptanalysis \cite{KN10,RKN10} made use of
+the fact that $C_5 \xor (C_5 \rol 2) = 0$. Above, we argued that the subkey
+counter destroys word-wise rotational properties. Since the counter is a tiny 
+value (i.e., if one avoids round 68 and above, the counter fits into four bits),
+the cipher still behaves \emph{approximately} as if it would preserve
+rotational properties under rotation by two (and in fact by all multiples of
+two). \cite{KN10,RKN10} introduce ``corrections'' for the subkey 
+counter additions, which are put into both the message and the key. Each 
+correction manages to preserve the approximate rotational property with some
+positive probability. 
+
+Even though the attacks from \cite{KN10,RKN10} don't endanger the full Skein, 
+we consider it prudent and responsible to replace the the constant $C_5$ by 
+a ``random'' $C$, where $C \neq (C \rol i)$ for all $i \in \{1, \ldots, 63\}$.
+This is an effective defense against rotational attacks without neither
+a positive nor negative effect on most other types of attack. 
+
+%In contrast to the rotation constants, we didn't try to optimise the new key
+%schedule constant, but we choose the constant at random. Firstly, if,
+%for a random constant, one could expect significant security issues which
+%would need a carefully optimised constant to disappear, then we would consider
+%the fundamental desigign of Threefish to be flawed. But we acutally are
+%convinced that a well-chosen random constant is sufficient.  
+%Secondly, note that we did carefully optimise the rotation constants, even 
+%though we would expect almost the same security for random constants. But
+%the rotation constants have been chosen to maximise diffusion, and high
+%diffusion is a defense against almost all attacks. This is much different from
+%optimising something for resistance against a single type of attack. 
+
+We decided to choose a \emph{typical random constant}; i.e., excluding
+outliers with respect to certain metrics.  In particular, we defined ten
+functions (related to differential cryptanalysis) which we took as
+characteristic values to describe potential candidates for the new
+constant. 
+
+To find out which function values should be considered ``typical'',  
+we generated a large number of ``random'' constants (using Skein with
+$C_5$ in OFB mode) and, for each function, counted which value
+was the most frequent one---which we then used as the ``typical''
+outcome.
+
+Then we generated ``random'' words, using AES-256 in counter
+mode, under the all-zero key. The first ``random'' word whose ten 
+characteristic values were all 
+``typical'' was then chosen as the new constant
+\begin{center}
+  $\TheConst=\text{AES-256}_{\text{0x000\ldots000}}(\text{0x00\ldots0F0})
+            =\text{0x1BD11BDAA9FC1A22}$. 
+\end{center}
+Note that hexadecimal $\text{F0}$ is decimal $240$. 
+
+We stress that we deliberately avoided optimizing the constant for maximal
+resistance against rotational attacks. Applying the rotational corrections for
+4-bit values, as done in \cite{KN10,RKN10} is quite costly, in terms of the
+attack's probability of success (or, alternatively, in terms of ``degrees of
+freedom''). Our new constant forces a rotational attacker to deal with 
+corrections for a 64-bit value, which suffices to defeat rotational
+attacks. 
+
+In general, we consider the approach of optimizing a cryptographic
+primitive to maximize resistance against a single class of attacks as
+questionable. It is easy to over-optimise and then aid other
+classes of attack.
+This contrasts with our effort in optimizing the rotation constants for 
+maximal diffusion. High diffusion is an important defense against 
+\emph{almost all} attacks, so one can hardly over-optimize there.
+
+{\bf Characteristic values for ``typical'' key schedule constants.}
+We considered two general metrics: 
+(1) the Hamming weight (i.e., the number of 1-bits) and
+(2) the ``run count'', i.e., the number of runs of consecutive all-1 bits. 
+Let $X$ be a 64-bit value. Below, we write $\mbox{hw}(X)$ for the Hamming
+weight of $X$ and $\mbox{rc}(X)$, for the run count of $X$.%
+    \footnote{E.g.,    0x010FFF000FFFF000 has three runs of consecutive ones,
+      namely 1, FFF, and FFFF, 
+      thus $\mbox{rc}$(0x010FFF000FFFF000)=3. Similarly:
+           $\mbox{rc}$(0x010888000FFFF000)=5,
+           $\mbox{rc}$(0x050555000FFFF000)=9, \ldots}
+Actually, the Hamming weight is less interesting for us than the deviation
+from the ``ideal'' Hamming weight, which is 32; thus we measure the 
+\emph{Hamming weight deviation} of $X$ from the
+``ideal'' value of 32, rather than in the Hamming weight itself: 
+  \[ \mbox{hwd}(X) = \left|\mbox{hw}(X)-32\right|. \]
+
+The original constant $C_5=\text{0x555\ldots5}$ was chosen to get a typical
+Hamming weight; namely, $\mbox{hw}(C_5)=32$, or $\mbox{hwd}(C_5)=0$. However, the run
+count was very atypical; namely, $\mbox{rc}(C_5)=32$, the largest possible value. 
+When considering a new constant $C$ and 
+its resistance against rotational attacks, such as the attacks from 
+\cite{KN10,RKN10}, one deals not only with the Hamming weight
+and the run count of the constant itself, but also with the Hamming weights 
+and the run counts of 
+  \[ F(C,i) = C \oplus (C \rol i) \quad 
+    \mbox{for all}\  i \in \{1, \ldots, 63\}.
+  \]  
+We will go even further and even 
+consider some imaginable ``higher-dimensional'' rotational attacks and,
+accordingly, the values 
+\begin{eqnarray*}
+   F(C,i,j) = & C \oplus (C \rol i) \oplus (C \rol j) &
+    \mbox{for}\ 1 \le i < j \le 63,
+   \\  
+   F(C,i,j,k) = & C \oplus (C \rol i) \oplus (C \rol j) \oplus (C \rol k) &
+    \mbox{for}\ 1 \le i < j < k \le 63, \ \mbox{and}
+  \\
+    F(C,i,j,k,\ell) = & C \oplus (C \rol i) \oplus (C \rol j) 
+                      \oplus (C \rol k) \oplus (C \rol \ell) &
+    \mbox{for}\ 1 \le i < j < k <  \ell \le 63
+\end{eqnarray*}
+and their Hamming weight deviations and run
+counts. As a rule of thumb, a constant is good if the maximum Hamming weight
+deviation is not too large, and if the mimimum run count is not too
+small. This made us consider 10 characteristic values for each candidate $C$:
+the maximum Hamming weight deviations (over all $i$, all $i,j$, all $i,j,k$,
+and all $i,j,k,\ell$) and the minimum run counts. 
+
+Now we had to find out which values are ``typical'' for this metric. We 
+generated 
+$2^{21}=2097152$ uniformly distributed pseudorandom 64-bit values, using Threefish
+with the old $C_5$ in OFB mode. 
+Our first requirement---inherited from the choice of $C_5$---was a Hamming
+weight of exactly 32. 
+We thus discarded all candiates $C$ with $\mbox{hw}(C) \neq 32$. This left 
+us with exactly $208308$ candidates (about 10\,\% of $2^{21}$). For these, 
+we computed the remaining 9 characteristic values and counted the ``typical'',
+i.e., most frequent outcome. 
+Tables \ref{tab:hammingweighthisto} and \ref{tab:runcounthisto} in 
+Appendix~\ref{sec:empiricalks} summarize our empirical data. 
+Table~\ref{tab:characteristicvalues} describes the final requirments the new
+constant had to meet. 
+
+Once the requirements had been fixed, we ran AES-256 in counter mode (under
+the all-zero key). The least significant half of each ciphertext was used as a 
+``random'' 64-bit word and checked if it meets all all ten
+requirements. Table \ref{tab:aescounter} describes the results we got
+until the first ``random'' word satisfied our requirements.
+
+See Appendix~\ref{sec:empiricalks} for more details on the search that
+resulted in the constant $\TheConst$.
+ 
+
+{\bf Back doors.}  Threefish and Skein have no back doors.  We understand that the super-paranoid might wonder if the rotation constants were selected so as to create a cipher with a back door in it.  To assuage those fears, we have made public the program that generated the rotation constants for anyone to run and verify.
+
+Ultimately, although there is no way to dispel this paranoia, we can offer the following comment. One of the things we know mathematically is that a block cipher with an invisible back door is equivalent to a public-key algorithm. If we had created a public-key encryption algorithm that had 512 bits of security and ran twice as fast as AES, we wouldn't be secretly using it as a block cipher.  Instead, we'd be revolutionizing public-key cryptography.
+
+\subsection{UBI Design}
+
+UBI is a variation of the cascade construction \cite{BCK96b} built upon a compression function constructed out of a tweakable block cipher.
+
+{\bf Matyas-Meyer-Oseas.} We chose Matyas-Meyer-Oseas \cite{MMO85} over Davies-Meyer \cite{MPW,QG} to simplify the mathematical security arguments.  We can prove that the compression function is a PRF, assuming that the block cipher is a PRP---a standard assumption on a block cipher.
+
+Less formally, Matyas-Meyer-Oseas is desirable because the primary attack model for a hash function allows the attacker to choose the data input.  In Davies-Meyer mode, this corresponds to a chosen-key attack on the block cipher.  In UBI, this corresponds to a chosen-plaintext attack.  The cryptographic community has a great deal of experience protecting block ciphers against chosen-plaintext attacks, but less experience in the area of chosen-key attacks.  For a new standard, it is always preferable to stay with what you know.
+
+This is even clearer when we look at attacks on the underlying block cipher. Differential attacks are probably the most important class of attacks to consider.  In UBI, a difference in a data block leads to a difference in each round.  With Davies-Meyer, a difference can be canceled out at one subkey and reintroduced at a subsequent one; it could even happen repeatedly in one block.  This gives the differential a free pass through some of the rounds, which is highly undesirable.  It also makes it much harder to provide a useful estimate for the upper bound of a differential characteristic.
+
+{\bf Tweak.}  The purpose of the tweak is to make each block operation in Skein unique.  Different Skein input fields use different field types in the modifier, and different blocks within one field use a different position value.
+
+{\bf First and final flags.}  These flags exist primarily to support our proofs of security and to simplify the security properties of UBI.  As defined, Skein would be secure without these flags.
+
+It is possible, however, to create a collision in UBI without the First flag: the hash of a two-block message, $M_1,M_2$, collides with a hash of $M_2$ and an appropriate tweak value.  This collision could not occur in Skein, as the tweak value is defined in such a way as to not permit it.  But UBI has potential applications outside of Skein, and we consider it safer to define it for more general security.
+
+{\bf Maximum message length.}  Skein is defined for messages up to $2^{96}-1$ bytes, or 64 kilo-tera-terabytes, long.  We consider this length to be long enough for the foreseeable future, and have reserved 16 bits of the tweak for future use, instead of increasing the maximum message length to $2^{112}-1$ bytes.
+
+\subsection{Optional Argument System}
+
+{\bf Configuration block.} The best way to think of the configuration block is as a method of computing the starting value for the chaining state.  Other hash function families do the same thing; for example, SHA-384 is identical to SHA-512, except that the starting value is different and the output is truncated.  Rather than define a large number of random-looking starting values, we compute them using the configuration block.
+
+{\bf Output transformation.} Originally we applied the output transformation only if the output size was larger than the state size. Unfortunately, without the output transform, you can construct two messages $M$ and $M'$ such that $H(M) \xor H(M')$ is the same as the XOR of the last blocks of $M$ and $M'$. (A similar property has recently been described for SHA-1 \cite{SS08}.)  This violates the requirement that the hash function behave like a random mapping.
+
+We chose the simplest solution to this problem: always apply the
+output transformation.  This both increases robustness and makes our
+security proofs easier, but it halves Skein's speed for hashing small
+messages. We looked at many other solutions,  such as applying a half-block fixed padding to the message. This solution made the obvious construction for the XOR-property not work, but it felt like a hack and we were not convinced that it addressed any still-undiscovered variations of that attack.  We decided to accept the performance penalty and chose a solution that addressed all our concerns.
+
+In most real-world applications, the application's own per-message overhead is already significant, and often larger than the cost of hashing a short message. Thus, the overhead of the output transformation does not decrease the practical throughput as much as one would think. The exceptions are applications like IPsec hardware, where short-message performance is very important.
+
+The output transformation is a one-way function, which isolates the output from the last point a user-chosen value affects the computation: the feed-forward of the last message block.
+
+{\bf Multiple optional arguments.}  Cryptosystems use hash functions for a plethora of purposes.  This agility requirement creates an added challenge for hash function design.  Developers will use the same hash function for radically different purposes, and---as time goes on---they will invent new ways to use that same hash function.  As cryptographers, we can caution developers to only use a hash function in certain specific ways, or not to use it for multiple purposes, but our experience shows that it doesn't work in practice.  A better alternative is to design a hash function assuming that it will be used and abused.
+
+Skein's system of optional arguments addresses this by letting the user specify the purpose of the hash function, and encoding that specification into the hash function itself, to make it unique for that purpose.  Thus, Skein-for-signatures is a slightly different hash function than Skein-for-key-derivation or Skein-for-MACs. The nonce argument also allows for building randomized hashing into the core of the hash function, which will be a boon for anyone using Skein for Tripwire-like data integrity systems \cite{KS94}.  A given host that computes file hashes can make those hashes unique for that host, something that makes the attacker's job that much harder. (Of course, the application could also use the MAC mode and use MACs rather than hashes to check the integrity of the data.) We also allow for these optional arguments to be combined.  A cryptosystem can directly use the nonce along with public-key specialization.
+
+We believe that this is an important innovation in Skein's design. We turn a source of unease about the way cryptographic engineers use hash functions into a strength.  Every purpose served by the hash function creates a unique hash function.  Additionally, engineers can trivially create their own personalized hash functions, and be assured of its cryptographic integrity.
+
+Skein can be generalized to allow the arguments in any order, or allow the same argument type to be used multiple times. Although interesting from a theoretical point of view, such flexibility is likely to lead to confusion and lack of interoperability between different implementations and applications of Skein. Furthermore, such generalizations would affect the security proofs, and require careful analysis.
+
+{\bf Key input.}  The most logical place for processing the key input would be somewhere after the configuration block. However, we chose to always process the key first to make our security proofs simpler.
+
+The security analysis is in two parts. The first UBI call maps the key into a chaining state. Assuming that UBI behaves like a random mapping (which we already require), this maps the key into a secret chaining state. From that point on, the chaining state is a key, and always goes into the key input of the Threefish block cipher. This uses the block cipher exactly as a normal block cipher is used: with a secret key and public plaintext. This simplifies the security proofs and allows them to use standard block cipher security assumptions.
+
+\section{Preliminary Cryptanalysis of Threefish and Skein}
+  \label{sec:prelim-analysis}
+
+Our Skein analysis concentrates on the security of the compression function---primarily, security against pseudo-collisions and pseudo-second-preimages---and on the security of the Threefish block cipher.  If it isn't possible to find a pseudo-collision for the compression function, it's likewise not possible to find a collision for the hash function.  Similarly, it's not possible to find preimages, second preimages, and near misses. 
+
+Furthermore, our security analysis focuses on XOR-differential characteristics.  Other algorithms that make use of Threefish's basic operations---for example, Helix \cite{Helix} and Phelix \cite{Phelix}---have proved vulnerable to differential cryptanalysis based on XOR differences \cite{HA1,HA2,HA3,PA}.
+
+We stress that the designers of a cryptosystem are not the best ones qualified to analyze their own cryptosystem for potential weaknesses. Furthermore, our own analysis has been guided by the need to decide on possible modifications of Threefish and Skein, including the number of rounds. As long as we where confident that our attacks would not extend to anything near the specified number of rounds, we did not try to push our attacks through another two or three rounds---we rather leave this to third-party cryptanalysis. In fact, by documenting our effort in analyzing Skein and Threefish\footnote{Of course, there was much more internal cryptanalysis on preliminary and alternate versions of Threefish, UBI, and Skein. While it was useful to guide our design decisions, most of it is irrelevant for the current version.}, we hope to inspire more third-party cryptanalysis.
+
+\subsection{Pseudo-Near-Collisions for the Skein-256 Compression Function 
+Reduced to Eight Rounds}
+
+Consider eight rounds (two cycles) of the Threefish-256 block cipher. Before the first round, after round~4, and after round~8, a subkey is added. Table~\ref{tab:ThreeSubkeys256} gives an overview of these three subkeys.
+%
+\begin{table}[ht]
+  \centering
+  \begin{tabular}{|ll|cccc|}
+  \hline
+    subkey & injected & word 0 & word 1 & word 2 & word 3 \\
+  \hline
+    first & \quad before round 1 &
+       $K_0$ & $K_1 + T_0$ & $K_2 + T_1$ &
+       $K_3 + \langle 0 \rangle$ \\
+  \hline
+    second & \quad after round 4 &
+       $K_1$ & $K_2 + T_1$ & $K_3 + T_{\oplus}$ &
+       $(K_{\oplus} \oplus \TheConst) + \langle 1 \rangle$ \\
+  \hline
+    third & \quad after round 8 &
+       $K_2$ & $K_3 + T_{\oplus}$ &
+         $(K_{\oplus} \oplus \TheConst) + T_0$ &
+         $K_0 + \langle 2 \rangle$\\
+  \hline
+  \end{tabular}
+  \caption{The first three subkeys of the Threefish-256 key schedule.}
+  \label{tab:ThreeSubkeys256}
+\end{table}
+%
+The values $K_i$ are the key words, and $T_i$ the tweak words. $K_{\xor}$ is the XOR of all the key words and similarly, $T_{\xor}$ is the XOR of both tweak words. $\TheConst$ is a fixed constant, and $\langle i \rangle$ is the current round constant.
+
+Assume we chose two key/tweak pairs:
+  \[ ((K_0, K_1, K_2, K_3),    (T_0,T_1))   \neq
+     ((K'_0, K'_1, K'_2, K'_3),(T'_0,T'_1))
+  \]
+such that there is no difference in the second subkey---the one added after round 4. This implies
+  \[ K_1 = K'_1, \ \ K_2 + T_1 = K'_2 + T'_1, \ \
+     K_3 + (T_0 \oplus T_1) =  K'_3 + (T'_0 \oplus T'_1),
+  \]
+and
+  \[
+     (K_0 \oplus K_1 \oplus K_2 \oplus K_3 \oplus \TheConst)     + 0\ldots0001 =
+     (K'_0 \oplus K'_1 \oplus K'_2 \oplus K'_3 \oplus \TheConst) + 0\ldots0001.
+  \]
+Now define $\delta=1000\ldots0$, i.e., the difference is isolated in the most significant bit. In this case, differences propagate under addition exactly as under XOR, i.e., in the context of a differential attack, the distinction between ``$+$'' and ``$\oplus$'' disappears. Set
+  \[
+     K_0 \oplus K'_0 = \delta, \ \ K_2 \oplus K'_2 = \delta, \ \
+     T_1 \oplus T'_1 = \delta, \ \ T_0 \oplus T'_0 = \delta,
+  \]
+and
+  \[ K_1 = K'_1, \ \ K_3 = K'_3. \]
+In this case, the difference in the first subkey is
+  $(\delta, \delta, 0, 0)$,
+and the difference in the third subkey is
+  $(\delta, 0, \delta, \delta)$.
+
+Choose a pair of messages with the same difference as in the first subkey; i.e., $(\delta, \delta, 0, 0)$. All the differences in message and subkey cancel out, so we have some kind of a \emph{local collision}, which propagates through rounds 1 to 4. After round~4, the second subkey is injected, with a zero difference of its own. Thus, the \emph{local collision} propagates further to round~8. Then, finally, a subkey with a nonzero difference is injected, and the local collision breaks apart, leaving a difference $(\delta, 0, \delta, \delta)$ in the state. This is the output of our block cipher (namely, Threefish-256, reduced to eight rounds). The chaining mode of Skein requires us to XOR the message to the final block cipher output
+  \( H_i := C(H_{i-1},T_i, M_i)
+          :=\mbox{block\_cipher}_{H_{i-1}, T_i}(M_i) \oplus M_i
+  \). 
+So the output difference of the compression function (using eight rounds of Threefish-256 as the underlying block cipher) is $(0, \delta, \delta, \delta)$. As $\delta=1000\ldots0$, all these differences appear with probability one. This gives the attacker a near-collision with Hamming difference three: all the output bits of our reduced-round compression function are the same, except for exactly three bits, which remain differently.
+
+One can generalize this attack probabilistically, for some $\delta \neq 1000\ldots0$, as long as the Hamming weight of the 63 least significant bits of $\delta$ remains low.
+
+Additionally, we get another near pseudo-collision---actually an even  better one with Hamming difference 2---by setting
+  \[ K_2 \oplus K'_2 = \delta, \ \ T_1 \oplus T'_1 = \delta, \ \
+     K_3 \oplus K'_3 = \delta,
+  \]
+and
+  \[ K_1 = K'_1, \ \ T_0 = T'_0, \ \ K_0 = K'_0. \]
+In this case, the difference in the first subkey is
+  $(0, 0, 0, \delta)$,
+and the difference in the third subkey is
+  $(\delta, 0, 0, 0)$.
+This is the output difference after eight rounds of Threefish-256. Note that the Hamming weight of the difference is one, for $\delta=1000\ldots0$. Applying the chaining mode then doubles the Hamming weight; the difference is now 
+  $(\delta, 0, 0, \delta)$.
+
+Note that the above pseudo-near-collision attack did actually allow the adversary to arbitrarily choose two different triples  (Tweak, Chaining-Value, Message) and (Tweak$'$, Chaining-Value$'$, Message$'$) with a certain difference. The attack even works if one triple (Tweak, Chaining-Value, Message) has been fixed in advance. So this isn't just a near pseudo-collision, it even is a near pseudo-second-preimage.
+
+\subsection{Pseudo-Near-Collisions for Eight Rounds of the Skein-512 and 
+-1024 Compression Functions}
+
+It is straightforward to apply the same attack principles to the Skein-512 and Skein-1024 compression functions:
+\begin{itemize}
+\item Choose key and tweak differences such that there is a zero 
+  difference in the second subkey.
+\item Choose the difference of the first subkey as the message difference, 
+  to get a local collision for the first eight rounds, excluding the key 
+  addition. If our differences are in the most significant bit only, the 
+  local collision occurs with probability one.
+\end{itemize}
+After the key addition and the message addition, we get some near-collision, exactly as for Skein-256.
+
+Set $N=4$ for Skein-256, $N=8$ for Skein-512, and $N=16$ for Skein-1024. Set $\delta=1000\ldots0$. We can choose  
+ \[ K_{N-1} + K'_{N-1}   = \delta, \ \ K_{N-2} + K'_{N-2} = \delta,
+    \ \ \mbox{and} \ \  T_1 + T'_1 = \delta, 
+ \]
+and
+ \[ K_i=K'i \ \ \mbox{for} i \in \{0, \ldots, N-3\}, \ \ \mbox{and} \ \ 
+    T_0 = T'_0. 
+ \] 
+This gives the subkeys added before the first round the differences 
+  \[ (0, 0, 0, \delta), \ \ (0, 0, 0, 0, 0, 0, 0, \delta), \ \
+     (0,0,0,0,0,0,0,0,0,0,0,0,0, 0, 0, \delta),
+  \]
+for Skein-256, -512, and -1024, respectively. Similarly, the subkey 
+differences after round eight are 
+  \[ (\delta, 0, 0, 0), \ \ (0, 0, 0, 0, \delta, 0, 0, 0), \ \
+     (0,0,0,0,0,0,0,0,0,0,0,0, \delta, 0, 0, 0).
+  \]
+With message differences 
+  \[ (0, 0, 0, \delta), \ \ (0, 0, 0, 0, 0, 0, 0, \delta), \ \
+     (0,0,0,0,0,0,0,0,0,0,0,0,0, 0, 0, \delta),
+  \]
+we get an 8-round local collision, with probability one. This local collision is finally destroyed by the key addition after round 8. The output-difference after eight rounds of either Threefish-256, Threefish-512, or Threefish-1024 has Hamming weight 1, and after applying the chaining mode, the corresponding near-pseudo-collision for the compression function of Skein-256, -512, or -1024 has Hamming weight 2.
+
+In any case, this attack on the compression functions of Skein-512 and Skein-1024 is more than just a near-pseudo-collision attack: One can fix one triple (Tweak, Chaining-Value, Message) in advance, thus implying a near-pseudo-second-preimage attack.
+
+\subsection{Related-Key Attacks for the Threefish Block Cipher}
+  \label{sec:related-key-attacks}
+
+Now we consider the Threefish block cipher on its own, disregarding the chaining mode. Recall that by choosing appropriate differences in tweak, cipher key, and message, we were able to get an output difference with Hamming weight 1, after eight rounds of any variant of the Threefish block cipher, including the key addition.  In a related-key attack on tweakable block ciphers, the key is secret, but the adversary can choose tweak and message at will. In our case, by making just two related-key queries with the appropriate differences in tweak and message, the adversary can predict the some ciphertext difference with high probability---even with probability one. 
+
+In contrast to attacking the compression function, attacking the block cipher itself extends nicely to a couple of additional rounds. Since we are able to predict a low-Hamming-weight difference after round eight of Threefish, we can probabilistically predict the differential behavior for a few more additional rounds, even under an unknown key. We will first consider distinguishing attacks, and then deal with key recovery. 
+
+In the context of this preliminary cryptanalysis, we focus on the bias of
+isolated ciphertext bits after executing a reduced-round Threefish block
+cipher. Of course, more advanced distinguishers are possible, but based on our
+simple distinguishers, we are very confident that no such attack could come
+close to penetrating the full number of rounds of either Threefish-256,
+Threefish-512, or Threefish-1024. This is confirmed by the third-party
+cryptanalysis of Threefish/Skein we have seen so far, see
+Section~\ref{sec:third-party-analysis}. 
+
+As we will describe below, we determined the bias of individual bits by
+generating a lot of random plaintext pairs with a fixed difference, and then
+empirically computing the probability of this bit to be flipped. Ideally, this
+probability should be 0.5 (bias 0.0). By the sample sizes we can afford (20 to
+50 million pairs), a bias of more than 0.001 is significant. I.e., below we
+claim a distinguisher if there is at least one bit with a bias exceeding
+0.001. Pushing this threshold to the next order of magnitude (bias $0.0001$)
+would approximately require to extend the sample size by two orders of
+magnitude. 
+
+Note that we ran our experiments for Threefish-256, Threefish-512, and
+Threefish-1024 with the revised rotation constants, but with the old and now
+deprecated key schedule constant $C_5$, instead of the new $\TheConst$. We 
+don't expect this modification to have any significant effect on the type of
+attack we consider here. 
+
+
+\subsubsection{Empirical Observations for Threefish-256}
+  \label{sec:empiric256}
+
+If we have a Hamming-weight-one difference after round eight, including the key addition, what will be the differences in the next few rounds? Consider the following experiment: Generate a pair of triples (Key, Tweak, Message), each pair consisting of 
+\begin{itemize}
+\item a random key $K$, a random tweak $T$, and
+\item a random message $M$
+\end{itemize}
+and
+\begin{itemize}
+\item a key $K'$ and a tweak $T'$ such that the difference to $(K,T)$ in the first subkey is $(0,0,0,\delta)$, the difference in the second subkey (added after round four) is $(0,0,0,0)$, and the difference in the third subkey is $(\delta,0,0,0)$, and
+\item a message $M'$ with the difference $(0,0,0,\delta)$ to $M$.
+\end{itemize}
+This is precisely the setting for the eight-round local collision and the difference  $(\delta,0,0,0)$ afterwards. We assume $\delta=1000\ldots0$. 
+
+Write $W^r_{w,b}\in \{0,1\}$ for $b$-th bit in word $W^r_w$, where $(W^r_0, \ldots, W^r_3)$ is the output after $r$ rounds of encrypting $M$ under the key $K$ and the tweak $T$. Similarly, for the $r$-round encryption of $M'$ under $K'$ and $T'$, write $(W')^r_{w,b} \in \{0,1\}$. In any case, $b \in \{0,\ldots, 63\}$ and, for Threefish-256, $w \in \{0, 1,2,3\}$.
+
+For Threefish-256, we repeated the experiment fifty million times (50,000,000 $\approx 2^{25.6}$), thus generating fifty million random pairs
+  \[ \bigl(\, (W^r_0,W^r_1,W^r_2,W^r_3),\,
+              ((W')^r_0,(W')^r_1,(W')^r_2,(W')^r_3)
+     \,\bigr)
+  \]
+with the specified difference for each round. We then counted how often the individual bits in $W^r_{w,b}$ and $(W')^r_{w,b}$ were the same, thus estimating the probabilities 
+     \[ p^r_{w,b} \quad = \quad \mbox{Prob}[W^r_{w,b}=(W')^r_{w,b}] \]
+for each word $w \in \{0,1,2,3\}$ and each bit $b \in \{0, \ldots, 63 \}$. Note that if $r$ rounds of Threefish did behave like an ideal cipher (aka ``Shannon cipher''), all these probabilities would be 0.5. 
+
+We define the ``bias'' by 
+  \[ |p^r_{b,w}-0.5|. \]
+If $p^r_{b,w} \in \{0,1\}$, then the bias $|p^r_{b,w}-0.5|$ is exactly 0.5, or ``full.''
+Table~\ref{tab:ObservationsFor256} summarizes our results. 
+
+The first, leftmost column is the number of the round, after which we computed the bias (``after round 0'' means before the first round).  For each round $r$ (up to a certain upper bound, when nothing ``interesting'' can be seen any more), the table gives the number of bits with ``large'' bias for each round; i.e., the number of bits with full bias, and with a bias exceeding 10\%, 1\%, and 0.1\%, respectively.  The table also gives the average bias over all the 256 bits considered. By ``full bias,'' we actually mean $p^r_{w,b}=\in \{0,1\}$; i.e., the number of bits which behave linearly. 
+
+\begin{table}[tbh]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline
+ 0: &  256 &  256 &  256 &  256 & 0.50000  \\
+ 1--9: &  256 &  256 &  256 &  256 & 0.50000  \\
+ 10: &  256 &  256 &  256 &  256 & 0.50000  \\
+ 11: &  242 &  254 &  254 &  254 & 0.49225  \\
+ 12: &  120 &  242 &  242 &  242 & 0.43372  \\
+ 13: &  41 &  222 &  223 &  223 & 0.34853  \\
+ 14: &  9 &  168 &  189 &  189 & 0.19401  \\
+ 15: &  0 &  63 &  130 &  152 & 0.04981  \\
+ 16: &  0 &  3 &  40 &  61 & 0.00349  \\
+ 17: &  0 &  0 &  0 &  7 & 0.00010  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00006  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00006  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00006  \\
+\hline
+\end{tabular}
+  \caption{Empirical results for Threefish-256, sample size 50,000,000 pairs.}
+  \label{tab:ObservationsFor256}
+\end{table}
+
+At the beginning, everything is deterministic---all the bits have bias 0.5; i.e., either $p^r_{w,b}=1.0$ or $p^r_{w,b}=0.0$. This continues throughl the end of round 10. From round 11 on, the number of highly biased bits quickly declines. After round 18, the statistical noise dominates the bias observed. 
+
+Thus, there is a very simple distinguisher for 17 rounds of Threefish-256, in the context of a related-key chosen-tweak chosen-plaintext attack: Determine the bit $W^{17}_{w,b}$ with the largest bias. Choose a few thousand input pairs with the appropriate differences. For each such pair, count how often bit~$b$ in word~$w$ of the two outputs is the same. If the result is close to 50\% of all pairs, we have a random permutation. If it is significantly divergent from 50\%, we have Threefish-256.  
+
+Instead of counting $W^r_{w,b} \oplus (W')^r_{w,b}$, for $r=17$ and some ``good'' $w,b$, we could search for correlations between $W^r_{w,b} \oplus (W')^r_{w,b}$ and $W^r_{w,b'} \oplus (W')^r_{w,b'}$ for some ``good'' $w,b,b'$. We did not study that approach in detail, but we would expect to get a distinguisher for 18 rounds of Threefish-256 that way.
+
+\subsubsection{Empirical Observations for Threefish-512 and Threefish-1024}
+  \label{sec:empiric512}
+
+For Threefish-512 and Threefish-1024, we can perform essentially the same experiment we did for Threefish-256. That is, we choose tweak, key, and message such that we get a local collision in the first eight rounds, \emph{ex}cluding the key addition. The key addition injects 
+  \[ \mbox{difference} \ \ (0, 0, 0, 0, \delta, 0, 0, 0) \ \
+         (\mbox{for Threefish-512}) \]
+and
+  \[ \mbox{difference} \ \
+     (0,0,0,0,0,0,0,0,0,0,0,0, \delta, 0, 0, 0)
+     \ \ (\mbox{for Threefish-1024}),
+  \]
+which then becomes the difference before round nine. As above, $\delta=1000\ldots0$, and everything in the first few rounds happens with probability one. As for Threefish-256, we repeated these experiments 50 million times each for Threefish-512 and Threefish-1024, and computed $p^r_{w,b}$ for rounds $r \in \{9, \ldots, 24\}$, $b \in \{0, \ldots, 63\}$ and $w \in \{0, \ldots N-1\}$, with $N=8$ for Threefish-512 and $N=16$ for Threefish-1024. Tables~\ref{tab:ObservationsFor512} and \ref{tab:ObservationsFor1024} summarize our results. 
+
+\begin{table}[tbh]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline
+ 0: &  512 &  512 &  512 &  512 & 0.50000  \\
+ 1--9: &  512 &  512 &  512 &  512 & 0.50000  \\
+ 10: &  512 &  512 &  512 &  512 & 0.50000  \\
+ 11: &  458 &  510 &  510 &  510 & 0.49609  \\
+ 12: &  270 &  494 &  494 &  494 & 0.45799  \\
+ 13: &  69 &  463 &  463 &  463 & 0.39152  \\
+ 14: &  29 &  385 &  400 &  403 & 0.24939  \\
+ 15: &  0 &  190 &  267 &  277 & 0.07153  \\
+ 16: &  0 &  14 &  64 &  99 & 0.00439  \\
+ 17: &  0 &  0 &  1 &  4 & 0.00008  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00005  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00006  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00005  \\
+\hline
+\end{tabular}
+\caption{Empirical results for Threefish-512, sample size 50,000,000~pairs.}
+\label{tab:ObservationsFor512}
+\end{table}
+
+\begin{table}[tbh]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline
+ 0: &  1024 &  1024 &  1024 &  1024 & 0.50000  \\
+ 1--9: &  1024 &  1024 &  1024 &  1024 & 0.50000  \\
+ 10: &  1024 &  1024 &  1024 &  1024 & 0.50000  \\
+ 11: &  972 &  1022 &  1022 &  1022 & 0.49805  \\
+ 12: &  743 &  1006 &  1006 &  1006 & 0.47936  \\
+ 13: &  405 &  975 &  975 &  975 & 0.44662  \\
+ 14: &  140 &  882 &  894 &  894 & 0.35347  \\
+ 15: &  16 &  650 &  723 &  728 & 0.19930  \\
+ 16: &  0 &  197 &  365 &  420 & 0.03414  \\
+ 17: &  0 &  7 &  44 &  93 & 0.00135  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00006  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00005  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00006  \\
+\hline
+\end{tabular}
+\caption{Empirical results for Threefish-1024, sample size 50,000,000~pairs.}
+\label{tab:ObservationsFor1024}
+\end{table}
+
+These tables confirm that Threefish-512 diffuses slightly slower than Threefish-256, and Threefish-1024 diffuses slightly slower than Threefish-512. Nevertheless, for each of Threefish-512 and Threefish-1024, the statistical noise dominates the bias after round 18, as was the case for Threefish-256. We believe it is possible to penetrate one additional round by considering correlations between output bits instead of isolated bias of the bits. This would imply the same kind of plausible 18-round distinguishers for Threefish-512 and Threefish-1024 that we expect for Threefish-256. 
+
+One could try to counter the noise by greatly increasing the sample size, theoretically to almost $2^{X-1}$ pairs for Threefish-$X$. This could, perhaps, push the distinguisher a little further. We did not consider such huge sample sizes, however. First, this approach doesn't seem to scale well with our approach to key recovery attacks, which we will describe in Section~\ref{sec:key-recovery}. Second, once the number of bits with a large bias is small, it doesn't take many further rounds for the bias to disappear in the statistical noise. For example, if there are less than 20 bits with a bias of more than $0.1$, then at most 2 rounds later there isn't a single bit left with a bias $>0.001$, cf. Tables~\ref{tab:ObservationsFor256}--\ref{tab:ObservationsFor1024}. Thus, a huge increase of the sample size to handle somewhat smaller thresholds for the bias would provide a very limited gain.
+
+\subsubsection{Key Recovery Attacks}
+  \label{sec:key-recovery}
+
+The core idea for our key recovery attacks is as follows:
+\begin{enumerate}
+\item Assume a simple distinguisher for $r$ rounds of the block cipher.  Here, ``simple'' means that a certain property that allows us to distinguish $r$ rounds of the cipher from random, only depends on one or two bits of a single word $W^r_2$ of the output after round $r$. Using that property, we can make $t$ related-key chosen-tweak chosen-plaintext queries to distinguish $r$ rounds of our cipher from a random permutation.
+ \item Partial decryption: Attack $r+s$ rounds of the cipher, for $s$ as large as possible. Assume a key addition after $r+s$ rounds. For the attack, we guess $k$ bits of the final round key, and partially decrypt all the $2t$ ciphertexts, such that we get all those bits of word $W^r$, which are needed to apply the simple distinguisher.
+\item Apply the simple distinguisher. Sort out most of the false key guesses.
+\item Exhaustively search the remaining key space.
+\end{enumerate}
+
+Note that $t$ is the number of ciphertext pairs and $k$ is the number of round key bits to be guessed. Thus, the number of partial decryptions is $2t*2^k$. In our current context, $k$ will be close to the full key size, which implies that $t$ cannot be overly large.
+
+We start with 20 rounds of Threefish-256, assuming the simple distinguisher for 18 rounds, which we ``expected'' `` in Section~\ref{sec:empiric256}. For concreteness, assume our simple distinguisher after round 18 deals with, say, word $W^{18}_1$.\footnote{Observe that the word $W^{18}_1$ depends on the words $W^{20}_w$ and $K_w$ for $w\in \{0,2,3\}$ and on \textbf{X}. Hence, when given the intermediate value \textbf{X}, neither $W^{20}_1$ nor $K_1$ is needed to determine $W^{18}_1$.} See Figure~\ref{fig:key-recovery}.
+%
+\begin{figure}[htbp]
+ \centering
+  \includegraphics[width=0.45\textwidth]{key_recover.pdf}
+ \caption{Simplified representation of rounds 19 and 20 of Threefish-256, including the key addition after round 20.}
+ \label{fig:key-recovery}
+\end{figure}
+
+The $b$th bit $W^{18}_{1,0}$ of $W_1$ only depends on the key words $K_0, K_2, K_3$ and on the least significant $b$ bits of the intermediate variable \textbf{X}. For $i \in \{0, \ldots, b\}$, changing $X_{b-i}$ changes $W^{18}_{1,b}$ with probability $2^{-b}$. Similarly, changing $K_{1, (b-c-i-j)\bmod 64}$ only affects the bit $X_{b-i}$ with at most the probability $2^{-j}$. Thus, given $b$ and $c$, it is easy to decide which bits of $K_1$ are statistically relevant and must be guessed, and which bits of $K_1$ can safely be neglected.  Hence, we can employ our simple distinguisher to sort out most of the false guesses. This provides a key recovery attack for 20 rounds of Threefish-256.
+
+For related reasons, we also don't need to guess all the bits of $K_0$, $K_2$ or $K_3$. We anticipate that it suffices to guess between 50\% and 75\% of all the 256 round key bits.
+
+Threefish-512 and Threefish-1024 use longer keys, thus allowing an attack to spend more time without being slower than an exhaustive key search. We can exploit that to go beyond 20 rounds.
+
+To analyze attacking any variant of Threefish with $r$ rounds, where $r \bmod 4 \neq 0$, we require an additional key addition after the final round. Otherwise, the final $r \bmod 4$ rounds could be trivially inverted, without knowing the key, and we could effectively attack $r-(r \bmod 4)$ rounds.
+In the remainder of Section~\ref{sec:key-recovery}, we consider attacks on 21 rounds of Threefish-512 and 22 rounds of Threefish-1024, with a key addition after the final round. We assume that we can undo or neglect the ``regular'' key addition after round 20. Without this assumption, analyzing the partial decryption step becomes tricky.
+
+\begin{itemize}
+\item Threefish-512:  Assume the same kind of distinguisher as above, for 18 rounds.  Guess most of the final round key, which is added after round 21.  Partially decrypt rounds 21 to 19, and apply the distinguishing property to sort out false key guesses.
+
+\item Threefish-1024: Assume a distinguisher for 18 rounds of Threefish-1024, partially decrypt rounds 22 to 20, and apply that distinguisher.
+\end{itemize}
+
+\subsubsection{Pushing the Attack Further: Prepending Four Additional Rounds}\label{sec:PushAttack}
+
+To push the attack any further, we will look at the first few rounds of Threefish. In other words, we do the following:
+\begin{itemize}
+  \item Apply the above attack (on 20 rounds of Threefish-256, 21 rounds of 
+    Threefish-512, and 22 rounds of Threefish-1024). But instead of
+    starting with the first round, i.e., with round~0, we start with 
+    round~4 now.
+  \item To bridge the first four rounds, try an appropriate message
+    difference as the input for round 0, which will get the input difference
+    $D_5$ for round 5 that we need. (In our case, for any of the three
+    Variants, this is $D_4 = (0, \ldots, 0, \delta)$ with $\delta=1000\ldots0
+    \in \{0,1\}^{64}$).
+  \item We cannot expect a probability-one approach here---even the best
+    plaintext difference $D_0$ would only turn into the required difference
+    $D_4$ with some probability $p_{0,\ldots,3}$. Thus, the values our
+    distinguisher sees will be much more noisy. To compensate for the
+    additional noise, we will have to increase the sample size by
+    approximately a factor of $1/p^2_{0,\ldots,3}$. That is, if we needed
+    $\sigma$ samples before, we now need $\sigma/p^2_{0,\ldots,3}$.
+\end{itemize}
+In other words, we need a good four-round differential characteristic with the output difference $D_4 = (0, \ldots, 0, \delta)$, which is then turned into an eight-round local collision from round 4 to round 11.
+
+Consider a single round of Threefish.  If we want a specific difference $D_i$ after round $i$, we can run round $i$ backwards; in other words, in decryption direction, to compute some difference $D_{i-1}$ before round $i$. To analyze this attack, we need to estimate
+\begin{itemize}
+  \item the probability $p(D_{i-1} \rightarrow D_i)$ that two random
+    inputs to round $i$ with difference $D_{i-1}$ produce any two outputs 
+    with difference $D_i$, and
+  \item the difference $D_{i-1}$ to maximize $p(D_{i-1} \rightarrow D_i)$.
+\end{itemize}
+We are only interested in a crude estimate of that probability. We will use the local Hamming weights to derive that estimate. Recall our MIX operation:
+  \[ \mbox{Mix}_c(A,B) = (A+B, (B<\!<\!<c) \oplus (A+B). \]
+If the Hamming weight is low, a good heuristic is to assume that the addition behaves exactly like the XOR operation. Assume Mix$_c(A,B)=(X,Y)$, and write $a$, $b$, $x$, and $y$ for the Hamming weights associated to $A$, $B$, $X$, and $Y$, respectively. For our crude estimate, we will apply the following three rules:
+\begin{enumerate}
+  \item $a = y+2x$.
+  \item $b = x+y$.
+  \item The differential probability is $\approx 2^{-x-y}$.
+\end{enumerate}
+
+Below, we will focus on Threefish-256, but we believe that this approach gives the adversary an additional four rounds for any of the three variants of Threefish.
+
+Our target output difference is of the form $(0,0,0,\delta)$, with Hamming weight 1. The target output for the MIX operations in the final round are $(0,1)$ and $(0,0)$ (due to the permutation). Applying the above three rules provides an input difference with Hamming weights $(12,7,9,6)$, as depicted in figure~\ref{fig:reverserounds256}. Applying the third of our three rules, to estimate the probabilities of this differential behavior in every round, gives a probability of $2^{-21}$.
+%
+\begin{figure}[htbp]
+  \centering
+   \includegraphics[width=0.45\textwidth]{reverserounds256.pdf}
+  \caption{A differential characteristic for the first four rounds of
+    Threefish-256.}
+  \label{fig:reverserounds256}
+\end{figure}
+
+This allows an attacker to push distinguishing attacks and key recovery attacks four rounds further, at the cost of increasing the sample size by a factor of more than $2^{40}$. Our attacks apply for 24~rounds of Threefish-256, 25~rounds of Threefish-512, and 26~rounds of Threefish-1024.
+
+Our probability estimate may be a bit too pessimistic, from the adversary's point of view. But the next logical step, namely, pushing the attack through another four rounds (one additional cycle), seems to require too large a sample size to be of any use for our key recovery attacks. 
+
+\subsection{An Attack on the Threefish Block Cipher that Doesn't Quite Work} 
+
+Our key schedule has been chosen with great care, such that the adversary cannot choose two different (tweak, cipher key) pairs with a zero difference in round $i$ and round $i+1$, or with a zero difference in round $i$ and $i+2$. In the first case, our local collision wouldn't break apart after eight rounds, but carry on for twelve rounds.
+
+The case of a zero difference in round $i$ and $i+2$, with a nonzero difference in round $i+1$, is a bit more complicated, but instructive. We can mount a boomerang attack.%, as visualized in figure\ref{fig:boomerang}.
+
+% Figure commented out for the time being.
+% If Niels has time he'll draw a figure for a boomerang attack, but as this is well known in literature
+% it is not of high importance.
+%
+%\begin{figure}[htbp]
+%  \centering
+%    \includegraphics[width=0.75\textwidth]{boomerang.pdf}
+%  \caption{A boomerang distinguisher on Threefish, reduced to 32 rounds and
+%    employing a weakened key schedule.}
+%  \label{fig:boomerang}
+%\end{figure}
+
+Assume two key/tweak pairs $(K,T) \neq (K',T')$ with a zero difference in the second and the fourth subkeys. Choose related keys/tweaks and messages $M$ and $M'$, such that local collision in the first eight rounds occurs. That is, after eight rounds, before the key injection, we have the same intermediate value $I$, both when computing the encryption $E_{K,T}(M)$ and when computing the encryption $E_{K',T'}(M')$. After the key addition, we have a certain difference, and the next eight rounds can be expected to destroy any predictable difference. So we get two ciphertexts $C=E_{K,T}(M)$ and $C'=E_{K',T'}(M')$.
+
+Now we choose new ciphertexts $C''$ and $C'''$ with the appropriate
+differences, and decrypt $C''$ under $(K',T')$ and $C'''$ under $(K,T)$. In
+rounds 9 to 16 (or rather, in rounds 16 to 9, when decrypting chosen ciphertext
+queries), we get another local collision between $C''$ and $C$, and also a
+local collision between $C'''$ and $C'$. Decrypting further, we get two
+messages $M''=D_{K',T'}(C'')$ and $M'''=D_{K,T}(C''')$ with 
+  \[ M'' \oplus M''' = M \oplus M'. \]
+For an appropriately chosen difference, this holds with probability one.
+
+Hence, with just two chosen-plaintext queries and another two chosen-ciphertext queries, we could easily distinguish 16 rounds of Threefish from a random permutation, using a boomerang property with probability one.
+
+But recall that this attack requires a property that is not provided by our key schedule; namely, different key/tweak combinations with a zero difference in some subkey $i$ and a zero difference in subkey $i+2$.
+
+Could it help the attacker if we got some key/tweak combinations in some subkey $i$ and $i+3$ or $i+4$? The boomerang property with probability one breaks apart, but a lower probability might still do. For example, assume a zero difference in subkeys $i$ and $i+4$. We would get a local collision for rounds 1--8, and then try to follow the most probable differential path to round 12 (with some probability $p_1 < 1$). From round 13 to round 16, we could try to find another most probable differential path, in order to exploit another local collision from round 17 to round 24. If everything worked out that way, we would get a boomerang distinguisher with the probability $p_1^2 \cdot p_2^2$.
+
+To defend against that kind of boomerang attack, our key schedule ensures a distance between zero-difference subkeys of at least seven subkeys.
+
+\subsection{Third-Party Cryptanalysis}
+\label{sec:third-party-analysis}
+
+Since the initial publication of the first version of this paper, many
+researchers have spent time analyzing Threefish/Skein, and quite a few of
+them have presented their results to the public. We are very pleased with this
+development, but will not attempt to elaborate on all the results. 
+Instead, we focus on the two most successful attempts to cryptanalyze
+reduced-round Skein/Threefish, where ``success'' means maximizing the number
+of rounds that can be attacked.
+
+\subsubsection{Differential Cryptanalysis}
+
+
+In September 2009, a group of six researchers published several attacks on reduced-round versions of Threefish, focusing on Threefish-512 \cite{ACMOPV09}. They managed to turn our 8-round pseudo-near-collisions with Hamming-weight 1 into a 17-round pseudo-near-collision, albeit with a much larger Hamming weight.  They also improved our related-key distinguisher for 17 rounds to 21 rounds, and presented related-key key-recovery attacks on 25 and 26 rounds of Threefish-512. The 26-round attack needs time $2^{507.8}$, just slightly faster than exhaustive search.  They also found a 21-round related-key impossible differential, a 32-round related-key boomerang key recovery attack, a 34-round related-key boomerang distinguisher, and a 35-round known-related-key boomerang distinguisher.
+
+While these results specifically apply to the version of Threefish with the deprecated rotation
+constants and the deprecated key schedule constant $C_5$, it is straightforward to use most of their ideas to analyze the current version of Threefish, with its new constants.  Indeed, one can understand how the attacks work without knowing the constants. 
+
+The core of the 17-round pseudo-near-collision attack is the same eight-round local collision we used for our 8-round near-collision. To push the pseudo-near-collision from 8 to 17 rounds, the authors of \cite{ACMOPV09} prepend a 4-round differential trail with probability $2^{-33}$, and append a 5-round differential trail with probability $2^{-24}$. These trails were found by linearization; i.e., by treating the addition mod $2^{64}$ like an XOR and then computing the probability that it actually behaves like that. Adapting the attack to the constants just requires repeating the linearization step to find new differential trails, possibly with slightly different probabilities.
+
+We used the eight-round local collision as a tool to build a 17-round distinguisher. Similarly, the authors of \cite{ACMOPV09} used their 17-round near-collision to build a 21-round distinguisher. To find a good distinguishing property, they ran a frequency test, searching for some highly biased bit---somewhat similar to our experiments in Section~\ref{sec:empiric512}. To deal with the new constants, one would essentially have to run a new frequency test. 
+
+The paper's impossible differential employs a 13-round forward differential and a 7-round backward differential. For the forward differential, the first ten rounds do not depend on the constants, but the last three rounds do. Similarly, the last four rounds of the backward differential do not depend on the constants, but the first three do. It is not clear to us whether the impossible differential can be modified for the new rotation constants.
+
+The paper's key-recovery attack improves our own key-recovery attack by means of a careful analysis---we only provided a sketch---and providing an improved search strategy that identifies and exploits neutral key bits. We believe the same approach would work for the new constants much as it works for the deprecated ones. One would need to rerun the frequency test to identify some bits with a significant bias. Since the attack workload for the 26-round attack is already close to an exhaustive key search, a slightly lower bias could possibly imply that the adapted attack would no longer be applicable for 26 rounds of Threefish-512, but it certainly remains applicable for 25 rounds. 
+
+The paper also describes a boomerang property based on concatenating two differential trails. Each of these trails goes through some key-dependent rounds. In contrast to the trails for their local collision, the authors used a complex technique for finding good boomerang trails; namely, the Lipmaa-Moriai algorithms for finding good XOR-approximations for the addition modulo powers of two \cite{LiMo}. To deal with the new constants, one would have to run their software again, to calculate new differential trails. The probabilities of the new trails might vary slightly from the old trails' probabilities, but otherwise the boomerang property is the same. 
+
+Exploiting a boomerang property for key-recovery, related-key, and known-related-key distinguishing is straightforward, regardless of the constants. 
+
+The authors of \cite{ACMOPV09} presented some excellent ideas for analyzing
+Threefish. Their analysis used the now-deprecated rotation constants and the
+now-deprecated key schedule constant, but most of their attacks depend on the
+structure of Threefish and on the key schedule---with the possible exception
+of the impossible differential. We are confident one could easily adapt the
+attacks to the current version of Threefish-512 and its new constants.
+
+\subsubsection{Rotational Cryptanalysis}
+
+The attacks presented in this section are obsolete. This is not the
+cryptanalysts' fault; they did an excellent job. But since it turned
+out that a very marginal tweak can defend against the very type of
+attack that had been applied with the most success on Threefish/Skein, we
+took the opportunity to apply that tweak; i.e., to change the key schedule
+constant $C_5$ to $\TheConst$. The attacks seem to apply equally
+well to either the deprecated or the new set of rotation constants. 
+
+\medskip
+
+{\bf Related-Key Key-Recovery.}
+In early 2010, \cite{KN10} analyzed Threefish,
+exploiting word-wise rotations. The attack is a related-key attack; i.e., there
+is a secret key, unknown to the adversary, but the adversary can ask for the
+encryptions of the plaintext under \emph{related} keys---here, specifically,
+on keys where the same rotation is applied to each of the 64-bit words of the
+key. 
+%
+Consider a plaintext
+$P$, a key $K$, and a ciphertext $C=E_K(P)$, where $E$ is a reduced-round
+version of a block cipher. Now apply a word-wise rotation to
+each word of $P$ and $K$, to generate a new plaintext $\vec{P}$ and
+$\vec{K}$. We also apply the same word-wise rotation on $C$ and get
+$\vec{C}$. If there is any significant chance that 
+   \begin{equation}\label{eq:rotational} 
+     \vec{C} = E_{\vec{K}}(\vec{P}),
+   \end{equation}
+we can attack $E$ and even provide key recovery attacks for variants of the
+same block cipher with slightly more rounds. 
+
+Threefish is based three operations: addition, XOR, and rotate.
+Neither XOR nor rotate have any effect on word-wise rotational properties:  
+$\vec{X} \xor \vec{Y} = \vec{Z}$, where $Z= X \xor Y$, and for $Y=X \rol i$, 
+$\vec{Y} = \vec{X} \rol i$. And even addition preserves word-wise
+rotations with some significant probability. If $Z = X + Y$, then  
+$\vec{Z}=\vec{X}+\vec{Y}$ holds with a probability $p=0.375$ for rotations by
+1, $p=0.3125$ for rotations by 2, and $p \approx 0.25$ for rotations by large
+amounts \cite{DaumThesis}. So the probability for Equation~\ref{eq:rotational}
+to hold is $p^k$, where $k$ is the number of additions required
+to compute $E_{\cdot}(\cdot)$.\footnote{This isn't precisely correct, since the
+  probabilities that consecutive additions preserve a word-wise rotation don't need
+  to be independent.}
+
+Consider $E$\ =\ Threefish with the key schedule constant $C_5$. The adversary
+will apply word-wise rotations by 2, so $p=0.3125$. (Without $C_5$, the
+adversary could apply word-wise rotations by 1, thus improving $p$.)
+Assume a version of Threefish with reduced rounds \emph{and} without the
+subkey counter addition. In this case, the rotational attack is immediately
+applicable, and \cite{KN10} claims attacks slightly faster than exhaustive
+key search for 50 rounds of Threefish-256, Threefish-512, and
+Threefish-1024. 
+
+One of the purposes of the subkey counter addition was to prevent this type of
+attack. Even a related-key attacker cannot apply a word-wise
+rotate on the subkey counter. And if only one of two operands of an addition
+or an XOR is rotated, while the second operand remains the same, then the
+word-wise rotational property is destroyed. 
+The main contribution of \cite{KN10} is to generate ``corrections'' $c_K$ and 
+$c_p$ for  $\vec{K}$ and $\vec{P}$; i.e., instead of
+Equation~\ref{eq:rotational}, they consider 
+  \[ \vec{C} = E_{\vec{K}\oplus c_K}(\vec{P} \oplus c_P). \]
+This approach depends heavily on the fact that the subkey counter is a tiny
+value (4 bits for \cite{KN10}; a fifth bit would be needed in round 68 above) 
+with a
+low Hamming weight. Even when merely dealing with the tiny subkey counter, the
+probability of success for the attack goes down considerably---as reflected by the
+maximal number of rounds, which goes down from 50 to 42 rounds for
+Threefish-512, from 50 to 39 rounds for Threefish-256, and from 50 to 44 rounds for
+Threefish-1024, where the key addition at the end of round 44 is omitted. 
+
+\medskip
+
+{\bf Known-Key Distinguisher.}
+Later in 2010, \cite{RKN10} extended this approach to a \emph{known-key
+  distinguisher}. That is, there is no longer a secret key---the adversary is free
+to choose both the key and the plaintext. Very informally, claiming a 
+distinguisher means proving some nonrandom constellation of cipher
+inputs/outputs, where generating a similar constellation for an ideal cipher
+would need more work. In \cite{RKN10}, this means 
+generating many equations of the form
+  \[ \vec{C_i}  =  E_{\vec{K_i}\oplus c_K}(\vec{P_i}), \]
+where $c_K$ is a ``correction'' and $C_i =E_{K_i}(P_i)$. 
+
+Since known-key distinguishers provide the adversary with more degrees of 
+freedom than related-key attacks, known-key distinguishers are often 
+applicable to more rounds of a cipher or a hash function. In
+the case of Threefish, \cite{RKN10} claim an attack for 53 rounds of
+Threefish-256 and 57 rounds of Threefish-512. The workload for the attacks
+is slightly below the exhaustive bound, which would be $2^{256}$ for
+Threefish-256 and at $2^{512}$ for Threefish-512. (\cite{RKN10} did not
+consider Threefish-1024.)
+
+It seems possible to carry the approach from the Threefish block cipher to the
+Skein hash function, though the adversary has fewer degrees of freedom and the
+attack thus penetrates fewer rounds. 
+
+We found the approach from \cite{RKN10} scientifically interesting, but were unsure of its relevance for Skein. What does it mean to show that certain
+reduced-round variants of Threefish behave non-randomly in settings where all
+or most inputs can be chosen \cite{RKN10}? For block ciphers, there is
+currently no consensus in the cryptographic community. But for hash functions,
+the consensus is that  behaving like an ideal hash function (a random
+oracle) is vitally important. For this reason, 
+we took care to prove Skein to be \emph{indifferentiable from a random
+  oracle} if the compression function behaves randomly. In that light,
+demonstrating that a reduced-round variant of Threefish---the core primitive
+inside the Skein compression function---does not behave randomly is 
+significant. However, the
+indifferentiability proof for Skein (or rather, for the UBI mode) fails at the
+birthday bound ($2^{128}$ for Skein-256 and $2^{256}$ for Skein-512); and, in
+fact, there is a straightforward attack that only needs that amount of work.%
+\footnote{%
+  This is not a specific limitation of Skein---all hash functions 
+  can be distinguished from random once an adversary has generated internal
+  collisions, and generating such collisions for $s$ bits of state only needs 
+  $2^{s/2}$ units of work.}
+From our point of view, the really interesting question would be the
+following: 
+How many rounds can a known-key
+  distinguisher penetrate with a workload below the birthday bound?
+Unfortunately, the answer isn't provided in \cite{RKN10}.
+
+
+
+\medskip
+
+{\bf The effect of the tweak.}
+Replacing $C_5$ by $C_{240}$ forces the attacker to provide ``corrections''
+for 64-bit values, instead of for 4-bit values. If is still possible to
+apply rotational attacks on reduced-round variants of Skein and Threefish. 
+But the number of rounds must be reduced drastically. 
+
+
+\subsection{Empirical Observations for Threefish with Random Rotation Constants}
+\label{sec:random-constants}
+
+After empirically studying the diffusion properties of the standard set of rotation constants, it may be interesting to also consider alternative choices for these constants. What would happen if we used random rotation constants, instead of our optimized constants? In other words, how critical is the choice of rotation constants for the security of Threefish? 
+
+To explore this question, we performed the same empirical tests on several sets of random rotation constants that we did for our current set in Section~\ref{sec:related-key-attacks} and for the deprecated set in Section~\ref{sec:deprecatedrotconst}. For the sake of simplicity, we focused on Threefish-256. All experiments we describe in this section have been performed with a sample size of 25 million pairs. 
+
+Similarly to Section~\ref{sec:related-key-attacks}, we performed our 
+experiments using
+Threefish with the old and now deprecated key schedule constant $C_5$ instead
+of the new constant $\TheConst$. The research described in this section has
+been motivated by the question of how far can we go by choosing apparently bad
+sets of rotation constants until Threefish becomes insecure, and the new key
+schedule constant is not expected to have any significant effect on this. 
+
+Recall the following facts: 
+\begin{enumerate}
+ \item The rotation constants repeat every eight rounds.
+ \item Each round of Threefish-256 employs two mix-operations.
+ \item Each mix operation performs one addition, one XOR and one rotation. 
+       That is, each mix operation requires one rotation constant.
+\end{enumerate}
+Thus, we can write the rotation constant as an eight-tuple of pairs of integers between 0 and 63. The original set of rotation constants for Threefish-256 is
+    \[ \mbox{Threefish-256} := ((14,16), (52,57), (23,40), (5,37), 
+                                (25,33), (46,12), (58,22), (32,32)). \]
+
+Our experiments started with random rotation constants, derived from the decimal representation of $\pi$, $e$, and $\sqrt{2}$, respectively. We studied sets of rotation constants that repeat after eight rounds (as do the constants in Threefish) and four rounds.
+
+The random rotation constants were only marginally less secure than the standard rotation constants, by providing somewhat better 17-round distinguishers.  As Table~\ref{tab:ObservationsRandom256} illustrates, none of these sets of rotation constants allowed us to distinguish more than 17 rounds (i.e., we could not identify any bit with a bias $>0.001$ in round 18), even those which repeated after four rounds.
+
+\begin{table}[htbp]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline \hline
+\multicolumn{6}{|l|}{Threefish-256 (standard constants)} \\ 
+ 16: &  0 &  3 &  41 &  63 & 0.00351  \\
+ 17: &  0 &  0 &  0 &  7 & 0.00012  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00008  \\ \hline\hline
+\multicolumn{6}{|l|}{consants derived from $\pi$ (repeats every 8 rounds)} \\ 
+\multicolumn{6}{|l|}{((14,15), (35,32), (38,46), (26,43),
+                      (38,32), (50,28), (19,39), (10,58))} \\ 
+ 16: &  0 &  6 &  42 &  67 & 0.00525  \\
+ 17: &  0 &  0 &  0 &  10 & 0.00013  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00008  \\ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $e$ (repeats every 8 rounds)} \\
+\multicolumn{6}{|l|}{((59,04), (52,35), (36,02), (47,13),
+                      (52,24), (47,09), (36,62), (24,07))} \\
+ 16: &  0 &  20 &  67 &  95 & 0.01180  \\
+ 17: &  0 &  0 &  5 &  25 & 0.00039  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00007  \\ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $\sqrt{2}$ (repeats every 8 rounds)} \\ 
+\multicolumn{6}{|l|}{((41,42), (13,56), (23,09), (50,48),
+                      (16,42), (09,56), (53,17), (24,46))} \\
+ 16: &  0 &  6 &  53 &  82 & 0.00579  \\
+ 17: &  0 &  0 &  3 &  12 & 0.00027  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00008  \\ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $\pi$ (repeats every 4 rounds)} \\ 
+\multicolumn{6}{|l|}{((14,15), (35,32), (38,46), (26,43)
+                      (14,15), (35,32), (38,46), (26,43))} \\
+ 16: &  0 &  5 &  44 &  91 & 0.00524  \\
+ 17: &  0 &  0 &  0 &  10 & 0.00017  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00008  \\ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $e$ (repeats every 4 rounds)} \\ 
+\multicolumn{6}{|l|}{((59,04), (52,35), (36,02), (47,13),
+                      (59,04), (52,35), (36,02), (47,13))} \\
+ 16: &  0 &  32 &  74 &  113 & 0.02048  \\
+ 17: &  0 &  3 &  16 &  33 & 0.00210  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00008  \\ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $\sqrt{2}$ (repeats every 4 rounds)} \\ 
+\multicolumn{6}{|l|}{((41,42), (13,56), (23,09), (50,48),
+                      (41,42), (13,56), (23,09), (50,48))} \\
+ 16: &  0 &  10 &  55 &  93 & 0.00792  \\
+ 17: &  0 &  0 &  6 &  25 & 0.00052  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00007  \\ \hline\hline
+\end{tabular}
+  \caption{Empirical results for variants of Threefish with random rotation constants,
+           sample size 25,000,000~pairs.}
+  \label{tab:ObservationsRandom256}
+\end{table}
+
+To improve the attacks, or rather, to weaken Threefish, we then considered random sets of rotation constants that repeated after two rounds. As shown in Table~\ref{tab:ObservationsTwoRepeat256}, this finally allowed us to improve the distinguisher to 18 and sometimes 19 rounds. One of our sets was ((17, 23), (32, 32), (17, 23), (32, 32),...), which we expected to perform worse than the others, as the effect of a rotation by 32 could cancel out two rounds later, when the next rotation by 32 takes place.%
+\footnote{Studying such sets of rotation constants can be of interest, since these could possibly improve the performance of Threefish on 32-bit machines.} 
+But even for this special case, we could not identify a distinguisher for more than 19 rounds. 
+
+\begin{table}[htbp]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline \hline
+\multicolumn{6}{|l|}{Threefish-256 (standard constants)} \\ 
+ 16: &  0 &  3 &  41 &  63 & 0.00351  \\
+ 17: &  0 &  0 &  0 &  7 & 0.00012  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00008  \\
+ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $\pi$ (repeats every 2 rounds)} \\ 
+\multicolumn{6}{|l|}{((14,15), (35,32), (14,15), (35,32),
+                      (14,15), (35,32), (14,15), (35,32))} \\ 
+ 16: &  0 &  94 &  127 &  154 & 0.08354  \\
+ 17: &  0 &  49 &  83 &  107 & 0.02986  \\
+ 18: &  0 &  4 &  33 &  56 & 0.00357  \\
+ 19: &  0 &  0 &  5 &  18 & 0.00035  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00008  \\
+ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $e$ (repeats every 2 rounds)} \\
+\multicolumn{6}{|l|}{((59,04), (52,35), (59,04), (52,35),
+                      (59,04), (52,35), (59,04), (52,35))} \\
+ 16: &  0 &  31 &  80 &  119 & 0.02318  \\
+ 17: &  0 &  3 &  20 &  43 & 0.00288  \\
+ 18: &  0 &  0 &  0 &  1 & 0.00008  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00008  \\
+ \hline\hline
+\multicolumn{6}{|l|}{constants derived from $\sqrt{2}$ (repeats every 2 rounds)} \\ 
+\multicolumn{6}{|l|}{((41,42), (13,56), (41,42), (13,56), 
+                      (41,42), (13,56), (41,42), (13,56))} \\ 
+ 16: &  0 &  41 &  99 &  125 & 0.02551  \\
+ 17: &  0 &  4 &  28 &  70 & 0.00366  \\
+ 18: &  0 &  0 &  1 &  9 & 0.00016  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00008  \\
+ \hline\hline
+\multicolumn{6}{|l|}{((17,23), (32,32), (17,23), (32,32),
+                      (17,23), (32,32), (17,23), (32,32))} \\ 
+ 16: &  0 &  83 &  164 &  193 & 0.05918  \\
+ 17: &  0 &  26 &  73 &  130 & 0.01734  \\
+ 18: &  0 &  2 &  22 &  68 & 0.00193  \\
+ 19: &  0 &  0 &  0 &  8 & 0.00011  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00007  \\
+ \hline\hline
+\end{tabular}
+  \caption{Empirical results for variants of Threefish with rotation constants repeating every two rounds, sample size 25,000,000~pairs.}
+  \label{tab:ObservationsTwoRepeat256}
+\end{table}
+
+In order to explore the limits of Threefish, we then tried to be even more malicious. 
+Could we get improved distinguishers if we sabotaged Threefish by simply repeating the same rotation constant again and again? The surprising answer is that the random constants we tried---9, 14, 23, and 59---did not weaken Threefish nearly as much as we had expected. As shown in Table~\ref{tab:ObservationsRepeatSame}, we only found distinguishers for at most 26 rounds. Specific choices were worse,  For example, the constant 3 allowed for a 34-round distinguisher. And the constant 32 was as bad as we had anticipated: we could distinguish 40 rounds.
+
+\begin{table}[htbp]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\ 
+\hline\hline 
+\multicolumn{6}{|l|}{constant 9} \\ 
+\multicolumn{6}{|l|}{((09,09), (09,09), (09,09), (09,09),
+                      (09,09), (09,09), (09,09), (09,09))} \\ 
+ 20: &  0 &  4 &  53 &  115 & 0.00448  \\
+ 21: &  0 &  0 &  11 &  66 & 0.00090  \\
+ 22: &  0 &  0 &  0 &  12 & 0.00014  \\
+ 23: &  0 &  0 &  0 &  0 & 0.00008  \\
+\hline\hline
+\multicolumn{6}{|l|}{constant 14} \\ 
+\multicolumn{6}{|l|}{((14,14), (14,14), (14,14), (14,14), 
+                      (14,14), (14,14), (14,14), (14,14))}\\
+ 20: &  0 &  12 &  65 &  112 & 0.00796  \\                                   
+ 21: &  0 &  1 &  25 &  68 & 0.00189  \\                                     
+ 22: &  0 &  0 &  3 &  25 & 0.00029  \\                                      
+ 23: &  0 &  0 &  0 &  0 & 0.00008  \\ 
+\hline\hline
+\multicolumn{6}{|l|}{constant 23} \\ 
+\multicolumn{6}{|l|}{((23,23), (23,23), (23,23), (23,23),
+	                    (23,23), (23,23), (23,23), (23,23))} \\ 
+ 22: &  0 &  4 &  40 &  74 & 0.00409  \\
+ 23: &  0 &  0 &  8 &  33 & 0.00057  \\
+ 24: &  0 &  0 &  0 &  13 & 0.00018  \\
+ 25: &  0 &  0 &  0 &  1 & 0.00008  \\
+ 26: &  0 &  0 &  0 &  0 & 0.00008  \\
+\hline\hline
+\multicolumn{6}{|l|}{constant 59} \\ 
+\multicolumn{6}{|l|}{((59,59), (59,59), (59,59), (59,59),
+                      (59,59), (59,59), (59,59), (59,59))} \\ 
+ 23: &  4 &  14 &  29 &  46 & 0.01614  \\
+ 24: &  0 &  4 &  23 &  37 & 0.00278  \\
+ 25: &  0 &  0 &  6 &  17 & 0.00041  \\
+ 26: &  0 &  0 &  0 &  2 & 0.00009  \\
+ 27: &  0 &  0 &  0 &  0 & 0.00008  \\
+ \hline\hline\hline
+\multicolumn{6}{|l|}{constant 3} \\ 
+\multicolumn{6}{|l|}{((03,03), (03,03), (03,03), (03,03),
+                      (03,03), (03,03), (03,03), (03,03))} \\ 
+ 31: &  0 &  12 &  32 &  58 & 0.00595  \\
+ 32: &  0 &  0 &  22 &  43 & 0.00167  \\
+ 33: &  0 &  0 &  4 &  29 & 0.00035  \\
+ 34: &  0 &  0 &  0 &  9 & 0.00011  \\
+ 35: &  0 &  0 &  0 &  0 & 0.00008  \\
+ \hline\hline
+\multicolumn{6}{|l|}{constant 32} \\ 
+\multicolumn{6}{|l|}{((32,32), (32,32), (32,32), (32,32),
+                      (32,32), (32,32), (32,32), (32,32))} \\ 
+ 33: &  0 &  4 &  52 &  86 & 0.00556  \\
+ \multicolumn{6}{ c }{$\vdots$\hfill$\vdots$} \\
+ 38: &  0 &  0 &  0 &  27 & 0.00024  \\
+ 39: &  0 &  0 &  0 &  15 & 0.00014  \\
+ 40: &  0 &  0 &  0 &  2 & 0.00010  \\
+ 41: &  0 &  0 &  0 &  0 & 0.00009  \\
+ \hline\hline
+\end{tabular}
+  \caption{Empirical results for variants of Threefish with a single rotation constant, sample size 25,000,000~pairs.}
+  \label{tab:ObservationsRepeatSame}
+\end{table}
+
+It is possible to do even worse. Constant 0 would be trivially weak, since the least significant bits in all words are linear. Given our related-key differential, constant 63 would need at least from round 11 to round 74 to propagate the difference from the most significant bits to the least significant bits. Since the full cipher only has 72 rounds, attacking that variant of Threefish would also be easy. Constant 1 is not quite as bad as 0 or 63, but diffusion would still be very slow. We conjecture that related-key attacks against a Threefish-256 variant with one-bit rotations and 72 rounds would be practical. 
+
+In summary, we searched for sets of rotation constants that would endanger or break Threefish.  The only such sets we could actually identify were \emph{obviously} malicious. Finding innocent-looking but dangerous sets of rotation constants for Threefish is still an open problem.
+
+\subsection{Cryptanalysis Summary}
+
+As our own cryptanalysis showed, it is feasible---in fact, quite easy---to create pseudo-near-collisions and pseudo-near-second-preimages for up to eight rounds of any variant of Skein; or rather, of the Skein compression function.  Here, ``near'' means Hamming-distance 2.  Using techniques similar to those from Section~\ref{sec:PushAttack}, one can push this from eight rounds to twelve rounds, at the cost of some significant but feasible amount of work. Assuming close to $2^n$ units of work, it may even be possible to find pseudo-near-second-preimages for up to sixteen rounds of the Skein-$n$ compression function, for all three versions of Skein.
+
+We stress that none of these attacks are applicable to reduced-round versions of the Skein hash function.  Our current attacks only deal with reduced-round versions of the compression function.  Due to Skein's output transformation, it remains an open problem how to create collisions or second preimages for the Skein hash function, even if one can create pseudo-collisions or pseudo-second-preimages for the compression function.
+
+We invite the reader to compare this to recent attacks on the security of the SHA-2 hash function family. The best implementable attacks on SHA-256 and SHA-512 we are aware of can generate collisions for up to 24 rounds of both SHA-256 and SHA-512 \cite{SS08a}. The time required for these attacks is between $2^{15.5}$ (for SHA-256, using a huge table for a speed-up trick) and $2^{32.5}$ (for SHA-512, without the huge table). As SHA-256 has 64 rounds and SHA-512 80 rounds, these attacks are a long way from actually endangering any member of the SHA-2 family. 
+
+Regarding the Threefish block cipher, we have discussed attacks for Threefish reduced to 24 to 26 rounds. Namely, the attacks were for 24 rounds of Threefish-256 (full cipher: 72 rounds), 25 rounds of Threefish-512 (full cipher: 72 rounds), and 26 rounds of Threefish-1024 (full cipher: 80 rounds). As cryptosystem designers, we are driven by reasonable pessimism. These attacks depend on certain optimistic assumptions by the adversary. 
+
+Additionally, we studied related-key boomerang attacks against Threefish, using a broken key schedule. For the broken variant, we described a probability-1 distinguisher for 16 rounds, and outlined how one might push this through a few more rounds when allowing smaller probabilities instead of probability 1. Because of the choice of our key schedule, one cannot actually apply these attacks to unmodified Threefish.
+
+The authors of \cite{KN10,RKN10} contributed rotational attacks. By changing our
+key schedule constant from $C_5$ to $\TheConst$, these attacks are no longer
+applicable to Threefish or Skein.
+
+The authors of \cite{ACMOPV09} presented some interesting ideas to improve the cryptanalysis of Threefish. Their best chosen-plaintext related-key key-recovery attack works for 26 rounds of Threefish-512, improving on our own conjectured attack by one round. Their best chosen plaintext/chosen ciphertext key-recovery attack is a boomerang attack on 32 rounds of Threefish-512. They extend this to a known-related-key distinguisher for 35 rounds. These attacks mostly exploit the Threefish structure and properties of the key schedule. One can adapt their observations and attacks to the new constants, with the possible exception of the impossible differential attack. In some cases, the adaptation may slightly change the number of rounds for which the attacks can be applied.  
+
+For comparison, consider the current state of cryptanalysis withh the
+current state for the SHA-2 family.  As far as we know, the best
+attack on SHACAL-2, the block cipher inside SHA-256, penetrates 44
+rounds \cite{LK08}---more than two-thirds of the full 64 rounds.
+However, since it requires $2^{233}$ related-key chosen plaintexts and
+time $2^{497.2}$, the attack is entirely academic.  It is based on the
+related-key rectangle attack scenario, using a probability $2^{-460}$
+distinguisher for 35 rounds of SHACAL-2. Note that related-key
+rectangle attacks are close relatives to related-key boomerang
+attacks, considered by ourselves and in \cite{ACMOPV09}, for modified
+Threefish.
+
+
+\section{Skein Website}
+\label{sec:website}
+
+The Skein website is \purl{http://www.skein-hash.info/}.  In addition to the latest version of this paper, the website contains reference code, optimized code, and code to generate performance measurements, test vectors, and known answer tests.  We will continue to update the page with additional security proofs, cryptanalysis results, performance measurements, implementations, and so on.
+
+The website is always the source for the most up-to-date version of this paper, and the most up-to-date information about Skein.
+
+\section{Legal Disclaimer}
+
+To the best of our knowledge, neither the Skein hash function, the Threefish block cipher, the UBI hashing mode, nor our optional argument system, are encumbered by any patents.  We have not, and will not, apply for patents on any part of our design or anything in this document, and we are unaware of any other patents or patent filings that cover this work. The example source code---and all other code on the Skein website---is in the public domain and can be freely used.
+
+We make this submission to NIST's hash function competition solely as individuals.  Our respective employers neither endorse nor condemn this submission.
+
+\section{Acknowledgements}
+
+We would like to thank NIST for overseeing this competition, and our employers for allowing us the time to work on this submission.  We would also like to thank the external reviewers who analyzed our algorithm when it was in draft---Frederik Armknecht, Martin Cochran, Hal Finney, Gary Graunke, Susan Landau, Sascha M\"uller-Lobeck, Kerry McKay, and Ray Savarda---and the following students of Stefan Lucks: Ewan Fleischmann, Christian Forler, Michael Gorski, Dennis Hoppe, Martin Kausche, Stoyan Stoyanov, and Julian Seifert. Also Beth Friedman for editing the submission document with Sue Heim's valuable suggestions.  And finally, we would like to thank our respective families for putting up with all of the time and attention this project required.
+
+\section{About the Authors}
+
+The Skein team is essentially a group of friends. We've all worked on cryptography and cryptographic engineering for many years.  We've met and worked together many times; our team includes half of the Twofish team \cite{Twofish}. Our experiences are extensive and diverse, which was a great help in bringing all aspects of the design together. It also led to some very interesting discussions: a single e-mail thread might span mathematical proofs, PR, and political considerations, and discussions on how modern CPUs work. We had lots of fun.
+
+We realize our affiliations read like a powerful industry consortium, but we are not.  Our employers kindly agreed to let us do this work, but most of it was done on our own time.  Really, they only have the vaguest idea what we're doing.
+
+\appendix
+% I don't know why we need the \leavevmode, but the section heading gets swallowed otherwise
+\clearpage \leavevmode
+
+%
+% Ugly hack to put "Appendix" in front of each appendix letter in the text
+%
+\begingroup
+\makeatletter
+\renewcommand\section{\@startsection {section}{1}{\z@ Appendix }%
+                                   {-3.5ex \@plus -1ex \@minus -.2ex}%
+                                   {2.3ex \@plus.2ex}%
+                                   {\normalfont\Large\bfseries}}
+\makeatother
+
+\section{Overview of Symbols}
+
+This appendix gives an overview and index of the symbols used in the definition of Skein.
+
+\begin{list}{?}{%
+\parsep = \parskip
+\itemsep = 0.5ex plus .5ex minus 0.2 ex
+\leftmargin = 27 mm
+\rightmargin = 0cm
+\listparindent = \parindent
+\labelsep = 2 mm
+\labelwidth = 25 mm
+\renewcommand{\makelabel}[1]{$#1$\hfill}
+}
+
+\item[\BytesToWords] A function that converts a string of bytes to a string of 64-bit words.
+    \symdefref{BytesToWords}.
+
+\item[C]    The Threefish ciphertext \symdefref{C} or the configuration string \symdefref{Conf}.
+
+\item[c_i]  The words of ciphertext $C$. \symdefref{c_i}
+
+\item[e_{d,i}] The $i$th word of the result of the subkey addition (if any) in round $d$. \symdefref{e_di}
+
+\item[d]    The round number. \symdefref{d}
+
+\item[f_{d,i}] The $i$th word of the result of the MIX functions in round $d$. \symdefref{f_di}
+
+\item[G_i]  Chaining values between different UBI invocations. \symdefref{G_i}
+
+\item[H_i]  Chaining values used within a UBI computation. \symdefref{H_i}
+
+\item[K]    The key, either the Threefish key \symdefref{Threefish-K} or the Skein key. \symdefref{Skein-K}
+
+\item[K']    The processed key that starts the Skein UBI chain. \symdefref{Kprime}
+
+\item[k_i]  The words of the Threefish key $K$. \symdefref{k_i}
+
+\item[k_{s,i}] The words of subkey $s$. \symdefref{k_si}
+
+\item[M]    Used for various message strings.
+
+\item[M_i]  Block $i$ of message string $M$.
+
+\item[N_b]  The number of bytes in the state. \symdefref{N_b}
+
+\item[N_o]  The number of output bits of Skein. \symdefref{N_o}
+
+\item[N_r]  The number of rounds in Threefish. \symdefref{N_r}
+
+\item[N_w]  The number of words in the state. \symdefref{N_w}
+
+\item[P]    The plaintext input to Threefish. \symdefref{P}
+
+\item[p_i]  The words of plaintext $P$. \symdefref{p_i}
+
+\item[\pi(i)] The permutation applied to the state words in each round. \symdefref{pi}
+
+\item[R_{d,j}] The rotation constant for mix $j$ in round $d$. \symdefref{R_dj}
+
+\item[s]    The subkey number. \symdefref{s}
+
+\item[T]    The tweak value. \symdefref{T}
+
+\item[T_s]  The starting tweak value for UBI. \symdefref{T_s}
+
+\item[T_\text{xxx}] Various type value constants. \symdefref{T_xxx}
+
+\item[t_i]  The words of tweak $T$. \symdefref{t_i}
+
+\item[\ToBytes] A function that converts an integer to a string of bytes, LSB first. \symdefref{ToBytes}
+
+\item[\ToInt]   A function that converts a string of bytes to an integer, LSB first. \symdefref{ToInt}
+
+\item[v_{d,i}] The value of the $i$th word of the Threefish encryption state after $d$ rounds. \symdefref{v_di}
+
+\item[\WordsToBytes] A function that converts a string of 64-bit words to a string of bytes. \symdefref{WordsToBytes}
+
+\item[(x_0, x_1)] The inputs to a MIX function. \symdefref{x0x1}
+
+\item[Y_f]  Encoding of the fan-out for tree hashing. \symdefref{Y_f}
+
+\item[Y_l]  Encoding of the leaf node size for tree hashing. \symdefref{Y_l}
+
+\item[Y_m]  Maximum tree height for tree hashing. \symdefref{Y_m}
+
+\item[(y_0, y_1)] The outputs of a MIX function. \symdefref{y0y1}
+
+\end{list}
+
+
+\section{Initial Chaining Values}\label{sec:initialchainingvalues}
+
+These are the IV values for the configurations in Table~\ref{tab:versions}.  These constants are the output of the configuration UBI.  If you are using Skein as a normal hash function, you can use these IV values as constants and skip the configuration step entirely.  Note that these are 64-bit words, not byte strings.
+
+\subsection{Skein-256-128}
+\begin{verbatim}
+0xE1111906964D7260, 0x883DAAA77C8D811C, 0x10080DF491960F7A, 0xCCF7DDE5B45BC1C2
+\end{verbatim}
+
+\subsection{Skein-256-160}
+\begin{verbatim}
+0x1420231472825E98, 0x2AC4E9A25A77E590, 0xD47A58568838D63E, 0x2DD2E4968586AB7D
+\end{verbatim}
+
+\subsection{Skein-256-224}
+\begin{verbatim}
+0xC6098A8C9AE5EA0B, 0x876D568608C5191C, 0x99CB88D7D7F53884, 0x384BDDB1AEDDB5DE
+\end{verbatim}
+
+\subsection{Skein-256-256}
+\begin{verbatim}
+0xFC9DA860D048B449, 0x2FCA66479FA7D833, 0xB33BC3896656840F, 0x6A54E920FDE8DA69
+\end{verbatim}
+
+\subsection{Skein-512-128}
+\begin{verbatim}
+0xA8BC7BF36FBF9F52, 0x1E9872CEBD1AF0AA, 0x309B1790B32190D3, 0xBCFBB8543F94805C
+0x0DA61BCD6E31B11B, 0x1A18EBEAD46A32E3, 0xA2CC5B18CE84AA82, 0x6982AB289D46982D
+\end{verbatim}
+
+\subsection{Skein-512-160}
+\begin{verbatim}
+0x28B81A2AE013BD91, 0xC2F11668B5BDF78F, 0x1760D8F3F6A56F12, 0x4FB747588239904F
+0x21EDE07F7EAF5056, 0xD908922E63ED70B8, 0xB8EC76FFECCB52FA, 0x01A47BB8A3F27A6E
+\end{verbatim}
+
+\subsection{Skein-512-224}
+\begin{verbatim}
+0xCCD0616248677224, 0xCBA65CF3A92339EF, 0x8CCD69D652FF4B64, 0x398AED7B3AB890B4
+0x0F59D1B1457D2BD0, 0x6776FE6575D4EB3D, 0x99FBC70E997413E9, 0x9E2CFCCFE1C41EF7
+\end{verbatim}
+
+\subsection{Skein-512-256}
+\begin{verbatim}
+0xCCD044A12FDB3E13, 0xE83590301A79A9EB, 0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB
+0xEC06025E74DD7683, 0xE7A436CDC4746251, 0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
+\end{verbatim}
+
+\subsection{Skein-512-384}
+\begin{verbatim}
+0xA3F6C6BF3A75EF5F, 0xB0FEF9CCFD84FAA4, 0x9D77DD663D770CFE, 0xD798CBF3B468FDDA
+0x1BC4A6668A0E4465, 0x7ED7D434E5807407, 0x548FC1ACD4EC44D6, 0x266E17546AA18FF8
+\end{verbatim}
+
+\subsection{Skein-512-512}
+\begin{verbatim}
+0x4903ADFF749C51CE, 0x0D95DE399746DF03, 0x8FD1934127C79BCE, 0x9A255629FF352CB1
+0x5DB62599DF6CA7B0, 0xEABE394CA9D5C3F4, 0x991112C71A75B523, 0xAE18A40B660FCC33
+\end{verbatim}
+
+\subsection{Skein-1024-384}
+\begin{verbatim}
+0x5102B6B8C1894A35, 0xFEEBC9E3FE8AF11A, 0x0C807F06E32BED71, 0x60C13A52B41A91F6
+0x9716D35DD4917C38, 0xE780DF126FD31D3A, 0x797846B6C898303A, 0xB172C2A8B3572A3B
+0xC9BC8203A6104A6C, 0x65909338D75624F4, 0x94BCC5684B3F81A0, 0x3EBBF51E10ECFD46
+0x2DF50F0BEEB08542, 0x3B5A65300DBC6516, 0x484B9CD2167BBCE1, 0x2D136947D4CBAFEA
+\end{verbatim}
+
+\subsection{Skein-1024-512}
+\begin{verbatim}
+0xCAEC0E5D7C1B1B18, 0xA01B0E045F03E802, 0x33840451ED912885, 0x374AFB04EAEC2E1C
+0xDF25A0E2813581F7, 0xE40040938B12F9D2, 0xA662D539C2ED39B6, 0xFA8B85CF45D8C75A
+0x8316ED8E29EDE796, 0x053289C02E9F91B8, 0xC3F8EF1D6D518B73, 0xBDCEC3C4D5EF332E
+0x549A7E5222974487, 0x670708725B749816, 0xB9CD28FBF0581BD1, 0x0E2940B815804974
+\end{verbatim}
+
+\subsection{Skein-1024-1024}
+\begin{verbatim}
+0xD593DA0741E72355, 0x15B5E511AC73E00C, 0x5180E5AEBAF2C4F0, 0x03BD41D3FCBCAFAF
+0x1CAEC6FD1983A898, 0x6E510B8BCDD0589F, 0x77E2BDFDC6394ADA, 0xC11E1DB524DCB0A3
+0xD6D14AF9C6329AB5, 0x6A9B0BFC6EB67E0D, 0x9243C60DCCFF1332, 0x1A1F1DDE743F02D4
+0x0996753C10ED0BB8, 0x6572DD22F2B4969A, 0x61FD3062D00A579A, 0x1DE0536E8682E539
+\end{verbatim}
+
+\section{Test Vectors}\label{sec:testvectors}
+
+\subsection{Skein-256-256}
+
+\begin{verbatim}
+Message data:
+     FF
+
+Result:
+     0B 98 DC D1 98 EA 0E 50 A7 A2 44 C4 44 E2 5C 23
+     DA 30 C1 0F C9 A1 F2 70 A6 63 7F 1F 34 E6 7E D2
+							 			 
+Message data:				 			 
+     FF FE FD FC FB FA F9 F8 F7 F6 F5 F4 F3 F2 F1 F0
+     EF EE ED EC EB EA E9 E8 E7 E6 E5 E4 E3 E2 E1 E0
+							 			 
+Result:						 			 
+     8D 0F A4 EF 77 7F D7 59 DF D4 04 4E 6F 6A 5A C3
+     C7 74 AE C9 43 DC FC 07 92 7B 72 3B 5D BF 40 8B
+							 			 
+Message data:				 			 
+     FF FE FD FC FB FA F9 F8 F7 F6 F5 F4 F3 F2 F1 F0
+     EF EE ED EC EB EA E9 E8 E7 E6 E5 E4 E3 E2 E1 E0
+     DF DE DD DC DB DA D9 D8 D7 D6 D5 D4 D3 D2 D1 D0
+     CF CE CD CC CB CA C9 C8 C7 C6 C5 C4 C3 C2 C1 C0
+
+Result:
+     DF 28 E9 16 63 0D 0B 44 C4 A8 49 DC 9A 02 F0 7A
+     07 CB 30 F7 32 31 82 56 B1 5D 86 5A C4 AE 16 2F
+\end{verbatim}
+
+\subsection{Skein-512-512}
+
+\begin{verbatim}
+
+Message data:
+     FF
+
+Result:
+     71 B7 BC E6 FE 64 52 22 7B 9C ED 60 14 24 9E 5B
+     F9 A9 75 4C 3A D6 18 CC C4 E0 AA E1 6B 31 6C C8
+     CA 69 8D 86 43 07 ED 3E 80 B6 EF 15 70 81 2A C5
+     27 2D C4 09 B5 A0 12 DF 2A 57 91 02 F3 40 61 7A
+
+Message data:
+     FF FE FD FC FB FA F9 F8 F7 F6 F5 F4 F3 F2 F1 F0
+     EF EE ED EC EB EA E9 E8 E7 E6 E5 E4 E3 E2 E1 E0
+     DF DE DD DC DB DA D9 D8 D7 D6 D5 D4 D3 D2 D1 D0
+     CF CE CD CC CB CA C9 C8 C7 C6 C5 C4 C3 C2 C1 C0
+
+Result:
+     45 86 3B A3 BE 0C 4D FC 27 E7 5D 35 84 96 F4 AC
+     9A 73 6A 50 5D 93 13 B4 2B 2F 5E AD A7 9F C1 7F
+     63 86 1E 94 7A FB 1D 05 6A A1 99 57 5A D3 F8 C9
+     A3 CC 17 80 B5 E5 FA 4C AE 05 0E 98 98 76 62 5B
+
+Message data:
+     FF FE FD FC FB FA F9 F8 F7 F6 F5 F4 F3 F2 F1 F0
+     EF EE ED EC EB EA E9 E8 E7 E6 E5 E4 E3 E2 E1 E0
+     DF DE DD DC DB DA D9 D8 D7 D6 D5 D4 D3 D2 D1 D0
+     CF CE CD CC CB CA C9 C8 C7 C6 C5 C4 C3 C2 C1 C0
+     BF BE BD BC BB BA B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+     AF AE AD AC AB AA A9 A8 A7 A6 A5 A4 A3 A2 A1 A0
+     9F 9E 9D 9C 9B 9A 99 98 97 96 95 94 93 92 91 90
+     8F 8E 8D 8C 8B 8A 89 88 87 86 85 84 83 82 81 80
+
+Result:
+     91 CC A5 10 C2 63 C4 DD D0 10 53 0A 33 07 33 09
+     62 86 31 F3 08 74 7E 1B CB AA 90 E4 51 CA B9 2E
+     51 88 08 7A F4 18 87 73 A3 32 30 3E 66 67 A7 A2
+     10 85 6F 74 21 39 00 00 71 F4 8E 8B A2 A5 AD B7
+
+\end{verbatim}
+
+\subsection{Skein-1024-1024}
+
+\begin{verbatim}
+
+Message data:
+     FF
+
+Result:
+     E6 2C 05 80 2E A0 15 24 07 CD D8 78 7F DA 9E 35
+     70 3D E8 62 A4 FB C1 19 CF F8 59 0A FE 79 25 0B
+     CC C8 B3 FA F1 BD 24 22 AB 5C 0D 26 3F B2 F8 AF
+     B3 F7 96 F0 48 00 03 81 53 1B 6F 00 D8 51 61 BC
+     0F FF 4B EF 24 86 B1 EB CD 37 73 FA BF 50 AD 4A
+     D5 63 9A F9 04 0E 3F 29 C6 C9 31 30 1B F7 98 32
+     E9 DA 09 85 7E 83 1E 82 EF 8B 46 91 C2 35 65 65
+     15 D4 37 D2 BD A3 3B CE C0 01 C6 7F FD E1 5B A8
+
+Message data:
+     FF FE FD FC FB FA F9 F8 F7 F6 F5 F4 F3 F2 F1 F0
+     EF EE ED EC EB EA E9 E8 E7 E6 E5 E4 E3 E2 E1 E0
+     DF DE DD DC DB DA D9 D8 D7 D6 D5 D4 D3 D2 D1 D0
+     CF CE CD CC CB CA C9 C8 C7 C6 C5 C4 C3 C2 C1 C0
+     BF BE BD BC BB BA B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+     AF AE AD AC AB AA A9 A8 A7 A6 A5 A4 A3 A2 A1 A0
+     9F 9E 9D 9C 9B 9A 99 98 97 96 95 94 93 92 91 90
+     8F 8E 8D 8C 8B 8A 89 88 87 86 85 84 83 82 81 80
+
+Result:
+     1F 3E 02 C4 6F B8 0A 3F CD 2D FB BC 7C 17 38 00
+     B4 0C 60 C2 35 4A F5 51 18 9E BF 43 3C 3D 85 F9
+     FF 18 03 E6 D9 20 49 31 79 ED 7A E7 FC E6 9C 35
+     81 A5 A2 F8 2D 3E 0C 7A 29 55 74 D0 CD 7D 21 7C
+     48 4D 2F 63 13 D5 9A 77 18 EA D0 7D 07 29 C2 48
+     51 D7 E7 D2 49 1B 90 2D 48 91 94 E6 B7 D3 69 DB
+     0A B7 AA 10 6F 0E E0 A3 9A 42 EF C5 4F 18 D9 37
+     76 08 09 85 F9 07 57 4F 99 5E C6 A3 71 53 A5 78
+
+Message data:
+     FF FE FD FC FB FA F9 F8 F7 F6 F5 F4 F3 F2 F1 F0
+     EF EE ED EC EB EA E9 E8 E7 E6 E5 E4 E3 E2 E1 E0
+     DF DE DD DC DB DA D9 D8 D7 D6 D5 D4 D3 D2 D1 D0
+     CF CE CD CC CB CA C9 C8 C7 C6 C5 C4 C3 C2 C1 C0
+     BF BE BD BC BB BA B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
+     AF AE AD AC AB AA A9 A8 A7 A6 A5 A4 A3 A2 A1 A0
+     9F 9E 9D 9C 9B 9A 99 98 97 96 95 94 93 92 91 90
+     8F 8E 8D 8C 8B 8A 89 88 87 86 85 84 83 82 81 80
+     7F 7E 7D 7C 7B 7A 79 78 77 76 75 74 73 72 71 70
+     6F 6E 6D 6C 6B 6A 69 68 67 66 65 64 63 62 61 60
+     5F 5E 5D 5C 5B 5A 59 58 57 56 55 54 53 52 51 50
+     4F 4E 4D 4C 4B 4A 49 48 47 46 45 44 43 42 41 40
+     3F 3E 3D 3C 3B 3A 39 38 37 36 35 34 33 32 31 30
+     2F 2E 2D 2C 2B 2A 29 28 27 26 25 24 23 22 21 20
+     1F 1E 1D 1C 1B 1A 19 18 17 16 15 14 13 12 11 10
+     0F 0E 0D 0C 0B 0A 09 08 07 06 05 04 03 02 01 00
+
+Result:
+     84 2A 53 C9 9C 12 B0 CF 80 CF 69 49 1B E5 E2 F7
+     51 5D E8 73 3B 6E A9 42 2D FD 67 66 65 B5 FA 42
+     FF B3 A9 C4 8C 21 77 77 95 08 48 CE CD B4 8F 64
+     0F 81 FB 92 BE F6 F8 8F 7A 85 C1 F7 CD 14 46 C9
+     16 1C 0A FE 8F 25 AE 44 4F 40 D3 68 00 81 C3 5A
+     A4 3F 64 0F D5 FA 3C 3C 03 0B CC 06 AB AC 01 D0
+     98 BC C9 84 EB D8 32 27 12 92 1E 00 B1 BA 07 D6
+     D0 1F 26 90 70 50 25 5E F2 C8 E2 4F 71 6C 52 A5
+\end{verbatim}
+
+\section{NIST SHA-3 Round 2 Tweak: Rotation Constants}\label{sec:tweakrotconst}
+
+This specification of Skein includes a ``tweak'' for Round 2 of the NIST SHA-3 competition: the rotation constants shown in Table~\ref{tab:rotations}, which differ from those originally submitted to NIST in October 2008 \cite{Skein1}. All existing and future implementations of Skein \emph{must} now use these new rotation constants to be compliant. Changing the rotation constants required updating all the Skein test vectors
+(Appendix~\ref{sec:testvectors}) and the precomputed initial chaining values
+(Appendix~\ref{sec:initialchainingvalues}).
+
+Some comments are in order about the tweak. It is our belief is that the structure of Threefish would be secure with almost any randomly chosen set of rotation constants (see Section~\ref{sec:random-constants}). Indeed, it would give us pause if a randomly generated set of rotation constants were to result in an attack.
+
+Nonetheless, given that we have an opportunity to select the rotation constants, it makes sense to maximize diffusion as much as possible. In the original NIST submission, we had limited time to design and run a search algorithm for rotation constants. Some time after the initial submission, Guillaume Sevestre contacted us \cite{Sev09} with a suggestion for a new search algorithm---the evolutionary algorithm described in Section~\ref{sec:threefishdesign}---which indeed produced significantly better results using the original diffusion metric. Many thanks are due to Guillaume for collaborating at length to educate us on the new search algorithm and how to optimize it.
+
+In addition, the metric used in the original search (see the definition of $H_{min}$ in Section~\ref{sec:deprecatedrotconst}) was not the most appropriate value to optimize.  This metric minimized the probability that a particular input bit flip would \emph{not} trigger a particular output bit flip, but it did not attempt to minimize the probability such a bit flip \emph{would} occur. A slightly different metric, minimizing the maximum bias from $K/2$ in the histogram, was more consistent with what we were trying to achieve. Not surprisingly, we found that some of the original rotation constant sets resulted in rather poor values of the maximum bias due to cases where the output bit almost always flipped for a given input bit difference.  Again, we do not expect that this bias using the original rotation constants would result in an attack, but as long as we were considering a tweak, it seemed best to use the bias as the new search metric. 
+
+When we ran the new search algorithm for two days using the improved bias metric, the gains in both diffusion metrics were generally impressive. A comparison is given in Table~\ref{tab:rotbiascomparison}, with the bias listed as the maximum deviation from $0.5$ of the value $x = h_i/K$ across the entire histogram, where $h_i$ is an entry in the histogram.  Thus, a smaller value in the table indicates better diffusion, and the worst possible value (i.e., 0.50) would indicate that at least one output bit location \emph{never} (or always) changed for a given input bit difference. Again, the original bias metric only took into account values of $x < 0.5$, while the new metric includes both positive and negative deviations, so bias values obtained using the old metric will never be larger than those using the new metric. Note that a later six-day search improved the metric by only about 0.002 for Skein-256, 0.007 for Skein-512, and 0.014 for Skein-1024 over the two-day results (i.e., t
+hose in Table~\ref{tab:rotations}), so we are comfortable using the latter.
+
+
+\begin{table}[htb]
+  \begin{center}
+    \begin{tabular}{|l|c|c|c|}
+    \hline
+    & \multicolumn{3}{c|}{Rotation Constants}  \\ \cline{2-4}
+    & \multicolumn{2}{c|}{Old Set} & New Set   \\ \cline{2-4}
+    & Old Metric & New Metric & New Metric     \\
+	\hline
+    Skein-256  & 0.10  &  0.24  & 0.09  \\
+    Skein-512  & 0.33  &  0.48  & 0.25  \\
+    Skein-1024 & 0.26  &  0.35  & 0.15  \\
+    \hline
+    \end{tabular}
+   \end{center}\caption{Maximum observed bias for $K=16384$}
+  \label{tab:rotbiascomparison}
+\end{table}
+
+Given these two results and the fact that NIST has allowed tweaks at this point in the competition, we feel it would be somewhat irresponsible \emph{not} to submit the improved rotation constant set. However, if NIST decides for some reason that changing the Skein rotation constants is not allowed as a tweak, then we will happily stay with the original Skein definition.
+
+If NIST accepts the tweak, we recommend that NIST consider as relevant any attack which uses either the old or the new rotation constants, so that all cryptanalysis efforts on the original rotation constants are still valid.
+
+\subsection{Deprecated Skein Rotation Constants}\label{sec:deprecatedrotconst}
+
+For historical reference, this section includes excerpts from originally submitted Skein specification \cite{Skein1}, showing the (now deprecated) rotation constants in Table~\ref{tab:rotations_v1_0}, as well as the original description of the search algorithm used to generate them. 
+
+\begin{table}[htbp]
+  \begin{center}
+    \begin{tabular}{|cc|rr|rrrr|rrrrrrrr|}
+    \hline
+    \multicolumn{2}{|c|}{$N_w$}&\multicolumn{2}{c|}{4}&\multicolumn{4}{c|}{8}&\multicolumn{8}{c|}{16}\\
+    \hline
+    \multicolumn{2}{|c|}{$j$}& 0 & 1 & 0 & 1 & 2 & 3 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7\\
+    \hline
+    \hline
+            & 0 &   5 & 56 &   38 & 30 & 50 & 53 &   55 & 43 & 37 & 40 & 16 & 22 & 38 & 12  \\
+            & 1 &  36 & 28 &   48 & 20 & 43 & 31 &   25 & 25 & 46 & 13 & 14 & 13 & 52 & 57  \\
+            & 2 &  13 & 46 &   34 & 14 & 15 & 27 &   33 &  8 & 18 & 57 & 21 & 12 & 32 & 54  \\
+    $d =$   & 3 &  58 & 44 &   26 & 12 & 58 &  7 &   34 & 43 & 25 & 60 & 44 &  9 & 59 & 34  \\
+            & 4 &  26 & 20 &   33 & 49 &  8 & 42 &   28 &  7 & 47 & 48 & 51 &  9 & 35 & 41  \\
+            & 5 &  53 & 35 &   39 & 27 & 41 & 14 &   17 &  6 & 18 & 25 & 43 & 42 & 40 & 15  \\
+            & 6 &  11 & 42 &   29 & 26 & 11 &  9 &   58 &  7 & 32 & 45 & 19 & 18 &  2 & 56  \\
+            & 7 &  59 & 50 &   33 & 51 & 39 & 35 &   47 & 49 & 27 & 58 & 37 & 48 & 53 & 56  \\
+       \hline
+    \end{tabular}
+  \end{center}\caption{Deprecated Skein rotation constants $R_{d,j}$ for each $N_w$.}
+  \label{tab:rotations_v1_0}
+\end{table}
+
+Our goal was to choose rotation constants that maximized diffusion across the entire cipher.  We used a three-phase process to select the final set.
+
+In phase one, we selected candidate sets of rotation constants that maximized the Hamming weight of a simplified version of Threefish.  In this modified version, we replaced the addition and XOR operations in the Threefish MIX function with the logical OR operation.  We then generated a random set of rotation constants and, using an all-zero plaintext, injected a single input bit difference at each input bit location.  After $R$ rounds, we measured the minimum Hamming weight of each of the $N$ output words across all input difference locations.  If the Hamming weight value was less than a threshold $W$, we rejected the rotation set and randomly chose another.  If it was greater than or equal to $W$, we saved it for phase two.
+
+We selected values of $R$ and $W$ empirically based on the block size.  The general idea was to choose values that were at the knee of the diffusion curve.  In other words, if we chose $R$ to be too small, then all rotation sets looked alike.  If we chose $R$ to be too large, then the minimum Hamming weight quickly reached 64 bits.  Similarly, if we chose $W$ to be too small, then all rotation sets passed; and if we chose $W$ to be too large, none passed.  After some experimentation, we settled on the $(R,W)$ sets of $(8,61)$, $(8,47)$, and $(9,51)$ for Threefish-256, -512, and -1024, respectively.
+
+Our search algorithm used a hill-climbing algorithm, initially accepting rotation constant sets with Hamming weight metric $(W-4)$ and then trying to modify pairs of rotation constants in the set to walk up to the value $W$, and beyond, if possible. In our random selections, we rejected any rotation constants with value $0$, $+1$, and $-1$, since the add and XOR operations in the MIX function already provided diffusion to adjacent bits.
+
+Phase one was very useful as an initial filter because it was much faster than running the actual Threefish rounds, primarily because this metric is rotationally invariant. That is, we actually ran the diffusion test using only a single bit difference position per word, which sped up this phase by a factor of 64. We could also have used XOR instead of logical OR here, but the former would have included cancellations and hidden the true diffusion rate of a candidate set of rotation constants, so we felt that using OR was a better choice.
+
+In phase two, we took all the sets of rotation constants collected in the first phase.  We selected $K$ random plaintexts and injected a small difference pattern in each possible input bit location, using the actual Threefish round function.  We chose $K$ to be 1024: small enough to run fairly quickly, but large enough to grade the rotation sets with reasonable probability.
+
+At each bit position, we used an input difference pattern of up to three bits, with a nonzero difference in the first bit; i.e., the bit patterns 001, 011, 101, and 111.  We generated a histogram for each output bit as to whether that bit changed for each input difference, after $R$ rounds, ignoring the key injection.  For example, in Threefish-512 this meant that the histogram had an array of 512x512 (256K) entries. We generated separate histograms for each input difference bit pattern, for a total of four different histograms per rotation constant set.
+
+For a truly random function, the expected value for each histogram entry would be $K/2$ with a binomial distribution.  Of course, with these small values of $R$ the function is not truly random, but the goal was simply to choose a reasonable metric to grade the sets of rotation constants.  For each set of rotation constants, we computed the minimum value, called $H_{min}$, across all four histograms, for $K$ plaintexts.  We retained the $N_f$ rotations sets with maximum $H_{min}$ value as ``finalists'' to use in phase three, with $N_f=16$.
+
+For each set of rotation constants selected in the first phase, the set of rotation constants generated by scaling by any fixed odd integer (mod 64) also has the same Hamming weight properties in the simplified OR-only version of Threefish.  Therefore, in the second phase, we also tested all 32 such scaled versions for the best $H_{min}$ value.
+
+In phase three, we re-graded the $N_f$ finalist sets of rotation constants using larger values of $K$---4096, 8192, and 16,384---to minimize the expected statistical sampling error.  Based on the relative rankings of the rotation constant sets in phase three, we chose the winner. In the case of Threefish-256, choosing the winner was somewhat arbitrary, as there were several leading contenders with similar $H_{min}$ values, and the relative rankings changed slightly for different values of $K$.
+
+We ran this three-phase selection process for all three Threefish block sizes.  The overall run time for the search was 2--3 days on an Intel Core 2 Duo CPU running in 64-bit mode, though this was actually split up and run on separate CPUs for the separate block sizes, to minimize elapsed time.
+
+\subsection{Empirical Cryptanalysis on the Deprecated Rotation Constants}
+
+For historical reference, this section describes a cryptanalytic experiment presented in the originally submitted Skein specification \cite{Skein1}.  This analysis is updated for the new rotation constants in Section~\ref{sec:related-key-attacks}.
+
+In 2008, Martin Kausche \cite{MartinKausche2008} performed a number of experiments regarding related-key attacks on reduced-round Threefish. We cited his results in \cite{Skein1}, in the context of our own preliminary cryptanalysis of Threefish and Skein. 
+
+\begin{table}[tbh]
+  \centering
+\begin{tabular}{|c|ccc|cccc|c|}
+\hline
+round & \multicolumn{ 2}{c}{maximum bias at}& prob. & \multicolumn{ 4}{c|}{\# bits with bias} &
+average bias \\
+$r$ & word $w$ & bit $b$ & $p^r_{w,b}$ & $>0.1$ & $>0.05$ & $>0.01$ & $>$ 0.001 & \\  \hline
+ 9 &  0 &  0 &    1.00000 &  256 &  256 &  256 &  256 &    0.50000 \\ \hline
+ 10 &  0 &  0 &    1.00000 &  256 &  256 &  256 &  256 &    0.50000 \\ \hline
+ 11 &  0 &  0 &    1.00000 &  254 &  254 &  254 &  254 &    0.49218 \\ \hline
+ 12 &  0 &  0 &    1.00000 &  245 &  245 &  245 &  245 &    0.45322 \\ \hline
+ 13 &  0 &  0 &    1.00000 &  216 &  223 &  223 &  223 &    0.34278 \\ \hline
+ 14 &  2 &  2 &    0.00418 &  147 &  175 &  188 &  188 &    0.17837 \\ \hline
+ 15 &  2 &  1 &    0.97631 &  37 &  60 &  114 &  132 &    0.04378 \\ \hline
+ 16 &  0 &  1 &    0.38875 &  1 &  1 &  18 &  55 &    0.00285 \\ \hline
+ 17 &  2 &  0 &    0.52350 &  0 &  0 &  1 &  3 &    0.00020 \\ \hline
+ 18 &  3 &  17 &    0.49969 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+ 19 &  3 &  35 &    0.49971 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+ 20 &  0 &  43 &    0.49961 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+\end{tabular}
+  \caption{Empirical results for Threefish-256 with deprecated rotation constants \cite{MartinKausche2008}, sample size 20,000,000 pairs.}
+  \label{tab:KauscheObservationsFor256}
+\end{table}
+
+\begin{table}[tbh]
+ \centering
+\begin{tabular}{|c|ccc|cccc|c|}
+\hline
+round & \multicolumn{ 2}{c}{maximum bias at}& prob. & \multicolumn{ 4}{c|}{\# bits with bias} &
+average bias \\
+$r$ & word $w$ & bit $b$ & $p^r_{w,b}$ & $>0.1$ & $>0.05$ & $>0.01$ & $>$ 0.001 & \\  \hline
+9 &  0 &  0 &    1.00000 &  512 &  512 &  512 &  512 &    0.50000 \\ \hline
+10 &  0 &  0 &    1.00000 &  512 &  512 &  512 &  512 &    0.50000 \\ \hline
+11 &  0 &  0 &    1.00000 &  510 &  510 &  510 &  510 &    0.49609 \\ \hline
+12 &  0 &  0 &    1.00000 &  501 &  501 &  501 &  501 &    0.47666 \\ \hline
+13 &  0 &  0 &    1.00000 &  462 &  466 &  466 &  466 &    0.39772 \\ \hline
+14 &  0 &  42 &    0.99999 &  366 &  389 &  402 &  403 &    0.25770 \\ \hline
+15 &  2 &  1 &    0.00963 &  141 &  197 &  256 &  278 &    0.07316 \\ \hline
+16 &  4 &  0 &    0.06533 &  7 &  21 &  65 &  100 &    0.00723 \\ \hline
+17 &  0 &  4 &    0.49655 &  0 &  0 &  0 &  4 &    0.00011 \\ \hline
+18 &  6 &  59 &    0.50036 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+19 &  6 &  32 &    0.50031 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+20 &  2 &  58 &    0.50036 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+\end{tabular}
+\caption{Empirical results for Threefish-512 with deprecated rotation constants \cite{MartinKausche2008}, sample size 20,000,000~pairs.}
+\label{tab:KauscheObservationsFor512}
+\end{table}
+
+\begin{table}[tbh]
+\centering
+\begin{tabular}{|c|ccc|cccc|c|}
+\hline
+round & \multicolumn{ 2}{c}{maximum bias at}& prob. & \multicolumn{ 4}{c|}{\# bits with bias} &
+average bias \\
+$r$ & word $w$ & bit $b$ & $p^r_{w,b}$ & $>0.1$ & $>0.05$ & $>0.01$ & $>$
+0.001 & \\  \hline
+9 &  0 &  0 &    1.00000 &  1024 &  1024 &  1024 &  1024 &    0.50000 \\ \hline
+10 &  0 &  0 &    1.00000 &  1024 &  1024 &  1024 &  1024 &    0.50000 \\ \hline
+11 &  0 &  0 &    1.00000 &  1022 &  1022 &  1022 &  1022 &    0.49805 \\ \hline
+12 &  0 &  0 &    1.00000 &  1013 &  1013 &  1013 &  1013 &    0.48829 \\ \hline
+13 &  0 &  0 &    1.00000 &  981 &  981 &  981 &  981 &    0.45041 \\ \hline
+14 &  0 &  0 &    1.00000 &  869 &  900 &  907 &  914 &    0.35832 \\ \hline
+15 &  9 &  0 &    1.00000 &  598 &  670 &  743 &  772 &    0.20242 \\ \hline
+16 &  0 &  1 &    0.97589 &  148 &  232 &  381 &  461 &    0.04239 \\ \hline
+17 &  10 &  1 &    0.70448 &  5 &  8 &  31 &  87 &    0.00173 \\ \hline
+18 &  6 &  0 &    0.48980 &  0 &  0 &  1 &  2 &    0.00010 \\ \hline
+19 &  3 &  13 &    0.50040 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+20 &  7 &  15 &    0.50040 &  0 &  0 &  0 &  0 &    0.00009 \\ \hline
+\end{tabular}
+\caption{Empirical results for Threefish-1024 with deprecated rotation constants \cite{MartinKausche2008}, sample size 20,000,000~pairs.}
+\label{tab:KauscheObservationsFor1024}
+\end{table} 
+
+For each of Threefish-256, -512, and -1024, twenty million random pairs (20,000,000 $\approx 2^{24.25}$) with the specified difference in plaintext, key, and tweak were generated. The probability that the bit is the same in both ciphertexts of a ciphertext pair is written as $p^r_{w,b}$, and the bias is $|p^r_{b,w}-0.5|$. Table~\ref{tab:KauscheObservationsFor256}, \ref{tab:KauscheObservationsFor512}, and \ref{tab:KauscheObservationsFor1024} summarize the results. The results we described above confirm what we already pointed out in \cite{Skein1}: there is a significant bias for 17 rounds of both Threefish-256 and Threefish-512 that disappears in round 18 and later, and a significant bias for 18 rounds of Threefish-1024.
+
+For each round $r$, Tables~\ref{tab:KauscheObservationsFor256}--\ref{tab:KauscheObservationsFor1024} describe the coordinates $w$ and $b$ of a bit with maximum bias and its actual probability. (There may be other bits with the same bias.) The table also gives the number of bits with ``large'' bias for each round; i.e., the number of bits with a bias exceeding 10\%, 5\%, 1\%, and 0.1\%, respectively. Furthermore, the tables gives the average bias, over all the 256/512/1024 bits considered.  The tables focus on the ``interesting'' rounds. 
+
+\subsection{New Empirical Cryptanalysis on the Deprecated Rotation Constants}
+
+To provide additional comparison between the new and old set of rotation constants, we ran the same sets of experiments for Threefish with the deprecated rotation constants as we did for Threefish in Section~\ref{sec:related-key-attacks} in Tables~\ref{tab:ObservationsFor256}--\ref{tab:ObservationsFor1024}.
+
+\begin{table}[tbh]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline
+0--10: &  256 &  256 &  256 &  256 & 0.50000  \\
+ 11: &  204 &  254 &  254 &  254 & 0.49219  \\
+ 12: &  77 &  242 &  242 &  242 & 0.43052  \\
+ 13: &  21 &  223 &  223 &  223 & 0.34280  \\
+ 14: &  0 &  175 &  188 &  188 & 0.17840  \\
+ 15: &  0 &  60 &  121 &  132 & 0.04377  \\
+ 16: &  0 &  1 &  30 &  50 & 0.00305  \\
+ 17: &  0 &  0 &  1 &  3 & 0.00017  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00005  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00006  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00006  \\
+\hline
+\end{tabular}
+\caption{Empirical results for Threefish-256 with deprecated rotation constants, sample size 50,000,000~pairs.}
+\label{tab:Observations-256-Old}
+\end{table}
+
+\begin{table}[tbh]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline
+ 0--10: &  512 &  512 &  512 &  512 & 0.50000  \\
+ 11: &  470 &  510 &  510 &  510 & 0.49609  \\
+ 12: &  266 &  495 &  495 &  495 & 0.46008  \\
+ 13: &  64 &  466 &  466 &  466 & 0.39772  \\
+ 14: &  0 &  389 &  403 &  403 & 0.25769  \\
+ 15: &  0 &  197 &  269 &  280 & 0.07315  \\
+ 16: &  0 &  19 &  74 &  102 & 0.00678  \\
+ 17: &  0 &  0 &  0 &  5 & 0.00008  \\
+ 18: &  0 &  0 &  0 &  0 & 0.00006  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00005  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00005  \\
+\hline
+\end{tabular}
+\caption{Empirical results for Threefish-512 with deprecated rotation constants, sample size 50,000,000~pairs.}
+\label{tab:Observations-512-Old}
+\end{table}
+
+\begin{table}[tbh]
+  \centering
+\begin{tabular}{|r|rrrr|r|}
+\hline
+round & \multicolumn{ 4}{c|}{\# bits with bias} & average\\
+$r$ & \quad full & $>0.1$ & $>0.01$ & $>$ 0.001 & bias \\  \hline
+ 0--10: &  1024 &  1024 &  1024 &  1024 & 0.50000  \\
+ 11: &  976 &  1022 &  1022 &  1022 & 0.49805  \\
+ 12: &  735 &  1008 &  1008 &  1008 & 0.48120  \\
+ 13: &  403 &  981 &  981 &  981 & 0.45041  \\
+ 14: &  99 &  900 &  912 &  914 & 0.35832  \\
+ 15: &  8 &  670 &  753 &  773 & 0.20241  \\
+ 16: &  0 &  211 &  389 &  464 & 0.03858  \\
+ 17: &  0 &  8 &  47 &  105 & 0.00171  \\
+ 18: &  0 &  0 &  1 &  2 & 0.00007  \\
+ 19: &  0 &  0 &  0 &  0 & 0.00006  \\
+ 20: &  0 &  0 &  0 &  0 & 0.00005  \\
+\hline
+\end{tabular}
+\caption{Empirical Results for Threefish-1024 with deprecated rotation constants, sample size 50,000,000~pairs.}
+\label{tab:Observations-1024-Old}
+\end{table}
+
+As it turned out, the deprecated and new 256-bit versions are basically the same: the new Skein-256 has a bit with bias $>0.01$ in round 16, but the deprecated version has more bits with a bias $>0.001$. The deprecated and new 512-bit versions are similarly the same.
+
+For Skein-1024, we are able to distinguish 18 rounds of Threefish-1024 with the deprecated rotation constants from random. There is even one bit with a bias exceeding $0.01$. Since we could not find a single bit with a bias exceeding $0.001$ for 18 rounds of Threefish-1024 with the new rotation constants, the new version is narrowly ahead of the deprecated version.
+
+\section{Empirical data for tweaking the key schedule constant}
+  \label{sec:empiricalks}
+
+\subsection{Defining the requirements for the new constant}
+
+Recall that we defined where $\mbox{hwd}(C)$ for the deviation of the Hamming
+weight of $C$ from the ideal value of 32, $\mbox{rc}(C)$ for the number of
+consecutive 1-bits, and 
+\begin{eqnarray*}
+   F(C,i)   = & C \oplus (C \rol i) & \mbox{for}\ 1 \le i \le 63, \\
+   F(C,i,j) = & C \oplus (C \rol i) \oplus (C \rol j) &
+    \mbox{for}\ 1 \le i < j \le 63,
+   \\  
+   F(C,i,j,k) = & C \oplus (C \rol i) \oplus (C \rol j) \oplus (C \rol k) &
+    \mbox{for}\ 1 \le i < j < k \le 63, \ \mbox{and}
+  \\
+    F(C,i,j,k,\ell) = & C \oplus (C \rol i) \oplus (C \rol j) 
+                      \oplus (C \rol k) \oplus (C \rol \ell) &
+    \mbox{for}\ 1 \le i < j < k <  \ell \le 63.
+\end{eqnarray*}
+
+\begin{table}[htbp]
+  \centering
+  \begin{tabular}{rrr}
+\hline
+H1=\\
+$\max_i(\mbox{hwd}(F(C,i))$ & count & ratio \\
+\hline
+  4 &    211 &   0\% \\
+  6 &  17174 &   8\% \\
+  \textbf{8} &  85405 &  49\% \\
+ 10 &  63377 &  80\% \\
+ 12 &  33791 &  96\% \\
+ 14 &   5130 &  98\% \\
+ 16 &   2910 & 100\% \\
+ 18 &     84 & 100\% \\
+ 20 &    215 & 100\% \\
+ 24 &     11 & 100\% \\
+\\
+\hline
+H2=\\
+$\max_{i,j}(\mbox{hwd}(F(C,i,j))$ & count & ratio \\
+\hline
+  8 &      8 &  0\%  \\
+ 10 &   7890 &  4\%  \\
+ \textbf{12} & 109760 & 56\%  \\
+ 14 &  76030 & 93\%  \\
+ 16 &  13229 & 99\%  \\
+ 18 &   1293 & 100\% \\
+ 20 &     95 & 100\% \\
+ 22 &      3 & 100\% \\
+  \end{tabular}
+  \quad
+  \begin{tabular}{rrr}
+\hline
+H3= \\
+$\max_{i,j,K}(\mbox{hwd}(F(C,i,j,k))$ & count & ratio \\
+\hline
+ 12 &     65 &  0\% \\
+ 14 &  56010 & 27\% \\
+ \textbf{16} & 127434 & 88\% \\
+ 18 &  19266 & 97\% \\
+ 20 &   3773 & 99\% \\
+ 22 &     80 & 99\% \\
+ 24 &   1667 &100\% \\
+ 28 &      3 &100\% \\
+ 32 &     10 &100\% \\
+\\ \\ 
+\hline%
+H4=\\
+$\max_{i,j,k,\ell}(\mbox{hwd}(F(C,i,j,k,\ell))$ & count & ratio \\
+\hline
+ 14 &      5 &  0\%  \\
+ 16 &  55625 & 27\%  \\
+ \textbf{18} & 134826 & 91\%  \\
+ 20 &  17029 & 100\% \\
+ 22 &    800 & 100\% \\
+ 24 &     23 & 100\% \\
+\\ \\     
+  \end{tabular}
+
+  \caption{Histogram of Hamming weights: 
+    The first column provides the maximal deviation, the second column the 
+    number
+    of candidates with that maximum, and the third the percentage of all
+    candidates with at most that maximum (rounded towards nearest). The most
+    frequent values, which we used to define a ``typical random constant'',
+    are printed in \textbf{bold}.}
+
+  \label{tab:hammingweighthisto}
+\end{table}
+
+\begin{table}[htbp]
+  \centering
+  \begin{tabular}{rrr}
+\hline
+R0=\\
+    $\mbox{rc}(C)$ & count & ratio\\
+\hline
+ 25 &      2 &  0\% \\
+ 24 &     31 &  0\% \\
+ 23 &    182 &  0\% \\
+ 22 &    947 &  1\% \\
+ 21 &   3390 &  2\% \\
+ 20 &   9240 &  7\% \\
+ 19 &  19245 & 16\% \\
+ 18 &  31561 & 31\% \\
+ \textbf{17} &  39917 & 50\% \\
+ \textbf{16} &  39683 & 69\% \\
+ 15 &  31074 & 84\% \\
+ 14 &  19077 & 93\% \\
+ 13 &   9319 & 98\% \\
+ 12 &   3393 & 99\% \\
+ 11 &    964 &100\% \\
+ 10 &    233 &100\% \\
+  9 &     41 &100\% \\
+  8 &      8 &100\% \\
+  7 &      1 &100\% \\
+\\ \\
+\hline
+R1=\\
+$\min_i(\mbox{rc}(C,i))$  & count & ratio\\
+\hline
+ 15 &      7 &  0\% \\
+ 14 &   7873 &  4\% \\
+ 13 &  59616 & 32\% \\
+ \textbf{12} &  83101 & 72\% \\
+ 11 &  39438 & 91\% \\
+ 10 &  14689 & 98\% \\
+  9 &   2538 & 99\% \\
+  8 &    917 &100\% \\
+  7 &     64 &100\% \\
+  6 &     60 &100\% \\
+  5 &      5 &100\% \\
+  \end{tabular}
+\quad
+  \begin{tabular}{rrr}
+\hline
+R2=\\
+$\min_{i,j}(\mbox{rc}(C,i,j))$ & count & ratio\\
+\hline
+ 13 &      9 &  0\% \\
+ 12 &    551 &  0\% \\
+ 11 &  46742 & 23\% \\
+ \textbf{10} & 112374 & 77\% \\
+  9 &  41321 &96\% \\
+  8 &   6603 &100\% \\
+  7 &    662 &100\% \\
+  6 &     45 &100\% \\
+  5 &     1 & 100\% \\
+\\ \\
+\hline
+R3=\\
+$\min_{i,j,k}(\mbox{rc}(C,i,j,k))$ & count & ratio\\
+\hline
+ 10 &   3166 &  2\% \\
+  \textbf{9} & 106890 & 53\% \\
+  8 &  85787 & 94\% \\
+  7 &  10002 & 99\% \\
+  6 &   1659 &100\% \\
+  5 &    333 &100\% \\
+  4 &    456 &100\% \\
+  3 &      2 &100\% \\
+  2 &      3 &100\% \\
+  1 &      4 &100\% \\
+  0 &      6 &100\% \\
+\\ \\
+\hline
+R4=\\
+$\min_{i,j,k,\ell}(\mbox{rc}(C,i,j,k,\ell))$ & count & ratio\\
+\hline
+  9 &    367 &  0\% \\
+  \textbf{8} & 110417 & 53\% \\
+  7 &  88671 & 96\% \\
+  6 &   8422 &100\% \\
+  5 &    419 &100\% \\
+  4 &     12 &100\% \\
+
+  \end{tabular}
+
+  \caption{Histogram of run counts: 
+    The first column provides the smallest run count, the second column the 
+    number
+    of candidates with that minimum, and the third the percentage of all
+    candidates with at least that minimum (rounded towards nearest). The most
+    frequent values, which we used to define a ``typcial random constant'',
+    are printed in \textbf{bold}.
+    Note that, up to statistical noise, $\mbox{rc}(C)=16$ and
+    $\mbox{rc}(C)=17$ are equally common.}
+
+  \label{tab:runcounthisto}
+\end{table}
+
+
+
+We generated $208308$ random values with Hamming weight $32$. For each of
+these values $C$, we computed the ten characteristic values
+\begin{enumerate}
+  \item H0=$\mbox{hwd}(C)$,
+  \item H1=$\max_{i,j,k,\ell}(\mbox{hwd}(F(C,i))$,
+  \item H2=$\max_{i,j,k,\ell}(\mbox{hwd}(F(C,i,j))$,
+  \item H3=$\max_{i,j,k,\ell}(\mbox{hwd}(F(C,i,j,k))$,
+  \item H4=$\max_{i,j,k,\ell}(\mbox{hwd}(F(C,i,j,k,\ell))$,
+
+  \item R0=$\mbox{rc}(C)$,
+  \item R1=$\min_{i,j,k,\ell}(\mbox{rc}(F(C,i))$
+  \item R3=$\min_{i,j,k,\ell}(\mbox{rc}(F(C,i,j))$
+  \item R4=$\min_{i,j,k,\ell}(\mbox{rc}(F(C,i,j,k))$
+  \item R4=$\min_{i,j,k,\ell}(\mbox{rc}(F(C,i,j,k,\ell))$
+\end{enumerate}
+and counted the outcomes for H1, \ldots, H4, R0, \ldots, R4. Note that by
+discarding all $C$ with a Hamming weight unequal to 32, we knew that all our
+values satisfied H0=0. The results are printed in
+tables \ref{tab:hammingweighthisto} and \ref{tab:runcounthisto}.
+
+
+
+
+Based on these results, we fixed the requirements for our new constant, as
+described in table~\ref{tab:characteristicvalues}. 
+
+\begin{table}[htbp]
+  \centering
+  \begin{tabular}{c|lr}
+    Identifier & Formula & Required value \\
+\hline
+    H0 & $\mbox{hwd}(C)$ & 0\\
+    H1 & $\max_i(\mbox{hwd}(F(C,i))$ & 8 \\
+    H2 & $\max_{i,j}(\mbox{hwd}(F(C,i,j))$ & 12 \\
+    H3 & $\max_{i,j,k}(\mbox{hwd}(F(C,i,j,k))$ & 16 \\
+    H4 & $\max_{i,j,k,\ell}(\mbox{hwd}(F(C,i,j,k,\ell))$ & 18 \\
+\hline
+    R0 & $\mbox{rc}(C)$ & 16 or 17 \\
+    R1 & $\min_i(\mbox{rc}(F(C,i))$ & 12\\
+    R2 & $\min_{i,j}(\mbox{rc}(F(C,i,j))$ & 10 \\
+    R3 & $\min_{i,j,k}(\mbox{rc}(F(C,i,j,k))$ & 9  \\
+    R4 & $\min_{i,j,k,\ell}(\mbox{rc}(F(C,i,j,k,\ell))$ &8  \\
+  \end{tabular}
+  \caption{Characteristic Values and the outcome required for a ``typical''
+    random 64-bit word.}
+  \label{tab:characteristicvalues}
+\end{table}
+
+
+\subsection{Searching for the new constant}
+
+Once the requirements had been fixed, we ran AES-256, encrypting 0, 1, 2, 
+\ldots{} under the all-zero key. We used the least significant 64 bits of each
+ciphertext as a ``random'' word and computed the ten characteristic values of
+each such ``random'' word. The first word which met all ten requirements was 
+the encryption of 240 (decimal). All results are provided in
+table~\ref{tab:aescounter}. 
+
+
+\newcommand{\Word}[1]{\texttt{#1}$\!\!\!$}
+
+\begin{table}[htbp]
+\centering
+\begin{tabular}{c|c|*{5}{r}|*{5}{r}}
+\hline
+    Plaintext & Ciphertext & %
+    \multicolumn{10}{c}{Characteristic values of the ``random'' word} \\
+   (counter) & (ignored) \quad  \quad  \quad \ ``random'' word &
+   {\tiny H0} & {\tiny H1} & {\tiny H2} & {\tiny H3} & {\tiny H4} & 
+   {\tiny R0} & {\tiny R1} & {\tiny R2} & {\tiny R3} & {\tiny R4} \\
+\hline
+    \Word{00\ldots000
+} & \Word{DC95C078A2408989 AD48A21492842087
+}    & 10& 12& 14& 16& 18    & 19& 11& 9& 9& 8\\
+
+    \Word{00\ldots001
+} & \Word{530F8AFBC74536B9 A963B4F1C4CB738B
+}    & 2& 12& 12& 16& 16    & 18& 11& 9& 9& 7\\
+
+    \Word{00\ldots002
+} & \Word{CEA7403D4D606B6E 074EC5D3BAF39D18
+}    & 2& 10& 14& 16& 18    & 15& 11& 9& 9& 7\\
+
+    \Word{00\ldots003
+} & \Word{726003CA37A62A74 D1A2F58E7506358E
+}    & 0& 16& 14& 16& 18    & 17& 13& 9& 9& 7\\
+
+    \Word{00\ldots004
+} & \Word{DD4AB1284D4AE17B 41E85924470C36F7
+}    & 3& 8& 11& 14& 17    & 15& 12& 10& 8& 7\\
+
+    \Word{00\ldots005
+} & \Word{4741CBE181BB7F30 617C1DE3AB0C3A1F
+}    & 1& 10& 13& 16& 17    & 13& 12& 11& 7& 7\\
+
+    \Word{00\ldots006
+} & \Word{D0C48F7321A82D37 6095ACE0419167A0
+}    & 7& 14& 13& 14& 17    & 15& 11& 11& 9& 8\\
+
+    \Word{00\ldots007
+} & \Word{BCAF49B0C0CEA62D E6BC1C66545E1DAD
+}    & 2& 6& 10& 14& 16    & 17& 13& 10& 9& 8\\
+
+\multicolumn{10}{ c }{$\vdots$\hfill$\vdots$} \\
+
+    \Word{00\ldots0EF
+} & \Word{0F22C348BE3D2B56 1140EA58ADC22A45
+}    & 7& 6& 13& 16& 19    & 19& 12& 10& 8& 8\\
+
+    \Word{00\ldots0F0
+} & \Word{9B0964231DC7E2C1 1BD11BDAA9FC1A22
+}    & 0& 8& 12& 16& 18    & 16& 12& 10& 9& 8\\
+\hline
+\end{tabular}
+\caption{The ``random'' words generated by running AES-256 under
+  the all-zero key in counter mode, until all ten characteristic values are as
+  required. Plaintexts and ciphertexts are represented as hexadecimal numbers.} 
+  \label{tab:aescounter}
+\end{table}
+
+\clearpage
+
+\endgroup %================================== of changes to \section to add the "Appendix" text
+
+\bibliographystyle{plain}
+
+{\sloppy\hbadness 2000
+\begin{thebibliography}{19}
+\addcontentsline{toc}{section}{References}
+
+\bibitem{ansi-hmac} American Bankers Association, ``Keyed Hash Message Authentication Code,'' ANSI X9.71, 2000.
+
+\bibitem{ACMOPV09} J.~Aumasson, C.~Calik, W.~Meier, O.~Ozen, R.~Phan, and K.~Varici, ``Improved Cryptanalysis of Skein''
+\texttt{http://www.131002.net/papers.html}, submitted to the IACR eprint server, September 2009.
+
+\bibitem{ECC} E.~Barker, D.~Johnson, and M.~Smid, ``Recommendation for Pair-Wise Key Establishment Schemes Using Discrete Logarithm Cryptography (Revised),'' NIST Special Publication SP 800-56A, Mar 2007.
+
+\bibitem{RNG} E.~Barker and J.~Kelsey, ``Recommendation for Random Number Generation Using Deterministic Random Bit Generators,'' NIST Special Publication SP 800-90, Mar 2007.
+
+\bibitem{B06} M.~Bellare, ``New Proofs for NMAC and HMAC: Security without Collision-Resistance,'' {\it Advances in Cryptology---CRYPTO '06 Proceedings}, Springer-Verlag, 2006, pp. 602--619.
+
+\bibitem{HMAC1} M.~Bellare, R.~Canetti and H.~Krawczyk, ``Keying hash functions for message authentication,'' {\it Advances in Cryptology---CRYPTO '96 Proceedings}, Springer-Verlag, 1996 , pp. 1--15.
+
+\bibitem{BCK96b} M.~Bellare, R.~Canetti, and H.~Krawczyk, ``Pseudorandom Functions Revisited: The Cascade Construction and its Concrete Security,'' {\it Proceedings of the 37th Symposium on Foundations of Computer Science}, IEEE Press, 1996, pp. 514--523.
+
+\bibitem{BKR94} M.~Bellare, J.~Kilian, and P.~Rogaway.  ``The Security of Cipher Block Chaining,'' {\it Advances in Cryptology---CRYPTO '94 Proceedings}, Springer-Verlag, 1994, pp 341--358.
+
+\bibitem{BK09} M.~Bellare, T.~Kohno, S.~Lucks, N.~Ferguson, B.~Schneier, D.~Whiting, J.~Callas, and J.~Walker, ``Provable Security Support for the Skein Hash Family,'' Version 1.0, Apr 2009, {\tt http://www.skein-hash.info/sites/default/files/skein-proofs.pdf}.
+
+\bibitem{BR06} M.~Bellare and T.~Ristenpart, ``Multi-Property-Preserving Hash Domain Extension and the EMD Transform,'' {\it Advances in Cryptology---ASIACRYPT '06 Proceedings}, Springer-Verlag, 2006, 299--314.
+
+\bibitem{Bellare-Yee} M.~Bellare and B.~Yee, ``Forward Security in Private Key Cryptography, '' {\it Topics in Cryptology---CT-RSA}, Springer-Verlag, 2003, pp. 1--18.
+
+\bibitem{Ber05} D.J.~Bernstein, ``Cache-Timing Attacks on AES,'' April 2005, {\tt http://cr.yp.to/antiforgery/cachetiming-20050414.pdf}.
+
+\bibitem{Radiogatun} G.~Bertoni, J.~Daemen, M.~Peeters, G.~can Assche, ``RadioGat\'{u}n, a Belt-and-Mill Hash Function,'' {\it Second NIST Cryptographic Hash Workshop}, Santa Barbara, USA, 24--25 Aug 2006.
+
+\bibitem{B94}  E.~Biham, ``New Types of Cryptanalytic Attacks using Related Keys,'' {\it Journal of Cryptology}, v. 7, 1994, pp. 229--246.
+
+\bibitem{BC04} E.~Biham and R.~Chen, ``Near-Collisions of SHA-0,'' {\it Advances in Cryptology - Crypto '04 Proceedings}, Springer-Verlag, 2004, pp. 290--305.
+
+\bibitem{BS94}  E.~Biham and A.~Shamir, {\it Differential Cryptanalysis of the Data Encryption Standard}, Springer Verlag, 1993.
+
+\bibitem{BW99} A.~Biryukov and D.~Wagner, ``Slide Attacks,'' {\it 6th International Workshop on Fast Software Encryption}, Springer-Verlag, 1999, pp. 245--259.
+
+\bibitem{BW00} A.~Biryukov and D.~Wagner, ``Advanced Slide Attacks,'' {\it Advances in Cryptology---EUROCRYPT '00 Proceedings}, Springer-Verlag, 2000, pp. 589--606.
+
+\bibitem{Blum-Micali} S.~Micali and M.~Blum, ``How to Generate Cryptographically Strong Sequences of Pseudo-random Bits,'' {\it Proceedings of the 23rd IEEE Symposium on Foundations of Computer Science (FOCS '82)}, IEEE, 1982, pp. 112--117.
+
+
+\bibitem{BM06} J.~Bonneau and I.~Mironov, ``Cache-Collision Timing Attacks Against AES,'' {\it Cryptographic Hardware and Embedded Systems--CHES 2006}, Springer-Verlag, 2006, pp. 201--215.
+
+\bibitem{MARS} C.~Burwick, D.~Coppersmith, E.~D'Avidnon, R.~Gennaro, S.~Halevi, C.~Jutla, S.M.~Matyas, L.~O'Connor, M.~Peyravian, D.~Stafford, and N.~Zunic, ``MARS---A Candidate Cipher for AES,'' NIST AES Proposal, Jun 1998.
+
+\bibitem{CJ98} F.~Chabaud and A. Joux, ``Differential Collisions in SHA-0,'' {\it Advances in Cryptology: Eurocrypt '98 Proceedings}, Springer-Verlag, 1998, pp. 56--71.
+
+\bibitem{Che08} L. Chen, ``Recommendation for Key Derivation Using Pseudorandom Functions,'' NIST Special Publication SP 800-108, Apr 2008.
+
+\bibitem{C05} J.~Coron, Y.~Dodis, C.~Malinaud, P.~Puniya, ``{Merkle--Damg{\aa}rd} Revisited: How to Construct a Hash Function,'' {\it Advances in Cryptology: CRYPTO 05 Proceedings}, Springer-Verlag, 2005,  430--448.
+
+\bibitem{DGV94} J.~Daemen, R.~Govaerts, and J.~Vanderwalle, ``Correlation Matrices,'' {\it Fast Software Encryption 1994}, Springer-Verlag, 1995, pp. 275--285.
+
+\bibitem{AES} J.~Daemen and V.~Rijmen, {\it The Design of Rijndael: AES---The Advanced Encryption Standard}, Springer-Verlag, 2002.
+
+\bibitem{MD} I.~Damg{\aa}rd. ``A Design Principle for Hash Functions,'' {\it Advances in Cryptology: Crypto '89 Proceedings}, Springer-Verlag, 1990, pp. 416--427.
+
+\bibitem{D08} Q.~Dang, ``Randomized Hashing for Digital Signatures,'' NIST Special Publication SP 800-106, Aug 2008.
+
+\bibitem{DaumThesis} M.~Daum, {\it Cryptanalysis of HAsh functions of the MD4
+    Family,}, PhD thesis, Ruhr-Universit\"at Bochum, Germany, 2005. 
+
+\bibitem{DL05} M.~Daum and S. Lucks, ``The Story of Alice and her Boss,'' Eurocrypt 2005 rump session, 2005,  \texttt{http://th.informatik.uni-mannheim.de/people/lucks/HashCollisions/}.
+
+\bibitem{D98} H. Dobbertin, ``Cryptanalysis of MD4,'' {\it Journal of Cryptology}, v 11, n. 4, 1998, pp. 253--271.
+
+\bibitem{Dodis} Y.~Dodis, R.~Gennaro, J.~H{\aa}stad, H.~Krawczyk, and T.~Rabin, ``Randomness Extraction and Key Derivation Using the CBC, Cascade and HMAC Modes,'' {\it Advances in Cryptology: Crypto '04 Proceedings}, Springer-Verlag, 2004, pp 494--510.
+
+\bibitem{Feistel} H.~Feistel, ``Cryptography and Computer Privacy,'' {\it Scientific American}, May 1973, pp. 15--23.
+
+\bibitem{Skein1} N.~Ferguson, S.~Lucks, B.~Schneier, D.~Whiting, M.~Bellare, T.~Kohno, J.~Callas, and J.~Walker, ``The Skein Hash Function Family,'' Version 1.1, Nov 2008.
+
+\bibitem{FS99} N.~Ferguson and B.~Schneier, ``A Cryptographic Evaluation of IPsec'', Counterpane Internet Security, 1999, \purl{http://www.schneier.com/paper-ipsec.pdf}.
+
+\bibitem{FS03} N.~Ferguson and B.~Schneier, {\it Practical Cryptography}, John Wiley \& Sons, 2003.
+
+\bibitem{Helix} N.~Ferguson, D.~Whiting, B.~Schneier, J.~Kelsey, S.~Lucks, and T.~Kohno, ``Helix: Fast Encryption and Authentication in a Single Cryptographic Primitive,'' {\it Fast Software Encryption 2003}, Springer-Verlag, 2003, pp. 330--346.
+
+\bibitem{GIS06} M.~Gebhardt, G.~Illies, and W.~Schindler, ``A Note on the Practical Value of Single Hash Collisions for Special File Formats,'' {\em Sicherheit~2006}, pp. 333--344.
+
+\bibitem{G08a} B.~Gladman, ``SHA1, SHA2, HMAC and Key Derivation in C,'' \purl{http://fp.gladman.plus.com/cryptography_technology/sha/index.htm}, accessed 27 Jun 2008.
+
+\bibitem{G08b} B.~Gladman, personal communication, Aug 2008.
+
+\bibitem{GLP08} M.~Gorski, S.~Lucks, and T.~Peyrin, ``Slide Attacks on a Class of Hash Functions,''  {\em Advances in Cryptology---ASIACRYPT '08 Proceedings}, Springer-Verlag, 2008,  pp. 143--160.
+
+\bibitem{IntelAES} S.~Gueron, ``Advanced Encryption Standard (AES) Instructions Set,'' Intel, \purl{http://softwarecommunity.intel.com/articles/eng/3788.htm}, accessed 25 Aug 2008.
+
+\bibitem{HK06} S.~Halevi and H.~Krawczyk, ``Strengthening Digital Signatures via Randomized Hashing,'' {\it Advances in Cryptology: CRYPTO '06 Proceedings}, Springer-Verlag, 2006, pp. 41--59.
+
+\bibitem{HPR04} P.~Hawkes, M.~Paddon, and G.~Rose, ``On Corrective Patterns for the SHA-2 Family,'' Cryptology ePrint Archive, Report 2004/207.
+
+\bibitem{J04} A.~Joux, ``Multicollisions in Iterated Hash Functions: Applications to Cascaded Constructions,'' {\it Advances in Cryptology: CRYPTO '04 Proceedings}, Springer-Verlag, 2004, pp. 306--316.
+
+\bibitem{Kal00} B.~Kaliski, ``PKCS \#5: Password-Based Cryptography Specification Version 2.0,'' RFC 2898, Sep 2000.
+
+\bibitem{K04}  D.~Kaminski, ``MD5 to be Considered Harmful Someday,'' Dec.~2004, \url{http://www.doxpara.com/md5\_someday.pdf}.
+
+\bibitem{MartinKausche2008} M.~Kausche, {\it Master's Thesis}, Bauhaus-Universit\"at Weimar, 2008 (in preparation).
+
+\bibitem{KK06} J.~Kelsey and T.~Kohno, ``Herding Hash Functions and the Nostradamus Attack,'' {\it Advances in Cryptology: EUROCRYPT '06 Proceedings}, Springer-Verlag, 2006, pp. 183--200.
+
+\bibitem{KS05} J.~Kelsey and B.~Schneier, ``Second Preimages on $n$-bit Hash Functions for Much Less than $2n$ Work,'' {\it Advances in Cryptology: EUROCRYPT 2005 Proceedings}, Springer-Verlag, 2005, pp. 474--490.
+
+\bibitem{KSF99} J.~Kelsey, B.~Schneier, and N.~Ferguson, ``Yarrow-160: Notes on the Design and Analysis of the Yarrow Cryptographic Pseudorandom Number Generator,'' {\it Sixth Annual Workshop on Selected Areas in Cryptography}, Springer Verlag, 1999, pp. 13--33.
+
+\bibitem{KSW96} J.~Kelsey, B.~Schneier, and D.~Wagner, ``Key-Schedule Cryptanalysis of 3-WAY, IDEA, G-DES, RC4, SAFER, and Triple-DES,'' {\it Advances in Cryptology--CRYPTO '96 Proceedings}, Springer-Verlag, 1996, pp. 237--251.
+
+\bibitem{KSW97} J.~Kelsey, B.~Schneier, and D.~Wagner, ``Related-Key Cryptanalysis of 3-WAY, Biham-DES, CAST, DES-X, NewDES, RC2, and TEA,'' {\it ICICS '97 Proceedings}, Springer-Verlag, November 1997, pp. 233--246.
+
+\bibitem{KSW98} J.~Kelsey, B.~Schneier, and D.~Wagner, ``Protocol Interactions and the Chosen Protocol Attack,'' {\it Security Protocols, 5th International Workshop April 1997 Proceedings}, Springer-Verlag, 1998, pp. 91--104.
+
+\bibitem{KSWH00} J.~Kelsey, B.~Schneier, D.~Wagner, and C.~Hall, ``Side Channel Cryptanalysis of Product Ciphers,'' {\it Journal of Computer Security}, v. 8, n. 2--3, 2000, pp. 141--158.
+
+\bibitem{KS94} G.~Kim and E.~Spafford, ``The Design and Implementation of Tripwire: a File System Integrity Checker,'' {\it Proceedings of the 2nd ACM Conference on Computer and Communications Security}, 1994, pp. 18--29.
+
+\bibitem{KBPL05} J.~Kim, A.~Biryukov, B.~Preneel, and S.~Lee, ``On the Security of Encryption Modes of MD4, MD5 and HAVAL,'' Cryptology ePrint Archive, report 2005/327.
+
+\bibitem{K05a} V.~Klima, ``Finding MD5 Collisions---a Toy For a Notebook,'' Cryptology ePrint Archive, Report 2005/075.
+
+\bibitem{K05b} V.~Klima, ``Finding MD5 Collisions on a Notebook PC Using Multi-message Modifications,'' Cryptology ePrint Archive, Report 2005/102.
+
+\bibitem{K06} V.~Klima, ``Tunnels in Hash Functions: MD5 Collisions Within a Minute,'' Cryptology ePrint Archive, Report 2006/105.
+
+\bibitem{KRT07} L.~Knudsen, C.~Rechberger, and S.~Thomsen, ``Grindahl---A Family of Hash Functions,''  {\it Fast Software Encryption 2007}, Springer-Verlag, 2007, pp. 39--57.
+
+\bibitem{Koc96} P.~Kocher, ``Timing Attacks on Implementations of Diffie-Hellman, RSA, DSS, and Other Systems,'' {\it Advances in Cryptology---CRYPTO '96 Proceedings}, Springer-Verlag, 1996, pp. 104--113.
+
+\bibitem{Koc99} P.~Kocher, J.~Jaffe, and B.~Jun, ``Differential Power Analysis,'' {\it Advances in Cryptology---CRYPTO '99 Proceedings}, Springer-Verlag, 1999, pp. 388--397.
+
+
+\bibitem{KN10} D.~Khovratovich, I.~Nikolic, ``Rotational Cryptanalysis of
+  ARX,'' Fast Software Encryption (FSE) 2010, pp.~333--346. 
+
+
+\bibitem{hmac-rfc} H.~Krawczyk, M.~Bellare, and R.~Canetti, ``{HMAC}: Keyed-hashing for Message Authentication,'' RFC 2104, 1997.
+
+\bibitem{LW05} A.~Lenstra, B.~de~Weger, ``On the Possibility of Constructing Meaningful Hash Collisions for Public Keys,'' {\it ACISP 2005}, pp. 267--279.
+
+\bibitem{LiMo} H.~Lipmaa, S.~Moriai, ``Efficient Algorithms for Computing Differential Properties
+of Addition'', {\it Fast Software Encryption---FSE 2001}, Springer-Verlag, pp. 336--350. 
+
+\bibitem{LRW02} M.~Liskov, R.~Rivest, and D.~Wagner, ``Tweakable Block Ciphers,'' {\it Advances in Cryptology---CRYPTO 2002 Proceedings}, Springer-Verlag, 2002, pp. 31--46.
+
+\bibitem{LK08} J.~Lu and J.~Kim, ``Attacking 44 Rounds of the SHACAL-2 Block Cipher Using Related-Key Rectangle Cryptanalysis,'' {\it IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences 2008}, E91-A(9), pp. 2588--2596.
+
+\bibitem{L05}  S.~Lucks, ``Two-Pass Authenticated Encryption Faster Than Generic Composition,'' {\it Fast Software Encryption 2005}, Springer-Verlag, 2005, pp. 284--298.
+
+\bibitem{L04} S.~Lucks, ``A Failure-Friendly Design Principle for Hash Functions,'' {\it Advances in Cryptology: ASIACRYPT '05 Proceedings}, Springer-Verlag, 2005, pp. 474--494.
+
+\bibitem{MMO85} S.M.~Matyas, C.H.~Meyer, and J.~Oseas, ``Generating strong one-way functions with cryptographic algorithms,'' {\it IBM Technical Disclosure Bulletin}, Vol. 27, No. 10A, 1985, pp. 5658--5659.
+
+\bibitem{M93} M.~Matsui, ``Linear Cryptanalysis Method for DES Cipher,'' {\it Advances in Cryptology---EUROCRYPT '93 Proceedings}, Springer-Verlag, 1993, pp. 386--397.
+
+\bibitem{M04a} U.~Maurer, R.~Renner, and C.~Holenstein, ``Indifferentiability, Impossibility Results on Reductions, and Applications to the Random Oracle Methodology,'' Theory of Cryptography Conference (TCC), 2004.
+
+\bibitem{Mer87} R.~Merkle, ``A Digital Signature Based on Conventional Encryption Functions,'' {\it Advances in Cryptology---CRYPTO '87 Proceedings}, Springer-Verlag, 1988, pp. 369--378.
+
+\bibitem{Mer89} R.~Merkle, ``A Certified Digital Signature Scheme.'' {\it Advances in Cryptology---CRYPTO '89 Proceedings}, Springer-Verlag, 1990, pp. 218--238.
+
+\bibitem{Mer89a} R.~Merkle, ``One way hash functions and DES,'' {\it Advances in Cryptology---CRYPTO '89 Proceedings}, Springer-Verlag, 1990, pp. 428--446.
+
+\bibitem{M04b} O.~Mikle, ``Practical Attacks on Digital Signatures Using MD5 Message Digest,'' Cryptology eprint archive report 2004/356, \texttt{http://eprint.iacr.org/2004/356/}.
+
+\bibitem{MPW} C.~Mitchell, F.~Piper, and P.~Wild, ``Digital
+signatures,'' in {\it Contemporary Cryptology: The Science of Information Integrity}, G.J. Simmons, Ed., IEEE Press, 1991, pp. 325--378.
+
+\bibitem{HA1} F.~Muller, ``Differential Attacks against the Helix Stream Cipher,'' {\it Fast Software Encryption 2004}, Springer-Verlag, 2004, pp. 94--108.
+
+\bibitem{DES} National Bureau of Standards, NBS FIPS PUB 46, ``Data Encryption Standard,'' U.S. Department of Commerce, Jan 1977.
+
+\bibitem{SHA} National Institute of Standards and Technology, ``Secure Hash Standard,'' FIPS 180, 11 May 1993.
+
+\bibitem{SHA-1} National Institute of Standards and Technology, ``Announcing the Standard for Secure Hash Standard,'' FIPS 180-1, 17 Apr 1995.
+
+\bibitem{AES2} National Institute of Standards and Technology, ``Announcing the Advanced Encryption Standard,'' FIPS 197, 26 Nov 2001.
+
+\bibitem{SHA-2} National Institute of Standards and Technology, ``Specification for the Secure Hash Standard,'' FIPS 180-2, 1 Aug 2002.
+
+\bibitem{DSS} National Institute of Standards and Technology, ``Digital Signature Standard (DSS),'' FIPS 186-2, 27 Jan 2000.
+
+\bibitem{HMAC2} National Institute of Standards and Technology, ``The Keyed-Hash Message Authentication Code (HMAC),'' FIPS 198, 6 Mar 2002.
+
+\bibitem{SHA-3a} National Institute of Standards and Technology, ``Announcing The Development of New Hash Algorithm(s) for the Revision of Federal Information Processing Standard (FIPS) 180-2, Secure Hash Standard,'' {\it Federal Register}, v. 72, n. 14, 23 Jan 2007, pp. 2861--2863.
+
+\bibitem{SHA-3b} National Institute of Standards and Technology, ``Announcing Request for Candidate Algorithm Nominations for a New Cryptographic Hash Algorithm (SHA-3) Family,'' {\it Federal Register}, v. 72, n. 212, 2 Nov 2007, pp. 62212--62220.
+
+\bibitem{Skipjack} National Security Agency, ``Skipjack and KEA Algorithm Specification,'' Version 2.0, May 1998.
+
+\bibitem{HA2} S.~Paul and B.~Preneel, ``Solving Systems of Differential Equations of Addition,'' {\it Information Security and Privacy, 10th Australasian Conference, ACISP 2005}, Springer-Verlag, 2005, pp. 75--88.
+
+\bibitem{HA3} S.~Paul, B.~Preneel, ``Near Optimal Algorithms for Solving Differential Equations of Addition With Batch Queries,'' {\it Progress in Cryptology - INDOCRYPT 2005}, Springer-Verlag, 2005, pp. 75--88.
+
+\bibitem{Per05} C.~Percival, ``Cache Missing for Fun and Profit,'' BSDCan 2005, 2005, {\tt http://www.daemonology.net/papers/htt.pdf}.
+
+\bibitem{QG} J.~J.~Quisquater and M.~Girault, ``2n-bit Hash-Functions
+Using n-bit Symmetric Block Cipher Algorithms,'' {\it Advances in Cryptology: EUROCRYPT '89 Proceedings},Springer-Verlag, 1990, pp. 102--109.
+
+\bibitem{RKN10} C.~Rechberger, D.~Khovratovich, I.~Nikolic, 
+  ``Rotational Rebound Attacks on Reduced Skein,'' 
+  Second SHA-3 Candidate Conference, August 23--24, Santa Barbara, 2010.
+
+\bibitem{MD4} R.~Rivest, ``The MD4 Message Digest Algorithm,'' {\it Advances in Cryptology: CRYPTO '90 Proceedings}, Springer-Verlag, 1990, pp. 303--311.
+
+\bibitem{MD5} R.~Rivest, ``The MD5 Message Digest Algorithm,'' RFC 1321, 1992.
+
+\bibitem{RC6} R.~Rivest, M.~Robshaw, R.~Sidney, and Y.L.~Yin, ``The RC6 Block Cipher,'' NIST AES Proposal, Jun 98.
+
+\bibitem{Ro06} P.~Rogaway, "Formalizing Human Ignorance," {\it VietCrypt 2006 Proceedings}, pp. 211--228.
+
+\bibitem{RBB03} P.~Rogaway, M.~Bellare, and J.~Black, ``OCB: A Block-Cipher Mode of Operation for Efficient Authenticated Encryption,'' {\it ACM Transactions on Information and System Security (TISSEC)}, v. 6, n. 3, Aug 2003, pp. 365--403.
+
+\bibitem{SS08} S.K.~Sanadhya and P.~Sarkar, ``Some Observations on Strengthening the SHA-2 Family,'' Cryptology ePrint Archive: Report 2008/272, 9 May 2008.
+
+\bibitem{SS08a} S.~Sanadhya and P.~Sarkar, ``New Collision attacks Against Up To 24-step SHA-2,'' Cryptology ePrint Archive: Report 2008/270, 22 Sep 2008.
+
+\bibitem{Twofish} B.~Schneier, J.~Kelsey, D.~Whiting, D.~Wagner, C.~Hall, and N.~Ferguson, {\it The Twofish Encryption Algorithm}, John Wiley and Sons, 1999.
+
+\bibitem{SW97} B.~Schneier and D.~Whiting, ``Fast Software Encryption: Designing Encryption Algorithms for Optimal Software Speed on the Intel Pentium Processor,'' {\it Fast Software Encryption, Fourth International Workshop Proceedings (January 1997)}, Springer-Verlag, 1997, pp. 242--259.
+
+\bibitem{Sev09} G.~Sevestre, private communication.
+
+\bibitem{S06} M.~Stevens, ``Fast Collision Attack on MD5,'' Cryptology ePrint Archive, report 2006/104.
+
+\bibitem{SLW07} M.~Stevens, A.~Lenstra, and B.~de~Weger, ``Predicting the Winner of the 2008 US Presidential Elections using a Sony PlayStation 3,'' Nov 2007, \url{http://www.win.tue.nl/hashclash/Nostradamus/}.
+
+\bibitem{Phelix} D.~Whiting, B.~Schneier, S.~Lucks, and S.~Muller, ``Phelix: Fast Encryption and Authentication in a Single Cryptographic Primitive,'' ECRYPT Stream Cipher Project Report 2005/027.
+
+\bibitem{WFLY04} X.~Wang, D.~Feng, X.~Lai, and H.~Yu, ``Collisions for Hash Functions MD4, MD5, HAVAL-128 and RIPEMD,'' Cryptology ePrint Archive, Report 2004/199.
+
+\bibitem{WLFCY05} X.~Wang, X.~Lai, D.~Feng, H.~Chen, and X.~Yu, ``Cryptanalysis of the Hash Functions MD4 and RIPEMD,'' {\it Advances in Cryptology---EUROCRYPT '05 Proceedings}, Springer-Verlag, 2005, pp. 1--18.
+
+\bibitem{WY05} X.~Wang and H.~Yu, ``How to Break MD5 and Other Hash Functions,'' {\it Advances in Cryptology---EUROCRYPT '05 Proceedings}, Springer-Verlag, 2005, pp. 19--35.
+
+\bibitem{WYY05} X.~Wang, Y.L.~Yin, and H.~Yu, ``Collision Search Attacks on SHA1,'' research summary, 2005.
+
+\bibitem{PA} H.~Wu and B.~Preneel, ``Differential-Linear Attacks against the Stream Cipher Phelix,'' {\it Proceedings of Fast Software Encryption 2007}, Springer-Verlag, 2007, pp. 87--100.
+
+\bibitem{Yao}  A.~Yao, ``Theory and Applications of Trapdoor Functions,'' {\it Proceedings of the 23rd IEEE Symposium on Foundations of Computer Science (FOCS '82)}, IEEE, 1982, pp. 80--91.
+
+\end{thebibliography}
+}
+
+\end{document}
+
+
+
diff --git a/Supporting_Documentation/tex/skeinround3Mods.tex b/Supporting_Documentation/tex/skeinround3Mods.tex
new file mode 100644
index 0000000000000..2da6d408909fa
--- /dev/null
+++ b/Supporting_Documentation/tex/skeinround3Mods.tex
@@ -0,0 +1,76 @@
+\documentclass[11pt,twoside]{article}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{xspace}
+\usepackage{url}
+\usepackage{graphicx}
+\usepackage{tabularx}
+
+\hyphenation{three-fish}
+
+\renewcommand{\topfraction}{0.8}     % max fraction of floating figures at the top of a page
+\renewcommand{\bottomfraction}{0.8}  % idem for bottom
+\setcounter{topnumber}{4}            % max figures at top of page
+\setcounter{bottomnumber}{4}
+\setcounter{totalnumber}{4}
+\renewcommand{\textfraction}{0.2}    % minimum fraction of page that is text
+\renewcommand{\floatpagefraction}{0.5}
+
+\setlength{\textheight}{9in}
+\setlength{\textwidth}{6.5in}
+\setlength{\topmargin}{0.0in}
+\setlength{\oddsidemargin}{0in}
+\setlength{\evensidemargin}{0in}
+\setlength{\footskip}{0.5in}
+\setlength{\headheight}{0in}
+\setlength{\headsep}{0in}
+\renewcommand{\baselinestretch}{1}
+
+\newcommand{\purl}{\protect\url}
+\newcommand{\comment}[1]{}
+
+\parindent 0pt
+\parskip 1ex
+
+\begin{document}
+\thispagestyle{empty}
+\begin{center}
+{\Large\bf The Skein Hash Function Family} \\
+\vspace*{.2in}
+{\Large\bf NIST Round 3 Tweak Description} \\
+\vspace*{.2in}
+{\small 25 Oct 2010}
+\end{center}
+
+\section*{Description of Changes}
+
+The only change to the Skein hash function is in the key schedule
+parity constant, found in Section~3.3.2 of the newly submitted 
+(``tweak'') version 1.3 of the Skein specification document. The
+old constant was the value 
+  $$C_5\ \  = \texttt{0x5555555555555555}.$$ 
+The new constant is 
+  $$C_{240} = \texttt{0x1BD11BDAA9FC1A22}.$$
+
+Further details and discussion of the tweak and its implications are found in 
+version 1.2 of the Skein specification document, as follows:
+\begin{itemize}
+\item Section~8.3 (``Key Schedule Constant,'')
+\item Section ~9.3 (``Related-Key Attacks for the Threefish Block Cipher'')
+\item Section ~9.5.2 (``Rotational Cryptanalysis'')
+\item Section ~9.6 (``Empirical Observations for Threefish with Random Rotation Constants'')
+\item Section ~9.7 (``Cryptanalysis Summary'')
+\item Appendix~B (``Initial Chaining Values'')
+\item Appendix~C (``Test Vectors'')
+\item Appendix~E (``Empirical data for tweaking the key schedule constant'')
+\end{itemize}
+
+In addition, the following items have been updated in the Skein tweak submission package:
+\begin{itemize}
+\item Reference C source code
+\item Optimized C source code (32-bit and 64-bit)
+\item Assembly source code (32-bit and 64-bit)
+\item Test vectors (KAT\_MCT directory)
+\end{itemize}
+
+\end{document}
author	Allan Jude <allanjude@FreeBSD.org>	2016-05-27 02:42:46 +0000
committer	Allan Jude <allanjude@FreeBSD.org>	2016-05-27 02:42:46 +0000
commit	92f76dc624c277a7c731733a4e51997c0e9ad981 (patch)
tree	8fef288bf5f480e476a789ae0525520eeea04f99
download	src-test-vendor/skein.tar.gz src-test-vendor/skein.zip