Blame SOURCES/valgrind-3.17.0-ppc64-isa-3.1.patch

e7bd38
commit 3cc0232c46a5905b4a6c2fbd302b58bf5f90b3d5
e7bd38
Author: Carl Love <cel@us.ibm.com>
e7bd38
Date:   Mon Jan 11 16:00:57 2021 -0600
e7bd38
e7bd38
    PPC64: ISA 3.1 VSX PCV Generate Operations
e7bd38
    
e7bd38
    xgenpcvbm VSX Vector Generate PCV from Byte Mask
e7bd38
    xxgenpcvdmVSX Vector Generate PCV from Doubleword Mask
e7bd38
    xxgenpcvhmVSX Vector Generate PCV from Halfword Mask
e7bd38
    xxgenpcvwmVSX Vector Generate PCV from Word Mask
e7bd38
e7bd38
diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h
e7bd38
index deda4dfce..54ce923a9 100644
e7bd38
--- a/VEX/priv/guest_ppc_defs.h
e7bd38
+++ b/VEX/priv/guest_ppc_defs.h
e7bd38
@@ -169,6 +169,23 @@ void write_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,
e7bd38
 void get_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,
e7bd38
                     UInt reg, UInt *result);
e7bd38
 
e7bd38
+extern void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                                   ULong src_hi,
e7bd38
+                                                   ULong src_lo,
e7bd38
+                                                   UInt rtn_val, UInt IMM );
e7bd38
+extern void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                                    ULong src_hi,
e7bd38
+                                                    ULong src_lo,
e7bd38
+                                                    UInt rtn_val, UInt IMM );
e7bd38
+extern void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                                   ULong src_hi,
e7bd38
+                                                   ULong src_lo,
e7bd38
+                                                   UInt rtn_val, UInt IMM );
e7bd38
+extern void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                                    ULong src_hi,
e7bd38
+                                                    ULong src_lo,
e7bd38
+                                                    UInt rtn_val, UInt IMM );
e7bd38
+
e7bd38
 /* 8-bit XO value from instruction description */
e7bd38
 #define XVI4GER8       0b00100011
e7bd38
 #define XVI4GER8PP     0b00100010
e7bd38
diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
e7bd38
index c24191ef3..75497abb9 100644
e7bd38
--- a/VEX/priv/guest_ppc_helpers.c
e7bd38
+++ b/VEX/priv/guest_ppc_helpers.c
e7bd38
@@ -701,6 +701,738 @@ ULong vector_evaluate64_helper( ULong srcA, ULong srcB, ULong srcC,
e7bd38
 #undef MAX_IMM_BITS
e7bd38
 }
e7bd38
 
e7bd38
+/*--------------------------------------------------*/
e7bd38
+/*---- VSX Vector Generate PCV from Mask helpers ---*/
e7bd38
+/*--------------------------------------------------*/
e7bd38
+static void write_VSX_entry (VexGuestPPC64State* gst, UInt reg_offset,
e7bd38
+                             ULong *vsx_entry)
e7bd38
+{
e7bd38
+   U128* pU128_dst;
e7bd38
+   pU128_dst = (U128*) (((UChar*) gst) + reg_offset);
e7bd38
+
e7bd38
+   /* The U128 type is defined as an array of unsigned intetgers.  */
e7bd38
+   /* Writing in LE order */
e7bd38
+   (*pU128_dst)[0] = (UInt)(vsx_entry[1] & 0xFFFFFFFF);
e7bd38
+   (*pU128_dst)[1] = (UInt)(vsx_entry[1] >> 32);
e7bd38
+   (*pU128_dst)[2] = (UInt)(vsx_entry[0] & 0xFFFFFFFF);
e7bd38
+   (*pU128_dst)[3] = (UInt)(vsx_entry[0] >> 32);
e7bd38
+   return;
e7bd38
+}
e7bd38
+
e7bd38
+/* CALLED FROM GENERATED CODE */
e7bd38
+void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                            ULong src_hi, ULong src_lo,
e7bd38
+                                            UInt reg_offset, UInt imm ) {
e7bd38
+   /* The function computes the 128-bit result then writes it directly
e7bd38
+      into the guest state VSX register.  */
e7bd38
+
e7bd38
+   UInt  i, shift_by, sel_shift_by, half_sel;
e7bd38
+   ULong index, src, result[2];
e7bd38
+   ULong j;
e7bd38
+
e7bd38
+   result[0] = 0;
e7bd38
+   result[1] = 0;
e7bd38
+   j = 0;
e7bd38
+
e7bd38
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
e7bd38
+      N-1 on right. The loop index is converted to "i" to match the algorithm
e7bd38
+      for claritiy of matching the C code to the algorithm in the ISA.  */
e7bd38
+
e7bd38
+   if (imm == 0b00) {    // big endian expansion
e7bd38
+      for( index = 0; index < 16; index++) {
e7bd38
+         i = 15 - index;
e7bd38
+
e7bd38
+         shift_by = i*8;
e7bd38
+
e7bd38
+         if ( i >= 8) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 7;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+               result[half_sel] |= j << shift_by;
e7bd38
+            j++;
e7bd38
+         } else {
e7bd38
+            result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+
e7bd38
+   } else if (imm == 0b01) {    // big endian compression
e7bd38
+      /* If IMM=0b00001, let pcv be the permute control vector required to
e7bd38
+         enable a left-indexed permute (vperm or xxperm) to implement a
e7bd38
+         compression of the sparse byte elements in a source vector specified
e7bd38
+         by the byte-element mask in VSR[VRB+32] into the leftmost byte
e7bd38
+         elements of a result vector.
e7bd38
+      */
e7bd38
+      for( index = 0; index < 16; index++) {
e7bd38
+         i = 15 - index;
e7bd38
+         shift_by = i*8;
e7bd38
+
e7bd38
+         if ( i >= 8) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 7;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j >= 8)
e7bd38
+               result[1] |= (index) << (15 - j)*8;
e7bd38
+            else
e7bd38
+               result[0] |= (index) << (7 - j)*8;
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+      /* The algorithim says set to undefined, leave as 0
e7bd38
+      for( index = 3 - j; index < 4; index++) {
e7bd38
+         result |= (0 << (index*8));
e7bd38
+      }
e7bd38
+      */
e7bd38
+
e7bd38
+   } else if (imm == 0b10) {   //little-endian expansion
e7bd38
+      /* If IMM=0b00010, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
e7bd38
+         expansion of the rightmost byte elements of a source vector into the
e7bd38
+         byte elements of a result vector specified by the byte-element mask
e7bd38
+         in VSR[VRB+32].  */
e7bd38
+      for( index = 0; index < 16; index++) {
e7bd38
+         i = index;
e7bd38
+
e7bd38
+         shift_by = i*8;
e7bd38
+
e7bd38
+         if ( i >= 8) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 7;
e7bd38
+
e7bd38
+         /* mod shift amount by 8 since src is either the upper or lower
e7bd38
+            64-bits.  */
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+               result[half_sel] |= j << shift_by;
e7bd38
+            j++;
e7bd38
+         } else {
e7bd38
+            result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b11) {   //little-endian compression
e7bd38
+      /* If IMM=0b00011, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
e7bd38
+         compression of the sparse byte elements in a source vector specified
e7bd38
+         by the byte-element mask in VSR[VRB+32] into the rightmost byte
e7bd38
+         elements of a result vector.  */
e7bd38
+
e7bd38
+      for( index = 0; index < 16; index++) {
e7bd38
+         i = index;
e7bd38
+
e7bd38
+         shift_by = i*8;
e7bd38
+
e7bd38
+         if ( i >= 8) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 7;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j >= 8)
e7bd38
+               result[0] |= (index) << (j-8)*8;
e7bd38
+            else
e7bd38
+               result[1] |= (index) << j*8;
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+      /* The algorithim says set to undefined, leave as 0
e7bd38
+      for( index = 3 - j; index < 4; index++) {
e7bd38
+         result |= (0 << (index*8));
e7bd38
+      }
e7bd38
+      */
e7bd38
+
e7bd38
+   } else {
e7bd38
+      vex_printf("ERROR, vector_gen_pvc_byte_mask_dirty_helper, imm value %u not supported.\n",
e7bd38
+                 imm);
e7bd38
+      vassert(0);
e7bd38
+   }
e7bd38
+   write_VSX_entry( gst, reg_offset, result);
e7bd38
+}
e7bd38
+
e7bd38
+/* CALLED FROM GENERATED CODE */
e7bd38
+void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                             ULong src_hi, ULong src_lo,
e7bd38
+                                             UInt reg_offset,
e7bd38
+                                             UInt imm ) {
e7bd38
+   /* The function computes the 128-bit result then writes it directly
e7bd38
+      into the guest state VSX register.  */
e7bd38
+   UInt  i, shift_by, sel_shift_by, half_sel;
e7bd38
+   ULong index, src, result[2];
e7bd38
+   ULong j;
e7bd38
+
e7bd38
+   result[0] = 0;
e7bd38
+   result[1] = 0;
e7bd38
+   j = 0;
e7bd38
+
e7bd38
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
e7bd38
+      N-1 on right. The loop index is converted to "i" to match the algorithm
e7bd38
+      for claritiy of matching the C code to the algorithm in the ISA.  */
e7bd38
+
e7bd38
+   if (imm == 0b00) {    // big endian expansion
e7bd38
+      /* If IMM=0b00000, let pcv be the permute control vector required to
e7bd38
+         enable a left-indexed permute (vperm or xxperm) to implement an
e7bd38
+         expansion of the leftmost halfword elements of a source vector into
e7bd38
+         the halfword elements of a result vector specified by the halfword-
e7bd38
+         element mask in VSR[VRB+32].
e7bd38
+      */
e7bd38
+      for( index = 0; index < 8; index++) {
e7bd38
+         i = 7 - index;
e7bd38
+
e7bd38
+         shift_by = i*16;
e7bd38
+
e7bd38
+         if ( i >= 4) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 15;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            // half-word i, byte 0
e7bd38
+            result[half_sel] |= (2*j + 0x0) << (shift_by+8);
e7bd38
+            // half-word i, byte 1
e7bd38
+            result[half_sel] |= (2*j + 0x1) << shift_by;
e7bd38
+            j++;
e7bd38
+         } else {
e7bd38
+            result[half_sel] |= (2*index + 0x10) << (shift_by+8);
e7bd38
+            result[half_sel] |= (2*index + 0x11) << shift_by;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b01) {    // big endian expansion
e7bd38
+      /* If IMM=0b00001,let pcv be the permute control vector required to
e7bd38
+         enable a left-indexed permute (vperm or xxperm) to implement a
e7bd38
+         compression of the sparse halfword elements in a source vector
e7bd38
+         specified by the halfword-element mask in VSR[VRB+32] into the
e7bd38
+         leftmost halfword elements of a result vector.
e7bd38
+      */
e7bd38
+      for( index = 0; index < 8; index++) {
e7bd38
+         i = 7 - index;
e7bd38
+
e7bd38
+         shift_by = i*16;
e7bd38
+
e7bd38
+         if ( i >= 4) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 15;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j >= 4) {
e7bd38
+               // half-word i, byte 0
e7bd38
+               result[1] |= (2*index + 0x0) << ((7 - j)*16 + 8);
e7bd38
+               // half-word i, byte 1
e7bd38
+               result[1] |= (2*index + 0x1) << ((7 - j)*16);
e7bd38
+            } else {
e7bd38
+               // half-word i, byte 0
e7bd38
+               result[0] |= (2*index + 0x0) << ((3 - j)*16 + 8);
e7bd38
+               // half-word i, byte 1
e7bd38
+               result[0] |= (2*index + 0x1) << ((3 - j)*16);
e7bd38
+            }
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b10) {   //little-endian expansion
e7bd38
+      /* If IMM=0b00010, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
e7bd38
+         expansion of the rightmost halfword elements of a source vector into
e7bd38
+         the halfword elements of a result vector specified by the halfword-
e7bd38
+         element mask in VSR[VRB+32].
e7bd38
+       */
e7bd38
+      for( index = 0; index < 8; index++) {
e7bd38
+         i = index;
e7bd38
+         shift_by = i*16;
e7bd38
+
e7bd38
+         if ( i >= 4) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 15;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            // half-word i, byte 0
e7bd38
+            result[half_sel] |= (2*j + 0x00) << shift_by;
e7bd38
+            // half-word i, byte 1
e7bd38
+            result[half_sel] |= (2*j + 0x01) << (shift_by+8);
e7bd38
+            j++;
e7bd38
+
e7bd38
+         } else {
e7bd38
+            // half-word i, byte 0
e7bd38
+            result[half_sel] |= (2*index + 0x10) << shift_by;
e7bd38
+            // half-word i, byte 1
e7bd38
+            result[half_sel] |= (2*index + 0x11) << (shift_by+8);
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b11) {   //little-endian compression
e7bd38
+      /* If IMM=0b00011, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
e7bd38
+         compression of the sparse halfword elements in a source vector
e7bd38
+         specified by the halfword-element mask in VSR[VRB+32] into the
e7bd38
+         rightmost halfword elements of a result vector.  */
e7bd38
+      for( index = 0; index < 8; index++) {
e7bd38
+         i = index;
e7bd38
+         shift_by = i*16;
e7bd38
+
e7bd38
+         if ( i >= 4) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 15;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j >= 4) {
e7bd38
+               // half-word j, byte 0
e7bd38
+               result[0] |= (2*index + 0x0) << ((j-4)*16);
e7bd38
+               // half-word j, byte 1
e7bd38
+               result[0] |= (2*index + 0x1) << ((j-4)*16+8);
e7bd38
+            } else {
e7bd38
+               // half-word j, byte 0
e7bd38
+               result[1] |= (2*index + 0x0) << (j*16);
e7bd38
+               // half-word j, byte 1
e7bd38
+               result[1] |= (2*index + 0x1) << ((j*16)+8);
e7bd38
+            }
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else {
e7bd38
+      vex_printf("ERROR, vector_gen_pvc_hword_dirty_mask_helper, imm value %u not supported.\n",
e7bd38
+                 imm);
e7bd38
+      vassert(0);
e7bd38
+   }
e7bd38
+   write_VSX_entry( gst, reg_offset, result);
e7bd38
+}
e7bd38
+
e7bd38
+/* CALLED FROM GENERATED CODE */
e7bd38
+void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                            ULong src_hi, ULong src_lo,
e7bd38
+                                            UInt reg_offset, UInt imm ) {
e7bd38
+   /* The function computes the 128-bit result then writes it directly
e7bd38
+      into the guest state VSX register.  */
e7bd38
+   UInt  i, shift_by, sel_shift_by, half_sel;
e7bd38
+   ULong index, src, result[2];
e7bd38
+   ULong j;
e7bd38
+
e7bd38
+   result[0] = 0;
e7bd38
+   result[1] = 0;
e7bd38
+   j = 0;
e7bd38
+
e7bd38
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
e7bd38
+      N-1 on right. The loop index is converted to "i" to match the algorithm
e7bd38
+      for claritiy of matching the C code to the algorithm in the ISA.  */
e7bd38
+
e7bd38
+   if (imm == 0b00) {    // big endian expansion
e7bd38
+      /* If IMM=0b00000, let pcv be the permute control vector required to
e7bd38
+         enable a left-indexed permute (vperm or xxperm) to implement an
e7bd38
+         expansion of the leftmost word elements of a source vector into the
e7bd38
+         word elements of a result vector specified by the word-element mask
e7bd38
+         in VSR[VRB+32].
e7bd38
+      */
e7bd38
+      for( index = 0; index < 4; index++) {
e7bd38
+         i = 3 - index;
e7bd38
+
e7bd38
+         shift_by = i*32;
e7bd38
+
e7bd38
+         if ( i >= 2) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 31;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            result[half_sel] |= (4*j+0) << (shift_by+24);  // word i, byte 0
e7bd38
+            result[half_sel] |= (4*j+1) << (shift_by+16);  // word i, byte 1
e7bd38
+            result[half_sel] |= (4*j+2) << (shift_by+8);   // word i, byte 2
e7bd38
+            result[half_sel] |= (4*j+3) << shift_by;       // word i, byte 3
e7bd38
+            j++;
e7bd38
+         } else {
e7bd38
+            result[half_sel] |= (4*index + 0x10) << (shift_by+24);
e7bd38
+            result[half_sel] |= (4*index + 0x11) << (shift_by+16);
e7bd38
+            result[half_sel] |= (4*index + 0x12) << (shift_by+8);
e7bd38
+            result[half_sel] |= (4*index + 0x13) << shift_by;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b01) {    // big endian compression
e7bd38
+      /* If IMM=0b00001, let pcv be the permute control vector required to
e7bd38
+         enable a left-indexed permute (vperm or xxperm) to implement a
e7bd38
+         compression of the sparse word elements in a source vector specified
e7bd38
+         by the word-element mask in VSR[VRB+32] into the leftmost word
e7bd38
+         elements of a result vector.
e7bd38
+      */
e7bd38
+      for( index = 0; index < 4; index++) {
e7bd38
+         i = 3 - index;
e7bd38
+
e7bd38
+         shift_by = i*32;
e7bd38
+
e7bd38
+         if ( i >= 2) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 31;
e7bd38
+
e7bd38
+         if (((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j >= 2) {
e7bd38
+               // word j, byte 0
e7bd38
+               result[1] |= (4*index+0) << ((3 - j)*32 + 24);
e7bd38
+               // word j, byte 1
e7bd38
+               result[1] |= (4*index+1) << ((3 - j)*32 + 16);
e7bd38
+               // word j, byte 2
e7bd38
+               result[1] |= (4*index+2) << ((3 - j)*32 + 8);
e7bd38
+               // word j, byte 3
e7bd38
+               result[1] |= (4*index+3) << ((3 - j)*32 + 0);
e7bd38
+            } else {
e7bd38
+               result[0] |= (4*index+0) << ((1 - j)*32 + 24);
e7bd38
+               result[0] |= (4*index+1) << ((1 - j)*32 + 16);
e7bd38
+               result[0] |= (4*index+2) << ((1 - j)*32 + 8);
e7bd38
+               result[0] |= (4*index+3) << ((1 - j)*32 + 0);
e7bd38
+            }
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b10) {   //little-endian expansion
e7bd38
+      /* If IMM=0b00010, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
e7bd38
+         expansion of the rightmost word elements of a source vector into the
e7bd38
+         word elements of a result vector specified by the word-element mask
e7bd38
+         in VSR[VRB+32].
e7bd38
+       */
e7bd38
+      for( index = 0; index < 4; index++) {
e7bd38
+         i = index;
e7bd38
+
e7bd38
+         shift_by = i*32;
e7bd38
+
e7bd38
+         if ( i >= 2) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 31;
e7bd38
+
e7bd38
+         if (((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            result[half_sel] |= (4*j+0) << (shift_by + 0);  // word j, byte 0
e7bd38
+            result[half_sel] |= (4*j+1) << (shift_by + 8);  // word j, byte 1
e7bd38
+            result[half_sel] |= (4*j+2) << (shift_by + 16); // word j, byte 2
e7bd38
+            result[half_sel] |= (4*j+3) << (shift_by + 24); // word j, byte 3
e7bd38
+            j++;
e7bd38
+         } else {
e7bd38
+            result[half_sel] |= (4*index + 0x10) << (shift_by + 0);
e7bd38
+            result[half_sel] |= (4*index + 0x11) << (shift_by + 8);
e7bd38
+            result[half_sel] |= (4*index + 0x12) << (shift_by + 16);
e7bd38
+            result[half_sel] |= (4*index + 0x13) << (shift_by + 24);
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b11) {   //little-endian compression
e7bd38
+      /* If IMM=0b00011, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
e7bd38
+         compression of the sparse word elements in a source vector specified
e7bd38
+         by the word-element mask in VSR[VRB+32] into the rightmost word
e7bd38
+         elements of a result vector.  */
e7bd38
+      for( index = 0; index < 4; index++) {
e7bd38
+         i =index;
e7bd38
+
e7bd38
+         shift_by = i*32;
e7bd38
+
e7bd38
+         if ( i >= 2) {
e7bd38
+            src = src_hi;
e7bd38
+            shift_by = shift_by - 64;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = shift_by + 31;
e7bd38
+
e7bd38
+         if (((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j >= 2){
e7bd38
+               // word j, byte 0
e7bd38
+               result[0] |= (4*index + 0x0) << ((j-2)*32+0);
e7bd38
+               // word j, byte 1
e7bd38
+               result[0] |= (4*index + 0x1) << ((j-2)*32+8);
e7bd38
+               // word j, byte 2
e7bd38
+               result[0] |= (4*index + 0x2) << ((j-2)*32+16);
e7bd38
+               // word j, byte 3
e7bd38
+               result[0] |= (4*index + 0x3) << ((j-2)*32+24);
e7bd38
+            } else {
e7bd38
+               result[1] |= (4*index + 0x0) << (j*32+0);
e7bd38
+               result[1] |= (4*index + 0x1) << (j*32+8);
e7bd38
+               result[1] |= (4*index + 0x2) << (j*32+16);
e7bd38
+               result[1] |= (4*index + 0x3) << (j*32+24);
e7bd38
+            }
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+   } else {
e7bd38
+      vex_printf("ERROR, vector_gen_pvc_word_mask_dirty_helper, imm value %u not supported.\n",
e7bd38
+                 imm);
e7bd38
+      vassert(0);
e7bd38
+   }
e7bd38
+
e7bd38
+   write_VSX_entry( gst, reg_offset, result);
e7bd38
+}
e7bd38
+
e7bd38
+/* CALLED FROM GENERATED CODE */
e7bd38
+void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
+                                             ULong src_hi, ULong src_lo,
e7bd38
+                                             UInt reg_offset, UInt imm ) {
e7bd38
+   /* The function computes the 128-bit result then writes it directly
e7bd38
+      into the guest state VSX register.  */
e7bd38
+   UInt  sel_shift_by, half_sel;
e7bd38
+   ULong index, src, result[2];
e7bd38
+   ULong j, i;
e7bd38
+
e7bd38
+   result[0] = 0;
e7bd38
+   result[1] = 0;
e7bd38
+   j = 0;
e7bd38
+
e7bd38
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
e7bd38
+      N-1 on right. The loop index is converted to "i" to match the algorithm
e7bd38
+      for claritiy of matching the C code to the algorithm in the ISA.  */
e7bd38
+
e7bd38
+   if (imm == 0b00) {    // big endian expansion
e7bd38
+      /* If IMM=0b00000, let pcv be the permute control vector required to
e7bd38
+         enable a left-indexed permute (vperm or xxperm) to implement an
e7bd38
+         expansion of the leftmost doubleword elements of a source vector into
e7bd38
+         the doubleword elements of a result vector specified by the
e7bd38
+         doubleword-element mask in VSR[VRB+32].
e7bd38
+      */
e7bd38
+      for( index = 0; index < 2; index++) {
e7bd38
+         i = 1 - index;
e7bd38
+
e7bd38
+         if ( i == 1) {
e7bd38
+            src = src_hi;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = 63;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            result[half_sel] |= (8*j + 0x0) << 56; // dword i, byte 0
e7bd38
+            result[half_sel] |= (8*j + 0x1) << 48; // dword i, byte 1
e7bd38
+            result[half_sel] |= (8*j + 0x2) << 40; // dword i, byte 2
e7bd38
+            result[half_sel] |= (8*j + 0x3) << 32; // dword i, byte 3
e7bd38
+            result[half_sel] |= (8*j + 0x4) << 24; // dword i, byte 4
e7bd38
+            result[half_sel] |= (8*j + 0x5) << 16; // dword i, byte 5
e7bd38
+            result[half_sel] |= (8*j + 0x6) << 8;  // dword i, byte 6
e7bd38
+            result[half_sel] |= (8*j + 0x7) << 0;  // dword i, byte 7
e7bd38
+            j++;
e7bd38
+         } else {
e7bd38
+            result[half_sel] |= (8*index + 0x10) << 56;
e7bd38
+            result[half_sel] |= (8*index + 0x11) << 48;
e7bd38
+            result[half_sel] |= (8*index + 0x12) << 40;
e7bd38
+            result[half_sel] |= (8*index + 0x13) << 32;
e7bd38
+            result[half_sel] |= (8*index + 0x14) << 24;
e7bd38
+            result[half_sel] |= (8*index + 0x15) << 16;
e7bd38
+            result[half_sel] |= (8*index + 0x16) << 8;
e7bd38
+            result[half_sel] |= (8*index + 0x17) << 0;
e7bd38
+         }
e7bd38
+      }
e7bd38
+   } else if (imm == 0b01) {    // big endian compression
e7bd38
+      /* If IMM=0b00001, let pcv be the the permute control vector required to
e7bd38
+         enable a left-indexed permute (vperm or xxperm) to implement a
e7bd38
+         compression of the sparse doubleword elements in a source vector
e7bd38
+         specified by the doubleword-element mask in VSR[VRB+32] into the
e7bd38
+         leftmost doubleword elements of a result vector.
e7bd38
+      */
e7bd38
+      for( index = 0; index < 2; index++) {
e7bd38
+         i = 1 - index;
e7bd38
+
e7bd38
+         if ( i == 1) {
e7bd38
+            src = src_hi;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = 63;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j == 1) {
e7bd38
+               result[1] |= (8*index + 0x0) << 56;   // double-word j, byte 0
e7bd38
+               result[1] |= (8*index + 0x1) << 48;   // double-word j, byte 1
e7bd38
+               result[1] |= (8*index + 0x2) << 40;   // double-word j, byte 2
e7bd38
+               result[1] |= (8*index + 0x3) << 32;   // double-word j, byte 3
e7bd38
+               result[1] |= (8*index + 0x4) << 24;   // double-word j, byte 4
e7bd38
+               result[1] |= (8*index + 0x5) << 16;   // double-word j, byte 5
e7bd38
+               result[1] |= (8*index + 0x6) << 8;    // double-word j, byte 6
e7bd38
+               result[1] |= (8*index + 0x7) << 0;    // double-word j, byte 7
e7bd38
+            } else {
e7bd38
+               result[0] |= (8*index + 0x0) << 56;   // double-word j, byte 0
e7bd38
+               result[0] |= (8*index + 0x1) << 48;   // double-word j, byte 1
e7bd38
+               result[0] |= (8*index + 0x2) << 40;   // double-word j, byte 2
e7bd38
+               result[0] |= (8*index + 0x3) << 32;   // double-word j, byte 3
e7bd38
+               result[0] |= (8*index + 0x4) << 24;   // double-word j, byte 4
e7bd38
+               result[0] |= (8*index + 0x5) << 16;   // double-word j, byte 5
e7bd38
+               result[0] |= (8*index + 0x6) << 8;    // double-word j, byte 6
e7bd38
+               result[0] |= (8*index + 0x7) << 0;    // double-word j, byte 7
e7bd38
+            }
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+   } else if (imm == 0b10) {   //little-endian expansion
e7bd38
+      /* If IMM=0b00010, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
e7bd38
+         expansion of the rightmost doubleword elements of a source vector
e7bd38
+         into the doubleword elements of a result vector specified by the
e7bd38
+         doubleword-element mask in VSR[VRB+32].
e7bd38
+       */
e7bd38
+
e7bd38
+      for( index = 0; index < 2; index++) {
e7bd38
+         i = index;
e7bd38
+
e7bd38
+         if ( i == 1) {
e7bd38
+            src = src_hi;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = 63;
e7bd38
+
e7bd38
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            result[half_sel] |= (8*j+0) << 0;  // double-word i, byte 0
e7bd38
+            result[half_sel] |= (8*j+1) << 8;  // double-word i, byte 1
e7bd38
+            result[half_sel] |= (8*j+2) << 16; // double-word i, byte 2
e7bd38
+            result[half_sel] |= (8*j+3) << 24; // double-word i, byte 3
e7bd38
+            result[half_sel] |= (8*j+4) << 32; // double-word i, byte 4
e7bd38
+            result[half_sel] |= (8*j+5) << 40; // double-word i, byte 5
e7bd38
+            result[half_sel] |= (8*j+6) << 48; // double-word i, byte 6
e7bd38
+            result[half_sel] |= (8*j+7) << 56; // double-word i, byte 7
e7bd38
+            j++;
e7bd38
+         } else {
e7bd38
+            result[half_sel] |= (8*index + 0x10) << 0;
e7bd38
+            result[half_sel] |= (8*index + 0x11) << 8;
e7bd38
+            result[half_sel] |= (8*index + 0x12) << 16;
e7bd38
+            result[half_sel] |= (8*index + 0x13) << 24;
e7bd38
+            result[half_sel] |= (8*index + 0x14) << 32;
e7bd38
+            result[half_sel] |= (8*index + 0x15) << 40;
e7bd38
+            result[half_sel] |= (8*index + 0x16) << 48;
e7bd38
+            result[half_sel] |= (8*index + 0x17) << 56;
e7bd38
+         }
e7bd38
+      }
e7bd38
+
e7bd38
+   } else if (imm == 0b11) {   //little-endian compression
e7bd38
+      /* If IMM=0b00011, let pcv be the permute control vector required to
e7bd38
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
e7bd38
+         compression of the sparse doubleword elements in a source vector
e7bd38
+         specified by the doubleword-element mask in VSR[VRB+32] into the
e7bd38
+         rightmost doubleword elements of a result vector.  */
e7bd38
+      for( index = 0; index < 2; index++) {
e7bd38
+         i = index;
e7bd38
+
e7bd38
+         if ( i == 1) {
e7bd38
+            src = src_hi;
e7bd38
+            half_sel = 0;
e7bd38
+         } else {
e7bd38
+            src = src_lo;
e7bd38
+            half_sel = 1;
e7bd38
+         }
e7bd38
+
e7bd38
+         sel_shift_by = 63;
e7bd38
+
e7bd38
+         if (((src >> sel_shift_by) & 0x1) == 1) {
e7bd38
+            if (j == 1) {
e7bd38
+               result[0] |= (8*index + 0x0) << 0;    // double-word j, byte 0
e7bd38
+               result[0] |= (8*index + 0x1) << 8;    // double-word j, byte 1
e7bd38
+               result[0] |= (8*index + 0x2) << 16;   // double-word j, byte 2
e7bd38
+               result[0] |= (8*index + 0x3) << 24;   // double-word j, byte 3
e7bd38
+               result[0] |= (8*index + 0x4) << 32;   // double-word j, byte 4
e7bd38
+               result[0] |= (8*index + 0x5) << 40;   // double-word j, byte 5
e7bd38
+               result[0] |= (8*index + 0x6) << 48;   // double-word j, byte 6
e7bd38
+               result[0] |= (8*index + 0x7) << 56;   // double-word j, byte 7
e7bd38
+            } else {
e7bd38
+               result[1] |= (8*index + 0x0) << 0;
e7bd38
+               result[1] |= (8*index + 0x1) << 8;
e7bd38
+               result[1] |= (8*index + 0x2) << 16;
e7bd38
+               result[1] |= (8*index + 0x3) << 24;
e7bd38
+               result[1] |= (8*index + 0x4) << 32;
e7bd38
+               result[1] |= (8*index + 0x5) << 40;
e7bd38
+               result[1] |= (8*index + 0x6) << 48;
e7bd38
+               result[1] |= (8*index + 0x7) << 56;
e7bd38
+            }
e7bd38
+            j++;
e7bd38
+         }
e7bd38
+      }
e7bd38
+   } else {
e7bd38
+      vex_printf("ERROR, vector_gen_pvc_dword_mask_helper, imm value %u not supported.\n",
e7bd38
+                 imm);
e7bd38
+      vassert(0);
e7bd38
+   }
e7bd38
+
e7bd38
+   write_VSX_entry( gst, reg_offset, result);
e7bd38
+}
e7bd38
 
e7bd38
 /*------------------------------------------------*/
e7bd38
 /*---- VSX Matrix signed integer GER functions ---*/
e7bd38
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
e7bd38
index bcabf69dd..354be6b53 100644
e7bd38
--- a/VEX/priv/guest_ppc_toIR.c
e7bd38
+++ b/VEX/priv/guest_ppc_toIR.c
e7bd38
@@ -3322,6 +3322,7 @@ static IRExpr * locate_vector_ele_eq ( IRTemp src, IRExpr *value,
e7bd38
 #define DFORM_IMMASK  0xffffffff
e7bd38
 #define DSFORM_IMMASK 0xfffffffc
e7bd38
 #define DQFORM_IMMASK 0xfffffff0
e7bd38
+#define DA8LSFORM_IMMASK 0x3fffffff   // Algebraic 8LS Dform
e7bd38
 
e7bd38
 #define ISA_3_1_PREFIX_CHECK if (prefix) {if (!allow_isa_3_1) goto decode_noIsa3_1;}
e7bd38
 
e7bd38
@@ -6109,6 +6110,87 @@ static void vsx_matrix_64bit_float_ger ( const VexAbiInfo* vbi,
e7bd38
    stmt( IRStmt_Dirty(d) );
e7bd38
 }
e7bd38
 
e7bd38
+static void vector_gen_pvc_mask ( const VexAbiInfo* vbi,
e7bd38
+                                   IRExpr *src, UInt IMM,
e7bd38
+                                   UInt opc2, UInt VSX_addr ) {
e7bd38
+   /* The function takes a 64-bit source and an immediate value.  The function
e7bd38
+      calls a helper to execute the xxgenpcvbm, xxgenpcvhm, xxgenpcvwm,
e7bd38
+      xxgenpcvdm instruction.  The instructions are not practical to do with
e7bd38
+      Iops.  The instruction is implemented with a dirty helper that
e7bd38
+      calculates the 128-bit result and writes it directly into the guest
e7bd38
+      state VSX register.
e7bd38
+  */
e7bd38
+   IRTemp src_hi = newTemp( Ity_I64);
e7bd38
+   IRTemp src_lo = newTemp( Ity_I64);
e7bd38
+
e7bd38
+   IRDirty* d;
e7bd38
+
e7bd38
+   vassert( (VSX_addr >= 0) && (VSX_addr < 64) );
e7bd38
+   UInt reg_offset = offsetofPPCGuestState( guest_VSR0 )
e7bd38
+      + sizeof(U128) * VSX_addr;
e7bd38
+
e7bd38
+   assign( src_hi, unop( Iop_V128HIto64, src ) );
e7bd38
+   assign( src_lo, unop( Iop_V128to64, src ) );
e7bd38
+
e7bd38
+   IRExpr** args = mkIRExprVec_5(
e7bd38
+      IRExpr_GSPTR(),
e7bd38
+      mkexpr( src_hi ),
e7bd38
+      mkexpr( src_lo ),
e7bd38
+      mkU32( reg_offset ),
e7bd38
+      mkU64( IMM ) );
e7bd38
+
e7bd38
+   switch( opc2 ) {
e7bd38
+   case 0x394: // xxgenpcvbm
e7bd38
+      d = unsafeIRDirty_0_N (
e7bd38
+         0 /*regparms*/,
e7bd38
+         "vector_gen_pvc_byte_mask_dirty_helper",
e7bd38
+         fnptr_to_fnentry( vbi,
e7bd38
+                           &vector_gen_pvc_byte_mask_dirty_helper ),
e7bd38
+         args);
e7bd38
+      break;
e7bd38
+
e7bd38
+   case 0x395: // xxgenpcvhm
e7bd38
+      d = unsafeIRDirty_0_N (
e7bd38
+         0 /*regparms*/,
e7bd38
+         "vector_gen_pvc_hword_mask_dirty_helper",
e7bd38
+         fnptr_to_fnentry( vbi,
e7bd38
+                           &vector_gen_pvc_hword_mask_dirty_helper ),
e7bd38
+         args);
e7bd38
+      break;
e7bd38
+
e7bd38
+   case 0x3B4: // xxgenpcvwm
e7bd38
+      d = unsafeIRDirty_0_N (
e7bd38
+         0 /*regparms*/,
e7bd38
+         "vector_gen_pvc_word_mask_dirty_helper",
e7bd38
+         fnptr_to_fnentry( vbi,
e7bd38
+                           &vector_gen_pvc_word_mask_dirty_helper ),
e7bd38
+         args);
e7bd38
+      break;
e7bd38
+
e7bd38
+   case 0x3B5: // xxgenpcvdm
e7bd38
+      d = unsafeIRDirty_0_N (
e7bd38
+         0 /*regparms*/,
e7bd38
+         "vector_gen_pvc_dword_mask_dirty_helper",
e7bd38
+         fnptr_to_fnentry( vbi,
e7bd38
+                           &vector_gen_pvc_dword_mask_dirty_helper ),
e7bd38
+         args);
e7bd38
+      break;
e7bd38
+   default:
e7bd38
+      vex_printf("ERROR: Unkown instruction = %u in vector_gen_pvc_mask()\n",
e7bd38
+                 opc2);
e7bd38
+      return;
e7bd38
+   }
e7bd38
+
e7bd38
+   d->nFxState = 1;
e7bd38
+   vex_bzero(&d->fxState, sizeof(d->fxState));
e7bd38
+   d->fxState[0].fx     = Ifx_Modify;
e7bd38
+   d->fxState[0].size   = sizeof(U128);
e7bd38
+   d->fxState[0].offset = reg_offset;
e7bd38
+
e7bd38
+   /* execute the dirty call, side-effecting guest state */
e7bd38
+   stmt( IRStmt_Dirty(d) );
e7bd38
+}
e7bd38
+
e7bd38
 static IRExpr * UNSIGNED_CMP_GT_V128 ( IRExpr *vA, IRExpr *vB ) {
e7bd38
    /* This function does an unsigned compare of two V128 values. The
e7bd38
     * function is for use in 32-bit mode only as it is expensive.  The
e7bd38
@@ -35227,6 +35309,54 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
e7bd38
    return True;
e7bd38
 }
e7bd38
 
e7bd38
+static Bool dis_vector_generate_pvc_from_mask ( UInt prefix,
e7bd38
+                                                UInt theInstr,
e7bd38
+                                                const VexAbiInfo* vbi )
e7bd38
+{
e7bd38
+   UChar XT_addr = ifieldRegXT(theInstr);
e7bd38
+   UChar vB_addr = ifieldRegB(theInstr);
e7bd38
+   IRTemp vB = newTemp( Ity_V128 );
e7bd38
+   UInt opc2 = ifieldOPClo10(theInstr);
e7bd38
+   UInt IMM = IFIELD(theInstr, (31-15), 5);    // bits[11:15]
e7bd38
+
e7bd38
+   assign( vB, getVReg( vB_addr ) );
e7bd38
+
e7bd38
+   switch( opc2 ) {
e7bd38
+   case 0x394:
e7bd38
+      DIP("xxgenpcvbm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
e7bd38
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
e7bd38
+         write it to the VSX result register.  */
e7bd38
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
e7bd38
+      break;
e7bd38
+
e7bd38
+   case 0x395:
e7bd38
+      DIP("xxgenpcvhm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
e7bd38
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
e7bd38
+         write it to the VSX result register.  */
e7bd38
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
e7bd38
+      break;
e7bd38
+
e7bd38
+   case 0x3B4:
e7bd38
+      DIP("xxgenpcvwm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
e7bd38
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
e7bd38
+         write it to the VSX result register.  */
e7bd38
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
e7bd38
+      break;
e7bd38
+
e7bd38
+   case 0x3B5:
e7bd38
+      DIP("xxgenpcvdm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
e7bd38
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
e7bd38
+         write it to the VSX result register.  */
e7bd38
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
e7bd38
+      break;
e7bd38
+
e7bd38
+   default:
e7bd38
+      return False;
e7bd38
+   }
e7bd38
+
e7bd38
+   return True;
e7bd38
+}
e7bd38
+
e7bd38
 static Int dis_nop_prefix ( UInt prefix, UInt theInstr )
e7bd38
 {
e7bd38
    Bool is_prefix   = prefix_instruction( prefix );
e7bd38
@@ -35748,14 +35878,9 @@ DisResult disInstr_PPC_WRK (
e7bd38
       }
e7bd38
       goto decode_failure;
e7bd38
 
e7bd38
-   case 0x31:   // lfsu, stxv
e7bd38
+   case 0x31:   // lfsu
e7bd38
       if (!allow_F) goto decode_noF;
e7bd38
-      if (prefix_instruction( prefix )) {  // stxv
e7bd38
-         if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
e7bd38
-         if (dis_fp_pair_prefix( prefix, theInstr )) goto decode_success;
e7bd38
-      } else {  // lfsu
e7bd38
-         if (dis_fp_load( prefix, theInstr )) goto decode_success;
e7bd38
-      }
e7bd38
+      if (dis_fp_load( prefix, theInstr )) goto decode_success;
e7bd38
       goto decode_failure;
e7bd38
 
e7bd38
    case 0x32:
e7bd38
@@ -35842,7 +35967,6 @@ DisResult disInstr_PPC_WRK (
e7bd38
    case 0x39:  // pld, lxsd, lxssp, lfdp
e7bd38
       {
e7bd38
          UInt opc2tmp = ifieldOPC0o2(theInstr);
e7bd38
-
e7bd38
          if (!allow_F) goto decode_noF;
e7bd38
          if (prefix_instruction( prefix )) {   // pld
e7bd38
             if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
e7bd38
@@ -36125,12 +36249,6 @@ DisResult disInstr_PPC_WRK (
e7bd38
             goto decode_failure;
e7bd38
       }
e7bd38
 
e7bd38
-      /* The vsxOpc2 returned is the "normalized" value, representing the
e7bd38
-       * instructions secondary opcode as taken from the standard secondary
e7bd38
-       * opcode field [21:30] (IBM notatition), even if the actual field
e7bd38
-       * is non-standard.  These normalized values are given in the opcode
e7bd38
-       * appendices of the ISA 2.06 document.
e7bd38
-       */
e7bd38
       if ( ( opc2 == 0x168 ) && ( IFIELD( theInstr, 19, 2 ) == 0 ) )// xxspltib
e7bd38
       {
e7bd38
          /* This is a special case of the XX1 form where the  RA, RB
e7bd38
@@ -36153,6 +36271,23 @@ DisResult disInstr_PPC_WRK (
e7bd38
          goto decode_failure;
e7bd38
       }
e7bd38
 
e7bd38
+      if ( ( opc2 == 0x394 ) ||         // xxgenpcvbm
e7bd38
+           ( opc2 == 0x395 ) ||         // xxgenpcvwm
e7bd38
+           ( opc2 == 0x3B4 ) ||         // xxgenpcvhm
e7bd38
+           ( opc2 == 0x3B5 ) ) {        // xxgenpcvdm
e7bd38
+         if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
e7bd38
+         if (dis_vector_generate_pvc_from_mask( prefix, theInstr,
e7bd38
+                                                abiinfo ))
e7bd38
+            goto decode_success;
e7bd38
+         goto decode_failure;
e7bd38
+      }
e7bd38
+
e7bd38
+      /* The vsxOpc2 returned is the "normalized" value, representing the
e7bd38
+       * instructions secondary opcode as taken from the standard secondary
e7bd38
+       * opcode field [21:30] (IBM notatition), even if the actual field
e7bd38
+       * is non-standard.  These normalized values are given in the opcode
e7bd38
+       * appendices of the ISA 2.06 document.
e7bd38
+       */
e7bd38
       vsxOpc2 = get_VSX60_opc2(opc2, theInstr);
e7bd38
 
e7bd38
       switch (vsxOpc2) {
e7bd38
commit 078f89e99b6f62e043f6138c6a7ae238befc1f2a
e7bd38
Author: Carl Love <cel@us.ibm.com>
e7bd38
Date:   Fri Feb 26 15:46:55 2021 -0600
e7bd38
e7bd38
    PPC64: Reduced-Precision - bfloat16 Outer Product & Format Conversion Operations
e7bd38
    
e7bd38
    Add support for:
e7bd38
    
e7bd38
    pmxvbf16ger2 Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update)
e7bd38
    pmxvbf16ger2pp Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Positive
e7bd38
      multiply, Positive accumulate
e7bd38
    pmxvbf16ger2pn Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Positive
e7bd38
      multiply, Negative accumulate
e7bd38
    pmxvbf16ger2np Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Negative
e7bd38
      multiply, Positive accumulate
e7bd38
    pmxvbf16ger2nn Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Negative
e7bd38
      multiply, Negative accumulate
e7bd38
    xvbf16ger2VSX Vector bfloat16 GER (Rank-2 Update)
e7bd38
    xvbf16ger2pp VSX Vector bfloat16 GER (Rank-2 Update) Positive multiply, Positive
e7bd38
      accumulate
e7bd38
    xvbf16ger2pn VSX Vector bfloat16 GER (Rank-2 Update) Positive multiply, Negative
e7bd38
      accumulate
e7bd38
    xvbf16ger2np VSX Vector bfloat16 GER (Rank-2 Update) Negative multiply, Positive
e7bd38
      accumulate
e7bd38
    xvbf16ger2nn VSX Vector bfloat16 GER (Rank-2 Update) Negative multiply, Negative
e7bd38
      accumulate
e7bd38
    xvcvbf16sp VSX Vector Convert bfloat16 to Single-Precision format
e7bd38
    xvcvspbf16 VSX Vector Convert with round Single-Precision to bfloat16 format
e7bd38
e7bd38
diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h
e7bd38
index 54ce923a9..d36d6c07d 100644
e7bd38
--- a/VEX/priv/guest_ppc_defs.h
e7bd38
+++ b/VEX/priv/guest_ppc_defs.h
e7bd38
@@ -150,6 +150,8 @@ extern ULong convert_to_zoned_helper( ULong src_hi, ULong src_low,
e7bd38
                                       ULong return_upper );
e7bd38
 extern ULong convert_to_national_helper( ULong src, ULong return_upper );
e7bd38
 extern ULong convert_from_zoned_helper( ULong src_hi, ULong src_low );
e7bd38
+extern ULong convert_from_floattobf16_helper( ULong src );
e7bd38
+extern ULong convert_from_bf16tofloat_helper( ULong src );
e7bd38
 extern ULong convert_from_national_helper( ULong src_hi, ULong src_low );
e7bd38
 extern ULong generate_C_FPCC_helper( ULong size, ULong src_hi, ULong src );
e7bd38
 extern ULong extract_bits_under_mask_helper( ULong src, ULong mask,
e7bd38
@@ -201,6 +203,11 @@ extern void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
e7bd38
 #define XVF16GER2PN    0b10010010
e7bd38
 #define XVF16GER2NP    0b01010010
e7bd38
 #define XVF16GER2NN    0b11010010
e7bd38
+#define XVBF16GER2     0b00110011
e7bd38
+#define XVBF16GER2PP   0b00110010
e7bd38
+#define XVBF16GER2PN   0b10110010
e7bd38
+#define XVBF16GER2NP   0b01110010
e7bd38
+#define XVBF16GER2NN   0b11110010
e7bd38
 #define XVF32GER       0b00011011
e7bd38
 #define XVF32GERPP     0b00011010
e7bd38
 #define XVF32GERPN     0b10011010
e7bd38
diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
e7bd38
index 75497abb9..6bcee966d 100644
e7bd38
--- a/VEX/priv/guest_ppc_helpers.c
e7bd38
+++ b/VEX/priv/guest_ppc_helpers.c
e7bd38
@@ -1905,6 +1905,125 @@ static Double conv_f16_to_double( ULong input )
e7bd38
 #  endif
e7bd38
 }
e7bd38
 
e7bd38
+#define BF16_SIGN_MASK   0x8000
e7bd38
+#define BF16_EXP_MASK    0x7F80
e7bd38
+#define BF16_FRAC_MASK   0x007F
e7bd38
+#define BF16_BIAS        127
e7bd38
+#define BF16_MAX_UNBIASED_EXP 127
e7bd38
+#define BF16_MIN_UNBIASED_EXP -126
e7bd38
+#define FLOAT_SIGN_MASK  0x80000000
e7bd38
+#define FLOAT_EXP_MASK   0x7F800000
e7bd38
+#define FLOAT_FRAC_MASK  0x007FFFFF
e7bd38
+#define FLOAT_FRAC_BIT8  0x00008000
e7bd38
+#define FLOAT_BIAS       127
e7bd38
+
e7bd38
+static Float conv_bf16_to_float( UInt input )
e7bd38
+{
e7bd38
+  /* input is 16-bit bfloat.
e7bd38
+     bias +127, exponent 8-bits, fraction 7-bits
e7bd38
+
e7bd38
+     output is 32-bit float.
e7bd38
+     bias +127, exponent 8-bits, fraction 22-bits
e7bd38
+  */
e7bd38
+
e7bd38
+  UInt input_exp, input_fraction, unbiased_exp;
e7bd38
+  UInt output_exp, output_fraction;
e7bd38
+  UInt sign;
e7bd38
+  union convert_t conv;
e7bd38
+
e7bd38
+  sign = (UInt)(input & BF16_SIGN_MASK);
e7bd38
+  input_exp = input & BF16_EXP_MASK;
e7bd38
+  unbiased_exp = (input_exp >> 7) - (UInt)BF16_BIAS;
e7bd38
+  input_fraction = input & BF16_FRAC_MASK;
e7bd38
+
e7bd38
+  if (((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&
e7bd38
+      (input_fraction != 0)) {
e7bd38
+     /* input is NaN or SNaN, exp all 1's, fraction != 0 */
e7bd38
+     output_exp = FLOAT_EXP_MASK;
e7bd38
+     output_fraction = input_fraction;
e7bd38
+
e7bd38
+  } else if(((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&
e7bd38
+      ( input_fraction == 0)) {
e7bd38
+     /* input is infinity,  exp all 1's, fraction = 0  */
e7bd38
+     output_exp = FLOAT_EXP_MASK;
e7bd38
+     output_fraction = 0;
e7bd38
+
e7bd38
+  } else if((input_exp == 0) && (input_fraction == 0)) {
e7bd38
+     /* input is zero */
e7bd38
+     output_exp = 0;
e7bd38
+     output_fraction = 0;
e7bd38
+
e7bd38
+  } else if((input_exp == 0) && (input_fraction != 0)) {
e7bd38
+     /* input is denormal */
e7bd38
+     output_fraction = input_fraction;
e7bd38
+     output_exp = (-(Int)BF16_BIAS + (Int)FLOAT_BIAS ) << 23;
e7bd38
+
e7bd38
+  } else {
e7bd38
+     /* result is normal */
e7bd38
+     output_exp = (unbiased_exp + FLOAT_BIAS) << 23;
e7bd38
+     output_fraction = input_fraction;
e7bd38
+  }
e7bd38
+
e7bd38
+  conv.u32 = sign << (31 - 15) | output_exp | (output_fraction << (23-7));
e7bd38
+  return conv.f;
e7bd38
+}
e7bd38
+
e7bd38
+static UInt conv_float_to_bf16( UInt input )
e7bd38
+{
e7bd38
+   /* input is 32-bit float stored as unsigned 32-bit.
e7bd38
+      bias +127, exponent 8-bits, fraction 23-bits
e7bd38
+
e7bd38
+      output is 16-bit bfloat.
e7bd38
+      bias +127, exponent 8-bits, fraction 7-bits
e7bd38
+
e7bd38
+      If the unbiased exponent of the input is greater than the max floating
e7bd38
+      point unbiased exponent value, the result of the floating point 16-bit
e7bd38
+      value is infinity.
e7bd38
+   */
e7bd38
+
e7bd38
+   UInt input_exp, input_fraction;
e7bd38
+   UInt output_exp, output_fraction;
e7bd38
+   UInt result, sign;
e7bd38
+
e7bd38
+   sign = input & FLOAT_SIGN_MASK;
e7bd38
+   input_exp = input & FLOAT_EXP_MASK;
e7bd38
+   input_fraction = input & FLOAT_FRAC_MASK;
e7bd38
+
e7bd38
+   if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&
e7bd38
+       (input_fraction != 0)) {
e7bd38
+      /* input is NaN or SNaN, exp all 1's, fraction != 0 */
e7bd38
+      output_exp = BF16_EXP_MASK;
e7bd38
+      output_fraction = (ULong)input_fraction >> (23 - 7);
e7bd38
+   } else if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&
e7bd38
+              ( input_fraction == 0)) {
e7bd38
+      /* input is infinity,  exp all 1's, fraction = 0  */
e7bd38
+      output_exp = BF16_EXP_MASK;
e7bd38
+      output_fraction = 0;
e7bd38
+   } else if ((input_exp == 0) && (input_fraction == 0)) {
e7bd38
+      /* input is zero */
e7bd38
+      output_exp = 0;
e7bd38
+      output_fraction = 0;
e7bd38
+   } else if ((input_exp == 0) && (input_fraction != 0)) {
e7bd38
+      /* input is denormal */
e7bd38
+      output_exp = 0;
e7bd38
+      output_fraction = (ULong)input_fraction >> (23 - 7);
e7bd38
+   } else {
e7bd38
+      /* result is normal */
e7bd38
+      output_exp = (input_exp - BF16_BIAS + FLOAT_BIAS) >> (23 - 7);
e7bd38
+      output_fraction = (ULong)input_fraction >> (23 - 7);
e7bd38
+
e7bd38
+      /* Round result. Look at the 8th bit position of the 32-bit floating
e7bd38
+         pointt fraction.  The F16 fraction is only 7 bits wide so if the 8th
e7bd38
+         bit of the F32 is a 1 we need to round up by adding 1 to the output
e7bd38
+         fraction.  */
e7bd38
+      if ((input_fraction & FLOAT_FRAC_BIT8) == FLOAT_FRAC_BIT8)
e7bd38
+         /* Round the F16 fraction up by 1 */
e7bd38
+         output_fraction = output_fraction + 1;
e7bd38
+   }
e7bd38
+
e7bd38
+   result = sign >> (31 - 15) | output_exp | output_fraction;
e7bd38
+   return result;
e7bd38
+}
e7bd38
 
e7bd38
 static Float conv_double_to_float( Double src )
e7bd38
 {
e7bd38
@@ -1942,6 +2061,36 @@ static Float negate_float( Float input )
e7bd38
       return -input;
e7bd38
 }
e7bd38
 
e7bd38
+/* This C-helper takes a vector of two 32-bit floating point values
e7bd38
+ * and returns a vector containing two 16-bit bfloats.
e7bd38
+   input:    word0           word1
e7bd38
+   output  0x0   hword1   0x0    hword3
e7bd38
+   Called from generated code.
e7bd38
+ */
e7bd38
+ULong convert_from_floattobf16_helper( ULong src ) {
e7bd38
+   ULong resultHi, resultLo;
e7bd38
+
e7bd38
+   resultHi = (ULong)conv_float_to_bf16( (UInt)(src >> 32));
e7bd38
+   resultLo = (ULong)conv_float_to_bf16( (UInt)(src & 0xFFFFFFFF));
e7bd38
+   return (resultHi << 32) | resultLo;
e7bd38
+
e7bd38
+}
e7bd38
+
e7bd38
+/* This C-helper takes a vector of two 16-bit bfloating point values
e7bd38
+ * and returns a vector containing one 32-bit float.
e7bd38
+   input:   0x0   hword1   0x0    hword3
e7bd38
+   output:    word0           word1
e7bd38
+ */
e7bd38
+ULong convert_from_bf16tofloat_helper( ULong src ) {
e7bd38
+   ULong result;
e7bd38
+   union convert_t conv;
e7bd38
+   conv.f = conv_bf16_to_float( (UInt)(src >> 32) );
e7bd38
+   result = (ULong) conv.u32;
e7bd38
+   conv.f = conv_bf16_to_float( (UInt)(src & 0xFFFFFFFF));
e7bd38
+   result = (result << 32) | (ULong) conv.u32;
e7bd38
+   return result;
e7bd38
+ }
e7bd38
+
e7bd38
 void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
e7bd38
                                               UInt offset_ACC,
e7bd38
                                               ULong srcA_hi, ULong srcA_lo,
e7bd38
@@ -2002,24 +2151,44 @@ void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
e7bd38
          srcB_word[0][j] = (UInt)((srcB_lo >> (16-16*j)) & mask);
e7bd38
       }
e7bd38
 
e7bd38
+      /* Note the isa is not consistent in the src naming.  Will use the
e7bd38
+         naming src10, src11, src20, src21 used with xvf16ger2 instructions.
e7bd38
+      */
e7bd38
       for( j = 0; j < 4; j++) {
e7bd38
          if (((pmsk >> 1) & 0x1) == 0) {
e7bd38
             src10 = 0;
e7bd38
             src20 = 0;
e7bd38
          } else {
e7bd38
-            src10 = conv_f16_to_double((ULong)srcA_word[i][0]);
e7bd38
-            src20 = conv_f16_to_double((ULong)srcB_word[j][0]);
e7bd38
+            if (( inst  == XVF16GER2 ) || ( inst  == XVF16GER2PP )
e7bd38
+                || ( inst == XVF16GER2PN ) || ( inst  == XVF16GER2NP )
e7bd38
+                || ( inst == XVF16GER2NN )) {
e7bd38
+               src10 = conv_f16_to_double((ULong)srcA_word[i][0]);
e7bd38
+               src20 = conv_f16_to_double((ULong)srcB_word[j][0]);
e7bd38
+            } else {
e7bd38
+               /* Input is in bfloat format, result is stored in the
e7bd38
+                  "traditional" 64-bit float format. */
e7bd38
+               src10 = (double)conv_bf16_to_float((ULong)srcA_word[i][0]);
e7bd38
+               src20 = (double)conv_bf16_to_float((ULong)srcB_word[j][0]);
e7bd38
+            }
e7bd38
          }
e7bd38
 
e7bd38
          if ((pmsk & 0x1) == 0) {
e7bd38
             src11 = 0;
e7bd38
             src21 = 0;
e7bd38
          } else {
e7bd38
-            src11 = conv_f16_to_double((ULong)srcA_word[i][1]);
e7bd38
-            src21 = conv_f16_to_double((ULong)srcB_word[j][1]);
e7bd38
+            if (( inst  == XVF16GER2 ) || ( inst  == XVF16GER2PP )
e7bd38
+                || ( inst == XVF16GER2PN ) || ( inst  == XVF16GER2NP )
e7bd38
+                || ( inst == XVF16GER2NN )) {
e7bd38
+               src11 = conv_f16_to_double((ULong)srcA_word[i][1]);
e7bd38
+               src21 = conv_f16_to_double((ULong)srcB_word[j][1]);
e7bd38
+            } else {
e7bd38
+               /* Input is in bfloat format, result is stored in the
e7bd38
+                  "traditional" 64-bit float format. */
e7bd38
+               src11 = (double)conv_bf16_to_float((ULong)srcA_word[i][1]);
e7bd38
+               src21 = (double)conv_bf16_to_float((ULong)srcB_word[j][1]);
e7bd38
+            }
e7bd38
          }
e7bd38
 
e7bd38
-
e7bd38
          prod = src10 * src20;
e7bd38
          msum = prod + src11 * src21;
e7bd38
 
e7bd38
@@ -2027,26 +2196,26 @@ void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
e7bd38
             /* Note, we do not track the exception handling bits
e7bd38
                ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR.  */
e7bd38
 
e7bd38
-            if ( inst == XVF16GER2 )
e7bd38
+            if (( inst == XVF16GER2 ) || ( inst == XVBF16GER2 ) )
e7bd38
                result[j] = reinterpret_float_as_int(
e7bd38
                   conv_double_to_float(msum) );
e7bd38
 
e7bd38
-            else if ( inst == XVF16GER2PP )
e7bd38
+            else if (( inst == XVF16GER2PP ) ||  (inst == XVBF16GER2PP ))
e7bd38
                result[j] = reinterpret_float_as_int(
e7bd38
                   conv_double_to_float(msum)
e7bd38
                   + acc_word[j] );
e7bd38
 
e7bd38
-            else if ( inst == XVF16GER2PN )
e7bd38
+            else if (( inst == XVF16GER2PN ) || ( inst == XVBF16GER2PN ))
e7bd38
                result[j] = reinterpret_float_as_int(
e7bd38
                   conv_double_to_float(msum)
e7bd38
                   + negate_float( acc_word[j] ) );
e7bd38
 
e7bd38
-            else if ( inst == XVF16GER2NP )
e7bd38
+            else if (( inst == XVF16GER2NP ) || ( inst == XVBF16GER2NP ))
e7bd38
                result[j] = reinterpret_float_as_int(
e7bd38
                   conv_double_to_float( negate_double( msum ) )
e7bd38
                   + acc_word[j] );
e7bd38
 
e7bd38
-            else if ( inst == XVF16GER2NN )
e7bd38
+            else if (( inst == XVF16GER2NN ) || ( inst == XVBF16GER2NN ))
e7bd38
                result[j] = reinterpret_float_as_int(
e7bd38
                   conv_double_to_float( negate_double( msum ) )
e7bd38
                   + negate_float( acc_word[j] ) );
e7bd38
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
e7bd38
index 354be6b53..20553a539 100644
e7bd38
--- a/VEX/priv/guest_ppc_toIR.c
e7bd38
+++ b/VEX/priv/guest_ppc_toIR.c
e7bd38
@@ -5688,6 +5688,57 @@ static IRExpr * convert_from_national ( const VexAbiInfo* vbi, IRExpr *src ) {
e7bd38
    return mkexpr( result );
e7bd38
 }
e7bd38
 
e7bd38
+static IRExpr * vector_convert_floattobf16 ( const VexAbiInfo* vbi,
e7bd38
+                                             IRExpr *src ) {
e7bd38
+   /* The function takes 128-bit value containing four 32-bit floats and
e7bd38
+      returns a 128-bit value containint four 16-bit bfloats in the lower
e7bd38
+      halfwords. */
e7bd38
+
e7bd38
+   IRTemp resultHi = newTemp( Ity_I64);
e7bd38
+   IRTemp resultLo = newTemp( Ity_I64);
e7bd38
+
e7bd38
+   assign( resultHi,
e7bd38
+           mkIRExprCCall( Ity_I64, 0 /*regparms*/,
e7bd38
+                          "vector_convert_floattobf16_helper",
e7bd38
+                          fnptr_to_fnentry( vbi,
e7bd38
+                                            &convert_from_floattobf16_helper ),
e7bd38
+                          mkIRExprVec_1( unop( Iop_V128HIto64, src ) ) ) );
e7bd38
+
e7bd38
+   assign( resultLo,
e7bd38
+           mkIRExprCCall( Ity_I64, 0 /*regparms*/,
e7bd38
+                          "vector_convert_floattobf16_helper",
e7bd38
+                          fnptr_to_fnentry( vbi,
e7bd38
+                                            &convert_from_floattobf16_helper ),
e7bd38
+                          mkIRExprVec_1( unop( Iop_V128to64, src ) ) ) );
e7bd38
+
e7bd38
+   return binop( Iop_64HLtoV128, mkexpr( resultHi ), mkexpr( resultLo ) );
e7bd38
+}
e7bd38
+
e7bd38
+static IRExpr * vector_convert_bf16tofloat ( const VexAbiInfo* vbi,
e7bd38
+                                             IRExpr *src ) {
e7bd38
+   /* The function takes 128-bit value containing four 16-bit bfloats in
e7bd38
+      the lower halfwords and returns a 128-bit value containint four
e7bd38
+      32-bit floats. */
e7bd38
+   IRTemp resultHi = newTemp( Ity_I64);
e7bd38
+   IRTemp resultLo = newTemp( Ity_I64);
e7bd38
+
e7bd38
+   assign( resultHi,
e7bd38
+           mkIRExprCCall( Ity_I64, 0 /*regparms*/,
e7bd38
+                          "vector_convert_bf16tofloat_helper",
e7bd38
+                          fnptr_to_fnentry( vbi,
e7bd38
+                                            &convert_from_bf16tofloat_helper ),
e7bd38
+                          mkIRExprVec_1( unop( Iop_V128HIto64, src ) ) ) );
e7bd38
+
e7bd38
+   assign( resultLo,
e7bd38
+           mkIRExprCCall( Ity_I64, 0 /*regparms*/,
e7bd38
+                          "vector_convert_bf16tofloat_helper",
e7bd38
+                          fnptr_to_fnentry( vbi,
e7bd38
+                                            &convert_from_bf16tofloat_helper ),
e7bd38
+                          mkIRExprVec_1( unop( Iop_V128to64, src ) ) ) );
e7bd38
+
e7bd38
+   return binop( Iop_64HLtoV128, mkexpr( resultHi ), mkexpr( resultLo ) );
e7bd38
+}
e7bd38
+
e7bd38
 static IRExpr * popcnt64 ( const VexAbiInfo* vbi,
e7bd38
                            IRExpr *src ){
e7bd38
    /* The function takes a 64-bit source and counts the number of bits in the
e7bd38
@@ -5936,6 +5987,7 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,
e7bd38
    case XVI16GER2:
e7bd38
    case XVI16GER2S:
e7bd38
    case XVF16GER2:
e7bd38
+   case XVBF16GER2:
e7bd38
    case XVF32GER:
e7bd38
          AT_fx = Ifx_Write;
e7bd38
          break;
e7bd38
@@ -5943,6 +5995,10 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,
e7bd38
    case XVI8GER4PP:
e7bd38
    case XVI16GER2PP:
e7bd38
    case XVI16GER2SPP:
e7bd38
+   case XVBF16GER2PP:
e7bd38
+   case XVBF16GER2PN:
e7bd38
+   case XVBF16GER2NP:
e7bd38
+   case XVBF16GER2NN:
e7bd38
    case XVF16GER2PP:
e7bd38
    case XVF16GER2PN:
e7bd38
    case XVF16GER2NP:
e7bd38
@@ -23899,6 +23955,24 @@ dis_vxs_misc( UInt prefix, UInt theInstr, const VexAbiInfo* vbi, UInt opc2,
e7bd38
                                     mkexpr( sub_element1 ),
e7bd38
                                     mkexpr( sub_element0 ) ) ) );
e7bd38
 
e7bd38
+         } else if ((inst_select == 16) && !prefix) {
e7bd38
+            IRTemp result = newTemp(Ity_V128);
e7bd38
+            UChar xT_addr = ifieldRegXT ( theInstr );
e7bd38
+            UChar xB_addr = ifieldRegXB ( theInstr );
e7bd38
+            /* Convert 16-bit bfloat to 32-bit float, not a prefix inst */
e7bd38
+            DIP("xvcvbf16sp v%u,v%u\n", xT_addr, xB_addr);
e7bd38
+            assign( result, vector_convert_bf16tofloat( vbi, mkexpr( vB ) ) );
e7bd38
+            putVSReg( XT, mkexpr( result) );
e7bd38
+
e7bd38
+         } else if ((inst_select == 17) && !prefix) {
e7bd38
+            IRTemp result = newTemp(Ity_V128);
e7bd38
+            UChar xT_addr = ifieldRegXT ( theInstr );
e7bd38
+            UChar xB_addr = ifieldRegXB ( theInstr );
e7bd38
+            /* Convert 32-bit float to 16-bit bfloat, not a prefix inst */
e7bd38
+            DIP("xvcvspbf16 v%u,v%u\n", xT_addr, xB_addr);
e7bd38
+            assign( result, vector_convert_floattobf16( vbi, mkexpr( vB ) ) );
e7bd38
+            putVSReg( XT, mkexpr( result) );
e7bd38
+
e7bd38
          } else if (inst_select == 23) {
e7bd38
             DIP("xxbrd v%u, v%u\n", (UInt)XT, (UInt)XB);
e7bd38
 
e7bd38
@@ -34956,6 +35030,41 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
e7bd38
                          getVSReg( rB_addr ), AT,
e7bd38
                          ( ( inst_prefix << 8 ) | XO ) );
e7bd38
          break;
e7bd38
+      case XVBF16GER2:
e7bd38
+         DIP("xvbf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ), AT,
e7bd38
+                         ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2PP:
e7bd38
+         DIP("xvbf16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ), AT,
e7bd38
+                         ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2PN:
e7bd38
+         DIP("xvbf16ger2pn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ), AT,
e7bd38
+                         ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2NP:
e7bd38
+         DIP("xvbf16ger2np %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ), AT,
e7bd38
+                         ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2NN:
e7bd38
+         DIP("xvbf16ger2nn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ), AT,
e7bd38
+                         ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
       case XVF32GER:
e7bd38
          DIP("xvf32ger %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
          vsx_matrix_ger( vbi, MATRIX_32BIT_FLOAT_GER,
e7bd38
@@ -35106,6 +35215,61 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
e7bd38
                          AT,
e7bd38
                          ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
e7bd38
          break;
e7bd38
+      case XVBF16GER2:
e7bd38
+         PMSK = IFIELD( prefix, 14, 2);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvbf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ),
e7bd38
+                         AT, ( (MASKS << 9 )
e7bd38
+                               | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2PP:
e7bd38
+         PMSK = IFIELD( prefix, 14, 2);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvbf16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ),
e7bd38
+                         AT, ( (MASKS << 9 )
e7bd38
+                               | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2PN:
e7bd38
+         PMSK = IFIELD( prefix, 14, 2);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvbf16ger2pn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ),
e7bd38
+                         AT, ( (MASKS << 9 )
e7bd38
+                               | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2NP:
e7bd38
+         PMSK = IFIELD( prefix, 14, 2);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvbf16ger2np %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ),
e7bd38
+                         AT, ( (MASKS << 9 )
e7bd38
+                               | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVBF16GER2NN:
e7bd38
+         PMSK = IFIELD( prefix, 14, 2);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvbf16ger2nn %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
+                         getVSReg( rA_addr ),
e7bd38
+                         getVSReg( rB_addr ),
e7bd38
+                         AT, ( (MASKS << 9 )
e7bd38
+                               | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
       case XVF16GER2:
e7bd38
          PMSK = IFIELD( prefix, 14, 2);
e7bd38
          XMSK = IFIELD( prefix, 4, 4);
e7bd38
@@ -36181,6 +36345,11 @@ DisResult disInstr_PPC_WRK (
e7bd38
              (opc2 == XVI4GER8PP)     ||       // xvi4ger8pp
e7bd38
              (opc2 == XVI8GER4)       ||       // xvi8ger4
e7bd38
              (opc2 == XVI8GER4PP)     ||       // xvi8ger4pp
e7bd38
+             (opc2 == XVBF16GER2)     ||       // xvbf16ger2
e7bd38
+             (opc2 == XVBF16GER2PP)   ||       // xvbf16ger2pp
e7bd38
+             (opc2 == XVBF16GER2PN)   ||       // xvbf16ger2pn
e7bd38
+             (opc2 == XVBF16GER2NP)   ||       // xvbf16ger2np
e7bd38
+             (opc2 == XVBF16GER2NN)   ||       // xvbf16ger2nn
e7bd38
              (opc2 == XVF16GER2)      ||       // xvf16ger2
e7bd38
              (opc2 == XVF16GER2PP)    ||       // xvf16ger2pp
e7bd38
              (opc2 == XVF16GER2PN)    ||       // xvf16ger2pn
e7bd38
commit e09fdaf569b975717465ed8043820d0198d4d47d
e7bd38
Author: Carl Love <cel@us.ibm.com>
e7bd38
Date:   Fri Feb 26 16:05:12 2021 -0600
e7bd38
e7bd38
    PPC64: Reduced-Precision: Missing Integer-based Outer Product Operations
e7bd38
    
e7bd38
    Add support for:
e7bd38
    
e7bd38
    pmxvi16ger2 VSX Vector 16-bit Signed Integer GER (rank-2 update), Prefixed
e7bd38
       Masked
e7bd38
    pmxvi16ger2pp VSX Vector 16-bit Signed Integer GER (rank-2 update) (Positive
e7bd38
       multiply, Positive accumulate), Prefixed Masked
e7bd38
    pmxvi8ger4spp VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with
e7bd38
       Saturation (Positive multiply, Positive accumulate), Prefixed Masked
e7bd38
    xvi16ger2 VSX Vector 16-bit Signed Integer GER (rank-2 update)
e7bd38
    xvi16ger2pp VSX Vector 16-bit Signed Integer GER (rank-2 update) (Positive
e7bd38
       multiply, Positive accumulate)
e7bd38
    xvi8ger4spp VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with
e7bd38
       Saturation (Positive multiply, Positive accumulate)
e7bd38
e7bd38
diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
e7bd38
index 6bcee966d..d8131eb60 100644
e7bd38
--- a/VEX/priv/guest_ppc_helpers.c
e7bd38
+++ b/VEX/priv/guest_ppc_helpers.c
e7bd38
@@ -1446,16 +1446,16 @@ static UInt exts4( UInt src)
e7bd38
       return src & 0xF;        /* make sure high order bits are zero */
e7bd38
 }
e7bd38
 
e7bd38
-static UInt exts8( UInt src)
e7bd38
+static ULong exts8( UInt src)
e7bd38
 {
e7bd38
-   /* Input is an 8-bit value.  Extend bit 7 to bits [31:8] */
e7bd38
+   /* Input is an 8-bit value.  Extend bit 7 to bits [63:8] */
e7bd38
    if (( src >> 7 ) & 0x1)
e7bd38
-      return src | 0xFFFFFF00; /* sign bit is a 1, extend */
e7bd38
+      return src | 0xFFFFFFFFFFFFFF00ULL; /* sign bit is a 1, extend */
e7bd38
    else
e7bd38
       return src & 0xFF;        /* make sure high order bits are zero */
e7bd38
 }
e7bd38
 
e7bd38
-static UInt extz8( UInt src)
e7bd38
+static ULong extz8( UInt src)
e7bd38
 {
e7bd38
    /* Input is an 8-bit value.  Extend src on the left with zeros.  */
e7bd38
    return src & 0xFF;        /* make sure high order bits are zero */
e7bd38
@@ -1662,12 +1662,12 @@ void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State* gst,
e7bd38
                                        ULong srcB_hi, ULong srcB_lo,
e7bd38
                                        UInt masks_inst )
e7bd38
 {
e7bd38
-   UInt i, j, mask, sum, inst, acc_entry, prefix_inst;
e7bd38
+   UInt i, j, mask, inst, acc_entry, prefix_inst;
e7bd38
 
e7bd38
    UInt srcA_bytes[4][4];   /* word, byte */
e7bd38
    UInt srcB_bytes[4][4];   /* word, byte */
e7bd38
    UInt acc_word[4];
e7bd38
-   UInt prod0, prod1, prod2, prod3;
e7bd38
+   ULong prod0, prod1, prod2, prod3, sum;
e7bd38
    UInt result[4];
e7bd38
    UInt pmsk = 0;
e7bd38
    UInt xmsk = 0;
e7bd38
@@ -1742,10 +1742,13 @@ void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State* gst,
e7bd38
             sum = prod0 + prod1 + prod2 + prod3;
e7bd38
 
e7bd38
             if ( inst == XVI8GER4 )
e7bd38
-               result[j] = sum;
e7bd38
+               result[j] = chop64to32( sum );
e7bd38
 
e7bd38
             else if ( inst == XVI8GER4PP )
e7bd38
-               result[j] = sum + acc_word[j];
e7bd38
+               result[j] = chop64to32( sum + acc_word[j] );
e7bd38
+
e7bd38
+            else if ( inst == XVI8GER4SPP )
e7bd38
+               result[j] = clampS64toS32(sum + acc_word[j]);
e7bd38
 
e7bd38
          } else {
e7bd38
             result[j] = 0;
e7bd38
@@ -1821,7 +1824,7 @@ void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State* gst,
e7bd38
             else
e7bd38
                prod1 = exts16to64( srcA_word[i][1] )
e7bd38
                   * exts16to64( srcB_word[j][1] );
e7bd38
-            /* sum is UInt so the result is choped to 32-bits */
e7bd38
+
e7bd38
             sum = prod0 + prod1;
e7bd38
 
e7bd38
             if ( inst == XVI16GER2 )
e7bd38
@@ -1830,13 +1833,11 @@ void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State* gst,
e7bd38
             else if ( inst == XVI16GER2S )
e7bd38
                result[j] = clampS64toS32( sum );
e7bd38
 
e7bd38
-            else if ( inst == XVI16GER2PP ) {
e7bd38
+            else if ( inst == XVI16GER2PP )
e7bd38
                result[j] = chop64to32( sum + acc_word[j] );
e7bd38
-            }
e7bd38
 
e7bd38
-            else if ( inst == XVI16GER2SPP ) {
e7bd38
+            else if ( inst == XVI16GER2SPP )
e7bd38
                result[j] = clampS64toS32( sum + acc_word[j] );
e7bd38
-            }
e7bd38
 
e7bd38
          } else {
e7bd38
             result[j] = 0;
e7bd38
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
e7bd38
index 20553a539..e54f0f389 100644
e7bd38
--- a/VEX/priv/guest_ppc_toIR.c
e7bd38
+++ b/VEX/priv/guest_ppc_toIR.c
e7bd38
@@ -5993,6 +5993,7 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,
e7bd38
          break;
e7bd38
    case XVI4GER8PP:
e7bd38
    case XVI8GER4PP:
e7bd38
+   case XVI8GER4SPP:
e7bd38
    case XVI16GER2PP:
e7bd38
    case XVI16GER2SPP:
e7bd38
    case XVBF16GER2PP:
e7bd38
@@ -34983,6 +34984,12 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
e7bd38
                          getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
                          AT, ( ( inst_prefix << 8 ) | XO ) );
e7bd38
          break;
e7bd38
+      case XVI8GER4SPP:
e7bd38
+         DIP("xvi8ger4spp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_8BIT_INT_GER,
e7bd38
+                         getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
+                         AT, ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
       case XVI16GER2S:
e7bd38
          DIP("xvi16ger2s %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
          vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
e7bd38
@@ -34995,6 +35002,19 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
e7bd38
                          getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
                          AT, ( ( inst_prefix << 8 ) | XO ) );
e7bd38
          break;
e7bd38
+      case XVI16GER2:
e7bd38
+         DIP("xvi16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
e7bd38
+                         getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
+                         AT, ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVI16GER2PP:
e7bd38
+         DIP("xvi16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
e7bd38
+                         getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
+                         AT, ( ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+
e7bd38
       case XVF16GER2:
e7bd38
          DIP("xvf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);
e7bd38
          vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,
e7bd38
@@ -35193,6 +35213,39 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
e7bd38
                          AT,
e7bd38
                          ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
e7bd38
          break;
e7bd38
+      case XVI8GER4SPP:
e7bd38
+         PMSK = IFIELD( prefix, 12, 4);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvi8ger4spp %u,r%u, r%u,%u,%u,%u\n",
e7bd38
+             AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_8BIT_INT_GER,
e7bd38
+                         getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
+                         AT,
e7bd38
+                         ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVI16GER2:
e7bd38
+         PMSK = IFIELD( prefix, 12, 4);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvi16ger2 %u,r%u, r%u,%u,%u,%u\n",
e7bd38
+             AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
e7bd38
+                         getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
+                         AT,
e7bd38
+                         ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
+      case XVI16GER2PP:
e7bd38
+         PMSK = IFIELD( prefix, 12, 4);
e7bd38
+         XMSK = IFIELD( prefix, 4, 4);
e7bd38
+         YMSK = IFIELD( prefix, 0, 4);
e7bd38
+         DIP("pmxvi16ger2pp %u,r%u, r%u,%u,%u,%u\n",
e7bd38
+             AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);
e7bd38
+         vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,
e7bd38
+                         getVSReg( rA_addr ), getVSReg( rB_addr ),
e7bd38
+                         AT,
e7bd38
+                         ( (MASKS << 9 ) | ( inst_prefix << 8 ) | XO ) );
e7bd38
+         break;
e7bd38
       case XVI16GER2S:
e7bd38
          PMSK = IFIELD( prefix, 14, 2);
e7bd38
          XMSK = IFIELD( prefix, 4, 4);
e7bd38
@@ -36345,6 +36398,9 @@ DisResult disInstr_PPC_WRK (
e7bd38
              (opc2 == XVI4GER8PP)     ||       // xvi4ger8pp
e7bd38
              (opc2 == XVI8GER4)       ||       // xvi8ger4
e7bd38
              (opc2 == XVI8GER4PP)     ||       // xvi8ger4pp
e7bd38
+             (opc2 == XVI8GER4SPP)    ||       // xvi8ger4spp
e7bd38
+             (opc2 == XVI16GER2)      ||       // xvi16ger2
e7bd38
+             (opc2 == XVI16GER2PP)    ||       // xvi16ger2pp
e7bd38
              (opc2 == XVBF16GER2)     ||       // xvbf16ger2
e7bd38
              (opc2 == XVBF16GER2PP)   ||       // xvbf16ger2pp
e7bd38
              (opc2 == XVBF16GER2PN)   ||       // xvbf16ger2pn