You can write them yourself also but I'd add a verifier that checks the output with scalar code as it can be tricky to get correct.
Intel had an article up for the 3x8 transpose, but it seems to no longer exist so i'll just post the psuedo code
//xyz -> xxx void swizzle3_AoS_to_SoA(v8float &x, v8float &y, v8float &z) { v8float m14 = interleave_low_high<1, 2>(x, z); //swap low/high 128 bits v8float m03 = blend<0, 0, 0, 0, 1, 1, 1, 1>(x, y); //_mm256_blend_ps 1 cycle v8float m25 = blend<0, 0, 0, 0, 1, 1, 1, 1>(y, z); //shuffles are all 1 cycle __m256 xy = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2, 1, 3, 2)); // upper x's and y's __m256 yz = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1, 0, 2, 1)); // lower y's and z's v8float xo = _mm256_shuffle_ps(m03, xy, _MM_SHUFFLE(2, 0, 3, 0)); v8float yo = _mm256_shuffle_ps(yz, xy, _MM_SHUFFLE(3, 1, 2, 0)); v8float zo = _mm256_shuffle_ps(yz, m25, _MM_SHUFFLE(3, 0, 3, 1)); x = xo; y = yo; z = zo; }