Interesting Aspects on the Compilation Process of icgrep

Zhenan (Carl) Cui

301220129

zca37@sfu.ca

1. CPU feature detection

  • Uses LLVM sys::getHostCPUFeatures to get a list of supported features for the cpu
  • Check for avx, avx2 also the largest block size to determine which kernel to use
Features getHostCPUFeatures() {
    Features hostCPUFeatures;
    StringMap<bool> features;
    if (sys::getHostCPUFeatures(features)) {
        hostCPUFeatures.hasAVX = features.lookup("avx");
        hostCPUFeatures.hasAVX2 = features.lookup("avx2");
    }
    return hostCPUFeatures;
}
KernelBuilder * GetIDISA_Builder(llvm::LLVMContext & C) {
    const auto hostCPUFeatures = getHostCPUFeatures();
    if (LLVM_LIKELY(codegen::BlockSize == 0)) {  // No BlockSize override: use processor SIMD width
        
        codegen::BlockSize = hostCPUFeatures.hasAVX2 ? 256 : 128;
    }
    else if (((codegen::BlockSize & (codegen::BlockSize - 1)) != 0) || (codegen::BlockSize < 64)) {
        llvm::report_fatal_error("BlockSize must be a power of 2 and >=64");
    }
    if (codegen::BlockSize >= 256) {
        // AVX2 or AVX builders can only be used for BlockSize multiples of 256
        if (hostCPUFeatures.hasAVX2) {
            return new KernelBuilderImpl<IDISA_AVX2_Builder>(C, codegen::BlockSize, codegen::BlockSize);
        } else if (hostCPUFeatures.hasAVX) {
            return new KernelBuilderImpl<IDISA_AVX_Builder>(C, codegen::BlockSize, codegen::BlockSize);
        }
    } else if (codegen::BlockSize == 64) {
        return new KernelBuilderImpl<IDISA_I64_Builder>(C, codegen::BlockSize, codegen::BlockSize);
    }
    return new KernelBuilderImpl<IDISA_SSE2_Builder>(C, codegen::BlockSize, codegen::BlockSize);
}

Under IR_Gen/idisa_target.cpp:

Even if hasAVX is true, BlockSize can still be 128, and the kernel will default to SSE2_Builder

 1158 #if defined(__i386__) || defined(_M_IX86) || \
 1159     defined(__x86_64__) || defined(_M_X64)
 1160 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 1161   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
 1162   unsigned MaxLevel;
 1163   union {
 1164     unsigned u[3];
 1165     char c[12];
 1166   } text;
 1167 
 1168   if (getX86CpuIDAndInfo(0, &MaxLevel, text.u + 0, text.u + 2, text.u + 1) ||
 1169       MaxLevel < 1)
 1170     return false;
 1171 
 1172   getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
 1173 
 1174   Features["cmov"]   = (EDX >> 15) & 1;
 1175   Features["mmx"]    = (EDX >> 23) & 1;
 1176   Features["sse"]    = (EDX >> 25) & 1;
 1177   Features["sse2"]   = (EDX >> 26) & 1;
 1178 
 1179   Features["sse3"]   = (ECX >>  0) & 1;
 1180   Features["pclmul"] = (ECX >>  1) & 1;
 1181   Features["ssse3"]  = (ECX >>  9) & 1;
 1182   Features["cx16"]   = (ECX >> 13) & 1;
 1183   Features["sse4.1"] = (ECX >> 19) & 1;
 1184   Features["sse4.2"] = (ECX >> 20) & 1;
 1185   Features["movbe"]  = (ECX >> 22) & 1;
 1186   Features["popcnt"] = (ECX >> 23) & 1;
 1187   Features["aes"]    = (ECX >> 25) & 1;
 1188   Features["rdrnd"]  = (ECX >> 30) & 1;
 1189 
 1190   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
 1191   // indicates that the AVX registers will be saved and restored on context
 1192   // switch, then we have full AVX support.
 1193   bool HasAVXSave = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) &&
 1194                     !getX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6);
 1195   // AVX512 requires additional context to be saved by the OS.
 1196   bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
 1197 
 1198   Features["avx"]   = HasAVXSave;
 1199   Features["fma"]   = ((ECX >> 12) & 1) && HasAVXSave;
 1200   // Only enable XSAVE if OS has enabled support for saving YMM state.
 1201   Features["xsave"] = ((ECX >> 26) & 1) && HasAVXSave;
 1202   Features["f16c"]  = ((ECX >> 29) & 1) && HasAVXSave;

From http://llvm.org/doxygen/Host_8cpp_source.html

2. Kernels

Driver

KernelBuilder

Kernel

1. Kernel Type

2. Instantiate the kernel

3. Feed input stream

4. Produce output stream

  • The driver is responsible for maintaining StreamSetBuffers (inputs and outputs) and kernels
  • During instantiation of the driver, it uses cpu feature detection mentioned before to initialize the KernelBuilders
  • Driver builds kernel instance using the KernelBuilder
  • Kernels perform a transformation on StreamSetBuffers
  • Only new kernels need to be built, and the driver and the KernelBuilder can be reused
  • Also, it is easy to write and add new kernels to the pipeline without changing too much code

In grep_engine.cpp:

std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
    auto & idb = mGrepDriver->getBuilder();
    const unsigned segmentSize = codegen::SegmentSize;
    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
    const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
    const unsigned encodingBits = 8;

    StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);

    #ifdef USE_DIRECT_LF_BUILDER
    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
    mGrepDriver->makeKernelCall(linefeedK, {ByteStream}, {LineFeedStream});
    #endif

    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});

    #ifndef USE_DIRECT_LF_BUILDER
    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
    mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
    #endif

    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
    mGrepDriver->makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream});

    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), baseBufferSize);
    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});

    const auto n = REs.size();
    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
    for(unsigned i = 0; i < n; ++i) {
        REs[i] = resolveModesAndExternalSymbols(REs[i]);
        REs[i] = excludeUnicodeLineBreak(REs[i]);
        REs[i] = regular_expression_passes(REs[i]);
#define USE_MULTIPLEX_CC
#ifdef USE_MULTIPLEX_CC
        const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);

        StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
        if (UnicodeSets.size() <= 1) {
            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
            mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
            MatchResultsBufs[i] = MatchResults;
        } else {
            mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
            REs[i] = transformCCs(mpx.get(), REs[i]);
            std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
            auto numOfCharacterClasses = mpx_basis.size();
            StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
            kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
            mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});            
            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], std::vector<cc::Alphabet *>{mpx.get()});
            mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams, CharClasses}, {MatchResults});
            MatchResultsBufs[i] = MatchResults;
        }
#else
        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
        mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
        MatchResultsBufs[i] = MatchResults;
#endif
    }
    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
    if (REs.size() > 1) {
        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
    }
    StreamSetBuffer * Matches = MergedResults;

    if (mMoveMatchesToEOL) {
        StreamSetBuffer * OriginalMatches = Matches;
        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
    }

    if (InvertMatchFlag) {
        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
        StreamSetBuffer * OriginalMatches = Matches;
        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
    }
    if (MaxCountFlag > 0) {
        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
        StreamSetBuffer * const AllMatches = Matches;
        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
    }
    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
}

The end

Made with Slides.com