Interesting Aspects on the Compilation Process of icgrep
Zhenan (Carl) Cui
301220129
zca37@sfu.ca
1. CPU feature detection
- Uses LLVM sys::getHostCPUFeatures to get a list of supported features for the cpu
- Check for avx, avx2 also the largest block size to determine which kernel to use
Features getHostCPUFeatures() {
Features hostCPUFeatures;
StringMap<bool> features;
if (sys::getHostCPUFeatures(features)) {
hostCPUFeatures.hasAVX = features.lookup("avx");
hostCPUFeatures.hasAVX2 = features.lookup("avx2");
}
return hostCPUFeatures;
}
KernelBuilder * GetIDISA_Builder(llvm::LLVMContext & C) {
const auto hostCPUFeatures = getHostCPUFeatures();
if (LLVM_LIKELY(codegen::BlockSize == 0)) { // No BlockSize override: use processor SIMD width
codegen::BlockSize = hostCPUFeatures.hasAVX2 ? 256 : 128;
}
else if (((codegen::BlockSize & (codegen::BlockSize - 1)) != 0) || (codegen::BlockSize < 64)) {
llvm::report_fatal_error("BlockSize must be a power of 2 and >=64");
}
if (codegen::BlockSize >= 256) {
// AVX2 or AVX builders can only be used for BlockSize multiples of 256
if (hostCPUFeatures.hasAVX2) {
return new KernelBuilderImpl<IDISA_AVX2_Builder>(C, codegen::BlockSize, codegen::BlockSize);
} else if (hostCPUFeatures.hasAVX) {
return new KernelBuilderImpl<IDISA_AVX_Builder>(C, codegen::BlockSize, codegen::BlockSize);
}
} else if (codegen::BlockSize == 64) {
return new KernelBuilderImpl<IDISA_I64_Builder>(C, codegen::BlockSize, codegen::BlockSize);
}
return new KernelBuilderImpl<IDISA_SSE2_Builder>(C, codegen::BlockSize, codegen::BlockSize);
}
Under IR_Gen/idisa_target.cpp:
Even if hasAVX is true, BlockSize can still be 128, and the kernel will default to SSE2_Builder
1158 #if defined(__i386__) || defined(_M_IX86) || \
1159 defined(__x86_64__) || defined(_M_X64)
1160 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
1161 unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
1162 unsigned MaxLevel;
1163 union {
1164 unsigned u[3];
1165 char c[12];
1166 } text;
1167
1168 if (getX86CpuIDAndInfo(0, &MaxLevel, text.u + 0, text.u + 2, text.u + 1) ||
1169 MaxLevel < 1)
1170 return false;
1171
1172 getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
1173
1174 Features["cmov"] = (EDX >> 15) & 1;
1175 Features["mmx"] = (EDX >> 23) & 1;
1176 Features["sse"] = (EDX >> 25) & 1;
1177 Features["sse2"] = (EDX >> 26) & 1;
1178
1179 Features["sse3"] = (ECX >> 0) & 1;
1180 Features["pclmul"] = (ECX >> 1) & 1;
1181 Features["ssse3"] = (ECX >> 9) & 1;
1182 Features["cx16"] = (ECX >> 13) & 1;
1183 Features["sse4.1"] = (ECX >> 19) & 1;
1184 Features["sse4.2"] = (ECX >> 20) & 1;
1185 Features["movbe"] = (ECX >> 22) & 1;
1186 Features["popcnt"] = (ECX >> 23) & 1;
1187 Features["aes"] = (ECX >> 25) & 1;
1188 Features["rdrnd"] = (ECX >> 30) & 1;
1189
1190 // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
1191 // indicates that the AVX registers will be saved and restored on context
1192 // switch, then we have full AVX support.
1193 bool HasAVXSave = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) &&
1194 !getX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6);
1195 // AVX512 requires additional context to be saved by the OS.
1196 bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
1197
1198 Features["avx"] = HasAVXSave;
1199 Features["fma"] = ((ECX >> 12) & 1) && HasAVXSave;
1200 // Only enable XSAVE if OS has enabled support for saving YMM state.
1201 Features["xsave"] = ((ECX >> 26) & 1) && HasAVXSave;
1202 Features["f16c"] = ((ECX >> 29) & 1) && HasAVXSave;
From http://llvm.org/doxygen/Host_8cpp_source.html
2. Kernels
Driver
KernelBuilder
Kernel
1. Kernel Type
2. Instantiate the kernel
3. Feed input stream
4. Produce output stream
- The driver is responsible for maintaining StreamSetBuffers (inputs and outputs) and kernels
- During instantiation of the driver, it uses cpu feature detection mentioned before to initialize the KernelBuilders
- Driver builds kernel instance using the KernelBuilder
- Kernels perform a transformation on StreamSetBuffers
- Only new kernels need to be built, and the driver and the KernelBuilder can be reused
- Also, it is easy to write and add new kernels to the pipeline without changing too much code
In grep_engine.cpp:
std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
auto & idb = mGrepDriver->getBuilder();
const unsigned segmentSize = codegen::SegmentSize;
const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
// TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
const unsigned encodingBits = 8;
StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
#ifdef USE_DIRECT_LF_BUILDER
kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
mGrepDriver->makeKernelCall(linefeedK, {ByteStream}, {LineFeedStream});
#endif
StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
#ifndef USE_DIRECT_LF_BUILDER
kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
#endif
StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
mGrepDriver->makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream});
kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), baseBufferSize);
mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
const auto n = REs.size();
std::vector<StreamSetBuffer *> MatchResultsBufs(n);
for(unsigned i = 0; i < n; ++i) {
REs[i] = resolveModesAndExternalSymbols(REs[i]);
REs[i] = excludeUnicodeLineBreak(REs[i]);
REs[i] = regular_expression_passes(REs[i]);
#define USE_MULTIPLEX_CC
#ifdef USE_MULTIPLEX_CC
const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);
StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
if (UnicodeSets.size() <= 1) {
kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
MatchResultsBufs[i] = MatchResults;
} else {
mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
REs[i] = transformCCs(mpx.get(), REs[i]);
std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
auto numOfCharacterClasses = mpx_basis.size();
StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], std::vector<cc::Alphabet *>{mpx.get()});
mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams, CharClasses}, {MatchResults});
MatchResultsBufs[i] = MatchResults;
}
#else
StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
MatchResultsBufs[i] = MatchResults;
#endif
}
StreamSetBuffer * MergedResults = MatchResultsBufs[0];
if (REs.size() > 1) {
MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
}
StreamSetBuffer * Matches = MergedResults;
if (mMoveMatchesToEOL) {
StreamSetBuffer * OriginalMatches = Matches;
kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
}
if (InvertMatchFlag) {
kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
StreamSetBuffer * OriginalMatches = Matches;
Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
}
if (MaxCountFlag > 0) {
kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
StreamSetBuffer * const AllMatches = Matches;
Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
}
return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
}
The end
cmpt 489 initial exploration on icgrep codebase
By carlcui
cmpt 489 initial exploration on icgrep codebase
- 471