// uses https://github.com/martinus/robin-hood-hashing // clang++ -std=c++17 -o w w.cc -W -Wall -Wextra -Wshadow -g -O3 -flto -lpthread -fuse-ld=lld #include #include #include #include #include #include "robin_hood.h" #include #include #include #include #include #include #include #include #include #include using namespace std; static bool isspace(char c) { return (c == ' ' || c=='\n' || c=='\t' || c=='\r'); } #ifdef __SSE__ #ifndef __SSE4_2__ // Unfortunately, using _mm_set1_epi8 in the loop will cause 4 loads. // The whole reason why we do this is so we can save loads. // So we cheat and declare explicit register variables that we will // initialize outside the call to l2. This disgusting hack will confuse // g++ 8.3 enough that it generates invalid assembler code. clang++ 9 // works. __m128i _space asm ("%xmm7"); __m128i _nl asm ("%xmm6"); __m128i _cr asm ("%xmm5"); __m128i _tab asm ("%xmm4"); #else __m128i mask asm("%xmm7"); #endif size_t l2(const char* s) { for (size_t r=0; ; r+=sizeof(__m128i)) { __m128i v=_mm_loadu_si128((__m128i*)(s+r)); #ifdef __SSE4_2__ unsigned int x = _mm_cmpistri(mask,v, _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_POSITIVE_POLARITY|_SIDD_LEAST_SIGNIFICANT); if (x < 16) return r + x; #else #if 0 // the version we really wanted: uint32_t x = _mm_movemask_epi8( _mm_or_si128( _mm_or_si128(_mm_cmpeq_epi8(v,_mm_set1_epi8(0x20)),_mm_cmpeq_epi8(v,_mm_set1_epi8(0x0a))), _mm_or_si128(_mm_cmpeq_epi8(v,_mm_set1_epi8(0x0d)),_mm_cmpeq_epi8(v,_mm_set1_epi8(0x09)))) ); #else // this code does an unsigned load of a 16-byte vector, // then does four element-wise vector compares, ORing the results // finally converts each byte into a bit in x and calls // __builtin_ffs to get gcc to generate a bit-scan instruction. uint32_t x = _mm_movemask_epi8( _mm_or_si128( _mm_or_si128(_mm_cmpeq_epi8(v,_space),_mm_cmpeq_epi8(v,_nl)), _mm_or_si128(_mm_cmpeq_epi8(v,_cr),_mm_cmpeq_epi8(v,_tab))) ); #endif // #if 0 if (x) return r + __builtin_ffs(x) - 1; #endif // #ifdef __SSE4_2__ } } #endif // #ifdef __SSE__ void tokenize(string_view sv,robin_hood::unordered_map* m) { size_t n=sv.size(),a,b; #ifdef __SSE__ #ifdef __SSE4_2__ static __m128i const _mask = { 0x200a0d09 }; mask = _mask; #else // to save on loads in l2, we pre-load these here, outside the loop. _space = _mm_set1_epi8(0x20); _nl = _mm_set1_epi8(0x0a); _cr = _mm_set1_epi8(0x0d); _tab = _mm_set1_epi8(0x09); #endif #endif for (size_t i=0; i(mmap(nullptr,ss.st_size+16,PROT_READ|PROT_WRITE,MAP_PRIVATE,0,0)); if (map==MAP_FAILED) { cerr << "could not mmap stdin!\n"; return 1; } map[ss.st_size]=' '; try { // shut up clang-tidy :-) // separate input into "threads" chunks, but make sure the // chunks don't split the input in the middle of a word size_t inc=ss.st_size/threads; // approximate chunk size vector ofs(threads); { // Adjust offsets in ofs[] from middle of word. // This code will fail on pathological inputs with ultra // long words (approaching chunk size) off_t n=inc; for (size_t i=0; i+1 slice; for (size_t i=0; i> m(threads); vector t; // phase 1: tokenize for (size_t i=1; i> temp(m[0].begin(), m[0].end()); sort(temp.begin(), temp.end(), [](auto const& a,auto const& b) { return b.second < a.second; } ); // phase 4: output vector output(4); { // Output is comparatively cheap, but we'll do it in multiple // threads anyway. size_t outslice = temp.size()/threads; for (size_t i=0; i