package main import ( "bytes" "encoding/binary" "io" "log" "os" "runtime" "runtime/pprof" "sort" "strconv" "sync" "syscall" "unsafe" ) type wc struct { token string count int } type byCount []wc func (a byCount) Len() int { return len(a) } func (a byCount) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func (a byCount) Less(i, j int) bool { return a[i].count > a[j].count } func main() { if pr := os.Getenv("CPUPROFILE"); pr != "" { p, err := os.Create(pr) if err != nil { log.Fatal("could not create CPU profile: ", err) } defer p.Close() if err := pprof.StartCPUProfile(p); err != nil { log.Fatal("could not start CPU profile: ", err) } defer pprof.StopCPUProfile() } var ( data []byte st os.FileInfo size int ) f, err := os.Open(os.Args[1]) if err != nil { log.Fatal(err) } else if st, err = f.Stat(); err != nil { log.Fatal(err) } size = int(st.Size()) data, err = syscall.Mmap(int(f.Fd()), 0, (size+4095)&^4095, syscall.PROT_READ, syscall.MAP_SHARED) if err != nil { log.Fatal(err) } var ( cpus = runtime.NumCPU() ns = make([]map[uint64]int, cpus) ss = make([]map[string]int, cpus) wg sync.WaitGroup part int start int end = size / cpus ) for { if part == cpus { break } for { if !isspace(data[end]) && end < size { end++ } else { break } } wg.Add(1) go func(data []byte, part int) { ns[part] = make(map[uint64]int) ss[part] = make(map[string]int) var ( buf [8]byte cursor int i int c byte ) for { // consume spaces for i, c = range data[cursor:] { if !isspace(c) { break } } cursor += i if cursor >= len(data)-1 { break } // consume word for i, c = range data[cursor:] { if isspace(c) { break } } // count word if i > 8 { ss[part][string(data[cursor:cursor+i])] += 1 } else { *(*uint64)(unsafe.Pointer(&buf)) = uint64(0) for j, c := range data[cursor : cursor+i] { buf[j] = c } x := binary.BigEndian.Uint64(buf[:]) ns[part][x] += 1 } cursor += i } wg.Done() }(data[start:end], part) start = end end += size / cpus if part == cpus-1 { end = size } part++ } wg.Wait() var ( w = make([]byte, 8) cum = make(map[string]int) ) for part := range ns { for key, value := range ns[part] { binary.BigEndian.PutUint64(w, key) cum[string(w)] += value } for key, value := range ss[part] { cum[key] += value } } a := make([]wc, len(cum)) i := 0 for key, value := range cum { a[i].token = key a[i].count = value i++ } sort.Sort(byCount(a)) var bufs = make([]chan *bytes.Buffer, cpus) for i := range bufs { bufs[i] = make(chan *bytes.Buffer) } start = 0 end = len(a) / cpus for part = 0; part < cpus; part++ { go func(start, end, part int) { var buf = &bytes.Buffer{} for _, it := range a[start:end] { buf.WriteString(strconv.Itoa(it.count)) buf.WriteRune(' ') buf.WriteString(it.token) buf.WriteRune('\n') } bufs[part] <- buf }(start, end, part) start = end end += len(a) / cpus if part == cpus-1 { end = len(a) } } for i := range bufs { buf := <-bufs[i] io.Copy(os.Stdout, buf) } } func isspace(b byte) bool { switch b { case ' ', '\r', '\t', '\n': return true } return false }