<[email protected]>
<http://www.gnu.org/licenses/>
namespace input {
bool check_memory_type(const std::string& memtype)
{
    if (memtype == "malloc") return true;
    if (memtype == "mmap") return true;
    if (memtype == "mmap_interleave") return true;
    if (memtype == "mmap_node0") return true;
    if (memtype == "mmap_segment") return true;
    std::cout << "Following --memory types are available:" << std::endl
              << "  malloc           use plain malloc() call (default)" << std::endl
              << "  mmap             use mmap() to allocate shared memory" << std::endl
              << "  mmap_interleave  use libnuma to interleave onto nodes" << std::endl
              << "  mmap_node0       pin memory to numa node 0" << std::endl
              << "  mmap_segment     segment characters equally onto all numa nodes" << std::endl
        ;
    
    return false;
}
void do_numa_segment(char* buff, size_t buffsize)
{
    int numNodes = g_numa_nodes;
    if (numNodes < 1) numNodes = 1;
    int numRealNodes = numa_num_configured_nodes();
    if (numRealNodes < 1) numRealNodes = 1;
    size_t segsize = (buffsize + numNodes-1) / numNodes;
    std::cout << "Segmenting string characters onto " << numNodes << " NUMA nodes, about "
              << segsize << " characters each." << std::endl;
    int pagesize = sysconf(_SC_PAGE_SIZE);
    segsize = segsize - segsize % pagesize;
    segsize += pagesize - (segsize % pagesize);
    assert(segsize % pagesize == 0);
    for (int n = 0; n < numNodes; ++n)
    {
        size_t offset = n * segsize;
        g_numa_chars.push_back(offset);
        
        size_t size = std::min(segsize, buffsize-offset);
        numa_tonode_memory(buff + offset, size, n % numRealNodes);
    }
    
    if (1)
    {
        for (size_t i = 0; i < g_numa_chars.size(); ++i)
        {
            size_t end = i == g_numa_chars.size()-1 ? buffsize : g_numa_chars[i+1];
            std::cout << "NUMA segment " << i << " = "
                      << "[" << g_numa_chars[i] << "," << end << ") = "
                      << (end - g_numa_chars[i]) << std::endl;
        }
    }
    numa_set_interleave_mask(numa_all_nodes_ptr);
    std::cout << "NUMA segmenting finished." << std::endl;
}
void free_stringdata()
{
    if (!g_string_databuff) return;
    if (gopt_memory_type == "mmap" ||
        gopt_memory_type == "mmap_interleave" ||
        gopt_memory_type == "mmap_node0" ||
        gopt_memory_type == "mmap_segment")
    {
        if (munmap(g_string_databuff, g_string_buffsize)) {
            std::cout << "Error unmapping string data memory: " << strerror(errno) << std::endl;
        }
    }
    else 
    {
        free(g_string_databuff);
    }
    g_string_databuff = NULL;
    numa_set_interleave_mask(numa_all_nodes_ptr);
}
char* allocate_stringdata(size_t size, const std::string& path)
{
    
    free_stringdata();
    
    g_string_buffsize = size + 2 + 8;
    std::cout << "Allocating " << size << " bytes in RAM, reading " << path << std::endl;
    char* stringdata;
    if (gopt_memory_type == "mmap" ||
        gopt_memory_type == "mmap_interleave" ||
        gopt_memory_type == "mmap_node0" ||
        gopt_memory_type == "mmap_segment")
    {
        stringdata = (char*)mmap(NULL, g_string_buffsize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);
        if (stringdata == MAP_FAILED) {
            std::cout << "Error allocating memory: " << strerror(errno) << std::endl;
            return NULL;
        }
        if (gopt_memory_type == "mmap_interleave")
        {
            numa_interleave_memory(stringdata, g_string_buffsize, numa_all_cpus_ptr);
            numa_set_interleave_mask(numa_all_nodes_ptr);
        }
        if (gopt_memory_type == "mmap_node0")
        {
            numa_tonode_memory(stringdata, g_string_buffsize, 0);
            numa_set_preferred(0);
        }
        if (gopt_memory_type == "mmap_segment")
        {
            do_numa_segment(stringdata, g_string_buffsize);
        }
    }
    else 
    {
        stringdata = (char*)malloc(g_string_buffsize);
    }
    
    g_string_databuff = stringdata;
    stringdata[0] = 0;
    stringdata[size+1] = 0;
    ++stringdata;
    g_string_data = stringdata;
    g_string_datasize = size;
    return stringdata;
}
void protect_stringdata()
{
    if (!g_string_databuff) return;
    if (gopt_memory_type == "mmap")
    {
        
        if (mprotect(g_string_databuff, g_string_buffsize, PROT_READ)) {
            std::cout << "Error protecting string data memory: " << strerror(errno) << std::endl;
        }
    }
}
std::string strip_datapath(const std::string& path)
{
    std::string::size_type slashpos = path.rfind('/');
    std::string name = (slashpos == std::string::npos ? path : path.substr(slashpos+1));
    if ( name.substr(name.size()-3,3) == ".gz" ||
         name.substr(name.size()-4,4) == ".bz2" ||
         name.substr(name.size()-3,3) == ".xz" ||
         name.substr(name.size()-4,4) == ".lzo" )
    {
        
        std::string::size_type dotpos = name.rfind('.');
        name.erase(dotpos);
        std::string::size_type dot2pos = name.rfind('.');
        name.erase(dot2pos);
    }
    
    if (!name.size()) name = path;
    return name;
}
bool load_plain(const std::string& path)
{
    FILE* file;
    size_t size = 0;
    if (!(file = fopen(path.c_str(), "r"))) {
        std::cout << "Cannot open " << path << ": " << strerror(errno) << std::endl;
        return false;
    }
    if (fseek(file,0,SEEK_END)) {
        std::cout << "Cannot seek in " << path << ": " << strerror(errno) << std::endl;
        fclose(file);
        return false;
    }
    size = ftell(file);
    rewind(file);
    
    if (gopt_inputsize && size > gopt_inputsize)
        size = gopt_inputsize;
    
    char* stringdata = allocate_stringdata(size, path);
    if (!stringdata) {
        fclose(file);
        return false;
    }
    
    g_string_count = 1;
    
    size_t rpos = 0;
    while ( rpos < size )
    {
        size_t batch = std::min<size_t>(8*1024*1024, size - rpos);
        if (batch + rpos > size) batch = size - rpos;
        ssize_t rb = fread(stringdata+rpos, sizeof(char), batch, file);
        if (rb < 0) {
            std::cout << "Cannot read from " << path << ": " << strerror(errno) << std::endl;
            fclose(file);
            return false;
        }
        
        if (!gopt_suffixsort)
        {
            for (size_t i = rpos; i < rpos + rb; ++i)
            {
                if (stringdata[i] == '\n' || stringdata[i] == 0) {
                    stringdata[i] = 0;
                    if (i+1 < size) g_string_count++;
                }
            }
        }
        rpos += rb;
    }
    if (gopt_suffixsort) g_string_count = size;
    
    stringdata[ size-1 ] = 0;
    
    for (size_t i = size; i < size+9; ++i)
        stringdata[i] = 0;
    fclose(file);
    g_dataname = strip_datapath(path);
    return true;
}
bool load_compressed(const std::string& path)
{
    if (path.size() < 4) return false;
    const char* decompressor = NULL;
    if ( path.substr(path.size()-3,3) == ".gz" )
        decompressor = "gzip";
    else if ( path.substr(path.size()-4,4) == ".bz2" )
        decompressor = "bzip2";
    else if ( path.substr(path.size()-3,3) == ".xz" )
        decompressor = "xz";
    else if ( path.substr(path.size()-4,4) == ".lzo" )
        decompressor = "lzop";
    if (!decompressor) return false;
    size_t size = 0;
    
    std::string::size_type i = path.rfind('.')-1;
    size_t v = 1;
    while ( isdigit(path[i]) ) {
        size += (path[i] - '0') * v;
        v *= 10; --i;
    }
    if (size == 0 || path[i] != '.') {
        std::cout << "Could not find decompressed size in filename " << path << std::endl;
        return false;
    }
    
    if (gopt_inputsize && size > gopt_inputsize)
        size = gopt_inputsize;
    
    int pipefd[2]; 
    if (pipe(pipefd) != 0) {
        std::cout << "Error creating pipe: " << strerror(errno) << std::endl;
        exit(-1);
    }
    pid_t pid = fork();
    if (pid == 0)
    {
        close(pipefd[0]); 
        dup2(pipefd[1], STDOUT_FILENO); 
        execlp(decompressor, decompressor, "-dc", path.c_str(), NULL);
        std::cout << "Pipe execution failed: " << strerror(errno) << std::endl;
        close(pipefd[1]); 
        exit(-1);
    }
    close(pipefd[1]); 
    
    char* stringdata = allocate_stringdata(size, path);
    if (!stringdata) {
        exit(-1);
    }
    g_string_count = 1;
    
    size_t rpos = 0;
    while ( rpos < size )
    {
        size_t batch = std::min<size_t>(8*1024*1024, size - rpos);
        if (batch + rpos > size) batch = size - rpos;
        ssize_t rb = read(pipefd[0], stringdata+rpos, batch);
        if (rb <= 0) {
            std::cout << "Error reading pipe: " << strerror(errno) << std::endl;
            close(pipefd[1]);
            exit(-1);
        }
        
        if (!gopt_suffixsort)
        {
            for (size_t i = rpos; i < rpos + rb; ++i)
            {
                if (stringdata[i] == '\n' || stringdata[i] == 0) {
                    stringdata[i] = 0;
                    if (i+1 < size) g_string_count++;
                }
            }
        }
        rpos += rb;
    }
    if (gopt_suffixsort) g_string_count = size;
    
    stringdata[ size-1 ] = 0;
    
    for (size_t i = size; i < size+9; ++i)
        stringdata[i] = 0;
    
    close(pipefd[1]);
    kill(pid, SIGTERM);
    int status;
    wait(&status);
    g_dataname = strip_datapath(path);
    return true;
}
bool generate_random(const std::string& path, const std::string& letters)
{
    if (!gopt_inputsize) {
        std::cout << "Random input size must be specified via '-s <size>'" << std::endl;
        return false;
    }
    size_t size = gopt_inputsize;
    
    char* stringdata = allocate_stringdata(size, path);
    if (!stringdata) {
        return false;
    }
    g_string_count = 0;
    LCGRandom rng(1234567);
    size_t slen = 0;
    for (size_t i = 0; i < size; ++i)
    {
        if (i == slen) { 
            g_string_count++;
            slen += (rng() % 3) + 16;
        }
        if (i+1 == slen) 
            stringdata[i] = 0;
        else
            stringdata[i] = letters[ (rng() / 100) % letters.size() ];
    }
    if (gopt_suffixsort) g_string_count = size;
    
    stringdata[ size-1 ] = 0;
    
    for (size_t i = size; i < size+9; ++i)
        stringdata[i] = 0;
    return true;
}
bool generate_sinha_randomASCII()
{
    if (!gopt_inputsize) {
        std::cout << "Random input size must be specified via '-s <size>'" << std::endl;
        return false;
    }
    size_t size = gopt_inputsize;
    
    char* stringdata = allocate_stringdata(size, "randomASCII");
    if (!stringdata) return false;
    g_string_count = 0;
    srandom(73802);
    size_t slen = (rand() % 20); 
    g_string_count++;
    for (size_t i = 0; i < size; ++i)
    {
        if (i == slen) 
        {
            stringdata[i] = 0;
            slen += 1 + (rand() % 20); 
            if (i+1 < size)
                g_string_count++;
        }
        else
        {
            int value = rand() % 127;
            if (value > 32 && value < 127)
                stringdata[i] = value;
            else
                i--;
        }
    }
    if (gopt_suffixsort) g_string_count = size;
    
    stringdata[ size-1 ] = 0;
    
    for (size_t i = size; i < size+9; ++i)
        stringdata[i] = 0;
    return true;
}
bool load_artifical(const std::string& path)
{
    if (path == "random2") {
        return generate_random("random2", "01");
    }
    else if (path == "random4") {
        return generate_random("random4", "ACGT");
    }
    else if (path == "random10") {
        return generate_random("random10", "0123456789");
    }
    else if (path == "random62") {
        return generate_random("random62", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    }
    else if (path == "random255")
    {
        std::string letters(255,0);
        for (int i = 0; i < 255; ++i) letters[i] = (char)(i+1);
        return generate_random("random255", letters);
    }
    else if (path == "randomASCII") {
        return generate_sinha_randomASCII();
    }
    else
        return false;
}
bool parse_filesize(const char* str, size_t& outsize)
{
    char* endptr;
    outsize = strtoul(str,&endptr,10);
    if (!endptr) return false;
    if ( *endptr == 0 || ( (*endptr == 'b' || *endptr == 'B') && *(endptr+1) == 0) )
        outsize *= 1;
    else if ( (*endptr == 'k' || *endptr == 'K') &&
              (*(endptr+1) == 0 || ( (*(endptr+1) == 'b' || *(endptr+1) == 'B') && *(endptr+2) == 0) ) )
        outsize *= 1024;
    else if ( (*endptr == 'm' || *endptr == 'M') &&
              (*(endptr+1) == 0 || ( (*(endptr+1) == 'b' || *(endptr+1) == 'B') && *(endptr+2) == 0) ) )
        outsize *= 1024*1024;
    else if ( (*endptr == 'g' || *endptr == 'G') &&
              (*(endptr+1) == 0 || ( (*(endptr+1) == 'b' || *(endptr+1) == 'B') && *(endptr+2) == 0) ) )
        outsize *= 1024*1024*1024;
    else
        return false;
    return true;
}
bool load(const std::string& path)
{
    double ts1 = omp_get_wtime();
    if (load_artifical(path)) {
        g_dataname = path;
    }
    else if (load_compressed(path)) {
    }
    else if (load_plain(path)) {
    }
    else {
        return false;
    }
    double ts2 = omp_get_wtime();
    std::cout << "Loaded input in " << ts2-ts1 << " sec with "
              << (g_string_datasize / (ts2-ts1) / 1024.0 / 1024.0) << " MiB/s" << std::endl;
    protect_stringdata();
    return true;
}
}