panthema / 2012 / 1119-eSAIS-Inducing-Suffix-and-LCP-Arrays-in-External-Memory / eSAIS-DC3-LCP-0.5.4 / src / esactest.cc (Download File)
/******************************************************************************
 * src/esactest.cc - Main program to test external memory suffix sorters
 *
 * Usage: ./esactest <input>
 *
 * This main program is designed to quickly test different external memory
 * suffix sorters on a wide variety of inputs. The <input> is specified on the
 * command line and can be:
 *
 * - plain data files located under $HOME/sac-corpus/
 *
 * - gzip compressed data files in $HOME/sac-corpus/, which are decompressed
 *   on-the-fly. These must be named datafile.123456.gz, where 123456 is the
 *   _decompressed_ size of the data file.
 *
 * - artificial inputs generated by functions in the input/ sub-directory.
 *
 * - verbatim strings can be processed as "simple/<string>"
 * 
 * Call the program without arguments for a list of available inputs. The input
 * is loaded into a stxxl vector and then processed by the external memory
 * suffix sorters that this main program was compiled with.
 *
 * Size of the input can be limited via the -s command line parameter.
 *
 ******************************************************************************
 * Copyright (C) 2012-2013 Timo Bingmann <tb@panthema.net>
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program.  If not, see <http://www.gnu.org/licenses/>.
 *****************************************************************************/

#ifndef LCP_CALC
#define LCP_CALC                1
#endif // LCP_CALC

#include <stdlib.h>
#include <string.h>

#include <assert.h>
#include <stdio.h>
#include <sys/stat.h>
#include <limits.h>
#include <inttypes.h>
#include <math.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <dirent.h>
#include <getopt.h>

#include <iostream>
#include <iomanip>
#include <sstream>
#include <fstream>
#include <vector>
#include <limits>
#include <stdexcept>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <stack>
#include <set>
#include <map>
#include <ext/algorithm>
#include <parallel/algorithm>
#include <tr1/unordered_map>

#include <stxxl/bits/algo/sort.h>
#include <stxxl/bits/containers/priority_queue.h>
#include <stxxl/bits/containers/vector.h>
#include <stxxl/bits/containers/queue.h>
#include <stxxl/bits/containers/stack.h>
#include <stxxl/bits/containers/sorter.h>
#include <stxxl/bits/containers/sequence.h>
#include <stxxl/bits/io/iostats.h>
#include <stxxl/bits/mng/buf_istream.h>
#include <stxxl/bits/mng/buf_istream_reverse.h>
#include <stxxl/bits/mng/buf_ostream.h>
#include <stxxl/bits/stream/sort_stream.h>
#include <stxxl/bits/stream/stream.h>
#include <stxxl/bits/common/uint_types.h>

#include <omp.h>
#include <zlib.h>

// globally available information
std::string	dataname;

// run data generation inside a child process
bool gopt_forkrun = false;

// algorithms selected at commandline
std::vector<const char*> gopt_algorithm;

// size limit specified on commandline
stxxl::uint64 gopt_sizelimit = 0;

// write input string to file name matching name (for debugging artificial).
bool gopt_writeinput = false;

// write output suffix and lcp array (to file name matching input)
bool gopt_writeoutput = false;

// file name of statistics output
static const char* statsfile = "sac-runs1.txt";

#ifndef MEMLIMIT
// RAM used by external memory algorithms - DO NOT set this to the total memory
// available, other structure besides the external sorters need memory as
// well. Try 80% of available RAM.
static const size_t ram_use = 4*1024*1024*1024LLU;

#ifndef BLOCKSIZE
static const size_t block_size = 1024*1024;
#else
static const size_t block_size = BLOCKSIZE*1024;
#endif

#else

static const size_t ram_use = MEMLIMIT*1024*1024LLU;

#if MEMLIMIT <= 128
static const size_t block_size = 16*1024;
#elif MEMLIMIT <= 512
static const size_t block_size = 32*1024;
#else
static const size_t block_size = 64*1024;
#endif

#endif

#define CHECK_RESULT    1

// ****************************************************************************

#include "tools/debug.h"
#include "tools/timerseries.h"
#include "tools/statsfile.h"
#include "tools/lp_hash_table.h"
#include "tools/lcp.h"
#include "tools/losertree.h"
#include "tools/radixsort.h"
#include "tools/rmq_succinct.h"
#include "tools/rmq_stack.h"

//typedef uint32_t      offset_type;
typedef stxxl::uint40   offset_type;

// Maximum input size processed. This is only needed to calculate the maximum
// number of items in eSAIS's PQs, due to the PQ design. This can be grossly
// overstated.
//static const size_t max_input_size = 4*1024*1024*1024LLU;
static const size_t max_input_size = 80*1024*1024*1024LLU;

// globally available stats writer cache for algorithms to append other useful
// information.
static StatsCache       g_statscache;

#include "external/tuples.h"
#include "external/sacheck.h"
#include "external/lcp.h"

#include "external/skew3.h"
#include "external/esais.h"

#include "input/input.h"

// ****************************************************************************

// this selects the alphabet_type for SA construction algorithms

typedef InputASCII  InputType;
InputType           input;

typedef InputType::alphabet_type    alphabet_type;

typedef stxxl::VECTOR_GENERATOR<InputType::alphabet_type,4,8,block_size,
                                STXXL_DEFAULT_ALLOC_STRATEGY,
                                stxxl::random>::result      alphabet_vector_type;

typedef stxxl::VECTOR_GENERATOR<offset_type,4,8,block_size,
                                STXXL_DEFAULT_ALLOC_STRATEGY,
                                stxxl::random>::result      offset_vector_type;

namespace std {
namespace tr1 {

template <>
struct hash<stxxl::uint40> : public unary_function<stxxl::uint40, size_t>
{
    size_t operator()(const stxxl::uint40& v) const
    {
        return v.ull();
    }
};

} // namespace tr1
} // namespace std

/// helper to print out readable characters
template <typename alphabet_type>
static inline std::string strC(alphabet_type c)
{
    std::ostringstream oss;
    if (c < 128 && isalnum(c)) oss << '\'' << (char)c << '\'';
    else oss << (int)c;
    return oss.str();
}

template <typename InputType>
class AlgoRunner
{
public:

    template < template <typename StringContainer,typename SAContainer> class SACA >
    void run(alphabet_vector_type& string)
    {
        offset_vector_type      suffixarray (string.size());

#if LCP_CALC
        offset_vector_type      lcparray (string.size());
#endif // LCP_CALC

        g_statscache.clear();

        double ts1, ts2;

        // Measure io-accesses
        stxxl::stats* stxxlstats = stxxl::stats::get_instance();
        stxxl::stats_data stats1, stats2;

        {
            SACA<alphabet_vector_type, offset_vector_type> saca;

            if (!gopt_algorithm_match(saca.name().c_str())) {
                (std::cout << "Skipping disabled " << saca.name() << "!\n").flush();
                return;
            }

            (std::cout << "Running " << saca.name() << ":\n").flush();

            g_statscache >> "sacaname" << saca.name()
                         >> "dataname" << dataname
                         >> "size" << string.size();

            saca.prepare(string, suffixarray, InputType::K);
            assert( suffixarray.size() == string.size() );

            stats1 = *stxxlstats;
            g_stats = *stxxlstats;
            ts1 = omp_get_wtime();

#if LCP_CALC
            saca.run_lcp(string, suffixarray, lcparray, InputType::K);
#else // !LCP_CALC
            saca.run(string, suffixarray, InputType::K);
#endif // LCP_CALC

            ts2 = omp_get_wtime();
            stats2 = *stxxlstats;

            g_statscache >> "time" << (ts2-ts1);
        }

        stxxl::stats_data stats = stats2 - stats1;

        std::cout << std::fixed << std::setprecision(2) << (ts2-ts1);

        if (getenv("DEBUGOUTPUT") && *getenv("DEBUGOUTPUT") == '1')
        {
            std::cout << "\nResulting suffix array: \n";

            std::vector<alphabet_type> stringRAM (string.begin(), string.end());

            for (size_t i = 0; i < suffixarray.size(); ++i)
            {
                std::cout << i << " : " << suffixarray[i] << " : ";

                for (size_t j = 0; (size_t)suffixarray[i]+j < stringRAM.size() && j < 20; ++j)
                {
                    std::cout << strC(stringRAM[(size_t)suffixarray[i]+j]) << " ";
                }

                std::cout << "\n";
            }
        }

#if CHECK_RESULT
        (std::cout << " checking ").flush();

        if (! sacheck::check_sa_vectors(string, suffixarray) )
        {
            std::cout << "failed!" << std::endl;

            g_statscache >> "status" << "failed";

            StatsWriter out(statsfile);
            write_iostats(out, stats);
            out.append_statsmap(g_statscache);
        }
        else
        {
            std::cout << "SA-ok" << std::endl;

            // gcc hack to get 128-bit integers for summing LCPs
            typedef unsigned int uint128_t __attribute__((mode(TI)));

            if (getenv("LCP") && *getenv("LCP") == '1')
            {
                (std::cout << "Generating LCP array: ").flush();

                offset_vector_type lcpkasai;
                lcp::lcparray_stxxl_kasai(string, suffixarray, lcpkasai);

                offset_type maxlcp = 0;
                uint128_t sumlcp = 0;

                typedef stxxl::stream::vector_iterator2stream<offset_vector_type::const_iterator> lcpstream_type;

                offset_type i = 0;
                for ( lcpstream_type it(lcpkasai.begin(), lcpkasai.end()); !it.empty(); ++it, ++i )
                {
                    maxlcp = std::max<offset_type>(maxlcp, *it);
                    sumlcp += (uint128_t)*it;
                    std::cout << "SA[" << i << "] = " << suffixarray[i] << " - LCP[" << i << "] = " << *it << "\n";
                }

                g_statscache >> "maxlcp0" << maxlcp;
                g_statscache >> "avglcp0" << ((double)sumlcp / (double)string.size());

                std::cout << "maxlcp = " << maxlcp << ", avglcp = " << (double)sumlcp / (double)string.size() << "\n";
            }

#if LCP_CALC
            {
                (std::cout << "Generating LCP array: ").flush();

                offset_vector_type lcpkasai;
                lcp::lcparray_stxxl_kasai(string, suffixarray, lcpkasai);

                typedef stxxl::stream::vector_iterator2stream<offset_vector_type::const_iterator> lcpstream_type;

                offset_type maxlcp = 0;
                uint128_t sumlcp = 0;

                bool lcpgood = true;
                offset_type i = 0;
                for ( lcpstream_type it(lcpkasai.begin(), lcpkasai.end()); !it.empty(); ++it, ++i )
                {
                    if ( lcparray[i] != *it ) {
                        std::cout << "WRONG: SA[" << i << "] = " << suffixarray[i] << " - LCP[" << i << "] = " << lcparray[i] << " which should be " << *it << "\n";
                        lcpgood = false;
                    }
                    maxlcp = std::max<offset_type>(maxlcp, *it);
                    sumlcp += (uint128_t)*it;
                }
                std::cout << (lcpgood ? "LCP-ok" : "LCP-failed!") << "\n";

                std::cout << "maxlcp = " << maxlcp << ", sumlcp = " << (size_t)sumlcp << "\n";

                g_statscache >> "maxlcp0" << maxlcp;
                g_statscache >> "avglcp0" << ((double)sumlcp / (double)string.size());

                std::cout << "maxlcp = " << maxlcp << ", avglcp = " << (double)sumlcp / (double)string.size() << ", sumlcp = " << (size_t)sumlcp << "\n";
            }
#endif // LCP_CALC

            g_statscache >> "status" << "ok";
        }
#else
        std::cout << " skipping checks\n";
#endif // CHECK_RESULT

#if LCP_CALC && !CHECK_RESULT
        // if result unchecked, still calculate lcp statistics
        {
            typedef stxxl::stream::vector_iterator2stream<offset_vector_type::const_iterator> lcpstream_type;

            // gcc hack to get 128-bit integers for summing LCPs
            typedef unsigned int uint128_t __attribute__((mode(TI)));

            offset_type maxlcp = 0;
            uint128_t sumlcp = 0;

            for ( lcpstream_type it(lcparray.begin(), lcparray.end()); !it.empty(); ++it )
            {
                maxlcp = std::max<offset_type>(maxlcp, *it);
                sumlcp += (uint128_t)*it;
            }

            g_statscache >> "maxlcp0" << maxlcp;
            g_statscache >> "avglcp0" << ((double)sumlcp / (double)string.size());

            std::cout << "maxlcp = " << maxlcp << ", avglcp = " << (double)sumlcp / (double)string.size() << "\n";
        }
#endif
        // print timing data out to results file
        StatsWriter out(statsfile);
        write_iostats(out, stats);
        out.append_statsmap(g_statscache);

        // maybe write output to filesystem
        if (gopt_writeoutput)
        {
            std::string outfile = dataname + ".sa" + (char)('0' + sizeof(offset_type));
            std::ofstream out(outfile.c_str());

            std::cout << "Writing SA output to " << outfile << "\n";

            typedef stxxl::stream::vector_iterator2stream<offset_vector_type::const_iterator> stream_type;

            const unsigned int buffersize = 1024*1024;
            offset_vector_type::value_type outbuffer[buffersize];
            unsigned int bufferfill = 0;

            for ( stream_type s(suffixarray.begin(), suffixarray.end()); !s.empty(); ++s )
            {
                outbuffer[ bufferfill++ ] = *s;

                if (bufferfill == buffersize) {
                    out.write((char*)&outbuffer[0], sizeof(outbuffer[0]) * buffersize);
                    bufferfill = 0;
                }
            }

            if (bufferfill)
                out.write((char*)&outbuffer[0], sizeof(outbuffer[0]) * bufferfill);
        }

#if LCP_CALC
        // maybe write output to filesystem
        if (gopt_writeoutput)
        {
            std::string outfile = dataname + ".lcp" + (char)('0' + sizeof(offset_type));
            std::ofstream out(outfile.c_str());

            std::cout << "Writing LCP output to " << outfile << "\n";

            typedef stxxl::stream::vector_iterator2stream<offset_vector_type::const_iterator> stream_type;

            const unsigned int buffersize = 1024*1024;
            offset_vector_type::value_type outbuffer[buffersize];
            unsigned int bufferfill = 0;

            for ( stream_type s(lcparray.begin(), lcparray.end()); !s.empty(); ++s )
            {
                outbuffer[ bufferfill++ ] = *s;

                if (bufferfill == buffersize) {
                    out.write((char*)&outbuffer[0], sizeof(outbuffer[0]) * buffersize);
                    bufferfill = 0;
                }
            }

            if (bufferfill)
                out.write((char*)&outbuffer[0], sizeof(outbuffer[0]) * bufferfill);
        }
#endif // LCP_CALC
    }

    template < template <typename StringContainer,typename SAContainer> class SACA >
    void runF(alphabet_vector_type& string)
    {
        int pid = fork();
        if (pid == 0)
        {
            alarm(15*60);	// time limit in seconds

            run<SACA>(string);

            exit(0);
        }

        int status;
        pid_t ch = wait(&status);

        if (ch != pid) {
            std::cout << "Different child returned?!?" << std::endl;
            abort();
        }

        if (!WIFEXITED(status)) {
            std::cout << "Child terminated abnormally with signal " << WTERMSIG(status) << std::endl;

            // write out exit status information to results file

            std::string sacaname = SACA<alphabet_vector_type,offset_vector_type>().name();

            StatsWriter out(statsfile);
            out >> "sacaname" << sacaname
                >> "dataname" << dataname
                >> "size" << string.size();

            if (WTERMSIG(status) == SIGALRM)
            {
                out >> "status" << "timeout";
            }
            else if (WTERMSIG(status) == SIGSEGV)
            {
                out >> "status" << "segfault";
            }
            else if (WTERMSIG(status) == SIGABRT)
            {
                out >> "status" << "aborted";
            }
            else
            {
                out >> "status" << "SIG" << WTERMSIG(status);
            }
        }
        else {
            // std::cout << "Child terminated with code " << WEXITSTATUS(status) << std::endl;
        }
    }

    void write_iostats(StatsWriter& sw, const stxxl::stats_data& stats)
    {
#ifdef STXXL_PARALLEL_MODE_EXPLICIT
        static const bool parallel_stxxl = true;
#else
        static const bool parallel_stxxl = false;
#endif
        sw >> "ndisks" << stxxl::config::get_instance()->disks_number()
           >> "alphabet_type" << sizeof(alphabet_type)
           >> "offset_type" << sizeof(offset_type)
           >> "block_size" << block_size
           >> "ram_use" << ram_use
           >> "parallel_stxxl" << parallel_stxxl
           >> "writevolume" << stats.get_written_volume()
           >> "readvolume" << stats.get_read_volume()
           >> "iovolume" << (stats.get_read_volume() + stats.get_written_volume())
           >> "writes" << stats.get_writes()
           >> "reads" << stats.get_reads()
           >> "readtime" << stats.get_read_time()
           >> "writetime" << stats.get_write_time()
           >> "preadtime" << stats.get_pread_time()
           >> "pwritetime" << stats.get_pwrite_time()
           >> "piotime" << stats.get_pio_time()
           >> "iowaittime" << stats.get_io_wait_time()
           >> "maxalloc" << stxxl::block_manager::get_instance()->get_maximum_allocation();
    }

    static inline bool gopt_algorithm_match(const char* algoname)
    {
        if (!gopt_algorithm.size()) return true;

        // iterate over gopt_algorithm list as a filter
        for (size_t ai = 0; ai < gopt_algorithm.size(); ++ai) {
            if (strstr(algoname, gopt_algorithm[ai]) != NULL)
                return true;
        }

        return false;
    }
};

void print_usage(const char* prog)
{
    std::cout << "Usage: " << prog << " [options] <dataname>" << std::endl
              << "Options:" << std::endl
              << "  -a, --algo <match>          Run only algorithms containing this substring." << std::endl
              << "  -F, --fork                  Fork before running algorithm and load data within fork!" << std::endl
              << "  -s, --size <size>           Limit the input size to this number of characters." << std::endl
              << "      --writeinput            Write input string to file matching dataname." << std::endl
              << "  -W, --writeoutput           Write output SA/LCP to files matching dataname." << std::endl
              << std::endl
        ;
}

int main(int argc, char* argv[])
{
#ifdef DMALLOC
    /* set the 'high' flags */
    //dmalloc_debug_setup("debug=0x4f47d03,log=dmalloc.log");
#endif

    // parse command line parameters

    while (1)
    {
        static const struct option longopts[] = {
            { "algo",    required_argument,  0, 'a' },
            { "help",    no_argument,        0, 'h' },
            { "fork",    no_argument,        0, 'F' },
            { "writeinput", no_argument,     0, 1 },
            { "writeoutput", no_argument,    0, 'W' },
            { 0,0,0,0 },
        };

        int index;
        int argi = getopt_long(argc, argv, "hs:FWa:", longopts, &index);

        if (argi < 0) break;

        switch (argi)
        {
        case 'h':
            print_usage(argv[0]);
            print_datasource_list();
            return 0;

        case 'a':
            gopt_algorithm.push_back(optarg);
            std::cout << "Option -a: selecting algorithms containing " << optarg << std::endl;
            break;

        case 'F':
            gopt_forkrun = true;
            std::cout << "Option -F: forking before each algorithm run and loading data after fork." << std::endl;
            break;

        case 1:
            gopt_writeinput = true;
            std::cout << "Option --writeinput: writing input to file names matching dataname." << std::endl;
            break;

        case 'W':
            gopt_writeoutput = true;
            std::cout << "Option -W: writing output SA/LCP to file names matching input dataname." << std::endl;
            break;

        case 's':
            if (!stxxl::parse_SI_IEC_size(optarg, gopt_sizelimit)) {
                std::cout << "Option -s: invalid size parameter: " << optarg << std::endl;
                exit(EXIT_FAILURE);
            }
            std::cout << "Option -s: limiting input size to " << gopt_sizelimit << std::endl;
            break;

        default:
            std::cout << "Invalid parameter: " << argi << std::endl;
            print_usage(argv[0]);
            exit(EXIT_FAILURE);
        }
    }

    if (optind == argc) { // no input data parameter given
        print_usage(argv[0]);
        print_datasource_list();
        return 0;
    }

    // pass remaining command line parameters
    datasource_list_type datasource_list;
    if (!select_datasource_list(argc - optind,argv + optind,datasource_list))
        return 0;

    // run construction algorithms on all datasources specified

    for (datasource_list_type::const_iterator ds = datasource_list.begin();
         ds != datasource_list.end(); ++ds)
    {
        AlgoRunner<InputType> r;

        dataname = ds->name;

        int pid = gopt_forkrun ? fork() : 0;
        if (pid == 0)
        {
            alphabet_vector_type string;

            if (!input.get(*ds, string)) {
                std::cout << "Could not get input string.\n";
                return 10;
            }

#ifdef COMPILE_ALGORITHM

            r.run<COMPILE_ALGORITHM::SACA>(string);
            return 0;

#else

            //r.run<skew::SACA>(string);
            r.run<esais::SACA>(string);

            return 0;
#endif

            if (gopt_forkrun) return 0;
            else continue;
        }

        int status;
        pid_t ch = wait(&status);

        if (ch != pid) {
            std::cout << "Different child returned?!?" << std::endl;
            abort();
        }

        if (!WIFEXITED(status)) {
            std::cout << "Child terminated abnormally with signal " << WTERMSIG(status) << std::endl;
        }
        else {
            // std::cout << "Child terminated with code " << WEXITSTATUS(status) << std::endl;
            if (WEXITSTATUS(status) == 10) break;
        }

        return 0;
    }

    return 0;
}