Fast file line extraction with indexing

Narue 4 Tallied Votes 362 Views Share

This is an example of using a binary index file to speed up random access of lines in a file. The file is indexed one time, where the position of the start of each line is stored in an index file. Subsequent requests for a get the position from the index file and seek directly to it.

Included is the usual naive brute force search for lines to compare performance.

Ancient Dragon commented: Excellent :) +35
Clinton Portis commented: good code snippet... learned a few things. still trying to figure some things out. +6
#include <algorithm>
#include <ios>
#include <iostream>
#include <fstream>
#include <stdexcept>
#include <sstream>
#include <string>
#include <typeinfo>
#include <vector>
#include <cctype>
#include <cerrno>
#include <cstdio>
#include <cstring>
#include <ctime>

namespace jsw {
    /**
        Simple implementation of Boost's lexical_cast to avoid dependencies
    */
    template <typename Target, typename Source>
    Target lexical_cast(Source arg)
    {
        std::stringstream interpreter;
        Target result;

        if (!(interpreter << arg && interpreter >> result))
            throw std::bad_cast();

        return result;
    }

    class timer {
        bool _running;
        time_t _start;
        time_t _stop;
        double _elapsed;
    public:
        timer() { start(); }

        void start()
        {
            _start = std::time(0);
            _running = true;
        }

        void stop()
        {
            _stop = std::time(0);
            _elapsed = std::difftime(_stop, _start);
            _running = false;
        }

        double elapsed() const
        {
            return _running ? std::difftime(std::time(0), _start) : _elapsed;
        }
    };
}

namespace {
    const std::string null_file = "dev/null"; // Display value when no file is opened

    struct lowercase {
        int operator()(int c) { return std::tolower(c); }
    };

    std::vector<std::string> split(std::string s);
    std::string commands();
    std::string generate_index_filename(const std::string& filename);
    std::string generate_index(std::istream& in, const std::string& filename);
    bool slow_search(std::istream& in, std::streamsize i, std::string& match);
    bool fast_search(std::istream& in, std::istream& index, std::streamsize i, std::string& match);
}

int main()
{
    std::string open_file = null_file;
    std::ifstream source, index;
    std::string line;
    bool ignore_slow;

    while (true) {
        std::cout<< open_file <<"> ";
            
        if (!getline(std::cin, line))
            break;

        std::transform(line.begin(), line.end(), line.begin(), lowercase());
        std::vector<std::string> cmd_parts = split(line);

        try {
            if (cmd_parts.at(0) == "exit")
                break;
            else if (cmd_parts.at(0) == "?")
                std::cout<< commands() <<'\n';
            else if (cmd_parts.at(0) == "open") {
                source.open(cmd_parts.at(1).c_str());

                if (source.is_open()) {
                    open_file = cmd_parts.at(1);
                    index.open(generate_index_filename(open_file).c_str());
                }
                else
                    std::cerr<<"Error opening file: " + std::string(std::strerror(errno)) <<'\n';

                ignore_slow = false; // Warn the first time about non-indexed searches
            }
            else if (cmd_parts.at(0) == "close") {
                source.clear();
                source.close();
                open_file = null_file;
            }
            else if (cmd_parts.at(0) == "index") {
                if (!source.is_open()) {
                    std::cerr<<"No file open\n";
                    continue;
                }

                index.open(generate_index(source, open_file).c_str(), std::ios::binary);

                if (!index.is_open())
                    std::cerr<<"Error generating index: " + std::string(std::strerror(errno)) <<'\n';
            }
            else if (cmd_parts.at(0) == "unindex") {
                if (!source.is_open()) {
                    std::cerr<<"No file open\n";
                    continue;
                }

                if (index.is_open()) {
                    index.clear();
                    index.close();
                }

                std::remove(generate_index_filename(open_file).c_str());
            }
            else if (cmd_parts.at(0) == "line") {
                if (!source.is_open()) {
                    std::cerr<<"No file open\n";
                    continue;
                }

                std::string match;
                jsw::timer timer;
                bool found;

                if (index.is_open()) {
                    timer.start();
                    found = fast_search(source, index, jsw::lexical_cast<std::streamsize>(cmd_parts.at(1)), match);
                    timer.stop();
                }
                else {
                    if (!ignore_slow) {
                        std::cout<< open_file <<" has not been indexed; search may be slow. Continue (y/n)? ";

                        if (!getline(std::cin, line) || line.empty() || std::tolower(line[0]) != 'y')
                            continue;

                        ignore_slow = true; // Stop warning to avoid annoying the user
                    }

                    timer.start();
                    found = slow_search(source, jsw::lexical_cast<std::streamsize>(cmd_parts.at(1)), match);
                    timer.stop();
                }

                if (found)
                    std::cout<<"Line #"<< cmd_parts.at(1) <<": '"<< match <<"'\n";
                else
                    std::cout<<"Line #"<< cmd_parts.at(1) <<" not found\n";

                std::cout<<"Search took "<< std::fixed << timer.elapsed() <<" seconds\n";
            }
            else
                std::cerr<<"Unrecognized command '"<< cmd_parts.at(0) <<"'\n";
        } catch (std::out_of_range&) {
            std::cerr<<"Expected command or parameter missing\n";
        } catch (std::bad_cast&) {
            std::cerr<<"Invalid value for expected parameter\n";
        }
    }
}

namespace {
    std::vector<std::string> split(std::string s)
    {
        std::vector<std::string> words;
        std::stringstream in(s);
        std::string word;

        while (in>> word)
            words.push_back(word);

        return words;
    }

    std::string commands()
    {
        return 
            "?                  -- Display help information\n"
            "exit               -- Exit the application\n"
            "open <file>        -- Open the specified file\n"
            "close              -- Close the currently open file\n"
            "index              -- Index the opened file by lines\n"
            "unindex            -- Remove index information for the current file\n"
            "line <line number> -- Display the specified line from the open file\n";
    }

    std::string generate_index_filename(const std::string& filename)
    {
        std::string index_filename;
    
        if (filename.find_last_of('.') != std::string::npos) {
            index_filename =
                filename.substr(0, filename.find_last_of('.')) + "_index" + 
                filename.substr(filename.find_last_of('.'), std::string::npos);
        }
        else
            index_filename = filename + "_index";

        return index_filename;
    }

    std::string generate_index(std::istream& in, const std::string& filename)
    {
        std::string index_filename = generate_index_filename(filename);
        std::string line;

        std::ofstream index(index_filename.c_str(), std::ios::binary);

        if (!index)
            return "";

        in.clear();
        in.seekg(0, std::ios::beg);

        do {
            std::istream::pos_type pos = in.tellg();

            index.write((char*)&pos, sizeof(std::istream::pos_type));
        } while (std::getline(in, line));

        return index_filename;
    }

    bool fast_search(std::istream& in, std::istream& index, std::streamsize i, std::string& match)
    {
        if (i == 0)
            return false;

        std::istream::pos_type saved_index;
        std::streamsize index_file_size;
        std::string line;

        in.clear();
        index.clear();

        index.seekg(0, std::ios::end);
        index_file_size = index.tellg() / sizeof(std::istream::pos_type);

        // Use a 1-based line match rather than 0-based
        if (i < index_file_size) {
            index.seekg((i - 1) * sizeof(std::istream::pos_type), std::ios::beg);
            index.read((char*)&saved_index, sizeof saved_index);
            in.seekg(saved_index, std::ios::beg);
            
            if (std::getline(in, line)) {
                match = line;
                return true;
            }
        }

        return false;
    }

    bool slow_search(std::istream& in, std::streamsize i, std::string& match)
    {
        if (i == 0)
            return false;

        std::string line;

        in.clear();
        in.seekg(0, std::ios::beg);

        // Use a 1-based line match rather than 0-based
        while (i > 0 && std::getline(in, line))
            --i;

        if (i == 0) {
            match = line;
            return true;
        }

        return false;
    }
}