I am trying to find a bug in my new and (theoretically) improved syntax highlighter program. I have made 3 versions now and have been able to debug them all with ease. Unfortunately my debugger does not do well with std containers as it shows ALL the information they contain and it can be hard to sift through it to find errors. Can anybody see where I went wrong in this code?

evenBetterMain.cpp: (since I already have main.cpp and betterMain.cpp working :P)

#include <iostream>     //for cin/cout
#include <string>       //for std::strings, obviously
#include <fstream>      //for file operations
#include <stdio.h>      //for integer parsing
using namespace std;
string readFile(string fileName)
{
    fstream file(fileName.c_str()); //open the file
    if (!file.is_open())            //check that it opened
        return "";
    string ret="";                  //initialize the return value
    while (!file.eof())             //loop through the whole file
        ret+=file.get();            //grab a character and append it to the return value
    file.close();                   //close the file
    return ret;
}
void writeFile(string fileName, string text)
{
    fstream file(fileName.c_str()); //open the file
    if (!file.is_open())            //check that it opened
        return;
    file<<text;                     //write the text
    file.close();                   //close the file
}
struct highlightFormatter
{
    string start;           //this is written at the start of the information
    string end;             //this is written at the end of the information
    string newLine;         //this is written at the end of each line of the information
    string code;            //this is written at the start of a segment labeled as code
    string comment;         //this is written at the start of a segment labeled as comment
    string documentation;   //this is written at the start of a documentation comment
    string preprocessor;    //this is written at the start of a preprocessor
    string str;             //this is written at the start of a string literal
    string chr;             //this is written at the start of a character literal
    string keyword;         //this is written at the start of a keyword
    string docKeyword;      //this is written at the start of a documentation keyword
    string op;              //this is written at the start of an operator
    string number;          //this is written at the start of a numerical literal
    string keywords;        //this stores all the words identified as keywords, seperated by commas
};
highlightFormatter readFormatFile(string filename)
{
    highlightFormatter ret;         //declare the return value
    fstream file(filename.c_str()); //open the file
    if (!file.is_open())            //check that the file opened
        return ret;
    //I am NOT commenting the following lines, they are self-explanatory
    getline(file,ret.start);
    getline(file,ret.end);
    getline(file,ret.newLine);
    getline(file,ret.code);
    getline(file,ret.comment);
    getline(file,ret.documentation);
    getline(file,ret.preprocessor);
    getline(file,ret.str);
    getline(file,ret.chr);
    getline(file,ret.keyword);
    getline(file,ret.docKeyword);
    getline(file,ret.op);
    getline(file,ret.number);
    getline(file,ret.keywords);
    file.close();
    return ret;
}
string strFromInt(int i)
{
    char ret[20];//I do not believe ints can be longer than 20 digits?
    sprintf(ret,"%d",i);
    return string(ret);
}
void replaceAll(string &str, char find, string replacement/*a #X indicates to add a number starting at X, #! is a new line*/)
{
    int location=0;
    do{
        location=str.find_first_of(find,location);
        if (location>=0)
            str.replace(location,1,replacement);
    }while(location!=-1);
    location=0;//now storing numbers
    for (int i=0; i<(int)str.length(); ++i)
    {
        if (str[i]=='#'&&i+1<(int)str.length()&&str[i+1]>='0'&&str[i+1]<='9')
            location=str[i+1]-'0';
        else if (str[i]=='#'&&i+1<(int)str.length()&&str[i+1]=='#')
            str.replace(i,2,strFromInt(location++));
        else if (str[i]=='#'&&i+1<(int)str.length()&&str[i+1]=='!')
            str.replace(i,2,"\n");
    }
}
bool isAlNum(char ch)
{
    return (ch>='A'&&ch<='Z')||
           (ch>='a'&&ch<='z')||
           (ch>='0'&&ch<='9');
}
bool strContains(string &str,string &search)
{
    return str.find(search)!=(size_t)-1;
}
string highlight(string inputText, highlightFormatter formatter)
{
    //Declaring some constants
    const char cCODE            ='0';
    const char cCOMMENT         ='c';
    const char cMLCOMMENT       ='C';
    const char cDOC             ='d';
    const char cMLDOC           ='D';
    const char cPREPROCESSOR    ='P';
    const char cSTRING          ='s';
    const char cCHAR            ='h';
    const char cKEYWORD         ='k';
    const char cDOCKEYWORD      ='K';
    const char cOPERATOR        ='o';
    const char cNUMBER          ='n';
    const char cENDL            ='\n';
    char state=cCODE;
    string format="";
    cout<<"Loading format string...\n";
    for (int i=0; i<(int)inputText.length(); ++i)//loop through the whole string
    {
        if (((i*100)/(inputText.length()))%5==0)
            cout<<(i*100)/inputText.length()<<"% formatted...\n";
        switch (inputText[i])
        {
            case '*'://weird character... endl comments and modifies them and is an operator.
            if ((state==cMLCOMMENT||state==cMLDOC)&&i+1<(int)inputText.length()&&inputText[i+1]=='/')
                state=cCODE;
            if (state==cCODE)
                format+=cOPERATOR;
            else
                format+=state;
            break;
            case '/'://Possible comment... possible operator
            if (state==cCHAR||state==cSTRING)//this is not to be parsed
            {
                format+=state;
                break;
            }
            if (i+1<(int)inputText.length()&&inputText[i+1]=='/')//check for inline comments
            {
                if (i+2<(int)inputText.length()&&(inputText[i+2]=='/'||inputText[i+2]=='!'))//check for inline documentation
                {
                    state=cDOC;
                    format+=state;
                    break;
                }
                else
                {
                    state=cCOMMENT;
                    format+=state;
                    break;
                }
            }
            else if (i+1<(int)inputText.length()&&inputText[i+1]=='*')//check for multiline comments
            {
                if (i+2<(int)inputText.length()&&(inputText[i+2]=='*'||inputText[i+2]=='!'))//check for multiline documentation
                {
                    state=cMLDOC;
                    format+=state;
                    break;
                }
                else
                {
                    state=cMLCOMMENT;
                    format+=state;
                    break;
                }
            }//else, treat as operator.
            case ','://OPERATORS
            case '<':case '>':case '=':case '|':case '%':case '^':case '&':
            case '(':case ')':case '+':case '-':case '~':case '.':
            case '{':case '}':case '[':case ']':case ':':case ';':case '!':
            if (state!=cPREPROCESSOR&&state!=cSTRING&&state!=cCHAR&&state!=cCOMMENT&&state!=cDOC&&state!=cMLCOMMENT&&state!=cMLDOC)
                format+=cOPERATOR;
            else
                format+=cPREPROCESSOR;
            break;
            case '#'://PREPROCESSOR?
            if (i>0&&inputText[i-1]=='\n')//I will not deal with indented preprocessor, yet.
            {
                state=cPREPROCESSOR;
                format+=cPREPROCESSOR;
            }
            else
            {
                format+=cPREPROCESSOR;
            }
            break;
            case '@'://dockeyword?
            if (state==cMLDOC||state==cDOC)
            {
                for (; i<(int)inputText.length()&&isAlNum(inputText[i]);++i)
                    format+=cDOCKEYWORD;
                --i;
            }
            break;
            case '\"'://string?
            if (state==cCODE)
            {
                format+=cSTRING;
                state=cSTRING;
            }
            else if (state==cSTRING)
            {
                format+=cSTRING;
                state=cCODE;
            }
            else
            {
                format+=state;
            }
            break;
            case '\''://character?
            if (state==cCODE)
            {
                format+=cCHAR;
                state=cCHAR;
            }
            else if (state==cCHAR)
            {
                format+=cCHAR;
                state=cCODE;
            }
            else
            {
                format+=state;
            }
            break;
            case '0':case '1':case '2':case '3':case '4':case '5':
            case '6':case '7':case '8':case '9'://NUMBERS!!!
            if (i<1||!isAlNum(inputText[i-1]))
            {
                for (; i<(int)inputText.length()&&isAlNum(inputText[i]); ++i)
                    format+=cNUMBER;
                --i;
            }
            else
            {
                format+=state;
            }
            break;
            case '\n'://newlines
            if (state!=cMLCOMMENT&&state!=cMLDOC)//check for multi-line comments
            {
                if (i>1&&inputText[i-1]=='\\'&&state==cPREPROCESSOR)//check for multi-line preprocessor
                {
                    format+=state;
                    break;
                }
                state=cCODE;
                format+=cENDL;
                break;
            }
            format+=cENDL;
            break;
            default://check for keyword
            if (state!=cCODE)
            {
                format+=state;
                break;
            }
            string thisWord="";
            for (;i<(int)inputText.length()&&isAlNum(inputText[i]);++i)
                thisWord+=inputText[i];
            --i;
            if (strContains(formatter.keywords,thisWord))
            {
                for (int ii=0; ii<(int)thisWord.length(); ++ii)
                    format+=cKEYWORD;
            }
            else
            {
                for (int ii=0; ii<(int)thisWord.length(); ++ii)
                    format+=state;
            }
        }
    }
    //now format should contain the format of inputText
    state=0;
    int offset=0;
    for (int i=0; i<(int)format.length(); ++i)
    {
        if (format[i]!=state)
        {
            state=format[i];
            inputText.insert(offset+i,1,-state);//plunk in an identifier
            ++offset;
        }
    }
    string ret=inputText;
    cout<<"Formatting for HTML...\n";
    replaceAll(ret,'&',"&amp;");
    replaceAll(ret,' ',"&nbsp;");
    replaceAll(ret,'>',"&gt;");
    replaceAll(ret,'<',"&lt;");
    replaceAll(ret,'\"',"&quot;");
    replaceAll(ret,-cCODE,formatter.code);
    replaceAll(ret,-cCOMMENT,formatter.comment);
    replaceAll(ret,-cMLCOMMENT,formatter.comment);
    replaceAll(ret,-cDOC,formatter.documentation);
    replaceAll(ret,-cMLDOC,formatter.documentation);
    replaceAll(ret,-cPREPROCESSOR,formatter.preprocessor);
    replaceAll(ret,-cSTRING,formatter.str);
    replaceAll(ret,-cCHAR,formatter.chr);
    replaceAll(ret,-cKEYWORD,formatter.keyword);
    replaceAll(ret,-cDOCKEYWORD,formatter.docKeyword);
    replaceAll(ret,-cNUMBER,formatter.number);
    replaceAll(ret,-cENDL,formatter.newLine);
    return formatter.start+ret+formatter.end;
}
int main(int argc, char *argv[])
{
    if (argc!=4)
    {
        //output help message
        cout<<"SyntaxHighlighter [input] [output] [specifiers]\n"<<
              "\tParses C++ code for HTML.\n"<<
              "\t\t[input] is the input filename.\n"<<
              "\t\t[output] is the output filename.\n"<<
              "\t\t[specifiers] is the specifiers filename.\n";
        return 1;
    }
    cout<<"Loading input file...\n";
    string text=readFile(argv[1]);
    cout<<"Loading format file...\n";
    highlightFormatter hf=readFormatFile(argv[3]);
    string tmp=highlight(text,hf);
    writeFile(argv[2],tmp);
    return 0;
}

formatters.txt: (so that you don't have to make one yourselves)

#2<table width="100%" border="0" cellspacing="0">#!<tr>#!<td width="2%">1.</td>#!<td width="98%" bgcolor="#FFFFFF"><div align="left">
</span></div></td>#!</tr></table><p>
</div></td></tr>#!<td width="2%">##.</td>#!<td bgcolor="#FFFFFFFF"><div align="left">
</span><span class="Code">
</span><span class="Comment">
</span><span class="DocComment">
</span><span class="Preprocessor">
</span><span class="String">
</span><span class="Character">
</span><span class="Keyword">
</span><span class="DocCommentKeyword">
</span><span class="Operator">
</span><span class="Constant">
asm auto bool break case catch char class const const_cast continue default delete do double dynamic_cast else enum explicit export extern false float for friend goto if inline int long mutable namespace new operator private protected public register reinterpret_cast restrict return short signed sizeof static static_cast struct switch template this throw true try typedef typeid typename union unsigned using virtual void volatile while int8_t uint8_t int16_t uint16_t int32_t uint32_t int64_t uint64_t int_least8_t uint_least8_t int_least16_t uint_least16_t int_least32_t uint_least32_t int_least64_t uint_least64_t int_fast8_t uint_fast8_t int_fast16_t uint_fast16_t int_fast32_t uint_fast32_t int_fast64_t uint_fast64_t intptr_t uintptr_t intmax_t uintmax_t wint_t wchar_t wctrans_t wctype_t size_t time_t and and_eq bitand bitor compl not not_eq or or_eq xor xor_eq complex imaginary _Complex _Imaginary _Bool _Pragma string wstring NULL

run.bat:

@echo off
SyntaxHighlighter.exe input.cpp output.txt formatters.txt
pause

input.cpp: Whatever you want to test it on.

Recommended Answers

All 8 Replies

It'd be helpful, as a start, if you gave an idea of what was going wrong. There is a lot of code there and even more context in your head that we are not privy to.

This program is designed to be run to turn an input file full of c++ code into an output file that contains that code parsed to be highlighted in an HTML document (or any other document for that matter) Basically I need to have it such that I insert strings before every change in syntax highlighting type. Here is another formatters.txt file that, were the program working, would illustrate the point better: (note that #! is parsed as a new line, ## is parsed as an incrementing counter and #X where X is 0-9 is parsed as setting that counter, also the last line is just a list of keywords)

#2START CODE#!(
)#!END CODE#!
)#!LINE # ##:(
)Code(
)Comment(
)DocumentationComment(
)Preprocessor(
)StringLiteral(
)CharacterLiteral(
)Keyword(
)DocumentationCommentKeyword(
)Operator(
)NumericLiteral(
asm auto bool break case catch char class const const_cast continue default delete do double dynamic_cast else enum explicit export extern false float for friend goto if inline int long mutable namespace new operator private protected public register reinterpret_cast restrict return short signed sizeof static static_cast struct switch template this throw true try typedef typeid typename union unsigned using virtual void volatile while int8_t uint8_t int16_t uint16_t int32_t uint32_t int64_t uint64_t int_least8_t uint_least8_t int_least16_t uint_least16_t int_least32_t uint_least32_t int_least64_t uint_least64_t int_fast8_t uint_fast8_t int_fast16_t uint_fast16_t int_fast32_t uint_fast32_t int_fast64_t uint_fast64_t intptr_t uintptr_t intmax_t uintmax_t wint_t wchar_t wctrans_t wctype_t size_t time_t and and_eq bitand bitor compl not not_eq or or_eq xor xor_eq complex imaginary _Complex _Imaginary _Bool _Pragma string wstring NULL

An example run would look like this:
input.cpp:

#include <iostream>//for cout
using namespace std;
///Main entry point @test!
int main()
{
    cout<<"Hello World"<<'\n';
    return 0;
}

output.txt:

START CODE()
LINE # 1:()Preprocessor(#include <iostream>)Comment(//for cout)
LINE # 2:()Keyword(using)Code( )Keyword(namespace)Code( std)Operator(;)
LINE # 3:()DocumentationComment(///Main entry point )DocumentationCommentKeyword(@test)DocumentationComment(!)
LINE # 4:()Keyword(int)Code( main)Operator(())
LINE # 5:()Operator({)
LINE # 6:()Code(    cout)Operator(<<)StringLiteral("Hello World")Operator(<<)CharacterLiteral('\n')Operator(;)
LINE # 7:()Code(    )Keyword(return)Code( )NumericLiteral(0)Operator(;)
LINE # 8:()Operator(})
END CODE

What I meant was that you say there is something wrong with the code but you do not give any hint as to where to look. For example, is it properly processing the input and giving bad output? Is there a crash at some point? Can you not even read the input? There are many stages here that could be the culprit - it would help to narrow the space for us.
Also, what, in particular, has changed from the two working copies that you have? That would be the perfect location to start your investigation.

Basically my code gets stuck at saying 0% formatted. I did a complete re-do from my previous version, but just for the sake of it here is my previous version:
bettermain.cpp: (not nearly as helpful and painfully difficult to modify, also glitchy)

#include <stdio.h>
#include <string>
using namespace std;
string readFile(string fname)
{
    FILE *file=fopen(fname.c_str(),"r");
    printf("Opening input file...\n");
    if (file==NULL)
        return "";
    string ret="";
    while (!feof(file))
        ret+=fgetc(file);
    fclose(file);
    return ret;
}
void writeFile(string fname, string text)
{
    FILE *file=fopen(fname.c_str(),"w");
    printf("Opening output file...\n");
    if (file==NULL)
        return;
    printf("Printing to file...\n");
    for (int i=0; text[i]; i++)
    {
        fputc(text[i],file);
        printf("%c",text[i]);
    }
    printf("Done printing to file...\n");
    fclose(file);
}
void replaceAll(string &text, char find, string replace)
{
    string ret="";
    for (int i=0; i<(int)text.length(); i++)
    {
        if (text[i]==find)
            ret+=replace;
        else
            ret+=text[i];
    }
    text.swap(ret);
}
int log(int x)
{
    int ret=0;
    while (x>0)
    {
        x/=10;
        ret++;
    }
    return ret;
}
void replaceAllInc(string &text, char find, int startindex)
{
    string ret="";
    int num=startindex;
    for (int i=0; i<(int)text.length(); i++)
    {
        if (text[i]==find)
        {
            char *temp=new char[log(num)+1];
            sprintf(temp,"%i",num++);
            ret+=temp;
            delete[]temp;
        }
        else
        {
            ret+=text[i];
        }
    }
    text.swap(ret);
}
string highlight(string itext)
{
    string keywords[]={
       "asm","auto","bool","break","case","catch",
       "char","class","const","const_cast","continue",
       "default","delete","do","double","dynamic_cast",
       "else","enum","explicit","export","extern",
       "false","float","for","friend","goto","if",
       "inline","int","long","mutable","namespace",
       "new","operator","private","protected","public",
       "register","reinterpret_cast","restrict","return",
       "short","signed","sizeof","static","static_cast",
       "struct","switch","template","this","throw",
       "true","try","typedef","typeid","typename",
       "union","unsigned","using","virtual","void",
       "volatile","while","int8_t","uint8_t","int16_t",
       "uint16_t","int32_t","uint32_t","int64_t","uint64_t",
       "int_least8_t","uint_least8_t","int_least16_t",
       "uint_least16_t","int_least32_t","uint_least32_t",
       "int_least64_t","uint_least64_t","int_fast8_t",
       "uint_fast8_t","int_fast16_t","uint_fast16_t",
       "int_fast32_t","uint_fast32_t","int_fast64_t",
       "uint_fast64_t","intptr_t","uintptr_t","intmax_t",
       "uintmax_t","wint_t","wchar_t","wctrans_t",
       "wctype_t","size_t","time_t","and","and_eq",
       "bitand","bitor","compl","not","not_eq","or",
       "or_eq","xor","xor_eq","complex","imaginary",
       "_Complex","_Imaginary","_Bool","_Pragma","|EOF"
    };
    string dockeywords[]={
        "a", "addindex", "addtogroup", "anchor", "arg", "attention",
        "author", "b", "brief", "bug", "c", "class", "code", "date",
        "def", "defgroup", "deprecated", "dontinclude", "e", "em",
        "endcode", "endhtmlonly", "endif", "endlatexonly", "endlink",
        "endverbatim", "enum", "example", "exception", "f$", "f[", "f]",
        "file", "fn", "hideinitializer", "htmlinclude", "htmlonly", "if",
        "image", "include", "ingroup", "internal", "invariant",
        "interface", "latexonly", "li", "line", "link", "mainpage", "name",
        "namespace", "nosubgrouping", "note", "overload", "p", "page",
        "par", "param", "post", "pre", "ref", "relates", "remarks", "return",
        "retval", "sa", "section", "see", "showinitializer", "since", "skip",
        "skipline", "struct", "subsection", "test", "throw", "todo",
        "typedef", "union", "until", "var", "verbatim", "verbinclude",
        "version", "warning", "weakgroup", "$", "@", "\"", "<", ">", "#",
        "{", "}", "|EOF"
    };
    string text=itext;
    string format="";
    #define CODE 1
    #define COMMENT 2
    #define MLCOMMENT 3
    #define DOC 4
    #define MLDOC 5
    #define PREPROCESSOR 6
    #define STR 7
    #define CHR 8
    #define ENDSTR 9
    #define ENDCHR 10
    #define KEYWORD 11
    #define DOCKEYWORD 12
    #define OPERATOR 13
    #define NUMBER 14
    #define ENDL 15
    printf("Loading format string...\n");
    char state=CODE;
    int kwdlen=0;
    for (int i=0; i<(int)text.length(); i++)
    {
        if (((i*1000)/(text.length()*10))%5==0)
            printf("%i%% formatted...\n",(i*100)/text.length());
        switch (state)
        {
            case KEYWORD:
                if (kwdlen-->=1)
                    break;
                state=CODE;
            case OPERATOR:
            case CODE:
            case ENDL:
            switch (text[i])
            {
                case '#':
                    if (i==0||text[i-1]=='\n')
                        state=PREPROCESSOR;
                    break;
                case '/':
                    if (text[i+1]=='/')
                    {
                        if (text[i+2]=='/'||text[i+2]=='!')
                            state=DOC;
                        else
                            state=COMMENT;
                    }
                    else if (text[i+1]=='*')
                    {
                        if (text[i+2]=='*'||text[i+2]=='!')
                            state=MLDOC;
                        else
                            state=MLCOMMENT;
                    }
                    else
                        state=OPERATOR;
                    break;
                case '1':case '2':case '3':case '4':case '5':
                case '6':case '7':case '8':case '9':case '0':
                    if (i<1)
                    {
                        state=NUMBER;
                        break;
                    }
                    switch (text[i-1])
                    {
                        case ' ':case ',':case '/':
                        case '<':case '>':case '=':case '|':case '%':case '^':case '&':
                        case '*':case '(':case ')':case '+':case '-':case '~':case '.':
                        case '{':case '}':case '[':case ']':case ':':case ';':case '!':
                            state=NUMBER;
                            break;
                    }
                    break;
                case '\"':
                    state=STR;
                    break;
                case '\'':
                    state=CHR;
                    break;
                case ',':
                case '<':case '>':case '=':case '|':case '%':case '^':case '&':
                case '*':case '(':case ')':case '+':case '-':case '~':case '.':
                case '{':case '}':case '[':case ']':case ':':case ';':case '!':
                    state=OPERATOR;
                    break;
                default://check for keyword
                    char val=(i<=0?' ':text[i-1]);
                    switch (val)//check that we are not in the middle of a word
                    {
                        case ' ':case '\n':
                        case ',':
                        case '<':case '>':case '=':case '|':case '%':case '^':case '&':
                        case '*':case '(':case ')':case '+':case '-':case '~':case '.':
                        case '{':case '}':case '[':case ']':case ':':case ';':case '!':
                            for (int ii=0; keywords[ii][0]!='|'; ii++)
                            {
                                bool hit=true;
                                for (int iii=0; keywords[ii][iii]&&text[i+iii]&&hit; iii++)
                                {
                                    if (text[i+iii]!=keywords[ii][iii])
                                    {
                                        hit=false;
                                        break;
                                    }
                                }
                                if (hit)
                                {
                                    state=KEYWORD;
                                    kwdlen=keywords[ii].length();
                                    break;
                                }
                            }
                        if (state!=KEYWORD)
                            state=CODE;
                        break;
                    }
                    break;
            }
            break;
            case MLCOMMENT:
            case MLDOC:
                if (text[i-1]=='/'&&text[i-2]=='*')
                    state=CODE;//cannot deal with /* at start of text!
                break;
            case DOC:
                if (text[i]=='@')
                {
                    for (int ii=0; dockeywords[ii][0]!='|'; ii++)
                    {
                        bool hit=true;
                        for (int iii=0; dockeywords[ii][iii]&&text[i+iii]&&hit; iii++)
                        {
                            if (text[i+iii]!=dockeywords[ii][iii])
                                hit=false;
                        }
                        if (hit)
                        {
                            state=DOCKEYWORD;
                            break;
                        }
                    }
                }
                break;
            case PREPROCESSOR:
                if (text[i]=='/')
                {
                    if (text[i+1]=='/')
                    {
                        if (text[i+2]=='/'||text[i+2]=='!')
                            state=DOC;
                        else
                            state=COMMENT;
                    }
                    else if (text[i+1]=='*')
                    {
                        if (text[i+2]=='*'||text[i+2]=='!')
                            state=MLDOC;
                        else
                            state=MLCOMMENT;
                    }
                }
                break;
            case STR:
                if (text[i]=='\"')
                    state=ENDSTR;
                break;
            case CHR:
                if (text[i]=='\'')
                    state=ENDCHR;
                break;
            case NUMBER:
                switch (text[i])
                {
                    case '<':
                    case ',':case '>':case '=':case '|':case '%':case '^':case '&':
                    case '*':case '(':case ')':case '/':case '+':case '-':case '~':
                    case '{':case '}':case '[':case ']':case ':':case ';':case '!':
                    state=OPERATOR;
                    break;
                    case ' ':
                    state=CODE;
                }
                break;

        }
        if (text[i]=='\n'&&state!=MLCOMMENT&&state!=MLDOC)
        {
            state=ENDL;
            format+=state;
        }
        else if (text[i]=='\n'&&(state==MLCOMMENT||state==MLDOC))
        {
            format+=ENDL;
        }
        else
        {
            format+=state;
        }
        if (state==ENDSTR)
            state=CODE;
        if (state==ENDCHR)
            state=CODE;
    }
    state=0;
    int offset=0;
    for (int i=0; i<(int)format.length(); i++)
    {
        if (format[i]!=state)
        {
            state=format[i];
            char *tmp=new char[2];
            sprintf(tmp,"%c",-state);
            text.insert(offset+i,tmp);
            delete[]tmp;
            offset++;
        }
    }
    printf("Formatting for HTML...\n");
    replaceAll(text,'&',"&amp;");
    replaceAll(text,' ',"&nbsp;");
    replaceAll(text,'>',"&gt;");
    replaceAll(text,'<',"&lt;");
    replaceAll(text,'\"',"&quot;");
    replaceAll(text,-CODE,"</span><span class=\"Code\">");
    replaceAll(text,-COMMENT,"</span><span class=\"Comment\">");
    replaceAll(text,-MLCOMMENT,"</span><span class=\"Comment\">");
    replaceAll(text,-DOC,"</span><span class=\"DocComment\">");
    replaceAll(text,-MLDOC,"</span><span class=\"DocComment\">");
    replaceAll(text,-PREPROCESSOR,"</span><span class=\"Preprocessor\">");
    replaceAll(text,-STR,"</span><span class=\"String\">");
    replaceAll(text,-CHR,"</span><span class=\"Character\">");
    replaceAll(text,-ENDSTR,"</span><span class=\"String\">");
    replaceAll(text,-ENDCHR,"</span><span class=\"Character\">");
    replaceAll(text,-KEYWORD,"</span><span class=\"Keyword\">");
    replaceAll(text,-DOCKEYWORD,"</span><span class=\"DocCommentKeyword\">");
    replaceAll(text,-OPERATOR,"</span><span class=\"Operator\">");
    replaceAll(text,-NUMBER,"</span><span class=\"Constant\">");
    replaceAll(text,-ENDL,"</span>");
    replaceAll(text,'\n',"</div></td></tr>\n<td>\a.</td>\n<td bgcolor=\"#FFFFFF\"><div align=\"left\">");
    replaceAllInc(text,'\a',2);
    string ret="<table width=\"100%%\" border=\"0\" cellspacing=\"0\">\n<tr>\n<td>1.</td>\n<td bgcolor=\"#FFFFFF\"><div align=\"left\">";//the start of the table
    ret+=text;//add the code
    ret+="</span></div></td>\n</tr>\n</table><p>";//the end of the table
    #undef CODE
    #undef COMMENT
    #undef MLCOMMENT
    #undef DOC
    #undef MLDOC
    #undef PREPROCESSOR
    #undef STR
    #undef CHR
    #undef KEYWORD
    #undef DOCKEYWORD
    #undef OPERATOR
    #undef NUMBER
    return ret;
}
int main(int argc, char *argv[])
{
    if (argc!=3)
    {
        printf("SyntaxHighlighter IFNAME OFNAME\n\tIFNAME is the name of the input file.\n\tOFNAME is the name of the output file.\nPRESS ENTER");
        getchar();
        return 1;
    }
    printf("Initializing...\n");
    string text=highlight(readFile(argv[1]));
    printf("Saving to file...\n");
    writeFile(argv[2],text);
    printf("PRESS ENTER");
    getchar();
    return 0;
}

The fact that you are moving i within the loop body it controls is not a good sign. Consider the following section:

if (i<1||!isAlNum(inputText[i-1]))
{
   for (; i<(int)inputText.length()&&isAlNum(inputText[i]); ++i)
      format+=cNUMBER;
   --i;
}

What happens if you try to consume a single digit surrounded by spaces? There may be similar problems at other locations - I stopped looking after this one.

I'd suggest you place some debug statements throughout your code (specifically at each case statement) to try and understand what you've written.

I have gotten my debugger to work and have a nearly perfect program working. The problem now is a single stray character at the end (I believe it has an ASCII value of -cENDL) here is the new code:

#include <iostream>     //for cin/cout
#include <string>       //for std::strings, obviously
#include <fstream>      //for file operations
#include <stdio.h>      //for integer parsing
using namespace std;
string readFile(string fileName)
{
    fstream file(fileName.c_str()); //open the file
    if (!file.is_open())            //check that it opened
        return "";
    string ret="";                  //initialize the return value
    while (!file.eof())             //loop through the whole file
        ret+=file.get();            //grab a character and append it to the return value
    file.close();                   //close the file
    return ret;
}
void writeFile(string fileName, string text)
{
    fstream file(fileName.c_str()); //open the file
    if (!file.is_open())            //check that it opened
        return;
    file<<text;                     //write the text
    file.close();                   //close the file
}
struct highlightFormatter
{
    string start;           //this is written at the start of the information
    string end;             //this is written at the end of the information
    string newLine;         //this is written at the end of each line of the information
    string code;            //this is written at the start of a segment labeled as code
    string comment;         //this is written at the start of a segment labeled as comment
    string documentation;   //this is written at the start of a documentation comment
    string preprocessor;    //this is written at the start of a preprocessor
    string str;             //this is written at the start of a string literal
    string chr;             //this is written at the start of a character literal
    string keyword;         //this is written at the start of a keyword
    string docKeyword;      //this is written at the start of a documentation keyword
    string op;              //this is written at the start of an operator
    string number;          //this is written at the start of a numerical literal
    string keywords;        //this stores all the words identified as keywords, seperated by commas
};
highlightFormatter readFormatFile(string filename)
{
    highlightFormatter ret;         //declare the return value
    fstream file(filename.c_str()); //open the file
    if (!file.is_open())            //check that the file opened
        return ret;
    //I am NOT commenting the following lines, they are self-explanatory
    getline(file,ret.start);
    getline(file,ret.end);
    getline(file,ret.newLine);
    getline(file,ret.code);
    getline(file,ret.comment);
    getline(file,ret.documentation);
    getline(file,ret.preprocessor);
    getline(file,ret.str);
    getline(file,ret.chr);
    getline(file,ret.keyword);
    getline(file,ret.docKeyword);
    getline(file,ret.op);
    getline(file,ret.number);
    getline(file,ret.keywords);
    file.close();
    return ret;
}
string strFromInt(int i)
{
    char ret[20];//I do not believe ints can be longer than 20 digits?
    sprintf(ret,"%d",i);
    return string(ret);
}
void replaceAll(string &str, char find, string replacement)
{
    int location=0;
    do {
        location=str.find_first_of(find,location);
        if (location>=0)
            str.replace(location,1,replacement);
    } while(location!=-1);
}
void formatNums(string &str)
{
    int location=0;//now storing numbers
    for (int i=0; i<(int)str.length(); ++i)
    {
        if (str[i]=='#'&&i+1<(int)str.length()&&str[i+1]>='0'&&str[i+1]<='9')
        {
            location=str[i+1]-'0';
            str.replace(i,2,"");
        }
        else if (str[i]=='#'&&i+1<(int)str.length()&&str[i+1]=='#')
            str.replace(i,2,strFromInt(location++));
        else if (str[i]=='#'&&i+1<(int)str.length()&&str[i+1]=='!')
            str.replace(i,2,"\n");
    }
}
bool isAlNum(char ch)
{
    return (ch>='A'&&ch<='Z')||
           (ch>='a'&&ch<='z')||
           (ch>='0'&&ch<='9');
}
bool strContains(string &str,string search)
{
    return str.find(search)!=(size_t)-1;
}
string highlight(string inputText, highlightFormatter formatter)
{
    //Declaring some constants
    const char cCODE            ='0';
    const char cCOMMENT         ='c';
    const char cMLCOMMENT       ='C';
    const char cDOC             ='d';
    const char cMLDOC           ='D';
    const char cPREPROCESSOR    ='P';
    const char cSTRING          ='s';
    const char cCHAR            ='h';
    const char cKEYWORD         ='k';
    const char cDOCKEYWORD      ='K';
    const char cOPERATOR        ='O';
    const char cNUMBER          ='n';
    const char cENDL            ='\n';
    char state=cCODE;
    string format="";
    cout<<"Loading format string...\n";
    for (int i=0; i<(int)inputText.length(); ++i)//loop through the whole string
    {
        if (((i*100)/(inputText.length()))%5==0)
            cout<<(i*100)/inputText.length()<<"% formatted...\n";
        switch (inputText[i])
        {
        case '*'://weird character... endl comments and modifies them and is an operator.
            if ((state==cMLCOMMENT||state==cMLDOC)&&i+1<(int)inputText.length()&&inputText[i+1]=='/')
                state=cCODE;
            if (state==cCODE)
                format+=cOPERATOR;
            else
                format+=state;
            break;
        case '/'://Possible comment... possible operator
            if (state==cCHAR||state==cSTRING)//this is not to be parsed
            {
                format+=state;
                break;
            }
            if (i+1<(int)inputText.length()&&inputText[i+1]=='/')//check for inline comments
            {
                if (i+2<(int)inputText.length()&&(inputText[i+2]=='/'||inputText[i+2]=='!'))//check for inline documentation
                {
                    state=cDOC;
                    format+=state;
                    break;
                }
                else
                {
                    state=cCOMMENT;
                    format+=state;
                    break;
                }
            }
            else if (i+1<(int)inputText.length()&&inputText[i+1]=='*')//check for multiline comments
            {
                if (i+2<(int)inputText.length()&&(inputText[i+2]=='*'||inputText[i+2]=='!'))//check for multiline documentation
                {
                    state=cMLDOC;
                    format+=state;
                    break;
                }
                else
                {
                    state=cMLCOMMENT;
                    format+=state;
                    break;
                }
            }//else, treat as operator.
        case ','://OPERATORS
        case '<':
        case '>':
        case '=':
        case '|':
        case '%':
        case '^':
        case '&':
        case '(':
        case ')':
        case '+':
        case '-':
        case '~':
        case '.':
        case '{':
        case '}':
        case '[':
        case ']':
        case ':':
        case ';':
        case '!':
            if (state==cCODE)
                format+=cOPERATOR;
            else
                format+=state;
            break;
        case '#'://PREPROCESSOR?
            if (state==cCODE)
                state=cPREPROCESSOR;
            format+=state;
            break;
        case '@'://dockeyword?
            if (state==cMLDOC||state==cDOC)
            {
                for (int ii=i; ii<(int)inputText.length()&&isAlNum(inputText[ii]); ++ii){
                    format+=cDOCKEYWORD;++i;}
            }
            else
            {
                format+=state;
            }
            break;
        case '\"'://string?
            if (state==cCODE)
            {
                format+=cSTRING;
                state=cSTRING;
            }
            else if (state==cSTRING)
            {
                format+=cSTRING;
                if (i==0||inputText[i-1]!='\\')
                    state=cCODE;
            }
            else
            {
                format+=state;
            }
            break;
        case '\''://character?
            if (state==cCODE)
            {
                format+=cCHAR;
                state=cCHAR;
            }
            else if (state==cCHAR)
            {
                format+=cCHAR;
                if (i==0||inputText[i-1]!='\\')
                    state=cCODE;
            }
            else
            {
                format+=state;
            }
            break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9'://NUMBERS!!!
            if (state!=cCODE)
            {
                format+=state;
                break;
            }
            if (i<1||!isAlNum(inputText[i-1]))
            {
                for (int ii=i; ii<(int)inputText.length()&&isAlNum(inputText[ii]); ++ii){
                    format+=cNUMBER;++i;}
                --i;
            }
            else
            {
                format+=state;
            }
            break;
        case '\n'://newlines
            format+=cENDL;
            if (state!=cMLCOMMENT&&state!=cMLDOC)//check for multi-line comments
            {

                if (i>1&&inputText[i-1]=='\\'&&state==cPREPROCESSOR)//check for multi-line preprocessor
                    break;
                state=cCODE;
                break;
            }
            break;
        default://check for keyword
            if (state!=cCODE||!isAlNum(inputText[i]))
            {
                format+=state;
                break;
            }
            //get the current word
            string thisWord="";
            for (int ii=i; ii<(int)inputText.length()&&isAlNum(inputText[ii]); ++ii)
                thisWord+=inputText[ii];
            i+=thisWord.length()-1;
            format.append(thisWord.length(),(strContains(formatter.keywords,thisWord+" ")?cKEYWORD:state));
        }
    }
    //now format should contain the format of inputText
    state=0;
    int offset=0;
    for (int i=0; i<(int)format.length(); ++i)
    {
        if (format[i]!=state)
        {
            state=format[i];
            inputText.insert(offset+i,1,-state);//plunk in an identifier
            ++offset;
        }
    }
    string ret=inputText;
    cout<<"Formatting for HTML...\n";
    replaceAll(ret,'&',"&amp;");
    replaceAll(ret,' ',"&nbsp;");
    replaceAll(ret,'>',"&gt;");
    replaceAll(ret,'<',"&lt;");
    replaceAll(ret,'\"',"&quot;");
    replaceAll(ret,-cCODE,formatter.code);
    replaceAll(ret,-cCOMMENT,formatter.comment);
    replaceAll(ret,-cMLCOMMENT,formatter.comment);
    replaceAll(ret,-cDOC,formatter.documentation);
    replaceAll(ret,-cMLDOC,formatter.documentation);
    replaceAll(ret,-cPREPROCESSOR,formatter.preprocessor);
    replaceAll(ret,-cSTRING,formatter.str);
    replaceAll(ret,-cCHAR,formatter.chr);
    replaceAll(ret,-cKEYWORD,formatter.keyword);
    replaceAll(ret,-cDOCKEYWORD,formatter.docKeyword);
    replaceAll(ret,-cNUMBER,formatter.number);
    replaceAll(ret,-cOPERATOR,formatter.op);
    replaceAll(ret,-cENDL,formatter.newLine);
    string retVal=formatter.start+ret+formatter.end;
    formatNums(retVal);
    return retVal;
}
int main(int argc, char *argv[])
{
    if (argc!=4)
    {
        //output help message
        cout<<"SyntaxHighlighter [input] [output] [specifiers]\n"<<
            "\tParses C++ code for HTML.\n"<<
            "\t\t[input] is the input filename.\n"<<
            "\t\t[output] is the output filename.\n"<<
            "\t\t[specifiers] is the specifiers filename.\n";
        return 1;
    }
    cout<<"Loading input file...\n";
    string text=readFile(argv[1]);
    cout<<"Loading format file...\n";
    highlightFormatter hf=readFormatFile(argv[3]);
    string tmp=highlight(text,hf);
    writeFile(argv[2],tmp);
    return 0;
}

here is formatters.txt:

start(
)end
)#!(
)Code(
)Comment(
)DocComment(
)Preprocessor(
)String(
)Character(
)Keyword(
)DocCommentKeyword(
)Operator(
)Constant(
asm auto bool break case catch char class const const_cast continue default delete do double dynamic_cast else enum explicit export extern false float for friend goto if inline int long mutable namespace new operator private protected public register reinterpret_cast restrict return short signed sizeof static static_cast struct switch template this throw true try typedef typeid typename union unsigned using virtual void volatile while int8_t uint8_t int16_t uint16_t int32_t uint32_t int64_t uint64_t int_least8_t uint_least8_t int_least16_t uint_least16_t int_least32_t uint_least32_t int_least64_t uint_least64_t int_fast8_t uint_fast8_t int_fast16_t uint_fast16_t int_fast32_t uint_fast32_t int_fast64_t uint_fast64_t intptr_t uintptr_t intmax_t uintmax_t wint_t wchar_t wctrans_t wctype_t size_t time_t and and_eq bitand bitor compl not not_eq or or_eq xor xor_eq complex imaginary _Complex _Imaginary _Bool _Pragma string wstring NULL

and here is input.cpp (everyone favourite program):

#include <iostream>
using namespace std;
int main()
{
    cout<<"Hello world!"<<endl;
    return 0;
}

Here is output.txt:

start()Preprocessor(#include&nbsp;&lt;iostream&gt;)
(
)Keyword(using)Code(&nbsp;)Keyword(namespace)Code(&nbsp;std)Operator(;)
(
)Keyword(int)Code(&nbsp;main)Operator(())
(
)Operator({)
(
)Code(&nbsp;&nbsp;&nbsp;&nbsp;cout)Operator(&lt;&lt;)String(&quot;Hello&nbsp;world!&quot;)Operator(&lt;&lt;)Code(endl)Operator(;)
(
)Code(&nbsp;&nbsp;&nbsp;&nbsp;)Keyword(return)Code(&nbsp;)Constant(0)Operator(;)
(
)Operator(})
(
)Code(ÿ)end

You can see the problem in that last line, but how do you fix it?

I wonder what the EOF character looks like if you print it out....

Thank you! I hate it when problems hide away in other functions!

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.