//  FILE: CPPtoHTML.cpp
//    
//  DESCRIPTION: 
//  Create an HTML page from C or C++ code.
//
//  Note that this program was written using Rogue Wave's Tools.h++ 7.0.3
//  with Standard Template Library support.  Converting this code to use
//  the STL and String classes should not be difficult.
//
//  REV: 1.0
//  CREATED: 09/29/96   11:19:46
//  AUTHOR:  Mike Benzinger

// Disable warning about signed/unsigned mismatch.
#pragma warning( disable : 4018 )

// Disable warning about unary minus operator applied to unsigned type.
#pragma warning( disable : 4146 )

// Include the necessary header files.
#include <iostream>
#include <fstream>

#include <ctype.h>

#include <rw/cstring.h>
#include <rw/re.h>
#include <rw/tvordvec.h>

typedef RWTValOrderedVector<RWCString> StringVector;


//  createKeywordList
// 
//  DESCRIPTION:
//    Create a list of C and C++ keywords.
// 
//  PARAMETERS:
//    keywordList - List of keywords to be populated
// 
//  RETURNS:
//    Nothing

void
createKeywordList(StringVector& keywordList)
{
  keywordList.insert("asm");
  keywordList.insert("auto");
  keywordList.insert("bool");
  keywordList.insert("break");
  keywordList.insert("case");
  keywordList.insert("catch");
  keywordList.insert("char");
  keywordList.insert("class");
  keywordList.insert("const");
  keywordList.insert("const_cast");
  keywordList.insert("continue");
  keywordList.insert("default");
  keywordList.insert("define");
  keywordList.insert("delete");
  keywordList.insert("disable");
  keywordList.insert("do");
  keywordList.insert("double");
  keywordList.insert("dynamic_cast");
  keywordList.insert("else");
  keywordList.insert("enum");
  keywordList.insert("error");
  keywordList.insert("explicit");
  keywordList.insert("extern");
  keywordList.insert("false");
  keywordList.insert("float");
  keywordList.insert("for");
  keywordList.insert("friend");
  keywordList.insert("goto");
  keywordList.insert("if");
  keywordList.insert("ifdef");
  keywordList.insert("ifndef");
  keywordList.insert("include");
  keywordList.insert("inline");
  keywordList.insert("int");
  keywordList.insert("long");
  keywordList.insert("mutable");
  keywordList.insert("namespace");
  keywordList.insert("new");
  keywordList.insert("NULL");
  keywordList.insert("operator");
  keywordList.insert("pragma");
  keywordList.insert("private");
  keywordList.insert("protected");
  keywordList.insert("public");
  keywordList.insert("register");
  keywordList.insert("reinterpret_cast");
  keywordList.insert("return");
  keywordList.insert("short");
  keywordList.insert("signed");
  keywordList.insert("sizeof");
  keywordList.insert("static");
  keywordList.insert("static_cast");
  keywordList.insert("struct");
  keywordList.insert("switch");
  keywordList.insert("template");
  keywordList.insert("this");
  keywordList.insert("throw");
  keywordList.insert("true");
  keywordList.insert("try");
  keywordList.insert("typedef");
  keywordList.insert("typeid");
  keywordList.insert("uchar");
  keywordList.insert("uint");
  keywordList.insert("ulong");
  keywordList.insert("union");
  keywordList.insert("unsigned");
  keywordList.insert("ushort");
  keywordList.insert("using");
  keywordList.insert("virtual");
  keywordList.insert("void");
  keywordList.insert("volatile");
  keywordList.insert("warning");
  keywordList.insert("wchar_t");
  keywordList.insert("while");
}

// Define constants for the HTML colors.
const RWCString black       = "\"#000000\"";
const RWCString blue        = "\"#0000FF\"";
const RWCString cyan        = "\"#00FFFF\"";
const RWCString green       = "\"#00FF00\"";
const RWCString magenta     = "\"#FF00FF\"";
const RWCString red         = "\"#FF0000\"";
const RWCString yellow      = "\"#FFFF00\"";
const RWCString white       = "\"#FFFFFF\"";
const RWCString darkBlue    = "\"#000080\"";
const RWCString darkCyan    = "\"#008080\"";
const RWCString darkGreen   = "\"#008000\"";
const RWCString darkMagenta = "\"#800080\"";
const RWCString darkRed     = "\"#800000\"";
const RWCString darkYellow  = "\"#808000\"";
const RWCString darkGray    = "\"#808080\"";
const RWCString lightGray   = "\"#C0C0C0\"";

//  startColor
// 
//  DESCRIPTION:
//    Compose an HTML tag to change the font color.
// 
//  PARAMETERS:
//    color - Desired font color
// 
//  RETURNS:
//    String defining the font color

RWCString
startColor(RWCString color)
{
  return( RWCString("<FONT COLOR=" + color + ">") );
}


//  endColor
// 
//  DESCRIPTION:
//    Compose an HTML tag to revert back to the previous color.
// 
//  PARAMETERS:
//    None
// 
//  RETURNS:
//    String defining the end font HTML tag.

RWCString
endColor()
{
  return( RWCString("</FONT>") );
}


//  processToken
// 
//  DESCRIPTION:
//    If a token is a keyword, bold it and change its color to blue.
// 
//  PARAMETERS:
//    token       - Token to be analyzed
//    htmlLine    - HTML output line
//    keywordList - List of keywords
// 
//  RETURNS:
//    Nothing

RWCString
processToken(RWCString& token, bool& tokenFound, StringVector& keywordList)
{
  // If the keyword is a keyword, then bold it and change its color to blue.
  // Otherwise, simply add the token to the HTML output line with no changes.
  RWCString htmlString;

  if ( keywordList.contains(token) )
    htmlString = "<B>" + startColor(blue) + token + endColor() + "</B>";
  else
    htmlString = token;  

  // Reset the token to blank and indicate that no token has been found.
  token      = "";
  tokenFound = false;

  // Return the HTML string.
  return( htmlString );
}


//  colorInclude
// 
//  DESCRIPTION:
//    Change the color of a system include file in the format <include.h> so
//    that it is colored the same as a string.
// 
//  PARAMETERS:
//    htmlLine - The HTML output line to be modified.
// 
//  RETURNS:
//    Nothing

void
colorInclude(RWCString& htmlLine, StringVector& headerNamesList)
{
  // Extract the header name from the HTML line with the format <headerFile.hpp>.
  // Note that since the "<" symbol is a special character for HTML, they are all
  // converted to "&lt".
  RWCString headerName;

  if ( htmlLine.contains("&lt") )
  {
    // Extract the header name to determine if it's in the list of
    // headers to have hypertext links.
    headerName = htmlLine;

    headerName.replace(RWCRExpr(".*&lt;</FONT>"), "");
    headerName.replace(RWCRExpr("<FONT.*&gt;.*"), "");
  }

  // Otherwise check for a quoted include file.  Note, again, that since the
  // '"' symbol is a special character for HTML, they are all converted to
  // "&quot".

  else if (htmlLine.contains("&quot") )
  {
    // Extract the header name to determine if it's in the list of
    // headers to have hypertext links.
    headerName = htmlLine;

    headerName.replace(RWCRExpr(".*>&quot;"), "");
    headerName.replace(RWCRExpr("&quot;<.*"), "");
  }

  // Force the header name to lower case to ensure that the compare will be
  // done properly.
  headerName.toLower();

  // If the name is in the list, then create a hypertext link.
  if ( headerNamesList.contains(headerName) )
  {
    // Create the hypertext reference name.
    RWCString hyperTextHRef;

    hyperTextHRef = headerName;
    hyperTextHRef.replace(RWCRExpr("\\."), "_");
    hyperTextHRef += ".htm";

    // Create the hypertext link.
    RWCString hyperTextLink;

    hyperTextLink = "<A HREF=\"" + hyperTextHRef + "\">" + headerName + "</A>";

    htmlLine.replace(RWCRExpr(headerName), hyperTextLink);
  }

  // Otherwise, if this is a include file in the form <headerFile.hpp> then
  // change the color of the header name to red.
  else if ( htmlLine.contains("&lt") )
  {
    // Convert the dark red color used for symbols and operators to red.
    htmlLine.replace(RWCRExpr(darkRed), red, RWCString::all);

    // Since by default, the less than symbol is colored and the coloring is
    // turned off immediately afterwards, remove the </FONT> which halts the
    // red coloring so that it will continue coloring the name of the file.    
    htmlLine.replace(RWCRExpr("lt;</FONT>"), "lt;");

    // If there are slashes in the header name, then remove all coloring
    // HTML flags.  An example is #include <rw/cstring.h>.
    RWCRExpr removeSlashColor("<FONT COLOR=" + red + ">/</FONT>");
  
    htmlLine.replace(removeSlashColor, "/", RWCString::all);

    // Remove the font coloring from before the greater than symbol.
    htmlLine.replace(RWCRExpr("<FONT COLOR=" + red + ">&gt"), "&gt");
  }
}


//  main
// 
//  DESCRIPTION:
//    Main routine for converting C or C++ code to HTML.
//
//  PARAMETERS:
//    argc - Number of input arguments.
// 
//  RETURNS:
//    0

int
main(int argc, char *argv[])
{
  // Check to be sure that the input file name has been passed in.
  if ( argc < 2 )
  {
    cout << "Usage: " << argv[0] << " sourceFile [headerNamesFile] [background]" << endl;
    
    exit( 0 );
  }
  
  // Open the input file.
  ifstream cfile(argv[1]);

  if ( ! cfile )
  {
    cout << "Unable to open input file '" << argv[1] << "'" << endl;

    exit( 1 );
  }

  // If a header names file is present, load the list of header files for
  // which a hyper text link will be created.  Note that header names should
  // be specified one per line.
  StringVector headerNamesList;

  if ( argc >= 3 )
  {
    // Open the header names list file.
    ifstream hdrFile(argv[2]);

    if ( ! hdrFile )
    {
      cout << "Unable to open header names file '" << argv[2] << "'" << endl;

      exit( 1 );
    }

    // Input all header names, convert them to lower case and add them to
    // the list.
    RWCString headerFileName;

    while ( ! hdrFile.eof() )
    {
      hdrFile >> headerFileName;

      headerFileName.toLower();

      headerNamesList.insert(headerFileName);
    }
  }

  // If the background name exists, then read it for output in the body statement.
  RWCString background;

  if ( argc >= 4 )
  {
    background = argv[3];
  }
  
  // Create an order vector of RWCStrings and add all C and C++ keywords to it.
  StringVector keywordList;

  createKeywordList(keywordList);

  // Output the header HTML.
  cout << "<HTML>" << endl;

  // Strip the drive and directory off the file name.  In other words, the
  // following regular expression and replace function will transform the file
  // name "C:\ADIR\BDIR\CDIR\CODEFILE.CPP" to "CODEFILE.CPP".  Note that a
  // back slash is used as an escape character in both C++ strings and regular
  // expressions.  Therefore, in order to have a back slash interpreted as a
  // regular character, two back slashes are needed for the C++ string as well
  // as the regular expression.
  RWCString fileName = argv[1];
  
  fileName.replace(RWCRExpr(".:"), "");
  fileName.replace(RWCRExpr("\\\\(.+\\\\)*"), "");

  // Create the title for the HTML document.
  cout << "<HEAD>" << endl;
  cout << "<TITLE>" << fileName << "</TITLE>" << endl;
  cout << "</HEAD>" << endl;

  // Output the body statement and the background if any.
  cout << "<BODY ";

  if ( background.length() > 0 )
  {
    cout << "BACKGROUND=\"" << background << "\">" << endl;
  }

  else
  {
    cout << ">" << endl;
  }

  // Output a heading for the file, java script to print out the date when the
  // file was last modified and a horizontal rule.
  cout << "<CENTER><H1>" << fileName << "</H1></CENTER>" << endl;
  cout << "<SCRIPT LANGUAGE=\"JavaScript\">" << endl;
  cout << "<!--  Hide script contents from old browsers" << endl;
  cout << "document.write(\"Last Modified: \" + document.lastModified)" << endl;
  cout << "// End hiding of contents from old browsers  -->" << endl;
  cout << "</SCRIPT>" << endl;
  cout << "<HR SIZE=5>" << endl;

  // Indicate that the output text will be preformatted fixed-width text.
  cout << "<PRE>" << endl;

  // Define boolean flags for parsing the input string.
  bool inString     = false;
  bool inComment    = false;
  bool inEOLComment = false;
  bool inNumber     = false;
  bool escapeChar   = false;
  bool startString  = false;
  bool tokenFound   = false;

  // Define variables for the input line and tokens.  
  RWCString inputLine;
  RWCString token;

  // Process all lines in the file until end-of-file.
  while ( ! cfile.eof() )
  {
    // Read a line out of the input file.
    inputLine.readLine(cfile, FALSE);

    // Define the HTML output line and a flag indicating whether this line has
    // a preprocessor directive in it.
    RWCString htmlLine;

    bool preprocessor = false;

    // Loop over every character in the input line.
    for ( int i = 0; i < inputLine.length(); i++ )
    {
      // Extract the desired character.
      char chr = inputLine(i);

      // If a number is being processed and it is not part of a token, then
      // determine if the number has been completed.      
      if ( inNumber && ! tokenFound)
      {
        // If the current character is not a hexadecimal digit and it is not
        // an 'x' or 'X', then the number has been completed and its color
        // formatting will be turned off.
        if ( ! isxdigit(chr) && chr != 'x' && chr != 'X' )
        {
          htmlLine += endColor();

          inNumber = false;
        }
      }

      // Process the current character.
      switch ( chr )
      {
        // Catch all alphabetic characters and the underscore.  The underscore
        // is a legal character in variable names and keywords.
        case 'A': case 'a':
        case 'B': case 'b':
        case 'C': case 'c':
        case 'D': case 'd':
        case 'E': case 'e':
        case 'F': case 'f':
        case 'G': case 'g':
        case 'H': case 'h':
        case 'I': case 'i':
        case 'J': case 'j':
        case 'K': case 'k':
        case 'L': case 'l':
        case 'M': case 'm':
        case 'N': case 'n':
        case 'O': case 'o':
        case 'P': case 'p':
        case 'Q': case 'q':
        case 'R': case 'r':
        case 'S': case 's':
        case 'T': case 't':
        case 'U': case 'u':
        case 'V': case 'v':
        case 'W': case 'w':
        case 'X': case 'x':
        case 'Y': case 'y':
        case 'Z': case 'z':
        case '_':
          // If this character is not in a comment or a string, then add it to
          // the token string and indicate that a token has been found.
          if ( ! inComment && ! inString )
          {
            token += chr;
          
            tokenFound = true;
          }

          // Otherwise, simply add the character to the HTML output line.
          else
            htmlLine += chr;

          break;

        // Catch all numeric characters.           
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
          // If this character is not in a number, comment, string or token,
          // then add font coloring for a number to the HTML output line and
          // indicate that the start of a number has been found.
          if ( ! inNumber && ! inComment && ! inString && ! tokenFound)
          {
            htmlLine += startColor(darkMagenta);

            inNumber = true;
          }

          // If this is part of a token, then add it it to the token.  Otherwise
          // simply add it to the HTML output line.
          if ( tokenFound )          
            token += chr;
          else  
            htmlLine += chr;
          
          break;

        // Catch most of the C and C++ operators.
        case '&':
        case '<':
        case '>':
        case '(':
        case ')':
        case '{':
        case '}':
        case '[':
        case ']':
        case ';':
        case ':':
        case '!':
        case '%':
        case '^':
        case '*':
        case '-':
        case '+':
        case '=':
        case '|':
        case ',':
          // If a token was being built, process it and add it to the HTML
          // output line.
          if ( tokenFound )
            htmlLine += processToken(token, tokenFound, keywordList);

          // If the operator is not in a comment or a string, then turn the
          // coloring to dark red.          
          if ( ! inComment && ! inString )
             htmlLine += startColor(darkRed);

          // The '&', '<' and '>' are special symbols in HTML and must be
          // handled differently.  For all other operators, add them directly
          // to the HTML output line.          
          switch ( chr )
          {
            case '&':
              htmlLine += "&amp;";

              break;

            case '<':
              htmlLine += "&lt;";

              break;

            case '>':
              htmlLine += "&gt;";

              break;

            default:
              htmlLine += chr;
          }

          // If the operator is not in a comment or a string, then turn off
          // the coloring.          
          if ( ! inComment && ! inString )
            htmlLine += endColor();
         
          break;

        // Process a '/' which can be a division operator or the start or end
        // of a comment string.
        case '/':
          // If a token was being built, process it and add it to the HTML
          // output line.
          if ( tokenFound )
            htmlLine += processToken(token, tokenFound, keywordList);

          // Ensure that the character is not in a comment or a string.
          if ( ! inComment && ! inString )
          {
            // If the current character is not the last character in the line.
            if ( i < inputLine.length()-1 )
            {
              // If the next character is also a slash, then indicate that
              // this is a C++ style end-of-line comment.
              if ( inputLine(i+1) == '/' )
              {
                inComment    = true;
                inEOLComment = true;
              }

              // Otherwise, if the next character is an asterisk, then indicate
              // that this is a C style comment.
              else if ( inputLine(i+1) == '*' )
                inComment = true;
            }

            // If this is a comment, then set the color to dark green for
            // comments and italicize the text.
            if ( inComment )
              htmlLine += startColor(darkGreen) + "<I>";

            // Otherwise, color the division operator dark red.
            else
              htmlLine += startColor(darkRed);
          }

          // Add the character to the HTML output line.          
          htmlLine += chr;

          // If this is not in a string, then determine if this is the end of
          // a comment string.
          if ( ! inString )
          {
            // If the character is part of a comment.
            if ( inComment )
            {
              // If it is not in a C++ style end-of-line comment.
              if ( ! inEOLComment )
              {
                // If this is not the start of the line and the previous
                // character was an asterisk, then turn off the italicized
                // text, end the comment coloring and turn off the comment flag.
                if ( i > 0 && inputLine(i-1) == '*' )
                {
                  htmlLine += "</I>" + endColor();

                  inComment    = false;
                }
              }
            }

            // Otherwise this is a division operator, end operator coloring.
            else
              htmlLine += endColor();
          }

          break;

        // Process a quote or tick mark.
        case '\"':
        case '\'':
          // If a token was being built, process it and add it to the HTML
          // output line.
          if ( tokenFound )
            htmlLine += processToken(token, tokenFound, keywordList);

          // Determine if the character is escaped by checking to see if the
          // previous character was a backslash.
          if ( i > 0 && inputLine(i-1) == '\\' )
            escapeChar = true;
          else
            escapeChar = false;

          // If the previous character was an escape character, ensure that
          // the character prior to that was not a back slash also.  A double
          // backslash indicates a backslash inside a string.  If this is the
          // case, then the quote or tick are not escaped.
          if ( escapeChar && i > 1 && inputLine(i-2) == '\\' )
            escapeChar = false;

          // If the character is not in a comment or a string and it is not
          // escaped, then color the text read and indicate that a string has
          // been started.  Note that startString is used instead of inString.
          // This is done to avoid turning the string coloring off once the
          // character has been added to the HTML output string.
          if ( ! inComment && ! inString && ! escapeChar )
          {
            htmlLine += startColor(red);

            startString = true;
          }

          // Since the quote is a special HTML character and must be handled
          // differently.  Simply add the tick to the output line.
          if ( chr == '\"' )
            htmlLine += "&quot;";
          else
            htmlLine += "\'";

          // If this is not a comment and the character is part of a string
          // and it is not escaped, then turn off the quoted coloring.
          if ( ! inComment && inString && ! escapeChar )
          {
            htmlLine += endColor();

            inString = false;
          }

          // If a string has been started, then turn on the in string indicator.
          if ( startString )
          {
            startString = false;
            inString    = true;
          }
         
          break;

        case '#':
          // If a token was being built, process it and add it to the HTML
          // output line.
          if ( tokenFound )
            htmlLine += processToken(token, tokenFound, keywordList);

          // If the character is not in a comment or a string, then add the
          // pound sign with a blue color and bold it.
          if ( ! inComment && ! inString )
          {
            htmlLine += startColor(blue) + "<B>#</B>" + endColor();

            preprocessor = true;
          }

          // Otherwise, add the character to the HTML output line.
          else
            htmlLine += "#";

          break;

        // Catch all other characters.         
        default:
          // If a token was being built, process it and add it to the HTML
          // output line.
          if ( tokenFound )
            htmlLine += processToken(token, tokenFound, keywordList);

          // Add the character unedited to the HTML output line.
          htmlLine += chr;
      }
    }

    // If a number is being processed and it is not part of a token, then
    // terminate coloring.      
    if ( inNumber && ! tokenFound)
    {
      htmlLine += endColor();

      inNumber = false;
    }

    // If there are any unprocessed tokens, process them now.
    if ( tokenFound )
      htmlLine += processToken(token, tokenFound, keywordList);

    // If a C++ style end of line comment was started, then turn off the coloring
    // and italicizing in the HTML output line and indicate that no comments
    // are in effect.
    if ( inEOLComment )
    {
      htmlLine += "</I>" + endColor();

      inComment    = false;
      inEOLComment = false;
    }

    // If this line has an include preprocessor directive, then determine if
    // the system style includes should be colored as strings.
    if ( preprocessor && htmlLine.contains("include") )
      colorInclude(htmlLine, headerNamesList);

    // Output the HTML output line to standard out.
    cout << htmlLine << endl;
  }

  // End the HTML page by turning of preformatting and ending the page.
  cout << "</PRE>"  << endl;
  cout << "</BODY>" << endl;
  cout << "</HTML>" << endl;

  // Return to the system.
  return( 0 );
}
