/* ----------------------------------------------------------- *\
**  comp5.cpp -- Compress an ASCII text file.                  **
** ----------------------------------------------------------- **
** - A fast, efficient text compressor that can acheive up to  **
** about 33 percent reduction on most text files.              **
** - Lowercase characters are compressed 3 per word. Since     **
** there are 32 values in 5 bits, the program can pack the     **
** basic 26-character alphabet plus 6 specials characters:     **
** <space>, comma, period, semicolon, hyphen, and quote.       **
** - If a value of 00 is detected, the next character is taken **
** as a literal respresentation if the next character has a    **
** value in excess of 127 (0x80); otherwise, the value         **
** represents a repeated character count of the next char.     **
** ----------------------------------------------------------- **
**   Copyright (c) 1993 by Chuck Guzis. All rights reserved.   **
\* ----------------------------------------------------------- */

#include  <stdio.h>
#include  <stdlib.h>
#include  <string.h>
#include  <ctype.h>

#define LINE_MAX 128          // Length of the longest line
#define DUPE_THRESHOLD 4      // Smallest duplicated byte count
typedef unsigned char UCHAR;  // Define a UCHAR
typedef unsigned int UINT;    // Define a UINT
long Comp_Count;              // How many bytes of compressed text?
long Orig_Count;              // How many bytes of original text?
UCHAR
  lbi[ LINE_MAX+1],           // Input line buffer
  lbo[ LINE_MAX+1];           // Output line buffer
FILE *in;                     // Input file
FILE *out;                    // Output file

//  Our 5-bit alphabet.
unsigned char Alphabet[] = "abcdefghijklmnopqrstuvwxyz ,.;- ";

//  An array for testing for the above.
UCHAR AlphaMap[256];          // One byte per ASCII value

//  Function prototypes.
void main(int, char **);      // Main function
void Error(char *, char *);   // Error routine
void Compress(void);          // Compressor
int DupeCheck(UCHAR *);       // Scan for duplicate characters

void main( int argc, char *argv[])
{
  UCHAR
    *ch,                      // Pointer and short int used to
    i;                        //  build AlphaMap.
 
  in = NULL;
  out = NULL;
  Comp_Count = 0;
  Orig_Count = 0;

//  Open the files, give an error if problems arise.
  if ( argc != 3)
    Error( "Command form is - COMP5 <in-file> <out-file>\n", NULL);
  if ( !(in = fopen( argv[1], "r")) )
    Error( "Can\'t open %s.\n", argv[1]);
  if ( !(out = fopen( argv[2], "wb")) )
    Error( "Can\'t create %s.\n", argv[2]);

//  Construct the AlphaMap array.  This is done purely for speed, as
//  the memchr() function could be used to search the string.
  memset( AlphaMap, 0, sizeof( AlphaMap)); // Clear it to 0
  for ( ch = Alphabet, i = 1; *ch; ch++, i++)
    AlphaMap[ *ch] = (UCHAR) i;

//  Read the input file, compress and encode it.
  while( fgets( (char *)lbi, sizeof( lbi)-1, in))
  { // Read until eof
    Orig_Count += (strlen( (char *)lbi));   // Total original UCHARs
    Compress();
  } // Digest the file

//  Show some summary data.
  printf( "\n\n"
    "\tOriginal text size:\t\t%ld\n"
    "\tCompressed text length:\t\t%ld\n"
    "\tSavings:\t\t\t%ld\n",
    Orig_Count, Comp_Count, Orig_Count-Comp_Count);
  fclose(in);
  fclose( out);
  exit(0);
}  // End of main

//  Compress - Compress a line and do repeated byte encoding.
//  ---------------------------------------------------------
//  This is a two-pass operation. Internally, we store a flag in
//  lbi of 0x8000 + count. The next byte in lbi[] may contain
//  the repeated byte.

void Compress( void)
{
  register UINT k;
  UCHAR
    *ch1,               // Source pointer
    *ch2;               // Destination pointer

  for ( ch1 = lbi, ch2 = lbo; *ch1;)
  { // Scan the line
    if ( (k = DupeCheck( ch1)) > DUPE_THRESHOLD )
    { // Compression is okay
      *ch2++ = 0;
      *ch2++ = (UCHAR) k;
      *ch2++ = *ch1;            // Store the repeated string
      ch1 += k;
    }
    else
    { // See about characters > 127 -- quote them
      if ( *ch1 > 127)
      { // If needs quoting
        *ch2++ = 0;
        *ch2++ = *ch1++;
      }
      else
      {

//  See if there are three consecutive symbols that reside in
//  our 5-bit alphabet. Note that an end-of-line will fail the test
//  automatically, so it doesn't require checking.

    if ( AlphaMap[ *ch1] &&
         AlphaMap[ *(ch1+1)] &&
         AlphaMap[ *(ch1+2)])
    { // Bingo--got all three
      k = 0x8000 | ((AlphaMap[ *ch1]-1) << 10) |
                   ((AlphaMap[ *(ch1+1)]-1) << 5) |
                    (AlphaMap[ *(ch1+2)] -1);
      *ch2++ = (UCHAR) (k >> 8);   // Store first byte
      *ch2++ = (UCHAR) (k & 255);  // Store second byte
      ch1 += 3;  // Advance
    }
    else
     *ch2++ = *ch1++;  // Everything else
      } // If character below 128
    } // If not compressible string
  } // Scan the line
  Comp_Count += (ch2 - lbo);  // Update compressed count
  fwrite((char *)lbo, (size_t)(ch2 - lbo), sizeof( UCHAR), 
    out);  // Dump the line
  return;
} // Compress

//  DupeCheck - Return the number of duplicated characters.
//  -------------------------------------------------------
//  Scans to end of line. Always returns at least 1.

int DupeCheck( UCHAR *what)
{
  UCHAR cref;               // Reference character
  int k;                    // Induction variable
  for ( cref = *what++, k = 1; *what; what++, k)
    if ( cref != *what)
      break;                // Just scan for same character
  return k;
} // DupeCheck

//  Error - Give an error and a message, then exit.
//  -----------------------------------------------
//  A string argument may be given, if desired.

void Error( char *msg, char *str)
{
  if ( in)
    fclose(in);      // Close files if still open
  if ( out)
    fclose( out);
  if ( str)
    fprintf( stderr, msg, str);
  else
    fprintf( stderr, msg);
  exit(1);
} // Error
