|
CompressArithmetic |
|
import java.io.*;
/**
* CompressArithmetic.java<P>
*
* Mark Nelson<BR>
* March 8, 1996<BR>
* http://web2.airmail.net/markn<BR>
* **** moded by David Scott in May of 2001 to make bijective<BR>
* Ported to Java by Tim Tyler, July 2001 - http://mandala.co.uk/biac/<P>
*
* DESCRIPTION<BR>
* -----------<P>
*
* This program performs an order-0 adaptive arithmetic encoding<BR>
* function on an input file/stream, and sends the result to an<BR>
* output file or stream.<P>
*
* This program contains the source code from the 1987 CACM
* article by Witten, Neal, and Cleary. I have taken the
* source modules and combined them into this single file for
* ease of distribution and compilation. Other than that,
* the code is essentially unchanged.<P>
*
* This program takes two arguments: an input file and an output
* file. You can leave off one argument and send your output to
* stdout. Leave off two arguments and read your input from stdin
* as well.<P>
*
* This program accompanies my article "Data Compression with the
* Burrows-Wheeler Transform."<P>
*
* Build Instructions<BR>
* ------------------<P>
*
* Define the constant unix for UNIX or UNIX-like systems. The
* use of this constant turns off the code used to force the MS-DOS
* file system into binary mode. g++ already does this, and your
* UNIX C++ compiler might also.<P>
*
* Borland C++ 4.5 16 bit : bcc -w ari.cpp<BR>
* Borland C++ 4.5 32 bit : bcc32 -w ari.cpp<BR>
* Microsoft Visual C++ 1.52 : cl /W4 ari.cpp<BR>
* Microsoft Visual C++ 2.1 : cl /W4 ari.cpp<BR>
* g++ : g++ -o ari ari.cpp<P>
*
* Typical Use<BR>
* -----------<P>
*
* rle < raw-file | bwt | mtf | rle | ari > compressed-file<P>
*
*/
public class CompressArithmetic {
final static int No_of_chars = 256; /* Number of character symbols */
static FileOutputStream out = null;
// The number of symbols is 256 not 257 EOF is not needed
/* TRANSLATION TABLES BETWEEN CHARACTERS AND SYMBOL INDEXES. */
final static int No_of_symbols = No_of_chars; /* Total number of symbols */
static int[] char_to_index = new int[No_of_chars]; /* To index from character */
static char[] index_to_char = new char[No_of_symbols+1]; /* To character from index */
/* ADAPTIVE SOURCE MODEL */
static long[] freq = new long[No_of_symbols+1]; /* Symbol frequencies */
static long[] cum_freq = new long[No_of_symbols+1]; /* Cumulative symbol frequencies */
// only 1 to 256 used for the symbols
/* DECLARATIONS USED FOR ARITHMETIC ENCODING AND DECODING */
/* SIZE OF ARITHMETIC CODE VALUES. */
final static int Code_value_bits = 62; /* Number of bits in a code value */
/* BIT OUTPUT ROUTINES. */
/* THE BIT BUFFER. */
static int buffer; /* Bits buffered for output */
static int bits_to_go; /* Number of bits free in buffer */
static int zbuffer; /* Number of zero bytes buffered up */
static int lobuffer; /* one if z lo tail in effect */
/* INITIALIZE FOR BIT OUTPUT. */
static void start_outputing_bits() {
buffer = 0; /* Buffer is empty to start */
bits_to_go= 8; /* with. */
zbuffer = 0; /* number of zero bytes buffered up */
lobuffer = 0; /* one if possible cuttoff */
}
/* OUTPUT A BIT. */
static void output_bit(int bit) throws IOException
{ buffer >>= 1;
if (bit != 0) buffer |= 0x80; /* Put bit in top of buffer.*/
bits_to_go -= 1;
if (bits_to_go==0) { /* Output buffer if it is */
if (buffer != 0) {
if (lobuffer != 0) writeOut(0x40);
if (zbuffer > 0 && buffer == 0x40) lobuffer = 1;
if (buffer != 0x40) lobuffer = 0; /* 0x40 just because its cool */
for(;zbuffer > 0; zbuffer--) writeOut(0);
if (lobuffer != 1) writeOut(buffer);
}
else zbuffer++;
bits_to_go = 8;
buffer = 0;
}
}
/* FLUSH OUT THE LAST BITS. */
static void done_outputing_bits() throws IOException
{ /* write out remaining bits most of the time */
if (buffer != 0) {
if (bits_to_go != 0) buffer >>= bits_to_go;
if (lobuffer != 0) writeOut(0x40);
if (zbuffer > 0 && buffer == 0x40) lobuffer = 1;
if (buffer != 0x40) lobuffer = 0;
for(;zbuffer > 0; zbuffer--) writeOut(0);
if (lobuffer != 1) writeOut(buffer);
}
}
/* CURRENT STATE OF THE ENCODING. */
static long low, high, swape; /* Ends of the current code region */
static long bits_to_follow; /* Number of opposite bits to output after */
/* the next bit. */
/* OUTPUT BITS PLUS FOLLOWING OPPOSITE BITS. */
static void bit_plus_follow(int bit) throws IOException {
output_bit(bit); /* Output the bit. */
while(bits_to_follow>0) {
output_bit((bit == 0) ? 1 : 0); /* Output bits_to_follow */
bits_to_follow -= 1; /* opposite bits. Set */
} /* bits_to_follow to zero. */
}
/* ARITHMETIC ENCODING ALGORITHM. */
/* START ENCODING A STREAM OF SYMBOLS. */
static void start_encoding() {
low = 0; /* Full code range. */
high =(((long)1<<Code_value_bits)-1);
bits_to_follow = 0; /* No bits to follow next. */
}
static long dss(long r,long top,long bot) throws IOException {
long t1,t2; /* r could be 64 bits top and bot 32 bits */
/* top < bot */
if (bot == 0) fprintf(" not possible \n");
t1 = r/bot;
t2 = r -(t1*bot);
t1 = t1*top;
t2 = t2*top;
t2 = t2/bot;
return(t1 + t2);
}
/* ENCODE A SYMBOL. */
static void encode_symbol(int symbol,long cum_freq[]) throws IOException {
long range; /* Size of the current code region */
range =(long)(high-low)+1;
high = low + dss(range,cum_freq[symbol-1],cum_freq[0]) -1;
// (range*cum_freq[symbol-1])/cum_freq[0]-1;
low = low + dss(range, cum_freq[symbol],cum_freq[0]);
// (range*cum_freq[symbol])/cum_freq[0];
low ^=(((long)1<<Code_value_bits)-1); /* mapping so largest segment at bottom */
high ^=(((long)1<<Code_value_bits)-1); /* key is so symbol 2 will have at least */
swape = low; /* one "one bit" so bijective at end */
low = high;
high = swape;
for(;;) { /* Loop to output bits. */
if (high<(2*((((long)1<<Code_value_bits)-1)/4+1))) {
bit_plus_follow(0); /* Output 0 if in low half. */
}
else if (low>=(2*((((long)1<<Code_value_bits)-1)/4+1))) { /* Output 1 if in high half.*/
bit_plus_follow(1);
low -=(2*((((long)1<<Code_value_bits)-1)/4+1));
high -=(2*((((long)1<<Code_value_bits)-1)/4+1)); /* Subtract offset to top. */
}
else if (low>=((((long)1<<Code_value_bits)-1)/4+1) /* Output an opposite bit */
&& high<(3*((((long)1<<Code_value_bits)-1)/4+1))) { /* later if in middle half. */
bits_to_follow += 1;
low -=((((long)1<<Code_value_bits)-1)/4+1); /* Subtract offset to middle*/
high -=((((long)1<<Code_value_bits)-1)/4+1);
}
else
break; /* Otherwise exit loop. */
low = 2*low;
high = 2*high+1; /* Scale up code range. */
}
low ^=(((long)1<<Code_value_bits)-1);
high ^=(((long)1<<Code_value_bits)-1);
swape = low;
low = high;
high = swape;
}
/* FINISH ENCODING THE STREAM. */
static void done_encoding() throws IOException {
if (bits_to_follow > 0) { /* saving at least one bit here */
bit_plus_follow(1);
}
else
{
if (high !=(((long)1<<Code_value_bits)-1)) bit_plus_follow(1);
}
}
/* THE ADAPTIVE SOURCE MODEL */
/* INITIALIZE THE MODEL. */
static void start_model() {
int i;
for(i = 0; i<No_of_chars; i++) { /* Set up tables that */
char_to_index[i] = i+1; /* translate between symbol */
index_to_char[i+1] =(char) i; /* indexes and characters. */
}
for(i = 0; i<=No_of_symbols; i++) { /* Set up initial frequency */
freq[i] = 1; /* counts to be one for all */
cum_freq[i] = No_of_symbols-i; /* symbols. */
}
freq[0] = 0; /* Freq[0] must not be the */
} /* same as freq[1]. */
/* UPDATE THE MODEL TO ACCOUNT FOR A NEW SYMBOL. */
static void update_model(int symbol) throws IOException {
int i; /* New index for symbol */
for(i = symbol; freq[i]==freq[i-1]; i--) ; /* Find symbol's new index. */
if (i<symbol) {
int ch_i, ch_symbol;
ch_i = index_to_char[i]; /* Update the translation */
ch_symbol = index_to_char[symbol]; /* tables if the symbol has */
index_to_char[i] =(char) ch_symbol; /* moved. */
index_to_char[symbol] =(char) ch_i;
char_to_index[ch_i] = symbol;
char_to_index[ch_symbol] = i;
}
freq[i] ++; /* Increment the frequency */
while(i>0) { /* count for the symbol and */
i -= 1; /* update the cumulative */
cum_freq[i] ++; /* frequencies. */
}
}
static void fprintf(String s) {
System.out.println(s);
}
/*
static void fprintf(String s) {
Log.log(s);
}
static void putc(char c) {
Log.log("" + c);
}
static void putc(byte b) {
Log.log("" + b);
}
static void putc(int i) {
Log.log("" +(byte)i);
}
*/
static void writeOut(int i) throws IOException {
out.write((byte)i);
}
/**
* Compresses input_file and store the results in output_file
*/
public static void compress(String input_file, String output_file) throws IOException {
int zerf = 0;
int onef = 0;
FileInputStream in = null;
try {
in = new FileInputStream(input_file);
out = new FileOutputStream(output_file);
byte[] _array = new byte[1];
int rb;
start_model(); /* Set up other modules. */
start_outputing_bits();
start_encoding();
for(;;) { /* Loop through characters. */
int ch;
int symbol;
if ((rb = in.read(_array, 0, 1)) == -1) {/* Read the next character. */
break;
}
ch = _array[0] & 0xff;
symbol = char_to_index[ch]; /* Translate to an index. */
if (symbol == 1) zerf = 1;
if (zerf == 1 && symbol == 2) onef = 1;
if (symbol > 1) zerf = 0;
if (symbol > 2) onef = 0;
encode_symbol(symbol,cum_freq); /* Encode that symbol. */
update_model(symbol); /* Update the model. */
}
if ((zerf + onef) > 0) {
encode_symbol(2, cum_freq);
update_model(2);
}
// encode_symbol(EOF_symbol,cum_freq); /* Encode the EOF symbol. */
done_encoding(); /* Send the last few bits. */
done_outputing_bits();
in.close();
out.close();
}
catch(Exception e) {
fprintf("Error while processing file:");
e.printStackTrace();
}
}
/**
* Handles the Commmand-line interface to the compressor
*/
public static void main(String[] args) throws IOException {
String input_file = null;
String output_file = null;
fprintf("Bijective Arithmetic coding version May 30, 2001\n");
fprintf("Arithmetic coding from ");
if (args.length > 0) {
input_file = args[0];
fprintf(args[0]);
}
else {
fprintf("Error: No input file specified");
System.exit(0);
}
fprintf(" to ");
if (args.length > 1) {
output_file = args[1];
fprintf(args[1]);
}
else {
fprintf("Error: No output file specified");
System.exit(0);
}
fprintf("\n");
compress(input_file, output_file);
}
}
|
CompressArithmetic |
|