Hi I am working on a project on writing our own huffman coding. I am currently having trouble writing the binary 1's and 0's to an output file. It works with smaller input files but for very large files it does not write anything out to the output file. The method responsible for writing is the compress method. Any help would be appreciated. Thank you!

package proj3;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.PriorityQueue;

public class Project3 {

    //variables for PriorityQueue and Huffman Tree
    private static PriorityQueue<BinaryNode<Character>> queue;
    private static BinaryNode<Character> huffTree;
    private static Map<Character, String> table = new LinkedHashMap<Character, String>();

    /**
     * Method for creating Huffman Tree
     * @param counts Map that contains all characters and their frequencies
     * @return the Huffman Tree
     */
    public static BinaryNode<Character> makeTree(Map<Character, Integer> counts)
    {
        queue = new PriorityQueue<BinaryNode<Character>>();

        for(Character c : counts.keySet())
        {
            BinaryNode<Character> tree = new BinaryNode<Character>(c, counts.get(c), null, null);
            queue.add(tree);
        }

        while(!queue.isEmpty())
        {
            if(queue.size() >= 2)
            {   
                BinaryNode<Character> n1 = queue.remove();
                BinaryNode<Character> n2 = queue.remove();
                Integer weight = n1.getFreq() + n2.getFreq();
                huffTree = new BinaryNode<Character>(null, weight, n1, n2);
                queue.add(huffTree);
            }
            if(queue.size() == 1)
            {
                return queue.remove();
            }
        }
        return huffTree;
    }

    public static void encode(BinaryNode<Character> node, String s)
    {
        if(!node.isLeaf())
        {
            encode(node.getLeft(), s + "0");
            encode(node.getRight(), s + "1");
        }
        else
        {
            table.put(node.getElement(), s);
        }
    }

    public static void compress(String in, String out) throws IOException
    {
        try 
        {
            File outFile = new File(out);
            FileOutputStream compressedFile = new FileOutputStream(outFile);
            Scanner infile = new Scanner(new FileInputStream(in));
            while(infile.hasNext())
            {
                infile.useDelimiter("");
                String str = infile.next();
                Character ch = str.charAt(0);
                compressedFile.write(table.get(ch).getBytes());
            }
            compressedFile.write(table.get('^').getBytes());
            infile.close();
            compressedFile.flush();
            compressedFile.close();
        }
        catch (FileNotFoundException e) 
        {
            System.err.println("File not found.");
            e.printStackTrace();
        }
    }

    public static void decompress(String s)
    {

    }

    public static void printEncodings(Map<Character, String> m)
    {
        ArrayList<Character> chars = new ArrayList<Character>();

        System.out.println("Character Encodings");
        System.out.println("---------------------");
        for(Character c : m.keySet())
        {
            chars.add(c);
            Collections.sort(chars);
        }
        for(Character c : chars)
        {
            if((int) c == 10)
            {
                System.out.print("Newline" + "\t" + table.get(c));
            }
            else
            {
                System.out.println(c + "\t" + table.get(c));
            }
        }
        System.out.println();
        System.out.println("Total Characters: " + chars.size());
    }

    /**
     * Method for creating map with character and its frequencies
     * @param s the file name to be opened
     * @return the Map containing characters and frequencies
     */
    public static Map<Character, Integer> charCount(String s){

        Map<Character, Integer> counts = new LinkedHashMap<Character, Integer>();
        ArrayList<Character> chars = new ArrayList<Character>();

        try {
            Scanner file = new Scanner(new FileInputStream(s));
            while(file.hasNext()){
                file.useDelimiter("");
                String str = file.next();
                Character c = str.charAt(0);
                if(counts.containsKey(c)){
                    counts.put(c, counts.get(c) + 1);
                }
                else{
                    counts.put(c, 1);
                }
            }
            counts.put('^', 1);
            System.out.println("Character Frequencies");
            System.out.println("---------------------");
            for(Character c : counts.keySet())
            {
                chars.add(c);
                Collections.sort(chars);
            }
            for(Character c : chars)
            {
                if((int) c == 10)
                {
                    System.out.print("Newline" + "\t" + counts.get(c));
                }
                else
                {
                    System.out.println(c + "\t" + counts.get(c));
                }
            }
            System.out.println();
            System.out.println("Total characters: " + chars.size() + "\n");
            file.close();
        } 
        catch (FileNotFoundException e) {
            System.err.println("File not found.");
            System.exit(0);
        }
        return counts;
    }

    public static void main(String[] args){

        if(args.length != 3)
        {
            throw new IllegalArgumentException("Invalid number of arguments.");
        }
        encode(makeTree(charCount(args[0])), "");
        printEncodings(table);
        try {
            compress(args[0], args[1]);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}

I can't find any problems with it, except that it is bad style to use static fields where local variables could be used, such as queue and huffTree. The compress method is also strangely named since the output file is much larger than the input file, so it's far from compression. Even with all of that, it seemed to work as intended when I tested it.

The compressed file is much larger? I thought I was compressing the characters so they take less bits using huffman encoding. Am I doing something wrong?

For example, when I tested your program it encoded 'A' as the string "1111000101", which is replacing a 1-character string with a 10-character string, a 1000% increase in size. If you are serious about outputting small files then you may want to break up the output into groups of 8 bits and encode each one as a single byte, so 'A' could be encoded as a little over one byte and "AAAA" would be exactly 5 bytes, and characters with smaller encodings would be packed in even tighter.

I'm sorry but I dont see how you are getting 1111000101 for 'A'. Is it by itself in a text file?

BTW, where is the BinaryNode generic class defined? This seems to be a crucial omission.

I think that the crux of the matter is that you have misunderstood what theString class's getBytes() method does. What it returns is an array of bytes containing the raw representation of the chracters themselves in the default chracter set, which is UTF-8. This means that when you write out the String "1111000101", it actually is writing two bytes per character of each digit.

What you actually want is something that takes the encoded String and converts it to the bits in single byte or a pair of bytes. The following should do the trick:

byte[] packEncodedChars(String[] bitStrings) {
    ArrayList<Byte> bits = new ArrayList<Byte>();
    byte target;
    int bitCount = 0;

    for (String s : bitStrings) {
        for (char bit : s.toCharArray()) {
            if (bit.equals("1") {
                target |= 1 << bitCount;
            }
            bitCount++;
            if (bitCount >= 8) {
                bitCount = 0;
                bits.append(target);
                target = 0;
            }
        }
    }
    byte[] bitstring = new byte[bits.Length()];

    for (int i = 0; i < bits.Length(); i++) {
        bitstring[i] = bits.get(i);
    }

    return bitstring;
}

Yes I can see now that my "compressed" file was much larger than the original. Your code works Schol-R-LEA but somehow it doesnt work for small files. Is there any way I can fix that?

I am trying to split the sequence of 1's and 0's (originally put in String format) into groups of 8 and return the corresponding value. Is this an acceptable way?

for(char bit : textbytes.toCharArray())
                {
                    if(bit == '1')
                    {
                        target |= 1 << bitCount;
                    }
                    bitCount++;
                    if(bitCount >= 8)
                    {
                        bitCount = 0;
                        bits.add(target);
                        target = 0;
                    }
                    else
                    {
                        bits.add(target);
                    }
                }
                byte[] bitstring = new byte[bits.size()];
                for(int i = 0; i < bits.size(); i ++)
                {
                    bitstring[i] = bits.get(i);
                    compressedFile.write(bitstring[i]);
                }

somehow it doesnt work for small files

You are likely to get help that comes quicker and is more useful if you explain your problem.

Why are you storing the bytes in bitstring? What use will they be after they have been written to compressedFile?

Why are you adding a byte to your bits list for every bit? That will result in an output file that has a byte for every '1' and every '0', which is just as bad for the output file size as your project was originally. If you want compression, the least you can do is have fewer bytes than you have bits.

tingwong: You say it doesn't work on smaller files? How small are we talking? Keep in mind that with Huffman encoding, there is a minimum size overhead in the form of the encoding table. Adaptive Huffman encoding doesn't have this limitation, but isn't as efficient overall (because you may be changing the encoding of a given character over time).

This article has been dead for over six months. Start a new discussion instead.