6.6 Tokenizing Byte Buffers

In this section we present an abstract Tokenizer implementation (see Example 2-8) for tokenizing the contents of a ByteBuffer. The subsections that follow include concrete subclasses for tokenizing characters read from a memory-mapped file or an arbitrary Channel. The ByteBufferTokenizer class in Example 6-6 extends the AbstractTokenizer class of Example 2-9. You may want to reread that example before starting in on this one.

As you recall, the AbstractTokenizer class has abstract methods it calls when it needs more characters to tokenize. The ByteBufferTokenizer class implements these methods to get more characters by using a CharsetDecoder to decode bytes from a ByteBuffer, but it defines and calls new abstract methods when it needs to get more bytes into the ByteBuffer.

As with AbstractTokenizer, the code for this class is a little dense; it is intended as a moderately advanced example. The most interesting thing to note about this example is the use of the CharsetDecoder. Notice how it is obtained from the Charset object, how its error behavior is configured, how the decode( ) method is called, and how the return values of that method are handled. It is useful to compare the use of the CharsetDecoder in this example with the decoding loop of Example 6-5.

Example 6-6. ByteBufferTokenizer.java

package je3.nio;
import java.nio.*;
import java.nio.charset.*;
import java.io.IOException;
import je3.classes.AbstractTokenizer;

/**
 * This is an abstract Tokenizer implementation for tokenizing ByteBuffers.
 * It implements the two abstract methods of AbstractTokenizer, but defines
 * two new abstract methods that subclasses must implement.  This class 
 * provides byte-to-character decoding but leaves it up to concrete subclasses
 * to provide the ByteBuffers to decode
 */
public abstract class ByteBufferTokenizer extends AbstractTokenizer {
    CharsetDecoder decoder;   // For converting bytes to characters
    CharBuffer chars;         // The characters we're working on
    ByteBuffer bytes;         // The bytes supplied by our subclass.

    // Initialize a decoder for the specified Charset, and tell our superclass
    // how big our buffer is (and thus what size tokens we can handle).
    protected ByteBufferTokenizer(Charset charset, int charBufferSize) {
        maximumTokenLength(charBufferSize);
        decoder = charset.newDecoder( );
        decoder.onMalformedInput(CodingErrorAction.IGNORE);
        decoder.onUnmappableCharacter(CodingErrorAction.IGNORE);
    }

    // Create the text[  ] array and set numChars. 
    // These two fields are defined by the superclass.
    // Our superclass needs characters in the text[  ] array.  We're going to
    // decode bytes into characters in a CharBuffer.  So we create a CharBuffer
    // that uses text[  ] as its backing array.
    protected void createBuffer(int bufferSize) {
        // Make sure AbstractTokenizer calls this method only once
        assert text == null;

        text = new char[bufferSize];   // Create the new buffer.
        chars = CharBuffer.wrap(text); // Wrap a char buffer around it.
        numChars = 0;                  // Say how much text it contains.
    }

    // Fill or refill the buffer.
    // See AbstractTokenizer.fillBuffer( ) for what this method must do.
    protected boolean fillBuffer( ) throws IOException {
        // Make sure AbstractTokenizer is upholding its end of the bargain
        assert text!=null && 0 <= tokenStart && tokenStart <= tokenEnd &&
            tokenEnd <= p && p <= numChars && numChars <= text.length;

        // First, shift already tokenized characters out of the buffer
        if (tokenStart > 0) {
            // Shift array contents in the text[  ] array.
            System.arraycopy(text, tokenStart, text, 0, numChars-tokenStart);
            // And update buffer indexes. These fields defined in superclass.
            tokenEnd -= tokenStart; 
            p -= tokenStart;
            numChars -= tokenStart;
            tokenStart = 0; 

            // Keep the CharBuffer in sync with the changes we made above.
            chars.position(p);
        }

        // If there is still no space in the char buffer, then we've
        // encountered a token too large for our buffer size.  
        // We could try to recover by creating a larger buffer, but
        // instead, we just throw an exception
        if (chars.remaining( ) == 0) 
            throw new IOException("Token too long at " + tokenLine( ) + ":" +
                                  tokenColumn( ));

        // Get more bytes if we don't have a buffer or if the buffer 
        // has been emptied
        if ((bytes == null || bytes.remaining( )==0) && hasMoreBytes( ))
            bytes = getMoreBytes( );

        // Now that we have room in the chars buffer and data in the bytes
        // buffer, we can decode some bytes into chars
        CoderResult result = decoder.decode(bytes, chars, !hasMoreBytes( ));

        // Get the index of the last valid character plus one.
        numChars = chars.position( );

        if (result == CoderResult.OVERFLOW) {
            // We've filled up the char buffer.  It wasn't full before, so
            // we know we got at least one new character.
            return true;
        }
        else if (result == CoderResult.UNDERFLOW) {
            // This means that we decoded all the bytes and have room left
            // in the char buffer.  Normally, this is fine.  But there is
            // a possibility that we didn't actually get any characters.
            if (numChars > p) return true;
            else { // We didn't get any new characters.  Figure out why.
                if (!hasMoreBytes( )) {
                    // If there are no more bytes to read, then we're at EOF
                    return false;
                }
                else {
                    // If there are still bytes remaining to read, then 
                    // we probably got part of a multi-byte sequence, and need
                    // more bytes before we can decode a character from it.
                    // Try again (recursively) to get some more bytes.
                    return fillBuffer( );
                }
            }
        }
        else {
            // We used CodingErrorAction.IGNORE for the CharsetDecoder, so
            // the decoding result should always be one of the above two.
            assert false : "Unexpected CoderResult: " + result;
            return false;
        }
    }

    /**
     * Determine if more bytes are available.
     * @return true if and only if more bytes are available for reading.
     */
    protected abstract boolean hasMoreBytes( );
    
    /**
     * Get a buffer of bytes for decoding and tokenizing.
     * Repeated calls to this method may create a new ByteBuffer, 
     * or may refill and return the same buffer each time.
     * @return a ByteBuffer with its position set to the first new byte, and
     *         its limit set to the index of the last new byte plus 1.
     *         The return value should never be null.  If no more bytes are
     *         available, return an empty buffer (with limit == position).
     */
    protected abstract ByteBuffer getMoreBytes( ) throws IOException;
}

6.6.1 Tokenizing Memory Mapped Files

Example 6-7 is a concrete subclass of Example 6-6. MappedFileTokenizer extends ByteBufferTokenizer to tokenize the contents of a given file, as represented by a FileChannel object. The implementation is particularly simple because refilling the byte buffer is simply a matter of memory mapping a new section of the file.

Example 6-7. MappedFileTokenizer.java

package je3.nio;
import java.io.*;
import java.nio.*;
import java.nio.channels.*;
import java.nio.charset.*;

/**
 * This class implements the Tokenizer interface for a FileChannel and Charset.
 * It extends ByteBufferTokenizer and uses FileChannel.map( ) to memory map the
 * contents of the file into a ByteBuffer.
 */
public class MappedFileTokenizer extends ByteBufferTokenizer {
    static final int DEFAULT_BUFFER_SIZE = 32*1024;
    FileChannel channel;    // The file we want to tokenize
    int byteBufferSize;     // What size chunks to map at a time
    long filesize;          // How big the file is
    long fileposition;      // Starting position of the next chunk

    // Construct a tokenizer for the specified FileChannel, assuming the
    // file contains text encoded using the specified Charset.
    public MappedFileTokenizer(FileChannel channel, Charset charset)
        throws IOException
    {
        this(channel, charset, DEFAULT_BUFFER_SIZE, DEFAULT_BUFFER_SIZE);
    }

    // Construct a tokenizer for the specified file and charset, additionally
    // specifying the size of the byte and character buffers to use.
    public MappedFileTokenizer(FileChannel channel, Charset charset,
                               int charBufferSize, int byteBufferSize)
        throws IOException
    {
        super(charset, charBufferSize); // Superclass handles charset and size
        this.channel = channel;
        this.byteBufferSize = byteBufferSize; 
        filesize = channel.size( );      // Get the length of the file
        fileposition = 0;               // And start at the beginning
    }

    // Return true if there are more bytes for us to return
    protected boolean hasMoreBytes( ) { return fileposition < filesize; }

    // Read the next chunk of bytes and return them.
    protected ByteBuffer getMoreBytes( ) throws IOException {
        // Return byteBufferSize bytes, or the number remaining in the file
        // if that is less
        long length = byteBufferSize;
        if (fileposition + length > filesize) length = filesize-fileposition;

        // Memory map the bytes into a buffer
        ByteBuffer buffer =
            channel.map(FileChannel.MapMode.READ_ONLY, fileposition, length);
        // Store the position of the next chunk
        fileposition += length;
        // And return the memory-mapped buffer of bytes.
        return buffer;
    }
}

6.6.2 Tokenizing Channels

Example 6-8 is another subclass of Example 6-6. ChannelTokenizer extends ByteBufferTokenizer to fill the byte buffer from an arbitrary ReadableByteChannel. The implementation is straightforward.

Example 6-8. ChannelTokenizer.java

package je3.nio;
import java.io.*;
import java.nio.*;
import java.nio.channels.*;
import java.nio.charset.*;

public class ChannelTokenizer extends ByteBufferTokenizer {
    static final int DEFAULT_BUFFER_SIZE = 32*1024;
    ReadableByteChannel channel;  // Where the bytes come from
    ByteBuffer buffer;            // Where we put those bytes
    boolean hasMoreBytes;         // Whether there are any more

    // Construct a ChannelTokenizer to tokenize the specified channel, 
    // decoding its bytes using the specified charset.
    public ChannelTokenizer(ReadableByteChannel channel, Charset charset) {
        this(channel, charset, DEFAULT_BUFFER_SIZE, DEFAULT_BUFFER_SIZE);
    }

    // Construct a ChannelTokenizer for the channel and charset, additionally
    // specifying the character and byte buffer sizes to use.
    public ChannelTokenizer(ReadableByteChannel channel, Charset charset,
                            int charBufferSize, int byteBufferSize)
    {
        super(charset, charBufferSize); // Superclass handles charset and size
        this.channel = channel;         // Remember the channel
        this.hasMoreBytes = true;       // Assume some bytes in the channel
        // Allocate the buffer we'll use to store bytes
        buffer = ByteBuffer.allocateDirect(byteBufferSize);
    }

    // Return false when we're at EOF and have returned all bytes.
    protected boolean hasMoreBytes( ) { return hasMoreBytes; }

    // Refill the buffer and return it
    protected ByteBuffer getMoreBytes( ) throws IOException {
        buffer.clear( );  // Clear the buffer; prepare to fill it.
        // Read a chunk of bytes
        int bytesRead = channel.read(buffer);
        // If we are at EOF, remember that for hasMoreBytes( )
        if (bytesRead == -1) hasMoreBytes = false;
        // Prepare the buffer to be drained and return it
        buffer.flip( );   // Set limit to position and position to 0
        return buffer;   // And return it.
    }
}

[ Team LiB ]