001    package org.biojava3.core.sequence.storage;
002    
003    import java.util.ArrayList;
004    import java.util.HashMap;
005    import java.util.List;
006    import java.util.Map;
007    import org.biojava3.core.sequence.AccessionID;
008    import org.biojava3.core.sequence.compound.NucleotideCompound;
009    import org.biojava3.core.sequence.template.CompoundSet;
010    import org.biojava3.core.sequence.template.Sequence;
011    
012    /**
013     * Implementation of the 2bit encoding. This will default to the following
014     * encodings:
015     *
016     * <ul>
017     * <li>0 - T</li>
018     * <li>1 - C</li>
019     * <li>2 - A</li>
020     * <li>3 - G</li>
021     * </ul>
022     *
023     * We also do not support case sensitive encodings therefore if you pass a
024     * lowercased a this will be treated as if it is an uppercase A and we will
025     * erase that information.
026     *
027     * @author ayates
028     */
029    public class TwoBitSequenceReader<C extends NucleotideCompound> extends BitSequenceReader<C> {
030    
031            public TwoBitSequenceReader(Sequence<C> sequence) {
032                    super(new TwoBitArrayWorker<C>(sequence), sequence.getAccession());
033            }
034    
035            public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet) {
036                    this(sequence, compoundSet, new AccessionID("Unknown"));
037            }
038    
039            public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) {
040                    super(new TwoBitArrayWorker<C>(sequence, compoundSet), accession);
041            }
042    
043            public TwoBitSequenceReader(TwoBitArrayWorker<C> worker) {
044                    super(worker, new AccessionID("unknown"));
045            }
046    
047            public TwoBitSequenceReader(TwoBitArrayWorker<C> worker, AccessionID accession) {
048                    super(worker, accession);
049            }
050    
051            /**
052             * Extension of the BitArrayWorker which provides the 2bit implementation
053             * code. This is intended to work with the 4 basic nucelotide types. If you
054             * require a different version of the encoding used here then extend
055             * and override as required.
056             *
057             * @param <C> Must extend NucleotideCompound
058             */
059            public static class TwoBitArrayWorker<C extends NucleotideCompound> extends BitArrayWorker<C> {
060    
061                    public TwoBitArrayWorker(CompoundSet<C> compoundSet, int length) {
062                            super(compoundSet, length);
063                    }
064    
065                    public TwoBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
066                            super(compoundSet, sequence);
067                    }
068    
069                    public TwoBitArrayWorker(Sequence<C> sequence) {
070                            super(sequence);
071                    }
072    
073                    public TwoBitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
074                            super(sequence, compoundSet);
075                    }
076    
077                    /**
078                     * Masking value used for extracting the right most 2 bits from a byte
079                     */
080                    private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1));
081    
082                    @Override
083                    protected byte bitMask() {
084                            return MASK;
085                    }
086    
087                    @Override
088                    protected int compoundsPerDatatype() {
089                            return 16;
090                    }
091    
092                    /**
093                     * Returns a Map which encodes TCAG into positions 0,1,2,3.
094                     */
095                    @Override
096                    @SuppressWarnings("serial")
097                    protected Map<C, Integer> generateCompoundsToIndex() {
098                            final CompoundSet<C> cs = getCompoundSet();
099                            return new HashMap<C, Integer>() {
100    
101                                    {
102                                            put(cs.getCompoundForString("T"), 0);
103                                            put(cs.getCompoundForString("C"), 1);
104                                            put(cs.getCompoundForString("A"), 2);
105                                            put(cs.getCompoundForString("G"), 3);
106                                            put(cs.getCompoundForString("t"), 0);
107                                            put(cs.getCompoundForString("c"), 1);
108                                            put(cs.getCompoundForString("a"), 2);
109                                            put(cs.getCompoundForString("g"), 3);
110                                    }
111                            };
112                    }
113    
114                    /**
115                     * Returns a List which encodes TCAG into positions 0,1,2,3.
116                     */
117                    @Override
118                    protected List<C> generateIndexToCompounds() {
119                            CompoundSet<C> cs = getCompoundSet();
120                            List<C> result = new ArrayList<C>();
121                            result.add( cs.getCompoundForString("T"));
122    
123    
124                            result.add( cs.getCompoundForString("C"));
125                            result.add( cs.getCompoundForString("A"));
126                            result.add( cs.getCompoundForString("G"));
127                            return result;
128                    }
129            }
130    
131    }