001    package org.biojava3.core.sequence.storage;
002    
003    import java.util.Iterator;
004    import java.util.List;
005    import java.util.Map;
006    
007    import org.biojava3.core.sequence.AccessionID;
008    import org.biojava3.core.sequence.compound.NucleotideCompound;
009    import org.biojava3.core.sequence.template.Compound;
010    import org.biojava3.core.sequence.template.CompoundSet;
011    import org.biojava3.core.sequence.template.SequenceMixin;
012    import org.biojava3.core.sequence.template.ProxySequenceReader;
013    import org.biojava3.core.sequence.template.Sequence;
014    import org.biojava3.core.sequence.template.SequenceView;
015    
016    /**
017     * An implementation of the popular bit encodings. This class provides the
018     * Sequence view over what is actually carried out in the {@link BitArrayWorker}
019     * instances. These are the objects that carry out array storage as well as
020     * indexing into those arrays. New bit encodings can be written by extending
021     * this class and a worker class. There are a number of issues with this
022     * type of storage engine:
023     *
024     * <ul>
025     * <li>We can only support a finite number of {@link Compound}s; 2 bit allows no N compounds</li>
026     * <li>For real savings you must read the sequence in using your own
027     * Reader and a {@link BitArrayWorker} instance</li>
028     * </ul>
029     *
030     * @author ayates
031     *
032     * @param <C> Type of compound; must extend {@link NucleotideCompound}
033     */
034    public class BitSequenceReader<C extends Compound> implements ProxySequenceReader<C> {
035    
036        private final AccessionID accession;
037        private final BitArrayWorker<C> worker;
038    
039        /**
040         * Instance which allows you to supply a different @{BitArrayWorker}
041         * object.
042         */
043        public BitSequenceReader(BitArrayWorker<C> worker, AccessionID accession) {
044            this.accession = accession;
045            this.worker = worker;
046        }
047    
048        /**
049         * Class is immutable & so this is unsupported
050         */
051        
052        public void setCompoundSet(CompoundSet<C> compoundSet) {
053            throw new UnsupportedOperationException("Cannot reset the CompoundSet; object is immutable");
054        }
055    
056        /**
057         * Class is immutable & so this is unsupported
058         */
059        
060        public void setContents(String sequence) {
061            throw new UnsupportedOperationException(getClass().getSimpleName() + " is an immutable data structure; cannot reset contents");
062        }
063    
064        /**
065         * Counts the number of times a compound appears in this sequence store
066         */
067        
068        public int countCompounds(C... compounds) {
069            return SequenceMixin.countCompounds(this, compounds);
070        }
071    
072        
073        public AccessionID getAccession() {
074            return accession;
075        }
076    
077        /**
078         * Returns this Sequence store as a List
079         */
080        
081        public List<C> getAsList() {
082            return SequenceMixin.toList(this);
083        }
084    
085        /**
086         * Returns the compound at the specified biological index
087         */
088        
089        public C getCompoundAt(int position) {
090            return worker.getCompoundAt(position);
091        }
092    
093        /**
094         * Returns the compound set backing this store
095         */
096        
097        public CompoundSet<C> getCompoundSet() {
098            return worker.getCompoundSet();
099        }
100    
101        /**
102         * Returns the first occurrence of the given compound in this store; performs
103         * a linear search
104         */
105        
106        public int getIndexOf(C compound) {
107            return SequenceMixin.indexOf(this, compound);
108        }
109    
110        /**
111         * Returns the last occurrence of the given compound in this store; performs
112         * a linear search
113         */
114        
115        public int getLastIndexOf(C compound) {
116            return SequenceMixin.lastIndexOf(this, compound);
117        }
118    
119        /**
120         * Returns the length of the sequence
121         */
122        
123        public int getLength() {
124            return worker.getLength();
125        }
126    
127        /**
128         * Returns the sequence as a String
129         */
130        
131        public String getSequenceAsString() {
132            return SequenceMixin.toStringBuilder(this).toString();
133        }
134    
135        /**
136         * Returns a sub sequence view
137         */
138        public SequenceView<C> getSubSequence(final int start, final int end) {
139            return SequenceMixin.createSubSequence(this, start, end);
140        }
141    
142        /**
143         * Provides basic iterable access to this class
144         */
145        
146        public Iterator<C> iterator() {
147            return SequenceMixin.createIterator(this);
148        }
149        
150        public SequenceView<C> getSubSequence(Integer start, Integer end) {
151            return getSubSequence((int) start, (int) end);
152        }
153    
154        @Override
155        public SequenceView<C> getInverse() {
156            return SequenceMixin.inverse(this);
157        }
158    
159        /**
160         * The logic of working with a bit has been separated out into this class
161         * to help developers create the bit data structures without having to
162         * put the code into an intermediate format and to also use the format
163         * without the need to copy this code.
164         *
165         * This class behaves just like a {@link Sequence} without the interface
166         *
167         * @author ayates
168         *
169         * @param <C> The {@link Compound} to use
170         */
171        public static abstract class BitArrayWorker<C extends Compound> {
172    
173            private final CompoundSet<C> compoundSet;
174            private final int length;
175            private final int[] sequence;
176            private transient List<C> indexToCompoundsLookup = null;
177            private transient Map<C, Integer> compoundsToIndexLookup = null;
178            public static final int BYTES_PER_INT = 32;
179    
180            public BitArrayWorker(Sequence<C> sequence) {
181                this(sequence.getCompoundSet(), sequence.getLength());
182                populate(sequence);
183            }
184    
185            public BitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
186                this(compoundSet, sequence.length());
187                populate(sequence);
188            }
189    
190            public BitArrayWorker(CompoundSet<C> compoundSet, int length) {
191                this.compoundSet = compoundSet;
192                this.length = length;
193                this.sequence = new int[seqArraySize(length)];
194            }
195    
196            public BitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
197                this.compoundSet = compoundSet;
198                this.sequence = sequence;
199                this.length = sequence.length;
200            }
201    
202            /**
203             * This method should return the bit mask to be used to extract the
204             * bytes you are interested in working with. See solid implementations
205             * on how to create these
206             */
207            protected abstract byte bitMask();
208    
209            /**
210             * Should return the maximum amount of compounds we can encode per int
211             */
212            protected abstract int compoundsPerDatatype();
213    
214            /**
215             * Should return the inverse information that {@link #generateCompoundsToIndex() }
216             * returns i.e. if the Compound C returns 1 from compoundsToIndex then we
217             * should find that compound here in position 1
218             */
219            protected abstract List<C> generateIndexToCompounds();
220    
221            /**
222             * Returns what the value of a compound is in the backing bit storage i.e.
223             * in 2bit storage the value 0 is encoded as 00 (in binary).
224             */
225            protected abstract Map<C, Integer> generateCompoundsToIndex();
226    
227            /**
228             * Returns how many bits are used to represent a compound e.g. 2 if using
229             * 2bit encoding.
230             */
231            protected int bitsPerCompound() {
232                return BYTES_PER_INT / compoundsPerDatatype();
233            }
234    
235            public int seqArraySize(int length) {
236                return (int) Math.ceil((double) length / (double) compoundsPerDatatype());
237            }
238    
239            /**
240             * Loops through the Compounds in a Sequence and passes them onto
241             * {@link #setCompoundAt(Compound, int)}
242             */
243            public void populate(Sequence<C> sequence) {
244                int position = 1;
245                for (C c : sequence) {
246                    setCompoundAt(c, position++);
247                }
248            }
249    
250            /**
251             * Loops through the chars in a String and passes them onto
252             * {@link #setCompoundAt(char, int)}
253             */
254            public void populate(String sequence) {
255                for (int index = 0; index < getLength(); index++) {
256                    setCompoundAt(sequence.charAt(index), index + 1);
257                }
258            }
259    
260            /**
261             * Converts from char to Compound and sets it at the given biological index
262             */
263            public void setCompoundAt(char base, int position) {
264                C compound = getCompoundSet().getCompoundForString(Character.toString(base));
265                setCompoundAt(compound, position);
266            }
267    
268            /**
269             * Sets the compound at the specified biological index
270             */
271            public void setCompoundAt(C compound, int position) {
272                int arrayIndex = biologicalIndexToArrayIndex(position);
273                int currentInt = sequence[arrayIndex];
274                int shiftBy = shiftBy(position);
275                Integer integerValue = getCompoundsToIndexLookup().get(compound);
276    
277                //If we got nothing then throw an error as it's wrong
278                if (integerValue == null) {
279                    processUnknownCompound(compound, position);
280                }
281    
282                int shiftedValue = integerValue << shiftBy;
283    
284                sequence[arrayIndex] = currentInt | shiftedValue;
285            }
286    
287            /**
288             * Returns the compound at the specified biological index
289             */
290            public C getCompoundAt(int position) {
291                //Avoids asking for something which is not encoded by a bit-pair
292                if (position > getLength()) {
293                    throw new IllegalArgumentException(position + " is greater than length. Cannot access this position");
294                }
295                //Just stops us from using 0 indexing
296                if (position < 1) {
297                    throw new IllegalArgumentException(position + " is less than 1; you must use biological indexing (indexing from 1)");
298                }
299    
300                int arrayIndex = biologicalIndexToArrayIndex(position);
301                int currentByte = sequence[arrayIndex];
302                int shiftBy = shiftBy(position);
303                int shifted = (int) (currentByte >>> shiftBy);
304                int masked = (int) (shifted & bitMask());
305    
306                //If we could encode 4 compounds then our max masked value is 3
307                if (masked > (compoundsPerDatatype() - 1)) {
308                    throw new IllegalStateException("Got a masked value of " + masked + "; do not understand values greater than " + (compoundsPerDatatype() - 1));
309                }
310                return getIndexToCompoundsLookup().get(masked);
311            }
312    
313            /**
314             * Since bit encoding only supports a finite number of bases
315             * it is more than likely when processing sequence you will encounter a
316             * compound which is not covered by the encoding e.g. N in a 2bit sequence.
317             * You can override this to convert the unknown base into one you can
318             * process or store locations of unknown bases for a level of post processing
319             * in your subclass.
320             *
321             * @param compound Compound process
322             * @return Byte representation of the compound
323             * @throws IllegalStateException Done whenever this method is invoked
324             */
325            protected byte processUnknownCompound(C compound, int position) throws IllegalStateException {
326                throw new IllegalStateException("Do not know how to translate the compound " + compound + " to a " + bitsPerCompound() + "bit representation");
327            }
328    
329            /**
330             * Returns a list of compounds the index position of which is used
331             * to translate from the byte representation into a compound.
332             */
333            protected List<C> getIndexToCompoundsLookup() {
334                if (indexToCompoundsLookup == null) {
335                    indexToCompoundsLookup = generateIndexToCompounds();
336                }
337                return indexToCompoundsLookup;
338            }
339    
340            /**
341             * Returns a map which converts from compound to an integer representation
342             */
343            protected Map<C, Integer> getCompoundsToIndexLookup() {
344                if (compoundsToIndexLookup == null) {
345                    compoundsToIndexLookup = generateCompoundsToIndex();
346                }
347                return compoundsToIndexLookup;
348            }
349    
350            /**
351             * Converting a biological index to the int which is used to store that
352             * position's data.
353             *
354             * <ul>
355             * <li>Assuming 2bit encoding using the 17 base in a sequence</li>
356             * <li>Convert into 0 index; 17 - 1 = 16</li>
357             * <li>Divide by the number of compounds per int; 16 / 16</li>
358             * <li>Floor the value; floor(1) = 1</li>
359             * </ul>
360             *
361             * Running this for position 13
362             *
363             * <ul>
364             * <li>13 - 1 = 12</li>
365             * <li>12 / 16 = 0.75</li>
366             * <li>floor(0.75) = 0</li>
367             * </ul>
368             */
369            private int biologicalIndexToArrayIndex(int index) {
370                return ((index - 1) / compoundsPerDatatype());
371            }
372    
373            /**
374             * Convert from bio to 0 index, remainder & then multiply by 2
375             * <ul>
376             * <li>Using 2bit encoding and working with position 19</li>
377             * <li>19 is the 3rd position in the second int</li>
378             * <li>Means a shift of 4 into that int to get the right data out</li>
379             * <li>Also must convert into the non-bio index</li>
380             * <li>19 - 1 = 18</li>
381             * <li>18 % compoundsPerDatatype() (16) = 2</li>
382             * <li>2 * bits per compound (2) = 4</li>
383             * </ul>
384             */
385            private byte shiftBy(int index) {
386                return (byte) (((index - 1) % compoundsPerDatatype()) * bitsPerCompound());
387            }
388    
389            /**
390             * Returns the compound set backing this store
391             */
392            public CompoundSet<C> getCompoundSet() {
393                return compoundSet;
394            }
395    
396            public int getLength() {
397                return length;
398            }
399        }
400    }