001 package org.biojava3.core.sequence.storage;
002
003 import java.util.Iterator;
004 import java.util.List;
005 import java.util.Map;
006
007 import org.biojava3.core.sequence.AccessionID;
008 import org.biojava3.core.sequence.compound.NucleotideCompound;
009 import org.biojava3.core.sequence.template.Compound;
010 import org.biojava3.core.sequence.template.CompoundSet;
011 import org.biojava3.core.sequence.template.SequenceMixin;
012 import org.biojava3.core.sequence.template.ProxySequenceReader;
013 import org.biojava3.core.sequence.template.Sequence;
014 import org.biojava3.core.sequence.template.SequenceView;
015
016 /**
017 * An implementation of the popular bit encodings. This class provides the
018 * Sequence view over what is actually carried out in the {@link BitArrayWorker}
019 * instances. These are the objects that carry out array storage as well as
020 * indexing into those arrays. New bit encodings can be written by extending
021 * this class and a worker class. There are a number of issues with this
022 * type of storage engine:
023 *
024 * <ul>
025 * <li>We can only support a finite number of {@link Compound}s; 2 bit allows no N compounds</li>
026 * <li>For real savings you must read the sequence in using your own
027 * Reader and a {@link BitArrayWorker} instance</li>
028 * </ul>
029 *
030 * @author ayates
031 *
032 * @param <C> Type of compound; must extend {@link NucleotideCompound}
033 */
034 public class BitSequenceReader<C extends Compound> implements ProxySequenceReader<C> {
035
036 private final AccessionID accession;
037 private final BitArrayWorker<C> worker;
038
039 /**
040 * Instance which allows you to supply a different @{BitArrayWorker}
041 * object.
042 */
043 public BitSequenceReader(BitArrayWorker<C> worker, AccessionID accession) {
044 this.accession = accession;
045 this.worker = worker;
046 }
047
048 /**
049 * Class is immutable & so this is unsupported
050 */
051
052 public void setCompoundSet(CompoundSet<C> compoundSet) {
053 throw new UnsupportedOperationException("Cannot reset the CompoundSet; object is immutable");
054 }
055
056 /**
057 * Class is immutable & so this is unsupported
058 */
059
060 public void setContents(String sequence) {
061 throw new UnsupportedOperationException(getClass().getSimpleName() + " is an immutable data structure; cannot reset contents");
062 }
063
064 /**
065 * Counts the number of times a compound appears in this sequence store
066 */
067
068 public int countCompounds(C... compounds) {
069 return SequenceMixin.countCompounds(this, compounds);
070 }
071
072
073 public AccessionID getAccession() {
074 return accession;
075 }
076
077 /**
078 * Returns this Sequence store as a List
079 */
080
081 public List<C> getAsList() {
082 return SequenceMixin.toList(this);
083 }
084
085 /**
086 * Returns the compound at the specified biological index
087 */
088
089 public C getCompoundAt(int position) {
090 return worker.getCompoundAt(position);
091 }
092
093 /**
094 * Returns the compound set backing this store
095 */
096
097 public CompoundSet<C> getCompoundSet() {
098 return worker.getCompoundSet();
099 }
100
101 /**
102 * Returns the first occurrence of the given compound in this store; performs
103 * a linear search
104 */
105
106 public int getIndexOf(C compound) {
107 return SequenceMixin.indexOf(this, compound);
108 }
109
110 /**
111 * Returns the last occurrence of the given compound in this store; performs
112 * a linear search
113 */
114
115 public int getLastIndexOf(C compound) {
116 return SequenceMixin.lastIndexOf(this, compound);
117 }
118
119 /**
120 * Returns the length of the sequence
121 */
122
123 public int getLength() {
124 return worker.getLength();
125 }
126
127 /**
128 * Returns the sequence as a String
129 */
130
131 public String getSequenceAsString() {
132 return SequenceMixin.toStringBuilder(this).toString();
133 }
134
135 /**
136 * Returns a sub sequence view
137 */
138 public SequenceView<C> getSubSequence(final int start, final int end) {
139 return SequenceMixin.createSubSequence(this, start, end);
140 }
141
142 /**
143 * Provides basic iterable access to this class
144 */
145
146 public Iterator<C> iterator() {
147 return SequenceMixin.createIterator(this);
148 }
149
150 public SequenceView<C> getSubSequence(Integer start, Integer end) {
151 return getSubSequence((int) start, (int) end);
152 }
153
154 @Override
155 public SequenceView<C> getInverse() {
156 return SequenceMixin.inverse(this);
157 }
158
159 /**
160 * The logic of working with a bit has been separated out into this class
161 * to help developers create the bit data structures without having to
162 * put the code into an intermediate format and to also use the format
163 * without the need to copy this code.
164 *
165 * This class behaves just like a {@link Sequence} without the interface
166 *
167 * @author ayates
168 *
169 * @param <C> The {@link Compound} to use
170 */
171 public static abstract class BitArrayWorker<C extends Compound> {
172
173 private final CompoundSet<C> compoundSet;
174 private final int length;
175 private final int[] sequence;
176 private transient List<C> indexToCompoundsLookup = null;
177 private transient Map<C, Integer> compoundsToIndexLookup = null;
178 public static final int BYTES_PER_INT = 32;
179
180 public BitArrayWorker(Sequence<C> sequence) {
181 this(sequence.getCompoundSet(), sequence.getLength());
182 populate(sequence);
183 }
184
185 public BitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
186 this(compoundSet, sequence.length());
187 populate(sequence);
188 }
189
190 public BitArrayWorker(CompoundSet<C> compoundSet, int length) {
191 this.compoundSet = compoundSet;
192 this.length = length;
193 this.sequence = new int[seqArraySize(length)];
194 }
195
196 public BitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
197 this.compoundSet = compoundSet;
198 this.sequence = sequence;
199 this.length = sequence.length;
200 }
201
202 /**
203 * This method should return the bit mask to be used to extract the
204 * bytes you are interested in working with. See solid implementations
205 * on how to create these
206 */
207 protected abstract byte bitMask();
208
209 /**
210 * Should return the maximum amount of compounds we can encode per int
211 */
212 protected abstract int compoundsPerDatatype();
213
214 /**
215 * Should return the inverse information that {@link #generateCompoundsToIndex() }
216 * returns i.e. if the Compound C returns 1 from compoundsToIndex then we
217 * should find that compound here in position 1
218 */
219 protected abstract List<C> generateIndexToCompounds();
220
221 /**
222 * Returns what the value of a compound is in the backing bit storage i.e.
223 * in 2bit storage the value 0 is encoded as 00 (in binary).
224 */
225 protected abstract Map<C, Integer> generateCompoundsToIndex();
226
227 /**
228 * Returns how many bits are used to represent a compound e.g. 2 if using
229 * 2bit encoding.
230 */
231 protected int bitsPerCompound() {
232 return BYTES_PER_INT / compoundsPerDatatype();
233 }
234
235 public int seqArraySize(int length) {
236 return (int) Math.ceil((double) length / (double) compoundsPerDatatype());
237 }
238
239 /**
240 * Loops through the Compounds in a Sequence and passes them onto
241 * {@link #setCompoundAt(Compound, int)}
242 */
243 public void populate(Sequence<C> sequence) {
244 int position = 1;
245 for (C c : sequence) {
246 setCompoundAt(c, position++);
247 }
248 }
249
250 /**
251 * Loops through the chars in a String and passes them onto
252 * {@link #setCompoundAt(char, int)}
253 */
254 public void populate(String sequence) {
255 for (int index = 0; index < getLength(); index++) {
256 setCompoundAt(sequence.charAt(index), index + 1);
257 }
258 }
259
260 /**
261 * Converts from char to Compound and sets it at the given biological index
262 */
263 public void setCompoundAt(char base, int position) {
264 C compound = getCompoundSet().getCompoundForString(Character.toString(base));
265 setCompoundAt(compound, position);
266 }
267
268 /**
269 * Sets the compound at the specified biological index
270 */
271 public void setCompoundAt(C compound, int position) {
272 int arrayIndex = biologicalIndexToArrayIndex(position);
273 int currentInt = sequence[arrayIndex];
274 int shiftBy = shiftBy(position);
275 Integer integerValue = getCompoundsToIndexLookup().get(compound);
276
277 //If we got nothing then throw an error as it's wrong
278 if (integerValue == null) {
279 processUnknownCompound(compound, position);
280 }
281
282 int shiftedValue = integerValue << shiftBy;
283
284 sequence[arrayIndex] = currentInt | shiftedValue;
285 }
286
287 /**
288 * Returns the compound at the specified biological index
289 */
290 public C getCompoundAt(int position) {
291 //Avoids asking for something which is not encoded by a bit-pair
292 if (position > getLength()) {
293 throw new IllegalArgumentException(position + " is greater than length. Cannot access this position");
294 }
295 //Just stops us from using 0 indexing
296 if (position < 1) {
297 throw new IllegalArgumentException(position + " is less than 1; you must use biological indexing (indexing from 1)");
298 }
299
300 int arrayIndex = biologicalIndexToArrayIndex(position);
301 int currentByte = sequence[arrayIndex];
302 int shiftBy = shiftBy(position);
303 int shifted = (int) (currentByte >>> shiftBy);
304 int masked = (int) (shifted & bitMask());
305
306 //If we could encode 4 compounds then our max masked value is 3
307 if (masked > (compoundsPerDatatype() - 1)) {
308 throw new IllegalStateException("Got a masked value of " + masked + "; do not understand values greater than " + (compoundsPerDatatype() - 1));
309 }
310 return getIndexToCompoundsLookup().get(masked);
311 }
312
313 /**
314 * Since bit encoding only supports a finite number of bases
315 * it is more than likely when processing sequence you will encounter a
316 * compound which is not covered by the encoding e.g. N in a 2bit sequence.
317 * You can override this to convert the unknown base into one you can
318 * process or store locations of unknown bases for a level of post processing
319 * in your subclass.
320 *
321 * @param compound Compound process
322 * @return Byte representation of the compound
323 * @throws IllegalStateException Done whenever this method is invoked
324 */
325 protected byte processUnknownCompound(C compound, int position) throws IllegalStateException {
326 throw new IllegalStateException("Do not know how to translate the compound " + compound + " to a " + bitsPerCompound() + "bit representation");
327 }
328
329 /**
330 * Returns a list of compounds the index position of which is used
331 * to translate from the byte representation into a compound.
332 */
333 protected List<C> getIndexToCompoundsLookup() {
334 if (indexToCompoundsLookup == null) {
335 indexToCompoundsLookup = generateIndexToCompounds();
336 }
337 return indexToCompoundsLookup;
338 }
339
340 /**
341 * Returns a map which converts from compound to an integer representation
342 */
343 protected Map<C, Integer> getCompoundsToIndexLookup() {
344 if (compoundsToIndexLookup == null) {
345 compoundsToIndexLookup = generateCompoundsToIndex();
346 }
347 return compoundsToIndexLookup;
348 }
349
350 /**
351 * Converting a biological index to the int which is used to store that
352 * position's data.
353 *
354 * <ul>
355 * <li>Assuming 2bit encoding using the 17 base in a sequence</li>
356 * <li>Convert into 0 index; 17 - 1 = 16</li>
357 * <li>Divide by the number of compounds per int; 16 / 16</li>
358 * <li>Floor the value; floor(1) = 1</li>
359 * </ul>
360 *
361 * Running this for position 13
362 *
363 * <ul>
364 * <li>13 - 1 = 12</li>
365 * <li>12 / 16 = 0.75</li>
366 * <li>floor(0.75) = 0</li>
367 * </ul>
368 */
369 private int biologicalIndexToArrayIndex(int index) {
370 return ((index - 1) / compoundsPerDatatype());
371 }
372
373 /**
374 * Convert from bio to 0 index, remainder & then multiply by 2
375 * <ul>
376 * <li>Using 2bit encoding and working with position 19</li>
377 * <li>19 is the 3rd position in the second int</li>
378 * <li>Means a shift of 4 into that int to get the right data out</li>
379 * <li>Also must convert into the non-bio index</li>
380 * <li>19 - 1 = 18</li>
381 * <li>18 % compoundsPerDatatype() (16) = 2</li>
382 * <li>2 * bits per compound (2) = 4</li>
383 * </ul>
384 */
385 private byte shiftBy(int index) {
386 return (byte) (((index - 1) % compoundsPerDatatype()) * bitsPerCompound());
387 }
388
389 /**
390 * Returns the compound set backing this store
391 */
392 public CompoundSet<C> getCompoundSet() {
393 return compoundSet;
394 }
395
396 public int getLength() {
397 return length;
398 }
399 }
400 }