001 package org.biojava3.core.sequence.storage;
002
003 import java.util.ArrayList;
004 import java.util.HashMap;
005 import java.util.List;
006 import java.util.Map;
007 import org.biojava3.core.sequence.AccessionID;
008 import org.biojava3.core.sequence.compound.NucleotideCompound;
009 import org.biojava3.core.sequence.template.CompoundSet;
010 import org.biojava3.core.sequence.template.Sequence;
011
012 /**
013 * Implementation of the 2bit encoding. This will default to the following
014 * encodings:
015 *
016 * <ul>
017 * <li>0 - T</li>
018 * <li>1 - C</li>
019 * <li>2 - A</li>
020 * <li>3 - G</li>
021 * </ul>
022 *
023 * We also do not support case sensitive encodings therefore if you pass a
024 * lowercased a this will be treated as if it is an uppercase A and we will
025 * erase that information.
026 *
027 * @author ayates
028 */
029 public class TwoBitSequenceReader<C extends NucleotideCompound> extends BitSequenceReader<C> {
030
031 public TwoBitSequenceReader(Sequence<C> sequence) {
032 super(new TwoBitArrayWorker<C>(sequence), sequence.getAccession());
033 }
034
035 public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet) {
036 this(sequence, compoundSet, new AccessionID("Unknown"));
037 }
038
039 public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) {
040 super(new TwoBitArrayWorker<C>(sequence, compoundSet), accession);
041 }
042
043 public TwoBitSequenceReader(TwoBitArrayWorker<C> worker) {
044 super(worker, new AccessionID("unknown"));
045 }
046
047 public TwoBitSequenceReader(TwoBitArrayWorker<C> worker, AccessionID accession) {
048 super(worker, accession);
049 }
050
051 /**
052 * Extension of the BitArrayWorker which provides the 2bit implementation
053 * code. This is intended to work with the 4 basic nucelotide types. If you
054 * require a different version of the encoding used here then extend
055 * and override as required.
056 *
057 * @param <C> Must extend NucleotideCompound
058 */
059 public static class TwoBitArrayWorker<C extends NucleotideCompound> extends BitArrayWorker<C> {
060
061 public TwoBitArrayWorker(CompoundSet<C> compoundSet, int length) {
062 super(compoundSet, length);
063 }
064
065 public TwoBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
066 super(compoundSet, sequence);
067 }
068
069 public TwoBitArrayWorker(Sequence<C> sequence) {
070 super(sequence);
071 }
072
073 public TwoBitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
074 super(sequence, compoundSet);
075 }
076
077 /**
078 * Masking value used for extracting the right most 2 bits from a byte
079 */
080 private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1));
081
082 @Override
083 protected byte bitMask() {
084 return MASK;
085 }
086
087 @Override
088 protected int compoundsPerDatatype() {
089 return 16;
090 }
091
092 /**
093 * Returns a Map which encodes TCAG into positions 0,1,2,3.
094 */
095 @Override
096 @SuppressWarnings("serial")
097 protected Map<C, Integer> generateCompoundsToIndex() {
098 final CompoundSet<C> cs = getCompoundSet();
099 return new HashMap<C, Integer>() {
100
101 {
102 put(cs.getCompoundForString("T"), 0);
103 put(cs.getCompoundForString("C"), 1);
104 put(cs.getCompoundForString("A"), 2);
105 put(cs.getCompoundForString("G"), 3);
106 put(cs.getCompoundForString("t"), 0);
107 put(cs.getCompoundForString("c"), 1);
108 put(cs.getCompoundForString("a"), 2);
109 put(cs.getCompoundForString("g"), 3);
110 }
111 };
112 }
113
114 /**
115 * Returns a List which encodes TCAG into positions 0,1,2,3.
116 */
117 @Override
118 protected List<C> generateIndexToCompounds() {
119 CompoundSet<C> cs = getCompoundSet();
120 List<C> result = new ArrayList<C>();
121 result.add( cs.getCompoundForString("T"));
122
123
124 result.add( cs.getCompoundForString("C"));
125 result.add( cs.getCompoundForString("A"));
126 result.add( cs.getCompoundForString("G"));
127 return result;
128 }
129 }
130
131 }