001 /*
002 * To change this template, choose Tools | Templates
003 * and open the template in the editor.
004 */
005 package org.biojava3.core.sequence.storage;
006
007 import java.util.ArrayList;
008 import java.util.Collections;
009 import java.util.Comparator;
010 import java.util.HashMap;
011 import java.util.List;
012 import java.util.Map;
013 import org.biojava3.core.sequence.AccessionID;
014 import org.biojava3.core.sequence.template.Compound;
015 import org.biojava3.core.sequence.template.CompoundSet;
016 import org.biojava3.core.sequence.template.Sequence;
017
018 /**
019 *
020 * Four bit encoding of the bit formats. This can support up to 16 compounds
021 * from a compound set. To allow us to support the redundant set of Nucleotide
022 * compounds this class will use case-insensitive encoding. The values assigned
023 * to these compounds is also done at runtime; if you want a predictable
024 * ordering then override and use your own encodings. However all
025 * encodings are calculated using lexographical ordering of the compounds
026 * so if a CompoundSet does not change then this encoding should not cauuse
027 * a problem.
028 *
029 * @author ayates
030 */
031 public class FourBitSequenceReader<C extends Compound> extends BitSequenceReader<C> {
032
033 public FourBitSequenceReader(Sequence<C> sequence) {
034 super(new FourBitArrayWorker<C>(sequence), sequence.getAccession());
035 }
036
037 public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet) {
038 this(sequence, compoundSet, new AccessionID("Unknown"));
039 }
040
041 public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) {
042 super(new FourBitArrayWorker<C>(sequence, compoundSet), accession);
043 }
044
045 public FourBitSequenceReader(FourBitArrayWorker<C> worker) {
046 super(worker, new AccessionID("unknown"));
047 }
048
049 public FourBitSequenceReader(FourBitArrayWorker<C> worker, AccessionID accession) {
050 super(worker, accession);
051 }
052
053 /**
054 * A four bit per compound implementation of the bit array worker code. This
055 * version can handle upto 16 compounds but this does mean that its ability
056 * to compress a normal sequence is halved (compared to the 1/4 performance
057 * seen with the 2bit workers).
058 *
059 * @param <C> Must extend NucleotideCompound
060 */
061 public static class FourBitArrayWorker<C extends Compound> extends BitArrayWorker<C> {
062
063 public FourBitArrayWorker(CompoundSet<C> compoundSet, int length) {
064 super(compoundSet, length);
065 }
066
067 public FourBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
068 super(compoundSet, sequence);
069 }
070
071 public FourBitArrayWorker(Sequence<C> sequence) {
072 super(sequence);
073 }
074
075 public FourBitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
076 super(sequence, compoundSet);
077 }
078 /**
079 * Masking value used for extracting the right most 2 bits from a byte
080 */
081 private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1) | (int) Math.pow(2, 2) | (int) Math.pow(2, 3));
082
083
084 protected byte bitMask() {
085 return MASK;
086 }
087
088
089 protected int compoundsPerDatatype() {
090 return 8;
091 }
092
093 /**
094 * Returns a Map which encodes the contents of CompoundSet. This
095 * version is case-insensitive i.e. C and c both encode for the same
096 * position. We sort lexigraphically so if the compound set has
097 * not changed then neither will this.
098 */
099
100 protected Map<C, Integer> generateCompoundsToIndex() {
101 final CompoundSet<C> cs = getCompoundSet();
102 Map<C, Integer> map = new HashMap<C, Integer>();
103 int index = 0;
104 for (C currentCompound : sortedCompounds(cs)) {
105 C upperCasedCompound = getOptionalUpperCasedCompound(currentCompound, cs);
106
107 //if it has the uppercased compound then set this
108 //compounds' value to that one
109 if (map.containsKey(upperCasedCompound)) {
110 map.put(currentCompound, map.get(upperCasedCompound));
111 } else {
112 map.put(currentCompound, index++);
113 }
114 }
115
116 return map;
117 }
118
119 private C getOptionalUpperCasedCompound(C currentCompound, CompoundSet<C> cs) {
120 C upperCasedCompound = null;
121 String upperCasedString = cs.getStringForCompound(currentCompound).toUpperCase();
122 if (cs.getCompoundForString(upperCasedString) == null) {
123 upperCasedCompound = currentCompound;
124 } else {
125 upperCasedCompound = cs.getCompoundForString(upperCasedString);
126 }
127 return upperCasedCompound;
128 }
129
130 private List<C> sortedCompounds(final CompoundSet<C> cs) {
131 List<C> compounds = new ArrayList<C>(cs.getAllCompounds());
132 Collections.sort(compounds, new Comparator<C>() {
133
134
135 public int compare(C o1, C o2) {
136 String s1 = cs.getStringForCompound(o1);
137 String s2 = cs.getStringForCompound(o2);
138 return s1.compareTo(s2);
139 }
140 });
141 return compounds;
142 }
143
144 /**
145 * Returns a List which reverse encodes the Compound, Integer map
146 */
147
148 protected List<C> generateIndexToCompounds() {
149 CompoundSet<C> cs = getCompoundSet();
150 Map<C, Integer> lookup = getCompoundsToIndexLookup();
151 Map<Integer, C> tempMap = new HashMap<Integer, C>();
152 //First get the reverse lookup working
153 for (C compound : lookup.keySet()) {
154 C upperCasedCompound = getOptionalUpperCasedCompound(compound, cs);
155 Integer pos = lookup.get(upperCasedCompound);
156 tempMap.put(pos, upperCasedCompound);
157 }
158
159 //Then populate the results by going back through the sorted integer keys
160 List<C> compounds = new ArrayList<C>();
161 List<Integer> keys = new ArrayList<Integer>(tempMap.keySet());
162 Collections.sort(keys);
163 for (Integer key : keys) {
164 compounds.add(tempMap.get(key));
165 }
166
167 return compounds;
168 }
169 }
170 }