001    /*
002     * To change this template, choose Tools | Templates
003     * and open the template in the editor.
004     */
005    package org.biojava3.core.sequence.storage;
006    
007    import java.util.ArrayList;
008    import java.util.Collections;
009    import java.util.Comparator;
010    import java.util.HashMap;
011    import java.util.List;
012    import java.util.Map;
013    import org.biojava3.core.sequence.AccessionID;
014    import org.biojava3.core.sequence.template.Compound;
015    import org.biojava3.core.sequence.template.CompoundSet;
016    import org.biojava3.core.sequence.template.Sequence;
017    
018    /**
019     *
020     * Four bit encoding of the bit formats. This can support up to 16 compounds
021     * from a compound set. To allow us to support the redundant set of Nucleotide
022     * compounds this class will use case-insensitive encoding. The values assigned
023     * to these compounds is also done at runtime; if you want a predictable
024     * ordering then override and use your own encodings. However all
025     * encodings are calculated using lexographical ordering of the compounds
026     * so if a CompoundSet does not change then this encoding should not cauuse
027     * a problem.
028     *
029     * @author ayates
030     */
031    public class FourBitSequenceReader<C extends Compound> extends BitSequenceReader<C> {
032    
033        public FourBitSequenceReader(Sequence<C> sequence) {
034            super(new FourBitArrayWorker<C>(sequence), sequence.getAccession());
035        }
036    
037        public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet) {
038            this(sequence, compoundSet, new AccessionID("Unknown"));
039        }
040    
041        public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) {
042            super(new FourBitArrayWorker<C>(sequence, compoundSet), accession);
043        }
044    
045        public FourBitSequenceReader(FourBitArrayWorker<C> worker) {
046            super(worker, new AccessionID("unknown"));
047        }
048    
049        public FourBitSequenceReader(FourBitArrayWorker<C> worker, AccessionID accession) {
050            super(worker, accession);
051        }
052    
053        /**
054         * A four bit per compound implementation of the bit array worker code. This
055         * version can handle upto 16 compounds but this does mean that its ability
056         * to compress a normal sequence is halved (compared to the 1/4 performance
057         * seen with the 2bit workers).
058         *
059         * @param <C> Must extend NucleotideCompound
060         */
061        public static class FourBitArrayWorker<C extends Compound> extends BitArrayWorker<C> {
062    
063            public FourBitArrayWorker(CompoundSet<C> compoundSet, int length) {
064                super(compoundSet, length);
065            }
066    
067            public FourBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
068                super(compoundSet, sequence);
069            }
070    
071            public FourBitArrayWorker(Sequence<C> sequence) {
072                super(sequence);
073            }
074    
075            public FourBitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
076                super(sequence, compoundSet);
077            }
078            /**
079             * Masking value used for extracting the right most 2 bits from a byte
080             */
081            private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1) | (int) Math.pow(2, 2) | (int) Math.pow(2, 3));
082    
083            
084            protected byte bitMask() {
085                return MASK;
086            }
087    
088            
089            protected int compoundsPerDatatype() {
090                return 8;
091            }
092    
093            /**
094             * Returns a Map which encodes the contents of CompoundSet. This
095             * version is case-insensitive i.e. C and c both encode for the same
096             * position. We sort lexigraphically so if the compound set has
097             * not changed then neither will this.
098             */
099            
100            protected Map<C, Integer> generateCompoundsToIndex() {
101                final CompoundSet<C> cs = getCompoundSet();
102                Map<C, Integer> map = new HashMap<C, Integer>();
103                int index = 0;
104                for (C currentCompound : sortedCompounds(cs)) {
105                    C upperCasedCompound = getOptionalUpperCasedCompound(currentCompound, cs);
106    
107                    //if it has the uppercased compound then set this
108                    //compounds' value to that one
109                    if (map.containsKey(upperCasedCompound)) {
110                        map.put(currentCompound, map.get(upperCasedCompound));
111                    } else {
112                        map.put(currentCompound, index++);
113                    }
114                }
115    
116                return map;
117            }
118    
119            private C getOptionalUpperCasedCompound(C currentCompound, CompoundSet<C> cs) {
120                C upperCasedCompound = null;
121                String upperCasedString = cs.getStringForCompound(currentCompound).toUpperCase();
122                if (cs.getCompoundForString(upperCasedString) == null) {
123                    upperCasedCompound = currentCompound;
124                } else {
125                    upperCasedCompound = cs.getCompoundForString(upperCasedString);
126                }
127                return upperCasedCompound;
128            }
129    
130            private List<C> sortedCompounds(final CompoundSet<C> cs) {
131                List<C> compounds = new ArrayList<C>(cs.getAllCompounds());
132                Collections.sort(compounds, new Comparator<C>() {
133    
134                    
135                    public int compare(C o1, C o2) {
136                        String s1 = cs.getStringForCompound(o1);
137                        String s2 = cs.getStringForCompound(o2);
138                        return s1.compareTo(s2);
139                    }
140                });
141                return compounds;
142            }
143    
144            /**
145             * Returns a List which reverse encodes the Compound, Integer map
146             */
147            
148            protected List<C> generateIndexToCompounds() {
149                CompoundSet<C> cs = getCompoundSet();
150                Map<C, Integer> lookup = getCompoundsToIndexLookup();
151                Map<Integer, C> tempMap = new HashMap<Integer, C>();
152                //First get the reverse lookup working
153                for (C compound : lookup.keySet()) {
154                    C upperCasedCompound = getOptionalUpperCasedCompound(compound, cs);
155                    Integer pos = lookup.get(upperCasedCompound);
156                    tempMap.put(pos, upperCasedCompound);
157                }
158    
159                //Then populate the results by going back through the sorted integer keys
160                List<C> compounds = new ArrayList<C>();
161                List<Integer> keys = new ArrayList<Integer>(tempMap.keySet());
162                Collections.sort(keys);
163                for (Integer key : keys) {
164                    compounds.add(tempMap.get(key));
165                }
166    
167                return compounds;
168            }
169        }
170    }