001    package org.biojava3.core.sequence.template;
002    
003    import java.io.IOException;
004    import java.util.ArrayList;
005    import java.util.Collections;
006    import java.util.HashMap;
007    import java.util.Iterator;
008    import java.util.List;
009    import java.util.Map;
010    
011    import org.biojava3.core.sequence.compound.NucleotideCompound;
012    import org.biojava3.core.sequence.storage.ArrayListSequenceReader;
013    import org.biojava3.core.sequence.views.ReversedSequenceView;
014    import org.biojava3.core.util.CRC64Checksum;
015    
016    import java.util.NoSuchElementException;
017    import org.biojava3.core.sequence.views.ComplementSequenceView;
018    import org.biojava3.core.sequence.views.WindowedSequence;
019    
020    /**
021     * Provides a set of static methods to be used as static imports when needed
022     * across multiple Sequence implementations but inheritance gets in the way.
023     *
024     * It also provides a place to put utility methods whose application can
025     * be to a single class of Sequence e.g. {@link NucleotideCompound}
026     * {@link Sequence}; or to any Sequence e.g. looking for the
027     * {@link #getComposition(Sequence)} or {@link #getDistribution(Sequence)}
028     * for any type of Sequence.
029     *
030     * All of these methods assume that you can use the {@link Iterable} interface
031     * offered by the implementations of {@link Sequence} to provide all the
032     * compounds that implementation allows you to see. Since sequence should know
033     * nothing about its backing stores (apart from calling out to it) this should
034     * be true.
035     *
036     * @author ayates
037     */
038    public class SequenceMixin {
039    
040        /**
041         * For the given vargs of compounds this method counts the number of
042         * times those compounds appear in the given sequence
043         *
044         * @param sequence The {@link Sequence} to perform the count on
045         * @param compounds The compounds to look for
046         * @param <C> The type of compound we are looking for
047         * @return The number of times the given compounds appear in this Sequence
048         */
049        public static <C extends Compound> int countCompounds(
050                Sequence<C> sequence, C... compounds) {
051            int count = 0;
052            Map<C, Integer> compositon = getComposition(sequence);
053            for (C compound : compounds) {
054                if(compositon.containsKey(compound)) {
055                    count = compositon.get(compound) + count;
056                }
057            }
058            return count;
059        }
060    
061        /**
062         * Returns the count of GC in the given sequence
063         *
064         * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform
065         * the GC analysis on
066         * @return The number of GC compounds in the sequence
067         */
068        public static int countGC(Sequence<NucleotideCompound> sequence) {
069            CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet();
070            NucleotideCompound G = cs.getCompoundForString("G");
071            NucleotideCompound C = cs.getCompoundForString("C");
072            NucleotideCompound g = cs.getCompoundForString("g");
073            NucleotideCompound c = cs.getCompoundForString("c");
074            return countCompounds(sequence, G, C, g, c);
075        }
076    
077        /**
078         * Returns the count of AT in the given sequence
079         *
080         * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform
081         * the AT analysis on
082         * @return The number of AT compounds in the sequence
083         */
084        public static int countAT(Sequence<NucleotideCompound> sequence) {
085            CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet();
086            NucleotideCompound A = cs.getCompoundForString("A");
087            NucleotideCompound T = cs.getCompoundForString("T");
088            NucleotideCompound a = cs.getCompoundForString("a");
089            NucleotideCompound t = cs.getCompoundForString("t");
090            return countCompounds(sequence, A, T, a, t);
091        }
092    
093        /**
094         * Analogous to {@link #getComposition(Sequence)} but returns the
095         * distribution of that {@link Compound} over the given sequence.
096         *
097         * @param <C> The type of compound to look for
098         * @param sequence The type of sequence to look over
099         * @return Returns the decimal fraction of the compounds in the given
100         * sequence. Any compound not in the Map will return a fraction of 0.
101         */
102        public static <C extends Compound> Map<C, Double> getDistribution(Sequence<C> sequence) {
103            Map<C, Double> results = new HashMap<C, Double>();
104            Map<C, Integer> composition = getComposition(sequence);
105            double length = (double) sequence.getLength();
106            for (Map.Entry<C, Integer> entry : composition.entrySet()) {
107                double dist = entry.getValue().doubleValue() / length;
108                results.put(entry.getKey(), dist);
109            }
110            return results;
111        }
112    
113        /**
114         * Does a linear scan over the given Sequence and records the number of
115         * times each base appears. The returned map will return 0 if a compound
116         * is asked for and the Map has no record of it.
117         *
118         * @param <C> The type of compound to look for
119         * @param sequence The type of sequence to look over
120         * @return Counts for the instances of all compounds in the sequence
121         */
122        public static <C extends Compound> Map<C, Integer> getComposition(Sequence<C> sequence) {
123            Map<C, Integer> results = new HashMap<C, Integer>();
124    
125            for (C currentCompound : sequence) {
126                Integer currentInteger = results.get(currentCompound);
127                if ( currentInteger == null)
128                    currentInteger = 0;
129                currentInteger++;
130                results.put(currentCompound, currentInteger);
131            }
132            return results;
133        }
134    
135        /**
136         * Used as a way of sending a Sequence to a writer without the cost of
137         * converting to a full length String and then writing the data out
138         *
139         * @param <C> Type of compound
140         * @param writer The writer to send data to
141         * @param sequence The sequence to write out
142         * @throws IOException Thrown if we encounter a problem
143         */
144        public static <C extends Compound> void write(Appendable appendable, Sequence<C> sequence) throws IOException {
145            for(C compound: sequence) {
146                appendable.append(compound.toString());
147            }
148        }
149    
150        /**
151         * For the given Sequence this will return a {@link StringBuilder} object
152         * filled with the results of {@link Compound#toString()}. Does not
153         * used {@link #write(java.lang.Appendable, org.biojava3.core.sequence.template.Sequence) }
154         * because of its {@link IOException} signature.
155         */
156        public static <C extends Compound> StringBuilder toStringBuilder(Sequence<C> sequence) {
157            StringBuilder sb = new StringBuilder(sequence.getLength());
158            for (C compound : sequence) {
159                sb.append(compound.toString());
160            }
161            return sb;
162        }
163    
164        /**
165         * Shortcut to {@link #toStringBuilder(org.biojava3.core.sequence.template.Sequence)}
166         * which calls toString() on the resulting object.
167         */
168        public static <C extends Compound> String toString(Sequence<C> sequence) {
169            return toStringBuilder(sequence).toString();
170        }
171    
172        /**
173         * For the given {@link Sequence} this will return a {@link List} filled with
174         * the Compounds of that {@link Sequence}.
175         */
176        public static <C extends Compound> List<C> toList(Sequence<C> sequence) {
177            List<C> list = new ArrayList<C>(sequence.getLength());
178            for (C compound : sequence) {
179                list.add(compound);
180            }
181            return list;
182        }
183    
184        /**
185         * Performs a linear search of the given Sequence for the given compound.
186         * Once we find the compound we return the position.
187         */
188        public static <C extends Compound> int indexOf(Sequence<C> sequence,
189                C compound) {
190            int index = 1;
191            for (C currentCompound : sequence) {
192                if (currentCompound.equals(compound)) {
193                    return index;
194                }
195                index++;
196            }
197            return 0;
198        }
199    
200        /**
201         * Performs a reversed linear search of the given Sequence by wrapping
202         * it in a {@link ReversedSequenceView} and passing it into
203         * {@link #indexOf(Sequence, Compound)}. We then inverse the index coming
204         * out of it.
205         */
206        public static <C extends Compound> int lastIndexOf(Sequence<C> sequence,
207                C compound) {
208            int index = indexOf(new ReversedSequenceView<C>(sequence), compound);
209            return (sequence.getLength() - index)+1;
210        }
211    
212        /**
213         * Creates a simple sequence iterator which moves through a sequence going
214         * from 1 to the length of the Sequence. Modification of the Sequence is not
215         * allowed.
216         */
217        public static <C extends Compound> Iterator<C> createIterator(
218                Sequence<C> sequence) {
219            return new SequenceIterator<C>(sequence);
220        }
221    
222        /**
223         * Creates a simple sub sequence view delimited by the given start and end.
224         */
225        public static <C extends Compound> SequenceView<C> createSubSequence(
226                Sequence<C> sequence, int start, int end) {
227            return new SequenceProxyView<C>(sequence, start, end);
228        }
229    
230        /**
231         * Implements sequence shuffling by first materializing the given
232         * {@link Sequence} into a {@link List}, applying
233         * {@link Collections#shuffle(List)} and then returning the shuffled
234         * elements in a new instance of {@link SequenceBackingStore} which behaves
235         * as a {@link Sequence}.
236         */
237        public static <C extends Compound> Sequence<C> shuffle(Sequence<C> sequence) {
238            List<C> compounds = sequence.getAsList();
239            Collections.shuffle(compounds);
240            return new ArrayListSequenceReader<C>(compounds,
241                    sequence.getCompoundSet());
242        }
243    
244        /**
245         * Performs a simple CRC64 checksum on any given sequence.
246         */
247        public static <C extends Compound> String checksum(Sequence<C> sequence) {
248            CRC64Checksum checksum = new CRC64Checksum();
249            for (C compound : sequence) {
250                checksum.update(compound.getShortName());
251            }
252            return checksum.toString();
253        }
254    
255        /**
256         * Produces kmers of the specified size e.g. ATGTGA returns two views which
257         * have ATG TGA
258         *
259         * @param <C> Compound to use
260         * @param sequence Sequence to build from
261         * @param kmer Kmer size
262         * @return The list of non-overlapping K-mers
263         */
264        public static <C extends Compound> List<SequenceView<C>> nonOverlappingKmers(Sequence<C> sequence, int kmer) {
265            List<SequenceView<C>> l = new ArrayList<SequenceView<C>>();
266            WindowedSequence<C> w = new WindowedSequence<C>(sequence, kmer);
267            for(SequenceView<C> view: w) {
268                l.add(view);
269            }
270            return l;
271        }
272    
273        /**
274         * Used to generate overlapping k-mers such i.e. ATGTA will give rise to
275         * ATG, TGT & GTA
276         *
277         * @param <C> Compound to use
278         * @param sequence Sequence to build from
279         * @param kmer Kmer size
280         * @return The list of overlapping K-mers
281         */
282        public static <C extends Compound> List<SequenceView<C>> overlappingKmers(Sequence<C> sequence, int kmer) {
283            List<SequenceView<C>> l = new ArrayList<SequenceView<C>>();
284            List<Iterator<SequenceView<C>>> windows
285                    = new ArrayList<Iterator<SequenceView<C>>>();
286    
287            for(int i=1; i<=kmer; i++) {
288                if(i == 1) {
289                    windows.add(new WindowedSequence<C>(sequence, kmer).iterator());
290                }
291                else {
292                    SequenceView<C> sv = sequence.getSubSequence(i, sequence.getLength());
293                    windows.add(new WindowedSequence<C>(sv, kmer).iterator());
294                }
295            }
296    
297            OUTER: while(true) {
298                for(int i=0; i<kmer; i++) {
299                    Iterator<SequenceView<C>> iterator = windows.get(i);
300                    boolean breakLoop=true;
301                    if(iterator.hasNext()) {
302                        l.add(iterator.next());
303                        breakLoop = false;
304                    }
305                    if(breakLoop) {
306                        break OUTER;
307                    }
308                }
309            }
310            return l;
311        }
312    
313        /**
314         * A method which attempts to do the right thing when is comes to a
315         * reverse/reverse complement
316         *
317         * @param <C> The type of compound
318         * @param sequence The input sequence
319         * @return The inverted sequence which is optionally complemented
320         */
321        @SuppressWarnings("unchecked")
322        public static <C extends Compound> SequenceView<C> inverse(Sequence<C> sequence) {
323            SequenceView<C> reverse = new ReversedSequenceView<C>(sequence);
324            if(sequence.getCompoundSet().isComplementable()) {
325                return new ComplementSequenceView(reverse);
326            }
327            return reverse;
328        }
329    
330        /**
331         * A basic sequence iterator which iterates over the given Sequence by
332         * biological index. This assumes your sequence supports random access
333         * and performs well when doing these operations.
334         *
335         * @author ayates
336         *
337         * @param <C> Type of compound to return
338         */
339        public static class SequenceIterator<C extends Compound>
340                implements Iterator<C> {
341    
342            private final Sequence<C> sequence;
343            private final int length;
344            private int currentPosition = 0;
345    
346            public SequenceIterator(Sequence<C> sequence) {
347                this.sequence = sequence;
348                this.length = sequence.getLength();
349            }
350    
351    
352            public boolean hasNext() {
353                return (currentPosition < length);
354            }
355    
356         
357            public C next() {
358                if(!hasNext()) {
359                    throw new NoSuchElementException("Exhausted sequence of elements");
360                }
361                return sequence.getCompoundAt(++currentPosition);
362            }
363    
364            public void remove() {
365                throw new UnsupportedOperationException("Cannot remove() on a SequenceIterator");
366            }
367        }
368    }