001 package org.biojava3.core.sequence.template;
002
003 import java.io.IOException;
004 import java.util.ArrayList;
005 import java.util.Collections;
006 import java.util.HashMap;
007 import java.util.Iterator;
008 import java.util.List;
009 import java.util.Map;
010
011 import org.biojava3.core.sequence.compound.NucleotideCompound;
012 import org.biojava3.core.sequence.storage.ArrayListSequenceReader;
013 import org.biojava3.core.sequence.views.ReversedSequenceView;
014 import org.biojava3.core.util.CRC64Checksum;
015
016 import java.util.NoSuchElementException;
017 import org.biojava3.core.sequence.views.ComplementSequenceView;
018 import org.biojava3.core.sequence.views.WindowedSequence;
019
020 /**
021 * Provides a set of static methods to be used as static imports when needed
022 * across multiple Sequence implementations but inheritance gets in the way.
023 *
024 * It also provides a place to put utility methods whose application can
025 * be to a single class of Sequence e.g. {@link NucleotideCompound}
026 * {@link Sequence}; or to any Sequence e.g. looking for the
027 * {@link #getComposition(Sequence)} or {@link #getDistribution(Sequence)}
028 * for any type of Sequence.
029 *
030 * All of these methods assume that you can use the {@link Iterable} interface
031 * offered by the implementations of {@link Sequence} to provide all the
032 * compounds that implementation allows you to see. Since sequence should know
033 * nothing about its backing stores (apart from calling out to it) this should
034 * be true.
035 *
036 * @author ayates
037 */
038 public class SequenceMixin {
039
040 /**
041 * For the given vargs of compounds this method counts the number of
042 * times those compounds appear in the given sequence
043 *
044 * @param sequence The {@link Sequence} to perform the count on
045 * @param compounds The compounds to look for
046 * @param <C> The type of compound we are looking for
047 * @return The number of times the given compounds appear in this Sequence
048 */
049 public static <C extends Compound> int countCompounds(
050 Sequence<C> sequence, C... compounds) {
051 int count = 0;
052 Map<C, Integer> compositon = getComposition(sequence);
053 for (C compound : compounds) {
054 if(compositon.containsKey(compound)) {
055 count = compositon.get(compound) + count;
056 }
057 }
058 return count;
059 }
060
061 /**
062 * Returns the count of GC in the given sequence
063 *
064 * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform
065 * the GC analysis on
066 * @return The number of GC compounds in the sequence
067 */
068 public static int countGC(Sequence<NucleotideCompound> sequence) {
069 CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet();
070 NucleotideCompound G = cs.getCompoundForString("G");
071 NucleotideCompound C = cs.getCompoundForString("C");
072 NucleotideCompound g = cs.getCompoundForString("g");
073 NucleotideCompound c = cs.getCompoundForString("c");
074 return countCompounds(sequence, G, C, g, c);
075 }
076
077 /**
078 * Returns the count of AT in the given sequence
079 *
080 * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform
081 * the AT analysis on
082 * @return The number of AT compounds in the sequence
083 */
084 public static int countAT(Sequence<NucleotideCompound> sequence) {
085 CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet();
086 NucleotideCompound A = cs.getCompoundForString("A");
087 NucleotideCompound T = cs.getCompoundForString("T");
088 NucleotideCompound a = cs.getCompoundForString("a");
089 NucleotideCompound t = cs.getCompoundForString("t");
090 return countCompounds(sequence, A, T, a, t);
091 }
092
093 /**
094 * Analogous to {@link #getComposition(Sequence)} but returns the
095 * distribution of that {@link Compound} over the given sequence.
096 *
097 * @param <C> The type of compound to look for
098 * @param sequence The type of sequence to look over
099 * @return Returns the decimal fraction of the compounds in the given
100 * sequence. Any compound not in the Map will return a fraction of 0.
101 */
102 public static <C extends Compound> Map<C, Double> getDistribution(Sequence<C> sequence) {
103 Map<C, Double> results = new HashMap<C, Double>();
104 Map<C, Integer> composition = getComposition(sequence);
105 double length = (double) sequence.getLength();
106 for (Map.Entry<C, Integer> entry : composition.entrySet()) {
107 double dist = entry.getValue().doubleValue() / length;
108 results.put(entry.getKey(), dist);
109 }
110 return results;
111 }
112
113 /**
114 * Does a linear scan over the given Sequence and records the number of
115 * times each base appears. The returned map will return 0 if a compound
116 * is asked for and the Map has no record of it.
117 *
118 * @param <C> The type of compound to look for
119 * @param sequence The type of sequence to look over
120 * @return Counts for the instances of all compounds in the sequence
121 */
122 public static <C extends Compound> Map<C, Integer> getComposition(Sequence<C> sequence) {
123 Map<C, Integer> results = new HashMap<C, Integer>();
124
125 for (C currentCompound : sequence) {
126 Integer currentInteger = results.get(currentCompound);
127 if ( currentInteger == null)
128 currentInteger = 0;
129 currentInteger++;
130 results.put(currentCompound, currentInteger);
131 }
132 return results;
133 }
134
135 /**
136 * Used as a way of sending a Sequence to a writer without the cost of
137 * converting to a full length String and then writing the data out
138 *
139 * @param <C> Type of compound
140 * @param writer The writer to send data to
141 * @param sequence The sequence to write out
142 * @throws IOException Thrown if we encounter a problem
143 */
144 public static <C extends Compound> void write(Appendable appendable, Sequence<C> sequence) throws IOException {
145 for(C compound: sequence) {
146 appendable.append(compound.toString());
147 }
148 }
149
150 /**
151 * For the given Sequence this will return a {@link StringBuilder} object
152 * filled with the results of {@link Compound#toString()}. Does not
153 * used {@link #write(java.lang.Appendable, org.biojava3.core.sequence.template.Sequence) }
154 * because of its {@link IOException} signature.
155 */
156 public static <C extends Compound> StringBuilder toStringBuilder(Sequence<C> sequence) {
157 StringBuilder sb = new StringBuilder(sequence.getLength());
158 for (C compound : sequence) {
159 sb.append(compound.toString());
160 }
161 return sb;
162 }
163
164 /**
165 * Shortcut to {@link #toStringBuilder(org.biojava3.core.sequence.template.Sequence)}
166 * which calls toString() on the resulting object.
167 */
168 public static <C extends Compound> String toString(Sequence<C> sequence) {
169 return toStringBuilder(sequence).toString();
170 }
171
172 /**
173 * For the given {@link Sequence} this will return a {@link List} filled with
174 * the Compounds of that {@link Sequence}.
175 */
176 public static <C extends Compound> List<C> toList(Sequence<C> sequence) {
177 List<C> list = new ArrayList<C>(sequence.getLength());
178 for (C compound : sequence) {
179 list.add(compound);
180 }
181 return list;
182 }
183
184 /**
185 * Performs a linear search of the given Sequence for the given compound.
186 * Once we find the compound we return the position.
187 */
188 public static <C extends Compound> int indexOf(Sequence<C> sequence,
189 C compound) {
190 int index = 1;
191 for (C currentCompound : sequence) {
192 if (currentCompound.equals(compound)) {
193 return index;
194 }
195 index++;
196 }
197 return 0;
198 }
199
200 /**
201 * Performs a reversed linear search of the given Sequence by wrapping
202 * it in a {@link ReversedSequenceView} and passing it into
203 * {@link #indexOf(Sequence, Compound)}. We then inverse the index coming
204 * out of it.
205 */
206 public static <C extends Compound> int lastIndexOf(Sequence<C> sequence,
207 C compound) {
208 int index = indexOf(new ReversedSequenceView<C>(sequence), compound);
209 return (sequence.getLength() - index)+1;
210 }
211
212 /**
213 * Creates a simple sequence iterator which moves through a sequence going
214 * from 1 to the length of the Sequence. Modification of the Sequence is not
215 * allowed.
216 */
217 public static <C extends Compound> Iterator<C> createIterator(
218 Sequence<C> sequence) {
219 return new SequenceIterator<C>(sequence);
220 }
221
222 /**
223 * Creates a simple sub sequence view delimited by the given start and end.
224 */
225 public static <C extends Compound> SequenceView<C> createSubSequence(
226 Sequence<C> sequence, int start, int end) {
227 return new SequenceProxyView<C>(sequence, start, end);
228 }
229
230 /**
231 * Implements sequence shuffling by first materializing the given
232 * {@link Sequence} into a {@link List}, applying
233 * {@link Collections#shuffle(List)} and then returning the shuffled
234 * elements in a new instance of {@link SequenceBackingStore} which behaves
235 * as a {@link Sequence}.
236 */
237 public static <C extends Compound> Sequence<C> shuffle(Sequence<C> sequence) {
238 List<C> compounds = sequence.getAsList();
239 Collections.shuffle(compounds);
240 return new ArrayListSequenceReader<C>(compounds,
241 sequence.getCompoundSet());
242 }
243
244 /**
245 * Performs a simple CRC64 checksum on any given sequence.
246 */
247 public static <C extends Compound> String checksum(Sequence<C> sequence) {
248 CRC64Checksum checksum = new CRC64Checksum();
249 for (C compound : sequence) {
250 checksum.update(compound.getShortName());
251 }
252 return checksum.toString();
253 }
254
255 /**
256 * Produces kmers of the specified size e.g. ATGTGA returns two views which
257 * have ATG TGA
258 *
259 * @param <C> Compound to use
260 * @param sequence Sequence to build from
261 * @param kmer Kmer size
262 * @return The list of non-overlapping K-mers
263 */
264 public static <C extends Compound> List<SequenceView<C>> nonOverlappingKmers(Sequence<C> sequence, int kmer) {
265 List<SequenceView<C>> l = new ArrayList<SequenceView<C>>();
266 WindowedSequence<C> w = new WindowedSequence<C>(sequence, kmer);
267 for(SequenceView<C> view: w) {
268 l.add(view);
269 }
270 return l;
271 }
272
273 /**
274 * Used to generate overlapping k-mers such i.e. ATGTA will give rise to
275 * ATG, TGT & GTA
276 *
277 * @param <C> Compound to use
278 * @param sequence Sequence to build from
279 * @param kmer Kmer size
280 * @return The list of overlapping K-mers
281 */
282 public static <C extends Compound> List<SequenceView<C>> overlappingKmers(Sequence<C> sequence, int kmer) {
283 List<SequenceView<C>> l = new ArrayList<SequenceView<C>>();
284 List<Iterator<SequenceView<C>>> windows
285 = new ArrayList<Iterator<SequenceView<C>>>();
286
287 for(int i=1; i<=kmer; i++) {
288 if(i == 1) {
289 windows.add(new WindowedSequence<C>(sequence, kmer).iterator());
290 }
291 else {
292 SequenceView<C> sv = sequence.getSubSequence(i, sequence.getLength());
293 windows.add(new WindowedSequence<C>(sv, kmer).iterator());
294 }
295 }
296
297 OUTER: while(true) {
298 for(int i=0; i<kmer; i++) {
299 Iterator<SequenceView<C>> iterator = windows.get(i);
300 boolean breakLoop=true;
301 if(iterator.hasNext()) {
302 l.add(iterator.next());
303 breakLoop = false;
304 }
305 if(breakLoop) {
306 break OUTER;
307 }
308 }
309 }
310 return l;
311 }
312
313 /**
314 * A method which attempts to do the right thing when is comes to a
315 * reverse/reverse complement
316 *
317 * @param <C> The type of compound
318 * @param sequence The input sequence
319 * @return The inverted sequence which is optionally complemented
320 */
321 @SuppressWarnings("unchecked")
322 public static <C extends Compound> SequenceView<C> inverse(Sequence<C> sequence) {
323 SequenceView<C> reverse = new ReversedSequenceView<C>(sequence);
324 if(sequence.getCompoundSet().isComplementable()) {
325 return new ComplementSequenceView(reverse);
326 }
327 return reverse;
328 }
329
330 /**
331 * A basic sequence iterator which iterates over the given Sequence by
332 * biological index. This assumes your sequence supports random access
333 * and performs well when doing these operations.
334 *
335 * @author ayates
336 *
337 * @param <C> Type of compound to return
338 */
339 public static class SequenceIterator<C extends Compound>
340 implements Iterator<C> {
341
342 private final Sequence<C> sequence;
343 private final int length;
344 private int currentPosition = 0;
345
346 public SequenceIterator(Sequence<C> sequence) {
347 this.sequence = sequence;
348 this.length = sequence.getLength();
349 }
350
351
352 public boolean hasNext() {
353 return (currentPosition < length);
354 }
355
356
357 public C next() {
358 if(!hasNext()) {
359 throw new NoSuchElementException("Exhausted sequence of elements");
360 }
361 return sequence.getCompoundAt(++currentPosition);
362 }
363
364 public void remove() {
365 throw new UnsupportedOperationException("Cannot remove() on a SequenceIterator");
366 }
367 }
368 }