001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023
024 package org.biojava3.core.sequence;
025
026 import java.util.ArrayList;
027 import java.util.Collections;
028 import java.util.List;
029
030 import org.biojava3.core.sequence.io.util.IOUtils;
031 import org.biojava3.core.sequence.template.Compound;
032 import org.biojava3.core.sequence.template.CompoundSet;
033 import org.biojava3.core.sequence.template.LightweightProfile;
034 import org.biojava3.core.sequence.template.Sequence;
035
036 /**
037 * Implements a minimal data structure for reading and writing a sequence alignment. The full {@code Profile} data
038 * structure in the alignment module provides additional functionality.
039 *
040 * @author Scooter Willis
041 * @author Mark Chapman
042 */
043 public class MultipleSequenceAlignment<S extends Sequence<C>, C extends Compound> implements LightweightProfile<S, C> {
044
045 private List<S> sequences = new ArrayList<S>();
046 private Integer length = null;
047
048 /**
049 * A sequence that has been aligned to other sequences will have inserts.
050 * @param sequence
051 */
052 public void addAlignedSequence(S sequence){
053 if(length == null){
054 length = sequence.getLength();
055 }
056 if(sequence.getLength() != length){
057 throw new IllegalArgumentException(sequence.getAccession() + " length = " + sequence.getLength() +
058 " not equal to MSA length = " + length);
059 }
060 sequences.add(sequence);
061 }
062
063 /**
064 * Remove a sequence
065 * @param sequence
066 * @return
067 */
068 public boolean removeAlignedSequence(S sequence){
069 return sequences.remove(sequence);
070 }
071 //methods for LightweightProfile
072
073 /**
074 * Uses bioIndex starting at 1 instead of 0
075 * @param listIndex
076 * @return
077 */
078
079
080 @Override
081 public S getAlignedSequence(int listIndex) {
082 return sequences.get(listIndex - 1);
083 }
084
085 /**
086 * Get the list of sequences
087 * @return
088 */
089 @Override
090 public List<S> getAlignedSequences() {
091 return Collections.unmodifiableList(sequences);
092 }
093
094 /**
095 * Get a list of compounds at a sequence position
096 * @param alignmentIndex
097 * @return
098 */
099 @Override
100 public List<C> getCompoundsAt(int alignmentIndex) {
101 List<C> column = new ArrayList<C>();
102 for (S s : sequences) {
103 column.add(s.getCompoundAt(alignmentIndex));
104 }
105 return Collections.unmodifiableList(column);
106 }
107
108 /**
109 * Get the Compounds defined in the first sequence
110 * @return
111 */
112 @Override
113 public CompoundSet<C> getCompoundSet() {
114 return sequences.get(0).getCompoundSet();
115 }
116
117 /**
118 * Get the length of the MSA where it is assumed that
119 * all sequence position
120 * @return
121 */
122 @Override
123 public int getLength() {
124 return length;
125 }
126
127 /**
128 * Get the number of sequences in the MSA
129 * @return
130 */
131 @Override
132 public int getSize() {
133 return sequences.size();
134 }
135
136 /**
137 * Get a string representation of the MSA with a fixed width
138 * @param width
139 * @return
140 */
141 @Override
142 public String toString(int width) {
143 return toString(width, null, IOUtils.getIDFormat(sequences), true, true, true, false);
144 }
145
146 /**
147 * Support for different MSA formats
148 * @param format
149 * @return
150 */
151 @Override
152 public String toString(StringFormat format) {
153 switch (format) {
154 case ALN:
155 case CLUSTALW:
156 default:
157 return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(sequences) +
158 " ", true, false, true, false);
159 case FASTA:
160 return toString(60, null, ">%s%n", false, false, false, false);
161 case GCG:
162 case MSF:
163 return toString(50, IOUtils.getGCGHeader(sequences), IOUtils.getIDFormat(sequences), true, false, false,
164 false);
165 case PDBWEB:
166 return toString(60, null, "%s", true, false, true, true);
167 }
168 }
169
170 /**
171 * String representation of the MSA
172 * @return
173 */
174
175 @Override
176 public String toString() {
177 return toString(getLength(), null, null, false, false, false, false);
178 }
179
180 // helper methods
181
182 /**
183 * Helper method that does all the formating work
184 * @param width
185 * @param header
186 * @param idFormat
187 * @param interlaced
188 * @param aligIndices
189 * @param aligConservation
190 * @param webDisplay
191 * @return
192 */
193 // creates formatted String
194 private String toString(int width, String header, String idFormat, boolean interlaced, boolean aligIndices,
195 boolean aligConservation, boolean webDisplay) {
196
197 // TODO handle circular alignments
198 StringBuilder s = (header == null) ? new StringBuilder() : new StringBuilder(header);
199
200 if (webDisplay && sequences.size() == 2) {
201 s.append("<div><pre>");
202 }
203
204 width = Math.max(1, width);
205 if (interlaced) {
206 String aligIndFormat = "%-" + Math.max(1, width / 2) + "d %" + Math.max(1, width - (width / 2) - 1) +
207 "d%n";
208 for (int i = 0; i < getLength(); i += width) {
209 int start = i + 1, end = Math.min(getLength(), i + width);
210 if (i > 0) {
211 s.append(String.format("%n"));
212 }
213 if (aligIndices) {
214 if (end < i + width) {
215 int line = end - start + 1;
216 aligIndFormat = "%-" + Math.max(1, line / 2) + "d %" + Math.max(1, line - (line / 2) - 1) +
217 "d%n";
218 }
219 if (idFormat != null) {
220 s.append(String.format(idFormat, ""));
221 }
222 s.append(String.format(aligIndFormat, start, end));
223 }
224 int counter = 0;
225 for (S as : sequences) {
226 counter++;
227 if (webDisplay && sequences.size() == 2) {
228 printSequenceAlignmentWeb(s, counter, idFormat, start, end);
229 } else {
230 if (idFormat != null) {
231 s.append(String.format(idFormat, as.getAccession()));
232 }
233 s.append(as.getSubSequence(start, end).getSequenceAsString());
234 s.append(String.format("%n"));
235 }
236 if (aligConservation && sequences.size() == 2 && counter == 1) {
237 printConservation(s, idFormat, start, end, webDisplay);
238 }
239 }
240 }
241 } else {
242 for (S as : sequences) {
243 if (idFormat != null) {
244 s.append(String.format(idFormat, as.getAccession()));
245 }
246 for (int i = 0; i < getLength(); i += width) {
247 int start = i + 1, end = Math.min(getLength(), i + width);
248 s.append(as.getSubSequence(start, end).getSequenceAsString());
249 s.append(String.format("%n"));
250 }
251 }
252 }
253
254 if (webDisplay && aligConservation && sequences.size() == 2) {
255 s.append(IOUtils.getPDBLegend());
256 }
257 return s.toString();
258 }
259
260 /**
261 *
262 * @param s
263 * @param counter
264 * @param idFormat
265 * @param start
266 * @param end
267 */
268 private void printSequenceAlignmentWeb(StringBuilder s, int counter, String idFormat, int start, int end) {
269 S as = sequences.get(counter - 1), seq1 = sequences.get(0), seq2 = sequences.get(1);
270
271 if (idFormat != null) {
272 s.append(String.format(idFormat, as.getAccession()));
273 }
274
275 String mySeq = as.getSubSequence(start, end).getSequenceAsString();
276 String s1 = seq1.getSubSequence(start, end).getSequenceAsString();
277 String s2 = seq2.getSubSequence(start, end).getSequenceAsString();
278 CompoundSet<C> cs = getCompoundSet();
279
280 for (int i = 0; i < s1.length(); i++) {
281 if (i >= s2.length() || i >= mySeq.length())
282 break;
283 char c1 = s1.charAt(i);
284 char c2 = s2.charAt(i);
285 char c = mySeq.charAt(i);
286 s.append(IOUtils.getPDBCharacter(true, c1, c2, cs.compoundsEquivalent(seq1.getCompoundAt(i),
287 seq2.getCompoundAt(i)), c));
288 }
289
290 s.append(String.format("%n"));
291 }
292
293 /**
294 *
295 * @param s
296 * @param idFormat
297 * @param start
298 * @param end
299 * @param webDisplay
300 */
301 private void printConservation(StringBuilder s, String idFormat, int start, int end, boolean webDisplay) {
302 S seq1 = sequences.get(0), seq2 = sequences.get(1);
303
304 if (idFormat != null) {
305 AccessionID ac1 = sequences.get(0).getAccession();
306 String id1 = (ac1 == null) ? "null" : ac1.getID();
307 id1 = id1.replaceAll(".", " ");
308 s.append(String.format(idFormat, id1));
309 }
310
311 String s1 = seq1.getSubSequence(start, end).getSequenceAsString();
312 String s2 = seq2.getSubSequence(start, end).getSequenceAsString();
313 CompoundSet<C> cs = getCompoundSet();
314
315 for (int i = 0; i < s1.length(); i++) {
316 if (i >= s2.length())
317 break;
318 char c1 = s1.charAt(i);
319 char c2 = s2.charAt(i);
320 s.append(IOUtils.getPDBConservation(webDisplay, c1, c2, cs.compoundsEquivalent(seq1.getCompoundAt(i),
321 seq2.getCompoundAt(i))));
322 }
323
324 s.append(String.format("%n"));
325 }
326
327 }