001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on DATE
021     *
022     */
023    
024    package org.biojava3.core.sequence;
025    
026    import java.util.ArrayList;
027    import java.util.Collections;
028    import java.util.List;
029    
030    import org.biojava3.core.sequence.io.util.IOUtils;
031    import org.biojava3.core.sequence.template.Compound;
032    import org.biojava3.core.sequence.template.CompoundSet;
033    import org.biojava3.core.sequence.template.LightweightProfile;
034    import org.biojava3.core.sequence.template.Sequence;
035    
036    /**
037     * Implements a minimal data structure for reading and writing a sequence alignment.  The full {@code Profile} data
038     * structure in the alignment module provides additional functionality.
039     *
040     * @author Scooter Willis
041     * @author Mark Chapman
042     */
043    public class MultipleSequenceAlignment<S extends Sequence<C>, C extends Compound> implements LightweightProfile<S, C> {
044    
045        private List<S> sequences = new ArrayList<S>();
046        private Integer length = null;
047    
048        /**
049         * A sequence that has been aligned to other sequences will have inserts. 
050         * @param sequence
051         */
052        public void addAlignedSequence(S sequence){
053            if(length == null){
054                length = sequence.getLength();
055            }
056            if(sequence.getLength() != length){
057                throw new IllegalArgumentException(sequence.getAccession() + " length = " + sequence.getLength() +
058                        " not equal to MSA length = " + length);
059            }
060            sequences.add(sequence);
061        }
062    
063        /**
064         * Remove a sequence
065         * @param sequence
066         * @return
067         */
068        public boolean removeAlignedSequence(S sequence){
069            return sequences.remove(sequence);
070        }
071    //methods for LightweightProfile
072    
073        /**
074         * Uses bioIndex starting at 1 instead of 0
075         * @param listIndex
076         * @return
077         */
078         
079    
080        @Override
081        public S getAlignedSequence(int listIndex) {
082            return sequences.get(listIndex - 1);
083        }
084    
085        /**
086         * Get the list of sequences
087         * @return
088         */
089        @Override
090        public List<S> getAlignedSequences() {
091            return Collections.unmodifiableList(sequences);
092        }
093    
094        /**
095         * Get a list of compounds at a sequence position
096         * @param alignmentIndex
097         * @return
098         */
099        @Override
100        public List<C> getCompoundsAt(int alignmentIndex) {
101            List<C> column = new ArrayList<C>();
102            for (S s : sequences) {
103                column.add(s.getCompoundAt(alignmentIndex));
104            }
105            return Collections.unmodifiableList(column);
106        }
107    
108        /**
109         * Get the Compounds defined in the first sequence
110         * @return
111         */
112        @Override
113        public CompoundSet<C> getCompoundSet() {
114            return sequences.get(0).getCompoundSet();
115        }
116    
117        /**
118         * Get the length of the MSA where it is assumed that
119         * all sequence position
120         * @return
121         */
122        @Override
123        public int getLength() {
124            return length;
125        }
126    
127        /**
128         * Get the number of sequences in the MSA
129         * @return
130         */
131        @Override
132        public int getSize() {
133            return sequences.size();
134        }
135    
136        /**
137         * Get a string representation of the MSA with a fixed width
138         * @param width
139         * @return
140         */
141        @Override
142        public String toString(int width) {
143            return toString(width, null, IOUtils.getIDFormat(sequences), true, true, true, false);
144        }
145    
146        /**
147         * Support for different MSA formats
148         * @param format
149         * @return
150         */
151        @Override
152        public String toString(StringFormat format) {
153            switch (format) {
154            case ALN:
155            case CLUSTALW:
156            default:
157                return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(sequences) +
158                        "   ", true, false, true, false);
159            case FASTA:
160                return toString(60, null, ">%s%n", false, false, false, false);
161            case GCG:
162            case MSF:
163                return toString(50, IOUtils.getGCGHeader(sequences), IOUtils.getIDFormat(sequences), true, false, false,
164                        false);
165            case PDBWEB:
166                return toString(60, null, "%s", true, false, true, true);
167            }
168        }
169    
170        /**
171         * String representation of the MSA
172         * @return
173         */
174    
175        @Override
176        public String toString() {
177            return toString(getLength(), null, null, false, false, false, false);
178        }
179    
180        // helper methods
181    
182        /**
183         * Helper method that does all the formating work
184         * @param width
185         * @param header
186         * @param idFormat
187         * @param interlaced
188         * @param aligIndices
189         * @param aligConservation
190         * @param webDisplay
191         * @return
192         */
193        // creates formatted String
194        private String toString(int width, String header, String idFormat, boolean interlaced, boolean aligIndices,
195                boolean aligConservation, boolean webDisplay) {
196    
197            // TODO handle circular alignments
198            StringBuilder s = (header == null) ? new StringBuilder() : new StringBuilder(header);
199    
200            if (webDisplay && sequences.size() == 2) {
201                s.append("<div><pre>");
202            }
203    
204            width = Math.max(1, width);
205            if (interlaced) {
206                String aligIndFormat = "%-" + Math.max(1, width / 2) + "d %" + Math.max(1, width - (width / 2) - 1) +
207                        "d%n";
208                for (int i = 0; i < getLength(); i += width) {
209                    int start = i + 1, end = Math.min(getLength(), i + width);
210                    if (i > 0) {
211                        s.append(String.format("%n"));
212                    }
213                    if (aligIndices) {
214                        if (end < i + width) {
215                            int line = end - start + 1;
216                            aligIndFormat = "%-" + Math.max(1, line / 2) + "d %" + Math.max(1, line - (line / 2) - 1) +
217                                    "d%n";
218                        }
219                        if (idFormat != null) {
220                            s.append(String.format(idFormat, ""));
221                        }
222                        s.append(String.format(aligIndFormat, start, end));
223                    }
224                    int counter = 0;
225                    for (S as : sequences) {
226                        counter++;
227                        if (webDisplay && sequences.size() == 2) {
228                            printSequenceAlignmentWeb(s, counter, idFormat, start, end);
229                        } else {
230                            if (idFormat != null) {
231                                s.append(String.format(idFormat, as.getAccession()));
232                            }
233                            s.append(as.getSubSequence(start, end).getSequenceAsString());
234                            s.append(String.format("%n"));
235                        }
236                        if (aligConservation && sequences.size() == 2 && counter == 1) {
237                            printConservation(s, idFormat, start, end, webDisplay);
238                        }
239                    }
240                }
241            } else {
242                for (S as : sequences) {
243                    if (idFormat != null) {
244                        s.append(String.format(idFormat, as.getAccession()));
245                    }
246                    for (int i = 0; i < getLength(); i += width) {
247                        int start = i + 1, end = Math.min(getLength(), i + width);
248                        s.append(as.getSubSequence(start, end).getSequenceAsString());
249                        s.append(String.format("%n"));
250                    }
251                }
252            }
253    
254            if (webDisplay && aligConservation && sequences.size() == 2) {
255                s.append(IOUtils.getPDBLegend());
256            }
257            return s.toString();
258        }
259    
260        /**
261         *
262         * @param s
263         * @param counter
264         * @param idFormat
265         * @param start
266         * @param end
267         */
268        private void printSequenceAlignmentWeb(StringBuilder s, int counter, String idFormat, int start, int end) {
269            S as = sequences.get(counter - 1), seq1 = sequences.get(0), seq2 = sequences.get(1);
270    
271            if (idFormat != null) {
272                s.append(String.format(idFormat, as.getAccession()));
273            }
274    
275            String mySeq = as.getSubSequence(start, end).getSequenceAsString();
276            String s1 = seq1.getSubSequence(start, end).getSequenceAsString();
277            String s2 = seq2.getSubSequence(start, end).getSequenceAsString();
278            CompoundSet<C> cs = getCompoundSet();
279    
280            for (int i = 0; i < s1.length(); i++) {
281                if (i >= s2.length() || i >= mySeq.length())
282                    break;
283                char c1 = s1.charAt(i);
284                char c2 = s2.charAt(i);
285                char c = mySeq.charAt(i);
286                s.append(IOUtils.getPDBCharacter(true, c1, c2, cs.compoundsEquivalent(seq1.getCompoundAt(i),
287                        seq2.getCompoundAt(i)), c));
288            }
289    
290            s.append(String.format("%n"));
291        }
292    
293        /**
294         *
295         * @param s
296         * @param idFormat
297         * @param start
298         * @param end
299         * @param webDisplay
300         */
301        private void printConservation(StringBuilder s, String idFormat, int start, int end, boolean webDisplay) {
302            S seq1 = sequences.get(0), seq2 = sequences.get(1);
303    
304            if (idFormat != null) {
305                AccessionID ac1 = sequences.get(0).getAccession();
306                String id1 = (ac1 == null) ? "null" : ac1.getID();
307                id1 = id1.replaceAll(".", " ");
308                s.append(String.format(idFormat, id1));
309            }
310    
311            String s1 = seq1.getSubSequence(start, end).getSequenceAsString();
312            String s2 = seq2.getSubSequence(start, end).getSequenceAsString();
313            CompoundSet<C> cs = getCompoundSet();
314    
315            for (int i = 0; i < s1.length(); i++) {
316                if (i >= s2.length())
317                    break;
318                char c1 = s1.charAt(i);
319                char c2 = s2.charAt(i);
320                s.append(IOUtils.getPDBConservation(webDisplay, c1, c2, cs.compoundsEquivalent(seq1.getCompoundAt(i),
321                        seq2.getCompoundAt(i))));
322            }
323    
324            s.append(String.format("%n"));
325        }
326    
327    }