001    package org.biojava3.core.sequence.edits;
002    
003    import java.util.ArrayList;
004    import java.util.List;
005    
006    import org.biojava3.core.sequence.BasicSequence;
007    import org.biojava3.core.sequence.storage.JoiningSequenceReader;
008    import org.biojava3.core.sequence.template.Compound;
009    import org.biojava3.core.sequence.template.Sequence;
010    
011    /**
012     * Interface for carrying out edit operations on a Sequence. The 3 major
013     * methods of Editing are supported
014     *
015     * <ul>
016     * <li>Insertion</li>
017     * <li>Deletion</li>
018     * <li>Substitution</li>
019     * </ul>
020     *
021     * The interface is provided so end users can use our implementations, which
022     * are implementations which attempts to create views of Sequences in an
023     * editted form not a full-realised editted Sequence, or their own.
024     *
025     * @author ayates
026     * @param <C> The type of compound to edit
027     */
028    public interface Edit<C extends Compound> {
029    
030        Sequence<C> edit(Sequence<C> sequence);
031    
032        /**
033         * Abstract class which defines all edit operations as a call to discover
034         * what 5' and 3' ends of an editing Sequence should be joined together
035         * with a target Sequence. These ends can be of 0 length but conceptionally
036         * they can still exist.
037         */
038        public static abstract class AbstractEdit<C extends Compound> implements Edit<C> {
039    
040            /**
041             * Should return the 5-prime end of the given Sequence according to
042             * the edit. An empty Sequence is valid.
043             */
044            protected abstract Sequence<C> getFivePrime(Sequence<C> editingSequence);
045    
046            /**
047             * Should return the 3-prime end of the given Sequence according to
048             * the edit. An empty Sequence is valid.
049             */
050            protected abstract Sequence<C> getThreePrime(Sequence<C> editingSequence);
051    
052          
053            public Sequence<C> edit(Sequence<C> editingSequence) {
054                Sequence<C> targetSequence = getTargetSequence(editingSequence);
055                List<Sequence<C>> sequences = new ArrayList<Sequence<C>>();
056    
057                sequences.add(getFivePrime(editingSequence));
058                sequences.add(targetSequence);
059                sequences.add(getThreePrime(editingSequence));
060    
061                return new JoiningSequenceReader<C>(sequences);
062            }
063            private int start = -1;
064            private int end = -1;
065            private String stringSequence;
066            private Sequence<C> sequence;
067    
068            public AbstractEdit(int start) {
069                this.start = start;
070            }
071    
072            public AbstractEdit(int start, int end) {
073                this.start = start;
074                this.end = end;
075            }
076    
077            protected void setStringSequence(String stringSequence) {
078                this.stringSequence = stringSequence;
079            }
080    
081            protected void setSequence(Sequence<C> sequence) {
082                this.sequence = sequence;
083            }
084    
085            /**
086             * Returns the Sequence which is our edit.
087             *
088             * @param editingSequence Asked for in-case we need to do String to
089             * Sequence conversion so we need a CompoundSet which is given
090             * by the Sequence we are editing
091             * @return The Sequence<C> object we wish to insert
092             */
093            public Sequence<C> getTargetSequence(Sequence<C> editingSequence) {
094                if (sequence == null && stringSequence != null) {
095                    sequence = new BasicSequence<C>(
096                            stringSequence, editingSequence.getCompoundSet());
097                }
098                return sequence;
099            }
100    
101            /**
102             * Returns an empty sequence with the given compound set of the editing
103             * sequence
104             */
105            protected Sequence<C> getEmptySequence(Sequence<C> editingSequence) {
106                return new BasicSequence<C>("", editingSequence.getCompoundSet());
107            }
108    
109            public int getStart() {
110                return start;
111            }
112    
113            public int getEnd() {
114                return end;
115            }
116        }
117    
118        /**
119         * Implementation which allows for the deletion of bases from a Sequence
120         */
121        public static class Delete<C extends Compound> extends AbstractEdit<C> {
122    
123            public Delete(int position) {
124                this(position, position);
125            }
126    
127            public Delete(int start, int end) {
128                super(start, end);
129                setStringSequence("");
130            }
131    
132            protected int getRealStart() {
133                return getStart() - 1;
134            }
135    
136            protected int getRealEnd() {
137                return getEnd() + 1;
138            }
139    
140            @Override
141            protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
142                int start = getRealStart();
143                if (start == 0) {
144                    return getEmptySequence(editingSequence);
145                }
146                return editingSequence.getSubSequence(1, start);
147            }
148    
149            @Override
150            protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
151                int end = getRealEnd();
152                if (end > editingSequence.getLength()) {
153                    return getEmptySequence(editingSequence);
154                }
155                return editingSequence.getSubSequence(end, editingSequence.getLength());
156            }
157        }
158    
159        /**
160         * Edit implementation which allows us to insert a base at any position
161         * in a Sequence. Specifying 1 base is used to insert at the start and
162         * end of a Sequence. If you wish to carry out an in-sequence insertion
163         * then you specify the flanking base positions e.g.
164         *
165         * <pre>
166         *   ACTG insert TT @ position 1   : TTACGT
167         *   ACTG insert TT @ position 2,3 : ACTTGT
168         *   ACTG insert A  @ position 4   : ACGTA
169         * </pre>
170         *
171         * The code will raise exceptions if you attempt a single base edit
172         * with an insertion.
173         */
174        public static class Insert<C extends Compound> extends AbstractEdit<C> {
175    
176            private final boolean singlePosition;
177    
178            public Insert(String sequence, int position) {
179                super(position, position);
180                this.singlePosition = true;
181                setStringSequence(sequence);
182            }
183    
184            public Insert(Sequence<C> sequence, int position) {
185                super(position, position);
186                this.singlePosition = true;
187                setSequence(sequence);
188            }
189    
190            public Insert(String sequence, int start, int stop) {
191                super(start, stop);
192                this.singlePosition = false;
193                setStringSequence(sequence);
194            }
195    
196            public Insert(Sequence<C> sequence, int start, int stop) {
197                super(start, stop);
198                this.singlePosition = false;
199                setSequence(sequence);
200            }
201    
202            @Override
203            protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
204                if (singlePosition) {
205                    if (getStart() == 1) {
206                        return getEmptySequence(editingSequence);
207                    } else if (getEnd() == editingSequence.getLength()) {
208                        return editingSequence;
209                    } else {
210                        throw new IllegalStateException("Given one position to "
211                                + "insert at but this is not the start or end "
212                                + "of the Sequence; cannot support this");
213                    }
214                }
215                return editingSequence.getSubSequence(1, getStart());
216            }
217    
218            @Override
219            protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
220                if (singlePosition) {
221                    if (getStart() == 1) {
222                        return editingSequence;
223                    } else if (getEnd() == editingSequence.getLength()) {
224                        return getEmptySequence(editingSequence);
225                    } else {
226                        throw new IllegalStateException("Given one position to "
227                                + "insert at but this is not the start or end "
228                                + "of the Sequence; cannot support this");
229                    }
230                }
231                return editingSequence.getSubSequence(getEnd(), editingSequence.getLength());
232            }
233        }
234    
235        /**
236         * Allows for the substitution of bases into an existing Sequence. This
237         * allows us to do edits like:
238         *
239         * <pre>
240         *    Sub TT @ position 2
241         *    AAAA -> ATTA
242         * </pre>
243         *
244         * We do not support
245         *
246         * Edits do not require the length of the insertion but do rely on the
247         * presence of a CompoundSet to parse a String (if given) which means
248         * the eventual length of a Sequence is a lazy operation.
249         */
250        public static class Substitute<C extends Compound> extends AbstractEdit<C> {
251    
252            public Substitute(String sequence, int position) {
253                super(position);
254                setStringSequence(sequence);
255            }
256    
257            public Substitute(Sequence<C> sequence, int position) {
258                super(position);
259                setSequence(sequence);
260            }
261    
262            /**
263             * Must use this rather than the no-args getEnd as this can return
264             * -1 and the length of a sub is dependent on the length of the
265             * Sequence; we cannot assume 1:1 mapping between characters in a
266             * String and the number of compounds we will have to insert.
267             */
268            public int getEnd(Sequence<C> sequence) {
269                if (getEnd() == -1) {
270                    int start = getStart();
271                    int length = getTargetSequence(sequence).getLength();
272                    return (start + length) - 1;
273                }
274                return getEnd();
275            }
276    
277            @Override
278            protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
279                int start = getStart();
280                if (start == 1) {
281                    return getEmptySequence(editingSequence);
282                }
283                return editingSequence.getSubSequence(1, start - 1);
284            }
285    
286            @Override
287            protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
288                int end = getEnd(editingSequence);
289                if (end > editingSequence.getLength()) {
290                    throw new IndexOutOfBoundsException(end +
291                            " is greater than the max index of " +
292                            "the editing sequence (" +
293                            editingSequence.getLength());
294                } else if (end == editingSequence.getLength()) {
295                    return getEmptySequence(editingSequence);
296                }
297                return editingSequence.getSubSequence(end + 1, editingSequence.getLength());
298            }
299        }
300    }