001 package org.biojava3.core.sequence.edits;
002
003 import java.util.ArrayList;
004 import java.util.List;
005
006 import org.biojava3.core.sequence.BasicSequence;
007 import org.biojava3.core.sequence.storage.JoiningSequenceReader;
008 import org.biojava3.core.sequence.template.Compound;
009 import org.biojava3.core.sequence.template.Sequence;
010
011 /**
012 * Interface for carrying out edit operations on a Sequence. The 3 major
013 * methods of Editing are supported
014 *
015 * <ul>
016 * <li>Insertion</li>
017 * <li>Deletion</li>
018 * <li>Substitution</li>
019 * </ul>
020 *
021 * The interface is provided so end users can use our implementations, which
022 * are implementations which attempts to create views of Sequences in an
023 * editted form not a full-realised editted Sequence, or their own.
024 *
025 * @author ayates
026 * @param <C> The type of compound to edit
027 */
028 public interface Edit<C extends Compound> {
029
030 Sequence<C> edit(Sequence<C> sequence);
031
032 /**
033 * Abstract class which defines all edit operations as a call to discover
034 * what 5' and 3' ends of an editing Sequence should be joined together
035 * with a target Sequence. These ends can be of 0 length but conceptionally
036 * they can still exist.
037 */
038 public static abstract class AbstractEdit<C extends Compound> implements Edit<C> {
039
040 /**
041 * Should return the 5-prime end of the given Sequence according to
042 * the edit. An empty Sequence is valid.
043 */
044 protected abstract Sequence<C> getFivePrime(Sequence<C> editingSequence);
045
046 /**
047 * Should return the 3-prime end of the given Sequence according to
048 * the edit. An empty Sequence is valid.
049 */
050 protected abstract Sequence<C> getThreePrime(Sequence<C> editingSequence);
051
052
053 public Sequence<C> edit(Sequence<C> editingSequence) {
054 Sequence<C> targetSequence = getTargetSequence(editingSequence);
055 List<Sequence<C>> sequences = new ArrayList<Sequence<C>>();
056
057 sequences.add(getFivePrime(editingSequence));
058 sequences.add(targetSequence);
059 sequences.add(getThreePrime(editingSequence));
060
061 return new JoiningSequenceReader<C>(sequences);
062 }
063 private int start = -1;
064 private int end = -1;
065 private String stringSequence;
066 private Sequence<C> sequence;
067
068 public AbstractEdit(int start) {
069 this.start = start;
070 }
071
072 public AbstractEdit(int start, int end) {
073 this.start = start;
074 this.end = end;
075 }
076
077 protected void setStringSequence(String stringSequence) {
078 this.stringSequence = stringSequence;
079 }
080
081 protected void setSequence(Sequence<C> sequence) {
082 this.sequence = sequence;
083 }
084
085 /**
086 * Returns the Sequence which is our edit.
087 *
088 * @param editingSequence Asked for in-case we need to do String to
089 * Sequence conversion so we need a CompoundSet which is given
090 * by the Sequence we are editing
091 * @return The Sequence<C> object we wish to insert
092 */
093 public Sequence<C> getTargetSequence(Sequence<C> editingSequence) {
094 if (sequence == null && stringSequence != null) {
095 sequence = new BasicSequence<C>(
096 stringSequence, editingSequence.getCompoundSet());
097 }
098 return sequence;
099 }
100
101 /**
102 * Returns an empty sequence with the given compound set of the editing
103 * sequence
104 */
105 protected Sequence<C> getEmptySequence(Sequence<C> editingSequence) {
106 return new BasicSequence<C>("", editingSequence.getCompoundSet());
107 }
108
109 public int getStart() {
110 return start;
111 }
112
113 public int getEnd() {
114 return end;
115 }
116 }
117
118 /**
119 * Implementation which allows for the deletion of bases from a Sequence
120 */
121 public static class Delete<C extends Compound> extends AbstractEdit<C> {
122
123 public Delete(int position) {
124 this(position, position);
125 }
126
127 public Delete(int start, int end) {
128 super(start, end);
129 setStringSequence("");
130 }
131
132 protected int getRealStart() {
133 return getStart() - 1;
134 }
135
136 protected int getRealEnd() {
137 return getEnd() + 1;
138 }
139
140 @Override
141 protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
142 int start = getRealStart();
143 if (start == 0) {
144 return getEmptySequence(editingSequence);
145 }
146 return editingSequence.getSubSequence(1, start);
147 }
148
149 @Override
150 protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
151 int end = getRealEnd();
152 if (end > editingSequence.getLength()) {
153 return getEmptySequence(editingSequence);
154 }
155 return editingSequence.getSubSequence(end, editingSequence.getLength());
156 }
157 }
158
159 /**
160 * Edit implementation which allows us to insert a base at any position
161 * in a Sequence. Specifying 1 base is used to insert at the start and
162 * end of a Sequence. If you wish to carry out an in-sequence insertion
163 * then you specify the flanking base positions e.g.
164 *
165 * <pre>
166 * ACTG insert TT @ position 1 : TTACGT
167 * ACTG insert TT @ position 2,3 : ACTTGT
168 * ACTG insert A @ position 4 : ACGTA
169 * </pre>
170 *
171 * The code will raise exceptions if you attempt a single base edit
172 * with an insertion.
173 */
174 public static class Insert<C extends Compound> extends AbstractEdit<C> {
175
176 private final boolean singlePosition;
177
178 public Insert(String sequence, int position) {
179 super(position, position);
180 this.singlePosition = true;
181 setStringSequence(sequence);
182 }
183
184 public Insert(Sequence<C> sequence, int position) {
185 super(position, position);
186 this.singlePosition = true;
187 setSequence(sequence);
188 }
189
190 public Insert(String sequence, int start, int stop) {
191 super(start, stop);
192 this.singlePosition = false;
193 setStringSequence(sequence);
194 }
195
196 public Insert(Sequence<C> sequence, int start, int stop) {
197 super(start, stop);
198 this.singlePosition = false;
199 setSequence(sequence);
200 }
201
202 @Override
203 protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
204 if (singlePosition) {
205 if (getStart() == 1) {
206 return getEmptySequence(editingSequence);
207 } else if (getEnd() == editingSequence.getLength()) {
208 return editingSequence;
209 } else {
210 throw new IllegalStateException("Given one position to "
211 + "insert at but this is not the start or end "
212 + "of the Sequence; cannot support this");
213 }
214 }
215 return editingSequence.getSubSequence(1, getStart());
216 }
217
218 @Override
219 protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
220 if (singlePosition) {
221 if (getStart() == 1) {
222 return editingSequence;
223 } else if (getEnd() == editingSequence.getLength()) {
224 return getEmptySequence(editingSequence);
225 } else {
226 throw new IllegalStateException("Given one position to "
227 + "insert at but this is not the start or end "
228 + "of the Sequence; cannot support this");
229 }
230 }
231 return editingSequence.getSubSequence(getEnd(), editingSequence.getLength());
232 }
233 }
234
235 /**
236 * Allows for the substitution of bases into an existing Sequence. This
237 * allows us to do edits like:
238 *
239 * <pre>
240 * Sub TT @ position 2
241 * AAAA -> ATTA
242 * </pre>
243 *
244 * We do not support
245 *
246 * Edits do not require the length of the insertion but do rely on the
247 * presence of a CompoundSet to parse a String (if given) which means
248 * the eventual length of a Sequence is a lazy operation.
249 */
250 public static class Substitute<C extends Compound> extends AbstractEdit<C> {
251
252 public Substitute(String sequence, int position) {
253 super(position);
254 setStringSequence(sequence);
255 }
256
257 public Substitute(Sequence<C> sequence, int position) {
258 super(position);
259 setSequence(sequence);
260 }
261
262 /**
263 * Must use this rather than the no-args getEnd as this can return
264 * -1 and the length of a sub is dependent on the length of the
265 * Sequence; we cannot assume 1:1 mapping between characters in a
266 * String and the number of compounds we will have to insert.
267 */
268 public int getEnd(Sequence<C> sequence) {
269 if (getEnd() == -1) {
270 int start = getStart();
271 int length = getTargetSequence(sequence).getLength();
272 return (start + length) - 1;
273 }
274 return getEnd();
275 }
276
277 @Override
278 protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
279 int start = getStart();
280 if (start == 1) {
281 return getEmptySequence(editingSequence);
282 }
283 return editingSequence.getSubSequence(1, start - 1);
284 }
285
286 @Override
287 protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
288 int end = getEnd(editingSequence);
289 if (end > editingSequence.getLength()) {
290 throw new IndexOutOfBoundsException(end +
291 " is greater than the max index of " +
292 "the editing sequence (" +
293 editingSequence.getLength());
294 } else if (end == editingSequence.getLength()) {
295 return getEmptySequence(editingSequence);
296 }
297 return editingSequence.getSubSequence(end + 1, editingSequence.getLength());
298 }
299 }
300 }