001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @author Richard Holland
023 * @auther Scooter Willis
024 *
025 */
026 package org.biojava3.core.sequence.loader;
027
028 import java.io.File;
029 import java.io.RandomAccessFile;
030 import java.util.ArrayList;
031 import java.util.Iterator;
032 import java.util.List;
033
034 import org.biojava3.core.sequence.template.SequenceProxyView;
035 import org.biojava3.core.sequence.template.Compound;
036 import org.biojava3.core.exceptions.CompoundNotFoundError;
037 import org.biojava3.core.exceptions.FileAccessError;
038 import org.biojava3.core.sequence.AccessionID;
039 import org.biojava3.core.sequence.Strand;
040
041 import org.biojava3.core.sequence.io.template.SequenceParserInterface;
042 import org.biojava3.core.sequence.storage.SequenceAsStringHelper;
043 import org.biojava3.core.sequence.template.CompoundSet;
044 import org.biojava3.core.sequence.template.ProxySequenceReader;
045 import org.biojava3.core.sequence.template.SequenceMixin;
046 import org.biojava3.core.sequence.template.SequenceView;
047
048 /**
049 * This class represents the storage container of a sequence stored in a fasta file where
050 * the initial parsing of the file we store the offset and length of the sequence. When a call
051 * is made to any method that needs sequence data then the file will be opened and the sequence
052 * loaded. This class could be improved by using the hints or a some algorithm that indicates
053 * the sequence data once loaded should stay loaded. Could keep track of the last time sequence
054 * data was loaded and then after X amount of time clear the contents to free up memory.
055 *
056 *
057 * @author Scooter Willis <willishf at gmail dot com>
058 * @param <C>
059 */
060 public class SequenceFileProxyLoader<C extends Compound> implements ProxySequenceReader<C> {
061
062 SequenceParserInterface sequenceParser;
063 private CompoundSet<C> compoundSet;
064 private List<C> parsedCompounds = new ArrayList<C>();
065 File file;
066 long sequenceStartIndex = -1;
067 int sequenceLength = -1;
068 private boolean initialized = false;
069
070 /**
071 *
072 * @param file The file where the sequence will be found
073 * @param sequenceParser The parser to use to load the sequence
074 * @param sequenceStartIndex The file offset to the start of the sequence
075 * @param sequenceLength The length of the sequence
076 * @param compoundSet
077 */
078 public SequenceFileProxyLoader(File file, SequenceParserInterface sequenceParser, long sequenceStartIndex, int sequenceLength, CompoundSet<C> compoundSet) {
079 this.sequenceParser = sequenceParser;
080 this.file = file;
081 this.sequenceStartIndex = sequenceStartIndex;
082 this.sequenceLength = sequenceLength;
083 setCompoundSet(compoundSet);
084 }
085
086 /**
087 *
088 * @param compoundSet
089 */
090 public void setCompoundSet(CompoundSet<C> compoundSet) {
091 this.compoundSet = compoundSet;
092 }
093
094 /**
095 * Load the sequence
096 * @return
097 */
098 private boolean init() {
099 try {
100 RandomAccessFile randomAccessFile = new RandomAccessFile(file, "r");
101 randomAccessFile.seek(sequenceStartIndex);
102 String sequence = sequenceParser.getSequence(randomAccessFile, sequenceLength);
103 setContents(sequence);
104 } catch (Exception e) {
105 throw new FileAccessError("Error accessing " + file + " offset=" + sequenceStartIndex + " sequenceLength=" + sequenceLength + " " + e.toString());
106 }
107 return true;
108 }
109
110 /**
111 *
112 * @param sequence
113 */
114 public void setContents(String sequence) {
115 // Horrendously inefficient - pretty much the way the old BJ did things.
116 // TODO Should be optimised.
117 this.parsedCompounds.clear();
118 for (int i = 0; i < sequence.length();) {
119 String compoundStr = null;
120 C compound = null;
121 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
122 compoundStr = sequence.substring(i, i + compoundStrLength);
123 compound = compoundSet.getCompoundForString(compoundStr);
124 }
125 if (compound == null) {
126 throw new CompoundNotFoundError(compoundStr);
127 } else {
128 i += compoundStr.length();
129 }
130 this.parsedCompounds.add(compound);
131 }
132
133 setInitialized(true);
134 }
135
136 /**
137 *
138 * @return
139 */
140 public int getLength() {
141 return sequenceLength;
142 }
143
144 /**
145 *
146 * @param position
147 * @return
148 */
149 public C getCompoundAt(int position) {
150 if (this.isInitialized() == false) {
151 init();
152 }
153 return this.parsedCompounds.get(position - 1);
154 }
155
156 /**
157 *
158 * @param compound
159 * @return
160 */
161 public int getIndexOf(C compound) {
162 if (this.isInitialized() == false) {
163 init();
164 }
165 return this.parsedCompounds.indexOf(compound) + 1;
166 }
167
168 /**
169 *
170 * @param compound
171 * @return
172 */
173 public int getLastIndexOf(C compound) {
174 if (this.isInitialized() == false) {
175 init();
176 }
177 return this.parsedCompounds.lastIndexOf(compound) + 1;
178 }
179
180 /**
181 *
182 * @return
183 */
184 public String toString() {
185 if (this.isInitialized() == false) {
186 init();
187 }
188 return getSequenceAsString();
189 }
190
191 /**
192 *
193 * @return
194 */
195 public String getSequenceAsString() {
196 return getSequenceAsString(1, getLength(), Strand.POSITIVE);
197 }
198
199 /**
200 *
201 * @param bioBegin
202 * @param bioEnd
203 * @param strand
204 * @return
205 */
206 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
207
208 if (this.isInitialized() == false) {
209 init();
210 }
211 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
212 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
213 }
214
215 /**
216 *
217 * @return
218 */
219 public List<C> getAsList() {
220 if (this.isInitialized() == false) {
221 init();
222 }
223 return this.parsedCompounds;
224
225 }
226
227 /**
228 *
229 * @param bioBegin
230 * @param bioEnd
231 * @return
232 */
233 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
234 if (this.isInitialized() == false) {
235 init();
236 }
237 return new SequenceProxyView<C>(SequenceFileProxyLoader.this, bioBegin, bioEnd);
238 }
239
240 /**
241 *
242 * @return
243 */
244 public Iterator<C> iterator() {
245 if (this.isInitialized() == false) {
246 init();
247 }
248 return this.parsedCompounds.iterator();
249 }
250
251 /**
252 *
253 * @return
254 */
255 public CompoundSet<C> getCompoundSet() {
256 return compoundSet;
257 }
258
259 /**
260 * @return the initialized
261 */
262 public boolean isInitialized() {
263 return initialized;
264 }
265
266 /**
267 * @param initialized the initialized to set
268 */
269 public void setInitialized(boolean initialized) {
270 this.initialized = initialized;
271 }
272
273 /**
274 *
275 * @return
276 */
277 public AccessionID getAccession() {
278 throw new UnsupportedOperationException("Not supported yet.");
279 }
280
281 /**
282 *
283 * @param compounds
284 * @return
285 */
286 public int countCompounds(C... compounds) {
287 return SequenceMixin.countCompounds(this, compounds);
288 }
289
290 /**
291 *
292 * @return
293 */
294 @Override
295 public SequenceView<C> getInverse() {
296 return SequenceMixin.inverse(this);
297 }
298 }