001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022 package org.biojava3.core.sequence.io;
023
024 import java.io.File;
025 import java.util.List;
026
027 import org.biojava3.core.sequence.ProteinSequence;
028 import org.biojava3.core.sequence.compound.AminoAcidCompound;
029 import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
030 import org.biojava3.core.sequence.loader.SequenceFileProxyLoader;
031 import org.biojava3.core.sequence.template.AbstractSequence;
032 import org.biojava3.core.sequence.template.CompoundSet;
033 import org.biojava3.core.sequence.template.ProxySequenceReader;
034
035 /**
036 * This class is a good example of using the SequenceCreatorInterface where during parsing of the stream
037 * the sequence and the offset index are passed to create a Protein sequence that will be loaded in lazily.
038 * This way you can load very large fasta files and store accession id and delay loading the sequence to save
039 * memory. The index is the file stream offset so when a ProteinSequence has a call to getSequence() the
040 * SequenceFileProxyLoader will open the file and offset to the index and retrieve the sequence.
041 *
042 * Same approach can be used for genome sequence data stored in a local fasta file, in a database or via http
043 * interface to a remote server
044 *
045 * @author Scooter Willis <willishf at gmail dot com>
046 */
047 public class FileProxyProteinSequenceCreator implements
048 SequenceCreatorInterface<AminoAcidCompound> {
049
050 CompoundSet<AminoAcidCompound> compoundSet = null;
051 File fastaFile = null;
052
053 /**
054 * Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
055 * @param fastaFile
056 * @param compoundSet
057 */
058 public FileProxyProteinSequenceCreator(File fastaFile,
059 CompoundSet<AminoAcidCompound> compoundSet) {
060 this.compoundSet = compoundSet;
061 this.fastaFile = fastaFile;
062 }
063
064 /**
065 * Even though we are passing in the sequence we really only care about the length of the sequence and the offset
066 * index in the fasta file.
067 * @param sequence
068 * @param index
069 * @return
070 */
071
072 public AbstractSequence<AminoAcidCompound> getSequence(String sequence,
073 long index) {
074 SequenceFileProxyLoader<AminoAcidCompound> sequenceFileProxyLoader = new SequenceFileProxyLoader<AminoAcidCompound>(
075 fastaFile, new FastaSequenceParser(), index, sequence.length(),
076 compoundSet);
077 return new ProteinSequence(sequenceFileProxyLoader, compoundSet);
078 }
079
080 /**
081 * Should be able to extend the same concept to a remote URL call or database connection. Not supported yet
082 * @param proxyLoader
083 * @param index
084 * @return
085 */
086 public AbstractSequence<AminoAcidCompound> getSequence(
087 ProxySequenceReader<AminoAcidCompound> proxyLoader, long index) {
088 throw new UnsupportedOperationException("Not supported yet.");
089 }
090
091 /**
092 * Not sure of use case and currently not supported
093 * @param list
094 * @return
095 */
096 public AbstractSequence<AminoAcidCompound> getSequence(
097 List<AminoAcidCompound> list) {
098 throw new UnsupportedOperationException("Not supported yet.");
099 }
100 }