001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     */
022    package org.biojava3.core.sequence.io;
023    
024    import java.io.File;
025    import java.util.List;
026    
027    import org.biojava3.core.sequence.ProteinSequence;
028    import org.biojava3.core.sequence.compound.AminoAcidCompound;
029    import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
030    import org.biojava3.core.sequence.loader.SequenceFileProxyLoader;
031    import org.biojava3.core.sequence.template.AbstractSequence;
032    import org.biojava3.core.sequence.template.CompoundSet;
033    import org.biojava3.core.sequence.template.ProxySequenceReader;
034    
035    /**
036     * This class is a good example of using the SequenceCreatorInterface where during parsing of the stream
037     * the sequence and the offset index are passed to create a Protein sequence that will be loaded in lazily.
038     * This way you can load very large fasta files and store accession id and delay loading the sequence to save
039     * memory. The index is the file stream offset so when a ProteinSequence has a call to getSequence() the
040     * SequenceFileProxyLoader will open the file and offset to the index and retrieve the sequence.
041     *
042     * Same approach can be used for genome sequence data stored in a local fasta file, in a database or via http
043     * interface to a remote server
044     *
045     * @author Scooter Willis <willishf at gmail dot com>
046     */
047    public class FileProxyProteinSequenceCreator implements
048            SequenceCreatorInterface<AminoAcidCompound> {
049    
050        CompoundSet<AminoAcidCompound> compoundSet = null;
051        File fastaFile = null;
052    
053        /**
054         * Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
055         * @param fastaFile
056         * @param compoundSet
057         */
058        public FileProxyProteinSequenceCreator(File fastaFile,
059                CompoundSet<AminoAcidCompound> compoundSet) {
060            this.compoundSet = compoundSet;
061            this.fastaFile = fastaFile;
062        }
063    
064        /**
065         * Even though we are passing in the sequence we really only care about the length of the sequence and the offset
066         * index in the fasta file.
067         * @param sequence
068         * @param index
069         * @return
070         */
071    
072        public AbstractSequence<AminoAcidCompound> getSequence(String sequence,
073                long index) {
074            SequenceFileProxyLoader<AminoAcidCompound> sequenceFileProxyLoader = new SequenceFileProxyLoader<AminoAcidCompound>(
075                    fastaFile, new FastaSequenceParser(), index, sequence.length(),
076                    compoundSet);
077            return new ProteinSequence(sequenceFileProxyLoader, compoundSet);
078        }
079    
080        /**
081         * Should be able to extend the same concept to a remote URL call or database connection. Not supported yet
082         * @param proxyLoader
083         * @param index
084         * @return
085         */
086        public AbstractSequence<AminoAcidCompound> getSequence(
087                ProxySequenceReader<AminoAcidCompound> proxyLoader, long index) {
088            throw new UnsupportedOperationException("Not supported yet.");
089        }
090    
091        /**
092         * Not sure of use case and currently not supported
093         * @param list
094         * @return
095         */
096        public AbstractSequence<AminoAcidCompound> getSequence(
097                List<AminoAcidCompound> list) {
098            throw new UnsupportedOperationException("Not supported yet.");
099        }
100    }