001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     */
022    package org.biojava3.core.sequence.io;
023    
024    import java.io.File;
025    import java.io.FileInputStream;
026    import java.io.InputStream;
027    import java.io.InputStreamReader;
028    import java.util.LinkedHashMap;
029    
030    import org.biojava3.core.sequence.ProteinSequence;
031    import org.biojava3.core.sequence.compound.AminoAcidCompound;
032    import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
033    import org.biojava3.core.sequence.io.template.FastaHeaderParserInterface;
034    import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
035    import org.biojava3.core.sequence.template.Compound;
036    import org.biojava3.core.sequence.template.Sequence;
037    
038    /**
039     * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the
040     * primary class used to read Fasta files
041     * @author Scooter Willis <willishf at gmail dot com>
042     */
043    public class FastaReader<S extends Sequence<?>, C extends Compound> {
044    
045        SequenceCreatorInterface<C> sequenceCreator;
046        FastaHeaderParserInterface<S,C> headerParser;
047        BufferedReaderBytesRead br;
048        InputStreamReader isr;
049        FileInputStream fi = null;
050    
051        /**
052         * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
053         * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
054         * an inputstream is forced to read all the data so you don't gain anything.
055         * @param br
056         * @param headerParser
057         * @param sequenceCreator
058         */
059        public FastaReader(InputStream is, FastaHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) {
060            this.headerParser = headerParser;
061            isr = new InputStreamReader(is);
062            this.br = new BufferedReaderBytesRead(isr);
063            this.sequenceCreator = sequenceCreator;
064        }
065    
066        /**
067         * If you are going to use the FileProxyProteinSequenceCreator then you need to use this constructor because we need details about
068         * the location of the file.
069         * @param file
070         * @param headerParser
071         * @param sequenceCreator
072         * @throws Exception
073         */
074        public FastaReader(File file, FastaHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) throws Exception {
075            this.headerParser = headerParser;
076            fi = new FileInputStream(file);
077            isr = new InputStreamReader(fi);
078            this.br = new BufferedReaderBytesRead(isr);
079            this.sequenceCreator = sequenceCreator;
080        }
081    
082        /**
083         * The parsing is done in this method
084         * @return
085         * @throws Exception
086         */
087        @SuppressWarnings("unchecked")
088        public LinkedHashMap<String,S> process() throws Exception {
089            LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
090    
091    
092            String line = "";
093            String header = "";
094            StringBuilder sb = new StringBuilder();
095            int maxSequenceLength = -1;
096            long fileIndex = 0;
097            long sequenceIndex = 0;
098            boolean keepGoing = true;
099            do {
100                line = line.trim(); // nice to have but probably not needed
101                if (line.length() != 0) {
102                    if (line.startsWith(">")) {
103                        if (sb.length() > 0) {
104                        //    System.out.println("Sequence index=" + sequenceIndex);
105                            S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
106                            headerParser.parseHeader(header, sequence);
107                            sequences.put(sequence.getAccession().getID(),sequence);
108                            if (maxSequenceLength < sb.length()) {
109                                maxSequenceLength = sb.length();
110                            }
111                            sb = new StringBuilder(maxSequenceLength);
112                        }
113                        header = line.substring(1);
114                    } else if (line.startsWith(";")) {
115                    } else {
116                        //mark the start of the sequence with the fileIndex before the line was read
117                        if(sb.length() == 0){
118                            sequenceIndex = fileIndex;
119                        }
120                        sb.append(line);
121                    }
122                }
123                fileIndex = br.getBytesRead();
124                line = br.readLine();
125                if (line == null) {
126                //    System.out.println("Sequence index=" + sequenceIndex + " " + fileIndex );
127                    S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
128                    headerParser.parseHeader(header, sequence);
129                    sequences.put(sequence.getAccession().getID(),sequence);
130                    keepGoing = false;
131                }
132            } while (keepGoing);
133            br.close();
134            isr.close();
135            //If stream was created from File object then we need to close it
136            if (fi != null) {
137                fi.close();
138            }
139            return sequences;
140        }
141    
142        public static void main(String[] args) {
143            try {
144                String inputFile = "src/test/resources/PF00104_small.fasta";
145                FileInputStream is = new FileInputStream(inputFile);
146    
147                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(is, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
148                LinkedHashMap<String,ProteinSequence> proteinSequences = fastaReader.process();
149                is.close();
150    
151    
152                System.out.println(proteinSequences);
153    
154                File file = new File(inputFile);
155                FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader = new FastaReader<ProteinSequence,AminoAcidCompound>(file, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new FileProxyProteinSequenceCreator(file, AminoAcidCompoundSet.getAminoAcidCompoundSet()));
156                LinkedHashMap<String,ProteinSequence> proteinProxySequences = fastaProxyReader.process();
157    
158                for(String key : proteinProxySequences.keySet()){
159                    ProteinSequence proteinSequence = proteinProxySequences.get(key);
160                    System.out.println(key);
161                    if(key.equals("Q98SJ1_CHICK/15-61")){
162                        int dummy = 1;
163                    }
164                    System.out.println(proteinSequence.toString());
165    
166                }
167    
168            } catch (Exception e) {
169                e.printStackTrace();
170            }
171        }
172    }