001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022 package org.biojava3.core.sequence.io;
023
024 import java.io.File;
025 import java.io.FileInputStream;
026 import java.io.InputStream;
027 import java.io.InputStreamReader;
028 import java.util.LinkedHashMap;
029
030 import org.biojava3.core.sequence.ProteinSequence;
031 import org.biojava3.core.sequence.compound.AminoAcidCompound;
032 import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
033 import org.biojava3.core.sequence.io.template.FastaHeaderParserInterface;
034 import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
035 import org.biojava3.core.sequence.template.Compound;
036 import org.biojava3.core.sequence.template.Sequence;
037
038 /**
039 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the
040 * primary class used to read Fasta files
041 * @author Scooter Willis <willishf at gmail dot com>
042 */
043 public class FastaReader<S extends Sequence<?>, C extends Compound> {
044
045 SequenceCreatorInterface<C> sequenceCreator;
046 FastaHeaderParserInterface<S,C> headerParser;
047 BufferedReaderBytesRead br;
048 InputStreamReader isr;
049 FileInputStream fi = null;
050
051 /**
052 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
053 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
054 * an inputstream is forced to read all the data so you don't gain anything.
055 * @param br
056 * @param headerParser
057 * @param sequenceCreator
058 */
059 public FastaReader(InputStream is, FastaHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) {
060 this.headerParser = headerParser;
061 isr = new InputStreamReader(is);
062 this.br = new BufferedReaderBytesRead(isr);
063 this.sequenceCreator = sequenceCreator;
064 }
065
066 /**
067 * If you are going to use the FileProxyProteinSequenceCreator then you need to use this constructor because we need details about
068 * the location of the file.
069 * @param file
070 * @param headerParser
071 * @param sequenceCreator
072 * @throws Exception
073 */
074 public FastaReader(File file, FastaHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) throws Exception {
075 this.headerParser = headerParser;
076 fi = new FileInputStream(file);
077 isr = new InputStreamReader(fi);
078 this.br = new BufferedReaderBytesRead(isr);
079 this.sequenceCreator = sequenceCreator;
080 }
081
082 /**
083 * The parsing is done in this method
084 * @return
085 * @throws Exception
086 */
087 @SuppressWarnings("unchecked")
088 public LinkedHashMap<String,S> process() throws Exception {
089 LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
090
091
092 String line = "";
093 String header = "";
094 StringBuilder sb = new StringBuilder();
095 int maxSequenceLength = -1;
096 long fileIndex = 0;
097 long sequenceIndex = 0;
098 boolean keepGoing = true;
099 do {
100 line = line.trim(); // nice to have but probably not needed
101 if (line.length() != 0) {
102 if (line.startsWith(">")) {
103 if (sb.length() > 0) {
104 // System.out.println("Sequence index=" + sequenceIndex);
105 S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
106 headerParser.parseHeader(header, sequence);
107 sequences.put(sequence.getAccession().getID(),sequence);
108 if (maxSequenceLength < sb.length()) {
109 maxSequenceLength = sb.length();
110 }
111 sb = new StringBuilder(maxSequenceLength);
112 }
113 header = line.substring(1);
114 } else if (line.startsWith(";")) {
115 } else {
116 //mark the start of the sequence with the fileIndex before the line was read
117 if(sb.length() == 0){
118 sequenceIndex = fileIndex;
119 }
120 sb.append(line);
121 }
122 }
123 fileIndex = br.getBytesRead();
124 line = br.readLine();
125 if (line == null) {
126 // System.out.println("Sequence index=" + sequenceIndex + " " + fileIndex );
127 S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
128 headerParser.parseHeader(header, sequence);
129 sequences.put(sequence.getAccession().getID(),sequence);
130 keepGoing = false;
131 }
132 } while (keepGoing);
133 br.close();
134 isr.close();
135 //If stream was created from File object then we need to close it
136 if (fi != null) {
137 fi.close();
138 }
139 return sequences;
140 }
141
142 public static void main(String[] args) {
143 try {
144 String inputFile = "src/test/resources/PF00104_small.fasta";
145 FileInputStream is = new FileInputStream(inputFile);
146
147 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(is, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
148 LinkedHashMap<String,ProteinSequence> proteinSequences = fastaReader.process();
149 is.close();
150
151
152 System.out.println(proteinSequences);
153
154 File file = new File(inputFile);
155 FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader = new FastaReader<ProteinSequence,AminoAcidCompound>(file, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new FileProxyProteinSequenceCreator(file, AminoAcidCompoundSet.getAminoAcidCompoundSet()));
156 LinkedHashMap<String,ProteinSequence> proteinProxySequences = fastaProxyReader.process();
157
158 for(String key : proteinProxySequences.keySet()){
159 ProteinSequence proteinSequence = proteinProxySequences.get(key);
160 System.out.println(key);
161 if(key.equals("Q98SJ1_CHICK/15-61")){
162 int dummy = 1;
163 }
164 System.out.println(proteinSequence.toString());
165
166 }
167
168 } catch (Exception e) {
169 e.printStackTrace();
170 }
171 }
172 }