001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022 package org.biojava3.core.sequence.io;
023
024 import java.util.ArrayList;
025
026 import org.biojava3.core.sequence.AccessionID;
027 import org.biojava3.core.sequence.DataSource;
028 import org.biojava3.core.sequence.ProteinSequence;
029 import org.biojava3.core.sequence.compound.AminoAcidCompound;
030 import org.biojava3.core.sequence.io.template.FastaHeaderParserInterface;
031 import org.biojava3.core.sequence.template.AbstractSequence;
032 import org.biojava3.core.sequence.template.Compound;
033 import org.biojava3.core.sequence.template.AbstractSequence.AnnotationType;
034
035 /**
036 * The default fasta header parser where some headers are well defined based on the source
037 * database which allows us to set the source of the protein sequence and the identifier
038 * that can be used in future implementations to load features from external sources
039 *
040 * If the user has a custom header with local data then they can create their own implementation
041 * of a FastaHeaderParserInterface
042 *
043 * GenBank gi|gi-number|gb|accession|locus
044 * ENA Data Library gi|gi-number|emb|accession|locus
045 * DDBJ, DNA Database of Japan gi|gi-number|dbj|accession|locus
046 * NBRF PIR pir||entry
047 * Protein Research Foundation prf||name
048 * SWISS-PROT sp|accession|name
049 * Brookhaven Protein Data Bank (1) pdb|entry|chain
050 * Brookhaven Protein Data Bank (2) entry:chain|PDBID|CHAIN|SEQUENCE
051 * PDB EBI PDB:1ECY_A mol:protein length:142 ECOTIN
052 * Patents pat|country|number
053 * GenInfo Backbone Id bbs|number
054 * General database identifier gnl|database|identifier
055 * NCBI Reference Sequence ref|accession|locus
056 * Local Sequence identifier lcl|identifier
057 *
058 * @author Scooter Willis <willishf at gmail dot com>
059 */
060 public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements FastaHeaderParserInterface<S,C> {
061
062 /**
063 * Parse out the components where some have a | and others do not
064 * @param header
065 * @return
066 */
067 private String[] getHeaderValues(String header) {
068 String[] data = new String[0];
069 ArrayList<String> values = new ArrayList<String>();
070 StringBuffer sb = new StringBuffer();
071 if(header.indexOf("length=") != -1){
072 data = new String[1];
073 int index = header.indexOf("length=");
074 data[0] = header.substring(0, index).trim();
075 // System.out.println("accession=" + data[0]);
076 return data;
077 } else if (header.startsWith("PDB:") == false) {
078 for (int i = 0; i < header.length(); i++) {
079 if (header.charAt(i) == '|') {
080 values.add(sb.toString());
081 sb = new StringBuffer();
082 } else if (i == header.length() - 1) {
083 sb.append(header.charAt(i));
084 values.add(sb.toString());
085 } else {
086 sb.append(header.charAt(i));
087 }
088
089 data = new String[values.size()];
090 values.toArray(data);
091 }
092 } else {
093 data = header.split(" ");
094 }
095 return data;
096 }
097
098 /**
099 * Parse the header and set the values in the sequence
100 * @param header
101 * @param sequence
102 */
103 public void parseHeader(String header, S sequence) {
104 //uniptrot
105 // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1
106 sequence.setOriginalHeader(header);
107 String[] data = getHeaderValues(header);
108
109 if (data.length == 1) {
110 sequence.setAccession(new AccessionID(data[0]));
111 } else if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) {
112 if (data[0].equalsIgnoreCase("sp")) {
113 sequence.setAnnotationType(AnnotationType.CURATED);
114 } else {
115 sequence.setAnnotationType(AnnotationType.PREDICTED);
116 }
117
118 sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT));
119 if (data.length > 1) {
120 sequence.setDescription(data[2]);
121 }
122
123 } else if (data[0].equalsIgnoreCase("gi")) {
124 DataSource giSource = DataSource.UNKNOWN;
125 if (data.length >= 3) {
126 if (data[2].equalsIgnoreCase("gb")) {
127 giSource = DataSource.GENBANK;
128 } else if (data[2].equalsIgnoreCase("emb")) {
129 giSource = DataSource.ENA;
130 } else if (data[2].equalsIgnoreCase("dbj")) {
131 giSource = DataSource.DDBJ;
132 }
133 sequence.setAccession(new AccessionID(data[3], giSource));
134 } else {
135 sequence.setAccession(new AccessionID(header, giSource));
136 }
137 } else if (data[0].equalsIgnoreCase("pir")) {
138 sequence.setAccession(new AccessionID(data[2], DataSource.NBRF));
139 } else if (data[0].equalsIgnoreCase("prf")) {
140 sequence.setAccession(new AccessionID(data[2], DataSource.PRF));
141 } else if (data[0].equalsIgnoreCase("pdb")) {
142 sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1));
143 } else if (data[0].startsWith("PDB")) {
144 String[] pdbe = data[0].split(" ");
145 String[] pdbaccession = pdbe[0].split(":");
146 sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe));
147 } else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) {
148 sequence.setAccession(new AccessionID(data[0], DataSource.PDB2));
149 } else if (data[0].equalsIgnoreCase("pat")) {
150 sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS));
151 } else if (data[0].equalsIgnoreCase("bbs")) {
152 sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO));
153 } else if (data[0].equalsIgnoreCase("gnl")) {
154 sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL));
155 } else if (data[0].equalsIgnoreCase("ref")) {
156 sequence.setAccession(new AccessionID(data[1], DataSource.NCBI));
157 } else if (data[0].equalsIgnoreCase("lcl")) {
158 sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL));
159 } else {
160 sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader
161 }
162
163
164 }
165
166 /**
167 *
168 * @param args
169 */
170 public static void main(String[] args) {
171
172 System.out.println("parseHeader");
173 String header = "";
174 ProteinSequence sequence = new ProteinSequence("");
175 GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound> instance =
176 new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>();
177
178 header = "gi|gi-number|gb|accession|locus";
179 instance.parseHeader(header, sequence);
180 System.out.println("accession" + "=" + sequence.getAccession());
181 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENBANK);
182
183 header = "gi|gi-number|emb|accession|locus";
184 instance.parseHeader(header, sequence);
185 System.out.println("accession" + "=" + sequence.getAccession());
186 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.ENA);
187
188 header = "gi|gi-number|dbj|accession|locus";
189 instance.parseHeader(header, sequence);
190 System.out.println("accession" + "=" + sequence.getAccession());
191 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.DDBJ);
192
193 header = "pir||entry";
194 instance.parseHeader(header, sequence);
195 System.out.println("entry" + "=" + sequence.getAccession());
196 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NBRF);
197
198 header = "prf||name";
199 instance.parseHeader(header, sequence);
200 System.out.println("name" + "=" + sequence.getAccession());
201 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PRF);
202
203 header = "sp|accession|name";
204 instance.parseHeader(header, sequence);
205 System.out.println("accession" + "=" + sequence.getAccession());
206 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.UNIPROT);
207
208 header = "pdb|entry|chain";
209 instance.parseHeader(header, sequence);
210 System.out.println("entry:chain" + "=" + sequence.getAccession());
211 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB1);
212
213 header = "entry:chain|PDBID|CHAIN|SEQUENCE";
214 instance.parseHeader(header, sequence);
215 System.out.println("entry:chain" + "=" + sequence.getAccession());
216 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB2);
217 header = "PDB:1ECY_A mol:protein length:142 ECOTIN";
218 instance.parseHeader(header, sequence);
219 System.out.println("1ECY_A" + "=" + sequence.getAccession());
220 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDBe);
221
222 header = "pat|country|number";
223 instance.parseHeader(header, sequence);
224 System.out.println("number" + "=" + sequence.getAccession());
225 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PATENTS);
226
227 header = "bbs|number";
228 instance.parseHeader(header, sequence);
229 System.out.println("number" + "=" + sequence.getAccession());
230 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENINFO);
231
232 header = "gnl|database|identifier";
233 instance.parseHeader(header, sequence);
234 System.out.println("identifier" + "=" + sequence.getAccession());
235 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENERAL);
236
237 header = "ref|accession|locus";
238
239 instance.parseHeader(header, sequence);
240 System.out.println("accession" + "=" + sequence.getAccession());
241 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NCBI);
242
243 header = "lcl|identifier";
244 instance.parseHeader(header, sequence);
245 System.out.println("identifier" + "=" + sequence.getAccession());
246 System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.LOCAL);
247 }
248 }