001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     */
022    package org.biojava3.core.sequence.io;
023    
024    import java.util.ArrayList;
025    
026    import org.biojava3.core.sequence.AccessionID;
027    import org.biojava3.core.sequence.DataSource;
028    import org.biojava3.core.sequence.ProteinSequence;
029    import org.biojava3.core.sequence.compound.AminoAcidCompound;
030    import org.biojava3.core.sequence.io.template.FastaHeaderParserInterface;
031    import org.biojava3.core.sequence.template.AbstractSequence;
032    import org.biojava3.core.sequence.template.Compound;
033    import org.biojava3.core.sequence.template.AbstractSequence.AnnotationType;
034    
035    /**
036     * The default fasta header parser where some headers are well defined based on the source
037     * database which allows us to set the source of the protein sequence and the identifier
038     * that can be used in future implementations to load features from external sources
039     * 
040     * If the user has a custom header with local data then they can create their own implementation
041     * of a FastaHeaderParserInterface
042     *
043     * GenBank                           gi|gi-number|gb|accession|locus
044     * ENA Data Library                 gi|gi-number|emb|accession|locus
045     * DDBJ, DNA Database of Japan       gi|gi-number|dbj|accession|locus
046     * NBRF PIR                          pir||entry
047     * Protein Research Foundation       prf||name
048     * SWISS-PROT                        sp|accession|name
049     * Brookhaven Protein Data Bank (1)  pdb|entry|chain
050     * Brookhaven Protein Data Bank (2)  entry:chain|PDBID|CHAIN|SEQUENCE
051     * PDB EBI                           PDB:1ECY_A mol:protein length:142  ECOTIN
052     * Patents                           pat|country|number
053     * GenInfo Backbone Id               bbs|number
054     * General database identifier       gnl|database|identifier
055     * NCBI Reference Sequence           ref|accession|locus
056     * Local Sequence identifier         lcl|identifier
057     *
058     * @author Scooter Willis <willishf at gmail dot com>
059     */
060    public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements FastaHeaderParserInterface<S,C> {
061    
062        /**
063         * Parse out the components where some have a | and others do not
064         * @param header
065         * @return
066         */
067        private String[] getHeaderValues(String header) {
068            String[] data = new String[0];
069            ArrayList<String> values = new ArrayList<String>();
070            StringBuffer sb = new StringBuffer();
071            if(header.indexOf("length=") != -1){
072                data = new String[1];
073                int index = header.indexOf("length=");
074                data[0] = header.substring(0, index).trim();
075        //        System.out.println("accession=" + data[0]);
076                return data;
077            } else if (header.startsWith("PDB:") == false) {
078                for (int i = 0; i < header.length(); i++) {
079                    if (header.charAt(i) == '|') {
080                        values.add(sb.toString());
081                        sb = new StringBuffer();
082                    } else if (i == header.length() - 1) {
083                        sb.append(header.charAt(i));
084                        values.add(sb.toString());
085                    } else {
086                        sb.append(header.charAt(i));
087                    }
088    
089                    data = new String[values.size()];
090                    values.toArray(data);
091                }
092            } else {
093                data = header.split(" ");
094            }
095            return data;
096        }
097    
098        /**
099         * Parse the header and set the values in the sequence
100         * @param header
101         * @param sequence
102         */
103        public void parseHeader(String header, S sequence) {
104            //uniptrot
105            // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1
106            sequence.setOriginalHeader(header);
107            String[] data = getHeaderValues(header);
108    
109            if (data.length == 1) {
110                sequence.setAccession(new AccessionID(data[0]));
111            } else  if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) {
112                if (data[0].equalsIgnoreCase("sp")) {
113                    sequence.setAnnotationType(AnnotationType.CURATED);
114                } else {
115                    sequence.setAnnotationType(AnnotationType.PREDICTED);
116                }
117    
118                sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT));
119                if (data.length > 1) {
120                    sequence.setDescription(data[2]);
121                }
122    
123            } else if (data[0].equalsIgnoreCase("gi")) {
124                DataSource giSource = DataSource.UNKNOWN;
125                if (data.length >= 3) {
126                    if (data[2].equalsIgnoreCase("gb")) {
127                        giSource = DataSource.GENBANK;
128                    } else if (data[2].equalsIgnoreCase("emb")) {
129                        giSource = DataSource.ENA;
130                    } else if (data[2].equalsIgnoreCase("dbj")) {
131                        giSource = DataSource.DDBJ;
132                    }
133                    sequence.setAccession(new AccessionID(data[3], giSource));
134                } else {
135                    sequence.setAccession(new AccessionID(header, giSource));
136                }
137            } else if (data[0].equalsIgnoreCase("pir")) {
138                sequence.setAccession(new AccessionID(data[2], DataSource.NBRF));
139            } else if (data[0].equalsIgnoreCase("prf")) {
140                sequence.setAccession(new AccessionID(data[2], DataSource.PRF));
141            } else if (data[0].equalsIgnoreCase("pdb")) {
142                sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1));
143            } else if (data[0].startsWith("PDB")) {
144                String[] pdbe = data[0].split(" ");
145                String[] pdbaccession = pdbe[0].split(":");
146                sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe));
147            } else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) {
148                sequence.setAccession(new AccessionID(data[0], DataSource.PDB2));
149            } else if (data[0].equalsIgnoreCase("pat")) {
150                sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS));
151            } else if (data[0].equalsIgnoreCase("bbs")) {
152                sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO));
153            } else if (data[0].equalsIgnoreCase("gnl")) {
154                sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL));
155            } else if (data[0].equalsIgnoreCase("ref")) {
156                sequence.setAccession(new AccessionID(data[1], DataSource.NCBI));
157            } else if (data[0].equalsIgnoreCase("lcl")) {
158                sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL));
159            } else {
160                sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader
161            }
162    
163    
164        }
165    
166        /**
167         * 
168         * @param args
169         */
170        public static void main(String[] args) {
171    
172            System.out.println("parseHeader");
173            String header = "";
174            ProteinSequence sequence = new ProteinSequence("");
175            GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound> instance =
176              new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>();
177    
178            header = "gi|gi-number|gb|accession|locus";
179            instance.parseHeader(header, sequence);
180            System.out.println("accession" + "=" + sequence.getAccession());
181            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENBANK);
182    
183            header = "gi|gi-number|emb|accession|locus";
184            instance.parseHeader(header, sequence);
185            System.out.println("accession" + "=" + sequence.getAccession());
186            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.ENA);
187    
188            header = "gi|gi-number|dbj|accession|locus";
189            instance.parseHeader(header, sequence);
190            System.out.println("accession" + "=" + sequence.getAccession());
191            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.DDBJ);
192    
193            header = "pir||entry";
194            instance.parseHeader(header, sequence);
195            System.out.println("entry" + "=" + sequence.getAccession());
196            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NBRF);
197    
198            header = "prf||name";
199            instance.parseHeader(header, sequence);
200            System.out.println("name" + "=" + sequence.getAccession());
201            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PRF);
202    
203            header = "sp|accession|name";
204            instance.parseHeader(header, sequence);
205            System.out.println("accession" + "=" + sequence.getAccession());
206            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.UNIPROT);
207    
208            header = "pdb|entry|chain";
209            instance.parseHeader(header, sequence);
210            System.out.println("entry:chain" + "=" + sequence.getAccession());
211            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB1);
212    
213            header = "entry:chain|PDBID|CHAIN|SEQUENCE";
214            instance.parseHeader(header, sequence);
215            System.out.println("entry:chain" + "=" + sequence.getAccession());
216            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB2);
217            header = "PDB:1ECY_A mol:protein length:142  ECOTIN";
218            instance.parseHeader(header, sequence);
219            System.out.println("1ECY_A" + "=" + sequence.getAccession());
220            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDBe);
221    
222            header = "pat|country|number";
223            instance.parseHeader(header, sequence);
224            System.out.println("number" + "=" + sequence.getAccession());
225            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PATENTS);
226    
227            header = "bbs|number";
228            instance.parseHeader(header, sequence);
229            System.out.println("number" + "=" + sequence.getAccession());
230            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENINFO);
231    
232            header = "gnl|database|identifier";
233            instance.parseHeader(header, sequence);
234            System.out.println("identifier" + "=" + sequence.getAccession());
235            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENERAL);
236    
237            header = "ref|accession|locus";
238    
239            instance.parseHeader(header, sequence);
240            System.out.println("accession" + "=" + sequence.getAccession());
241            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NCBI);
242    
243            header = "lcl|identifier";
244            instance.parseHeader(header, sequence);
245            System.out.println("identifier" + "=" + sequence.getAccession());
246            System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.LOCAL);
247        }
248    }