001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on DATE
021     *
022     */
023    package org.biojava3.core.sequence;
024    
025    import java.util.ArrayList;
026    import java.util.Collections;
027    import java.util.LinkedHashMap;
028    import java.util.logging.Logger;
029    
030    import org.biojava3.core.sequence.transcription.TranscriptionEngine;
031    
032    /**
033     * This is the sequence if you want to go from a gene sequence to a protein sequence. Need to start with a
034     * ChromosomeSequence then getting a GeneSequence and then a TranscriptSequence
035     * @author Scooter Willis
036     */
037    public class TranscriptSequence extends DNASequence {
038    
039        private static final Logger log = Logger.getLogger(TranscriptSequence.class.getName());
040        private final ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>();
041        private final LinkedHashMap<String, CDSSequence> cdsSequenceHashMap = new LinkedHashMap<String, CDSSequence>();
042        private StartCodonSequence startCodonSequence = null;
043        private StopCodonSequence stopCodonSequence = null;
044        private GeneSequence parentGeneSequence = null;
045    
046        /**
047         *
048         * @param parentDNASequence
049         * @param begin
050         * @param end inclusive of end
051         */
052        public TranscriptSequence(GeneSequence parentDNASequence, int begin, int end) {
053            setParentSequence(parentDNASequence);
054            this.parentGeneSequence = parentDNASequence;
055            setBioBegin(begin);
056            setBioEnd(end);
057    
058        }
059    
060        /**
061         * @return the strand
062         */
063        public Strand getStrand() {
064            return parentGeneSequence.getStrand();
065        }
066    
067        /**
068         * Remove a CDS or coding sequence from the transcript sequence
069         * @param accession
070         * @return
071         */
072        public CDSSequence removeCDS(String accession) {
073            for (CDSSequence cdsSequence : cdsSequenceList) {
074                if (cdsSequence.getAccession().getID().equals(accession)) {
075                    cdsSequenceList.remove(cdsSequence);
076                    cdsSequenceHashMap.remove(accession);
077                    return cdsSequence;
078                }
079            }
080            return null;
081        }
082    
083        /**
084         * Get the CDS sequences that have been added to the TranscriptSequences
085         * @return
086         */
087        public LinkedHashMap<String, CDSSequence> getCDSSequences() {
088            return cdsSequenceHashMap;
089        }
090    
091        /**
092         * Add a Coding Sequence region with phase to the transcript sequence
093         * @param accession
094         * @param begin
095         * @param end
096         * @param phase 0,1,2
097         * @return
098         */
099        public CDSSequence addCDS(AccessionID accession, int begin, int end, int phase) throws Exception {
100            if (cdsSequenceHashMap.containsKey(accession.getID())) {
101                throw new Exception("Duplicate accesion id " + accession.getID());
102            }
103            CDSSequence cdsSequence = new CDSSequence(this, begin, end, phase); //sense should be the same as parent
104            cdsSequence.setAccession(accession);
105            cdsSequenceList.add(cdsSequence);
106            Collections.sort(cdsSequenceList, new CDSComparator());
107            cdsSequenceHashMap.put(accession.getID(), cdsSequence);
108            return cdsSequence;
109        }
110    
111        /**
112         * http://www.sequenceontology.org/gff3.shtml
113         * http://biowiki.org/~yam/bioe131/GFF.ppt
114         * @return
115         */
116        /**
117         * Return a list of protein sequences based on each CDS sequence
118         * where the phase shift between two CDS sequences is assigned to the
119         * CDS sequence that starts the triplet. This can be used to map
120         * a CDS/exon region of a protein sequence back to the DNA sequence
121         * If you have a protein sequence and a predicted gene you can take the
122         * predict CDS protein sequences and align back to the protein sequence.
123         * If you have errors in mapping the predicted protein CDS regions to
124         * an the known protein sequence then you can identify possible errors
125         * in the prediction
126         *
127         * @return
128         */
129        public ArrayList<ProteinSequence> getProteinCDSSequences() {
130            ArrayList<ProteinSequence> proteinSequenceList = new ArrayList<ProteinSequence>();
131            for (int i = 0; i < cdsSequenceList.size(); i++) {
132                CDSSequence cdsSequence = cdsSequenceList.get(i);
133                String codingSequence = cdsSequence.getCodingSequence();
134                //          System.out.println("CDS " + getStrand() + " "  + cdsSequence.getPhase() + "=" + codingSequence);
135                if (this.getStrand() == Strand.NEGATIVE) {
136                    if (cdsSequence.phase == 1) {
137                        codingSequence = codingSequence.substring(1, codingSequence.length());
138                    } else if (cdsSequence.phase == 2) {
139                        codingSequence = codingSequence.substring(2, codingSequence.length());
140                    }
141                    if (i < cdsSequenceList.size() - 1) {
142                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
143                        if (nextCDSSequence.phase == 1) {
144                            String nextCodingSequence = nextCDSSequence.getCodingSequence();
145                            codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
146                        } else if (nextCDSSequence.phase == 2) {
147                            String nextCodingSequence = nextCDSSequence.getCodingSequence();
148                            codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
149                        }
150                    }
151                } else {
152                    if (cdsSequence.phase == 1) {
153                        codingSequence = codingSequence.substring(1, codingSequence.length());
154                    } else if (cdsSequence.phase == 2) {
155                        codingSequence = codingSequence.substring(2, codingSequence.length());
156                    }
157                    if (i < cdsSequenceList.size() - 1) {
158                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
159                        if (nextCDSSequence.phase == 1) {
160                            String nextCodingSequence = nextCDSSequence.getCodingSequence();
161                            codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
162                        } else if (nextCDSSequence.phase == 2) {
163                            String nextCodingSequence = nextCDSSequence.getCodingSequence();
164                            codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
165                        }
166                    }
167                }
168    
169    
170                //    System.out.println(codingSequence);
171                DNASequence dnaCodingSequence = new DNASequence(codingSequence.toString().toUpperCase());
172                RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(TranscriptionEngine.getDefault());
173                ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(TranscriptionEngine.getDefault());
174                proteinSequence.setAccession(new AccessionID(cdsSequence.getAccession().getID()));
175                proteinSequence.setParentDNASequence(cdsSequence, 1, cdsSequence.getLength());
176                proteinSequenceList.add(proteinSequence);
177            }
178            return proteinSequenceList;
179        }
180    
181        /**
182         * Get the stitched together CDS sequences then maps to the cDNA
183         * @return
184         */
185        public DNASequence getDNACodingSequence() {
186            StringBuilder sb = new StringBuilder();
187            for (CDSSequence cdsSequence : cdsSequenceList) {
188                sb.append(cdsSequence.getCodingSequence());
189            }
190            DNASequence dnaSequence = new DNASequence(sb.toString().toUpperCase());
191            dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
192            return dnaSequence;
193        }
194    
195        /**
196         * Get the protein sequence
197         * @return
198         */
199        public ProteinSequence getProteinSequence() {
200            return getProteinSequence(TranscriptionEngine.getDefault());
201        }
202    
203        /**
204         * Get the protein sequence with user defined TranscriptEngine
205         * @param engine
206         * @return
207         */
208        public ProteinSequence getProteinSequence(TranscriptionEngine engine) {
209            DNASequence dnaCodingSequence = getDNACodingSequence();
210            RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(engine);
211            ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(engine);
212            proteinSequence.setAccession(new AccessionID(this.getAccession().getID()));
213            return proteinSequence;
214        }
215    
216        /**
217         * @return the startCodonSequence
218         */
219        public StartCodonSequence getStartCodonSequence() {
220            return startCodonSequence;
221        }
222    
223        /**
224         * @param startCodonSequence the startCodonSequence to set
225         */
226        public void addStartCodonSequence(AccessionID accession, int begin, int end) {
227            this.startCodonSequence = new StartCodonSequence(this, begin, end);
228            startCodonSequence.setAccession(accession);
229        }
230    
231        /**
232         * @return the stopCodonSequence
233         */
234        public StopCodonSequence getStopCodonSequence() {
235            return stopCodonSequence;
236        }
237    
238        /**
239         * @param stopCodonSequence the stopCodonSequence to set
240         */
241        public void addStopCodonSequence(AccessionID accession, int begin, int end) {
242            this.stopCodonSequence = new StopCodonSequence(this, begin, end);
243            stopCodonSequence.setAccession(accession);
244        }
245    }