001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on DATE
021     *
022     */
023    package org.biojava3.core.sequence;
024    
025    import java.util.ArrayList;
026    import java.util.Collections;
027    import java.util.LinkedHashMap;
028    import java.util.logging.Logger;
029    import org.biojava3.core.sequence.compound.NucleotideCompound;
030    import org.biojava3.core.sequence.template.CompoundSet;
031    
032    /**
033     *
034     * @author Scooter Willis
035     */
036    public class GeneSequence extends DNASequence {
037    
038        private final LinkedHashMap<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<String, TranscriptSequence>();
039        private static final Logger log = Logger.getLogger(GeneSequence.class.getName());
040        private final LinkedHashMap<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<String, IntronSequence>();
041        private final LinkedHashMap<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<String, ExonSequence>();
042        private final ArrayList<IntronSequence> intronSequenceList = new ArrayList<IntronSequence>();
043        private final ArrayList<ExonSequence> exonSequenceList = new ArrayList<ExonSequence>();
044        boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons
045        private Strand strand = Strand.UNDEFINED;
046        private ChromosomeSequence chromosomeSequence;
047    
048        /**
049         * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult
050         * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one
051         * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein.
052         *
053         * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a
054         * TranscriptSequence and then add CDS sequences.
055         *
056         * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files
057         *
058         * @param parentDNASequence
059         * @param begin
060         * @param end inclusive of end
061         * @param strand force a gene to have strand and transcription sequence will inherit
062         */
063        public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) {
064            chromosomeSequence = parentSequence;
065            setParentSequence(parentSequence);
066            setBioBegin(begin);
067            setBioEnd(end);
068            setStrand(strand);
069        }
070    
071         /**
072          * The parent ChromosomeSequence which contains the actual DNA sequence data
073          * @return
074          */
075        public ChromosomeSequence getParentChromosomeSequence() {
076            return chromosomeSequence;
077        }
078    
079        /**
080         * Once everything has been added to the gene sequence where you might have added exon sequences only then you
081         * can infer the intron sequences and add them. You may also have the case where you only added one or more
082         * TranscriptSequences and from that you can infer the exon sequences and intron sequences.
083         * Currently not implement
084         */
085        public void validate() {
086            ExonComparator exonComparator = new ExonComparator();
087            //sort based on start position and sense;
088            Collections.sort(exonSequenceList, exonComparator);
089            if (intronAdded) {
090                log.severe(this.getAccession() + " has introns added which will not be handled properly trying to fill in introns gaps from validate method");
091            }
092    
093    
094            //    log.severe("Add in support for building introns based on added exons");
095    
096        }
097    
098        /**
099         * A gene should have Strand
100         * @return the strand
101         */
102        public Strand getStrand() {
103            return strand;
104        }
105    
106        /**
107         * @param strand the strand to set
108         */
109        public void setStrand(Strand strand) {
110            this.strand = strand;
111        }
112    
113        /**
114         * Get the transcript sequence by accession
115         * @param accession
116         * @return
117         */
118        public TranscriptSequence getTranscript(String accession) {
119            return transcriptSequenceHashMap.get(accession);
120        }
121    
122        /**
123         * Get the collection of transcription sequences assigned to this gene
124         * @return
125         */
126        public LinkedHashMap<String, TranscriptSequence> getTranscripts() {
127            return transcriptSequenceHashMap;
128        }
129    
130        /**
131         * Remove the transcript sequence from the gene
132         * @param accession
133         * @return
134         */
135        public TranscriptSequence removeTranscript(String accession) {
136    
137    
138            return transcriptSequenceHashMap.remove(accession);
139        }
140    
141        /**
142         * Add a transcription sequence to a gene which describes a ProteinSequence
143         * @param accession
144         * @param begin
145         * @param end
146         * @return
147         * @throws Exception If the accession id is already used
148         */
149        public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception {
150            if (transcriptSequenceHashMap.containsKey(accession.getID())) {
151                throw new Exception("Duplicate accesion id " + accession.getID());
152            }
153            TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end);
154            transcriptSequence.setAccession(accession);
155            transcriptSequenceHashMap.put(accession.getID(), transcriptSequence);
156            return transcriptSequence;
157        }
158    
159        /**
160         * Remove the intron by accession
161         * @param accession
162         * @return
163         */
164        public IntronSequence removeIntron(String accession) {
165            for (IntronSequence intronSequence : intronSequenceList) {
166                if (intronSequence.getAccession().getID().equals(accession)) {
167                    intronSequenceList.remove(intronSequence);
168                    intronSequenceHashMap.remove(accession);
169                    return intronSequence;
170                }
171            }
172            return null;
173        }
174    
175        /**
176         * Add an Intron Currently used to mark an IntronSequence as a feature
177         * @param accession
178         * @param begin
179         * @param end
180         * @return
181         */
182        public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception {
183            if (intronSequenceHashMap.containsKey(accession.getID())) {
184                throw new Exception("Duplicate accesion id " + accession.getID());
185            }
186            intronAdded = true;
187            IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent
188            intronSequence.setAccession(accession);
189            intronSequenceList.add(intronSequence);
190            intronSequenceHashMap.put(accession.getID(), intronSequence);
191            return intronSequence;
192        }
193    
194        /**
195         * Remove the exon sequence
196         * @param accession
197         * @return
198         */
199        public ExonSequence removeExon(String accession) {
200            for (ExonSequence exonSequence : exonSequenceList) {
201                if (exonSequence.getAccession().getID().equals(accession)) {
202                    exonSequenceList.remove(exonSequence);
203                    exonSequenceHashMap.remove(accession);
204                    validate();
205                    return exonSequence;
206                }
207            }
208            return null;
209        }
210    
211        /**
212         * Add an ExonSequence mainly used to mark as a feature
213         * @param accession
214         * @param begin
215         * @param end
216         * @return
217         */
218        public ExonSequence addExon(AccessionID accession, int begin, int end) throws Exception {
219            if (exonSequenceHashMap.containsKey(accession.getID())) {
220                throw new Exception("Duplicate accesion id " + accession.getID());
221            }
222    
223            ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent
224            exonSequence.setAccession(accession);
225            exonSequenceList.add(exonSequence);
226            exonSequenceHashMap.put(accession.getID(), exonSequence);
227            validate();
228            return exonSequence;
229        }
230    
231        /**
232         * Get the exons as an ArrayList
233         * @return
234         */
235        public ArrayList<ExonSequence> getExonSequences(){
236            return exonSequenceList;
237        }
238    
239        /**
240         * Get the introns as an ArrayList
241         * @return
242         */
243        public ArrayList<IntronSequence> getIntronSequences(){
244            return intronSequenceList;
245        }
246    
247        /**
248         * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction
249         * Returns the DNASequence representative of the 5' and 3' reading based on strand
250         * @return
251         */
252    
253        public DNASequence getSequence5PrimeTo3Prime() {
254            String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand());
255            if (getStrand() == Strand.NEGATIVE) {
256                //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand
257                StringBuilder b = new StringBuilder(getLength());
258                CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet();
259                for (int i = 0; i < sequence.length(); i++) {
260                    String nucleotide = sequence.charAt(i) + "";
261                    NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide);
262                    b.append(nucleotideCompound.getComplement().getShortName());
263                }
264                sequence = b.toString();
265            }
266            DNASequence dnaSequence = new DNASequence(sequence.toUpperCase());
267            dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
268            return dnaSequence;
269        }
270    }