001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023 package org.biojava3.core.sequence;
024
025 import java.util.ArrayList;
026 import java.util.Collections;
027 import java.util.LinkedHashMap;
028 import java.util.logging.Logger;
029 import org.biojava3.core.sequence.compound.NucleotideCompound;
030 import org.biojava3.core.sequence.template.CompoundSet;
031
032 /**
033 *
034 * @author Scooter Willis
035 */
036 public class GeneSequence extends DNASequence {
037
038 private final LinkedHashMap<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<String, TranscriptSequence>();
039 private static final Logger log = Logger.getLogger(GeneSequence.class.getName());
040 private final LinkedHashMap<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<String, IntronSequence>();
041 private final LinkedHashMap<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<String, ExonSequence>();
042 private final ArrayList<IntronSequence> intronSequenceList = new ArrayList<IntronSequence>();
043 private final ArrayList<ExonSequence> exonSequenceList = new ArrayList<ExonSequence>();
044 boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons
045 private Strand strand = Strand.UNDEFINED;
046 private ChromosomeSequence chromosomeSequence;
047
048 /**
049 * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult
050 * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one
051 * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein.
052 *
053 * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a
054 * TranscriptSequence and then add CDS sequences.
055 *
056 * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files
057 *
058 * @param parentDNASequence
059 * @param begin
060 * @param end inclusive of end
061 * @param strand force a gene to have strand and transcription sequence will inherit
062 */
063 public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) {
064 chromosomeSequence = parentSequence;
065 setParentSequence(parentSequence);
066 setBioBegin(begin);
067 setBioEnd(end);
068 setStrand(strand);
069 }
070
071 /**
072 * The parent ChromosomeSequence which contains the actual DNA sequence data
073 * @return
074 */
075 public ChromosomeSequence getParentChromosomeSequence() {
076 return chromosomeSequence;
077 }
078
079 /**
080 * Once everything has been added to the gene sequence where you might have added exon sequences only then you
081 * can infer the intron sequences and add them. You may also have the case where you only added one or more
082 * TranscriptSequences and from that you can infer the exon sequences and intron sequences.
083 * Currently not implement
084 */
085 public void validate() {
086 ExonComparator exonComparator = new ExonComparator();
087 //sort based on start position and sense;
088 Collections.sort(exonSequenceList, exonComparator);
089 if (intronAdded) {
090 log.severe(this.getAccession() + " has introns added which will not be handled properly trying to fill in introns gaps from validate method");
091 }
092
093
094 // log.severe("Add in support for building introns based on added exons");
095
096 }
097
098 /**
099 * A gene should have Strand
100 * @return the strand
101 */
102 public Strand getStrand() {
103 return strand;
104 }
105
106 /**
107 * @param strand the strand to set
108 */
109 public void setStrand(Strand strand) {
110 this.strand = strand;
111 }
112
113 /**
114 * Get the transcript sequence by accession
115 * @param accession
116 * @return
117 */
118 public TranscriptSequence getTranscript(String accession) {
119 return transcriptSequenceHashMap.get(accession);
120 }
121
122 /**
123 * Get the collection of transcription sequences assigned to this gene
124 * @return
125 */
126 public LinkedHashMap<String, TranscriptSequence> getTranscripts() {
127 return transcriptSequenceHashMap;
128 }
129
130 /**
131 * Remove the transcript sequence from the gene
132 * @param accession
133 * @return
134 */
135 public TranscriptSequence removeTranscript(String accession) {
136
137
138 return transcriptSequenceHashMap.remove(accession);
139 }
140
141 /**
142 * Add a transcription sequence to a gene which describes a ProteinSequence
143 * @param accession
144 * @param begin
145 * @param end
146 * @return
147 * @throws Exception If the accession id is already used
148 */
149 public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception {
150 if (transcriptSequenceHashMap.containsKey(accession.getID())) {
151 throw new Exception("Duplicate accesion id " + accession.getID());
152 }
153 TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end);
154 transcriptSequence.setAccession(accession);
155 transcriptSequenceHashMap.put(accession.getID(), transcriptSequence);
156 return transcriptSequence;
157 }
158
159 /**
160 * Remove the intron by accession
161 * @param accession
162 * @return
163 */
164 public IntronSequence removeIntron(String accession) {
165 for (IntronSequence intronSequence : intronSequenceList) {
166 if (intronSequence.getAccession().getID().equals(accession)) {
167 intronSequenceList.remove(intronSequence);
168 intronSequenceHashMap.remove(accession);
169 return intronSequence;
170 }
171 }
172 return null;
173 }
174
175 /**
176 * Add an Intron Currently used to mark an IntronSequence as a feature
177 * @param accession
178 * @param begin
179 * @param end
180 * @return
181 */
182 public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception {
183 if (intronSequenceHashMap.containsKey(accession.getID())) {
184 throw new Exception("Duplicate accesion id " + accession.getID());
185 }
186 intronAdded = true;
187 IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent
188 intronSequence.setAccession(accession);
189 intronSequenceList.add(intronSequence);
190 intronSequenceHashMap.put(accession.getID(), intronSequence);
191 return intronSequence;
192 }
193
194 /**
195 * Remove the exon sequence
196 * @param accession
197 * @return
198 */
199 public ExonSequence removeExon(String accession) {
200 for (ExonSequence exonSequence : exonSequenceList) {
201 if (exonSequence.getAccession().getID().equals(accession)) {
202 exonSequenceList.remove(exonSequence);
203 exonSequenceHashMap.remove(accession);
204 validate();
205 return exonSequence;
206 }
207 }
208 return null;
209 }
210
211 /**
212 * Add an ExonSequence mainly used to mark as a feature
213 * @param accession
214 * @param begin
215 * @param end
216 * @return
217 */
218 public ExonSequence addExon(AccessionID accession, int begin, int end) throws Exception {
219 if (exonSequenceHashMap.containsKey(accession.getID())) {
220 throw new Exception("Duplicate accesion id " + accession.getID());
221 }
222
223 ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent
224 exonSequence.setAccession(accession);
225 exonSequenceList.add(exonSequence);
226 exonSequenceHashMap.put(accession.getID(), exonSequence);
227 validate();
228 return exonSequence;
229 }
230
231 /**
232 * Get the exons as an ArrayList
233 * @return
234 */
235 public ArrayList<ExonSequence> getExonSequences(){
236 return exonSequenceList;
237 }
238
239 /**
240 * Get the introns as an ArrayList
241 * @return
242 */
243 public ArrayList<IntronSequence> getIntronSequences(){
244 return intronSequenceList;
245 }
246
247 /**
248 * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction
249 * Returns the DNASequence representative of the 5' and 3' reading based on strand
250 * @return
251 */
252
253 public DNASequence getSequence5PrimeTo3Prime() {
254 String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand());
255 if (getStrand() == Strand.NEGATIVE) {
256 //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand
257 StringBuilder b = new StringBuilder(getLength());
258 CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet();
259 for (int i = 0; i < sequence.length(); i++) {
260 String nucleotide = sequence.charAt(i) + "";
261 NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide);
262 b.append(nucleotideCompound.getComplement().getShortName());
263 }
264 sequence = b.toString();
265 }
266 DNASequence dnaSequence = new DNASequence(sequence.toUpperCase());
267 dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
268 return dnaSequence;
269 }
270 }