001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     */
022    package org.biojava3.core.sequence.transcription;
023    
024    import java.util.ArrayList;
025    import java.util.Arrays;
026    import java.util.HashMap;
027    import java.util.List;
028    import java.util.Map;
029    
030    import org.biojava3.core.sequence.RNASequence;
031    import org.biojava3.core.sequence.compound.AminoAcidCompound;
032    import org.biojava3.core.sequence.compound.NucleotideCompound;
033    import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
034    import org.biojava3.core.sequence.template.AbstractCompoundTranslator;
035    import org.biojava3.core.sequence.template.CompoundSet;
036    import org.biojava3.core.sequence.template.Sequence;
037    import org.biojava3.core.sequence.template.SequenceView;
038    import org.biojava3.core.sequence.transcription.Table.Codon;
039    import org.biojava3.core.sequence.views.WindowedSequence;
040    
041    /**
042     * Takes a {@link Sequence} of {@link NucleotideCompound} which should
043     * represent an RNA sequence ({@link RNASequence} is good for this) and returns
044     * a list of {@link Sequence} which hold {@link AminoAcidCompound}. The
045     * translator can also trim stop codons as well as changing any valid
046     * start codon to an initiating met.
047     *
048     * @author ayates
049     */
050    public class RNAToAminoAcidTranslator extends AbstractCompoundTranslator<NucleotideCompound, AminoAcidCompound> {
051    
052        private final boolean trimStops;
053        private final boolean initMetOnly;
054        private final Map<Table.CaseInsensitiveTriplet, Codon> quickLookup;
055        private final Map<AminoAcidCompound, List<Codon>> aminoAcidToCodon;
056        //Cheeky lookup which uses a hashing value; key is to switch to using this all the time
057        private final Codon[] codonArray = new Codon[64000];
058        private final AminoAcidCompound unknownAminoAcidCompound;
059        private final boolean translateNCodons;
060    
061        public RNAToAminoAcidTranslator(
062                SequenceCreatorInterface<AminoAcidCompound> creator,
063                CompoundSet<NucleotideCompound> nucleotides, CompoundSet<Codon> codons,
064                CompoundSet<AminoAcidCompound> aminoAcids, Table table,
065                boolean trimStops, boolean initMetOnly, boolean translateNCodons) {
066    
067            super(creator, nucleotides, aminoAcids);
068            this.trimStops = trimStops;
069            this.initMetOnly = initMetOnly;
070            this.translateNCodons = translateNCodons;
071    
072            quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons.getAllCompounds().size());
073            aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>();
074    
075            List<Codon> codonList = table.getCodons(nucleotides, aminoAcids);
076            for (Codon codon : codonList) {
077                quickLookup.put(codon.getTriplet(), codon);
078                codonArray[codon.getTriplet().intValue()] = codon;
079                
080                List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid());
081                if ( codonL == null){
082                    codonL = new ArrayList<Codon>();
083                    aminoAcidToCodon.put(codon.getAminoAcid(), codonL);
084                }
085                codonL.add(codon);
086                
087            }
088    
089            unknownAminoAcidCompound = aminoAcids.getCompoundForString("X");
090        }
091    
092        /**
093         * Performs the core conversion of RNA to Peptide. It does this by walking
094         * a windowed version of the given sequence. Any trailing DNA base pairs
095         * are ignored according to the specification of {@link WindowedSequence}.
096         */
097        @Override
098        public List<Sequence<AminoAcidCompound>> createSequences(
099                Sequence<NucleotideCompound> originalSequence) {
100    
101            List<List<AminoAcidCompound>> workingList = new ArrayList<List<AminoAcidCompound>>();
102            
103            Iterable<SequenceView<NucleotideCompound>> iter =
104                new WindowedSequence<NucleotideCompound>(originalSequence, 3);
105                    
106            for (SequenceView<NucleotideCompound> element : iter) {
107                AminoAcidCompound aminoAcid;
108    
109                int i =1;
110                Table.CaseInsensitiveTriplet triplet = new Table.CaseInsensitiveTriplet(
111                  element.getCompoundAt(i++), element.getCompoundAt(i++), element.getCompoundAt(i++));
112    
113                Codon target;
114    
115                int arrayIndex = triplet.intValue();
116                //So long as we're within range then access
117                if(arrayIndex > -1 && arrayIndex < codonArray.length) {
118                    target = codonArray[arrayIndex];
119                    aminoAcid = target.getAminoAcid();
120                }
121                //Otherwise we have to use the Map
122                else {
123                    target = quickLookup.get(triplet);
124                    aminoAcid = target.getAminoAcid();
125                }
126                
127                if(aminoAcid == null && translateNCodons()) {
128                    aminoAcid = unknownAminoAcidCompound;
129                }
130                addCompoundsToList(Arrays.asList(aminoAcid), workingList);
131            }
132    
133            postProcessCompoundLists(workingList);
134    
135            return workingListToSequences(workingList);
136        }
137    
138        /**
139         * Performs the trimming of stop codons and the conversion of a valid start
140         * amino acid to M
141         */
142        @Override
143        protected void postProcessCompoundLists(
144                List<List<AminoAcidCompound>> compoundLists) {
145            for (List<AminoAcidCompound> compounds : compoundLists) {
146                if (initMetOnly) {
147                    initMet(compounds);
148                }
149                if (trimStops) {
150                    trimStop(compounds);
151                }
152            }
153        }
154    
155        private void initMet(List<AminoAcidCompound> sequence) {
156            AminoAcidCompound initMet = getToCompoundSet().getCompoundForString("M");
157            AminoAcidCompound start = sequence.get(0);
158            boolean isStart = false;
159            for (Codon c : aminoAcidToCodon.get(start)) {
160                if (c.isStart()) {
161                    isStart = true;
162                    break;
163                }
164            }
165    
166            if (isStart) {
167                sequence.set(0, initMet);
168            }
169        }
170    
171        /**
172         * Imperfect code. Checks the last amino acid to see if a codon could
173         * have translated a stop for it. Left in for the moment
174         */
175        protected void trimStop(List<AminoAcidCompound> sequence) {
176            AminoAcidCompound stop = sequence.get(sequence.size() - 1);
177            boolean isStop = false;
178            for (Codon c : aminoAcidToCodon.get(stop)) {
179                if (c.isStop()) {
180                    isStop = true;
181                    break;
182                }
183            }
184    
185            if (isStop) {
186                sequence.remove(sequence.size() - 1);
187            }
188        }
189    
190        /**
191         * Indicates if we want to force exact translation of compounds or not i.e.
192         * those with internal N RNA bases. This will cause a translation to an
193         * X amino acid
194         */
195        public boolean translateNCodons() {
196            return translateNCodons;
197        }
198    }