001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     */
022    
023    package org.biojava3.core.sequence.compound;
024    
025    import java.util.ArrayList;
026    import java.util.Collections;
027    import java.util.HashMap;
028    import java.util.HashSet;
029    import java.util.List;
030    import java.util.Map;
031    import java.util.Set;
032    
033    import org.biojava3.core.exceptions.CompoundNotFoundError;
034    import org.biojava3.core.sequence.template.CompoundSet;
035    import org.biojava3.core.sequence.template.Sequence;
036    
037    /**
038     * Set of proteinogenic amino acids.  Molecular weights are recorded in daltons (Da) as residues of a chain; monomers
039     * outside of a chain would likely have an additional mass of 18.01524 Da contributed by an associated water molecule.
040     *
041     * Currently we have different symbols to handle inserts so not as clean as it should be
042     *
043     * @author Richard Holland
044     * @author Scooter Willis
045     * @author Mark Chapman
046     */
047    public class AminoAcidCompoundSet implements CompoundSet<AminoAcidCompound> {
048    
049        private final Map<String, AminoAcidCompound> aminoAcidCompoundCache = new HashMap<String, AminoAcidCompound>();
050        private final Map<AminoAcidCompound, Set<AminoAcidCompound>> equivalentsCache =
051                new HashMap<AminoAcidCompound, Set<AminoAcidCompound>>();
052    
053        public AminoAcidCompoundSet() {
054            aminoAcidCompoundCache.put("A", new AminoAcidCompound(this, "A", "Ala", "Alanine", 71.0788f));
055            aminoAcidCompoundCache.put("R", new AminoAcidCompound(this, "R", "Arg", "Arginine", 156.1875f));
056            aminoAcidCompoundCache.put("N", new AminoAcidCompound(this, "N", "Asn", "Asparagine", 114.1039f));
057            aminoAcidCompoundCache.put("D", new AminoAcidCompound(this, "D", "Asp", "Aspartic acid", 115.0886f));
058            aminoAcidCompoundCache.put("C", new AminoAcidCompound(this, "C", "Cys", "Cysteine", 103.1388f));
059            aminoAcidCompoundCache.put("E", new AminoAcidCompound(this, "E", "Glu", "Glutamic acid", 129.1155f));
060            aminoAcidCompoundCache.put("Q", new AminoAcidCompound(this, "Q", "Gln", "Glutamine", 128.1307f));
061            aminoAcidCompoundCache.put("G", new AminoAcidCompound(this, "G", "Gly", "Glycine", 57.0519f));
062            aminoAcidCompoundCache.put("H", new AminoAcidCompound(this, "H", "His", "Histidine", 137.1411f));
063            aminoAcidCompoundCache.put("I", new AminoAcidCompound(this, "I", "Ile", "Isoleucine", 113.1594f));
064            aminoAcidCompoundCache.put("L", new AminoAcidCompound(this, "L", "Leu", "Leucine", 113.1594f));
065            aminoAcidCompoundCache.put("K", new AminoAcidCompound(this, "K", "Lys", "Lysine", 128.1741f));
066            aminoAcidCompoundCache.put("M", new AminoAcidCompound(this, "M", "Met", "Methionine", 131.1986f));
067            aminoAcidCompoundCache.put("F", new AminoAcidCompound(this, "F", "Phe", "Phenylalanine", 147.1766f));
068            aminoAcidCompoundCache.put("P", new AminoAcidCompound(this, "P", "Pro", "Proline", 97.1167f));
069            aminoAcidCompoundCache.put("S", new AminoAcidCompound(this, "S", "Ser", "Serine", 87.0782f));
070            aminoAcidCompoundCache.put("T", new AminoAcidCompound(this, "T", "Thr", "Threonine", 101.1051f));
071            aminoAcidCompoundCache.put("W", new AminoAcidCompound(this, "W", "Trp", "Tryptophan", 186.2132f));
072            aminoAcidCompoundCache.put("Y", new AminoAcidCompound(this, "Y", "Tyr", "Tyrosine", 163.1760f));
073            aminoAcidCompoundCache.put("V", new AminoAcidCompound(this, "V", "Val", "Valine", 99.1326f));
074            aminoAcidCompoundCache.put("B", new AminoAcidCompound(this, "B", "Asx", "Asparagine or Aspartic acid", null));
075            aminoAcidCompoundCache.put("Z", new AminoAcidCompound(this, "Z", "Glx", "Glutamine or Glutamic acid", null));
076            aminoAcidCompoundCache.put("J", new AminoAcidCompound(this, "J", "Xle", "Leucine or Isoleucine", null));
077            aminoAcidCompoundCache.put("X", new AminoAcidCompound(this, "X", "Xaa", "Unspecified", null));
078            aminoAcidCompoundCache.put("-", new AminoAcidCompound(this, "-", "---", "Unspecified", null));
079            aminoAcidCompoundCache.put(".", new AminoAcidCompound(this, ".", "...", "Unspecified", null));
080            aminoAcidCompoundCache.put("_", new AminoAcidCompound(this, "_", "___", "Unspecified", null));
081            aminoAcidCompoundCache.put("*", new AminoAcidCompound(this, "*", "***", "Stop", null));
082    
083            //Selenocysteine - this is encoded by UGA with the presence
084            //of a SECIS element (SElenoCysteine Insertion Sequence) in the mRNA
085            //and is a post-translation modification
086            aminoAcidCompoundCache.put("U", new AminoAcidCompound(this, "U", "Sec", "Selenocysteine", 150.0388f));
087    
088            //Pyrrolysine is encoded by UAG in mRNA (normally Amber stop codon) which is translated to
089            //this amino acid under the presence of pylT which creates an anti-codon CUA & pylS
090            //which then does the actual conversion to Pyl.
091            aminoAcidCompoundCache.put("O", new AminoAcidCompound(this, "O", "Pyl", "Pyrrolysine", 255.3172f));
092        }
093    
094        public String getStringForCompound(AminoAcidCompound compound) {
095            return compound.toString();
096        }
097    
098        public AminoAcidCompound getCompoundForString(String string) {
099            if (string.length() == 0) {
100                return null;
101            }
102            if (string.length() > this.getMaxSingleCompoundStringLength()) {
103                throw new IllegalArgumentException("String supplied ("+string+") is too long. Max is "+getMaxSingleCompoundStringLength());
104            }
105            return this.aminoAcidCompoundCache.get(string);
106        }
107    
108        public int getMaxSingleCompoundStringLength() {
109            return 1;
110        }
111    
112    
113        public boolean isCompoundStringLengthEqual() {
114            return true;
115        }
116    
117        private final static AminoAcidCompoundSet aminoAcidCompoundSet = new AminoAcidCompoundSet();
118    
119        public static AminoAcidCompoundSet getAminoAcidCompoundSet() {
120            return aminoAcidCompoundSet;
121        }
122    
123        public boolean compoundsEquivalent(AminoAcidCompound compoundOne, AminoAcidCompound compoundTwo) {
124            Set<AminoAcidCompound> equivalents = getEquivalentCompounds(compoundOne);
125            return (equivalents == null) ? false : equivalents.contains(compoundTwo);
126        }
127    
128        public Set<AminoAcidCompound> getEquivalentCompounds(AminoAcidCompound compound) {
129            if (equivalentsCache.isEmpty()) {
130                // most compounds are equivalent to themselves alone
131                for (AminoAcidCompound c : aminoAcidCompoundCache.values()) {
132                    equivalentsCache.put(c, Collections.singleton(c));
133                }
134                // ambiguous Asparagine or Aspartic acid
135                addAmbiguousEquivalents("N", "D", "B");
136                // ambiguous Glutamine or Glutamic acid
137                addAmbiguousEquivalents("E", "Q", "Z");
138                // ambiguous Leucine or Isoleucine
139                addAmbiguousEquivalents("I", "L", "J");
140                // ambiguous gaps
141                AminoAcidCompound gap1, gap2, gap3;
142                Set<AminoAcidCompound> gaps = new HashSet<AminoAcidCompound>();
143                gaps.add(gap1 = aminoAcidCompoundCache.get("-"));
144                gaps.add(gap2 = aminoAcidCompoundCache.get("."));
145                gaps.add(gap3 = aminoAcidCompoundCache.get("_"));
146                equivalentsCache.put(gap1, gaps);
147                equivalentsCache.put(gap2, gaps);
148                equivalentsCache.put(gap3, gaps);
149                // X is never equivalent, even to itself
150                equivalentsCache.put(aminoAcidCompoundCache.get("X"), new HashSet<AminoAcidCompound>());
151            }
152            return equivalentsCache.get(compound);
153        }
154    
155        // helper method to initialize the equivalent sets for 2 amino acid compounds and their ambiguity compound
156        private void addAmbiguousEquivalents(String one, String two, String either) {
157            Set<AminoAcidCompound> equivalents;
158            AminoAcidCompound cOne, cTwo, cEither;
159    
160            equivalents = new HashSet<AminoAcidCompound>();
161            equivalents.add(cOne = aminoAcidCompoundCache.get(one));
162            equivalents.add(cTwo = aminoAcidCompoundCache.get(two));
163            equivalents.add(cEither = aminoAcidCompoundCache.get(either));
164            equivalentsCache.put(cEither, equivalents);
165    
166            equivalents = new HashSet<AminoAcidCompound>();
167            equivalents.add(cOne);
168            equivalents.add(cEither);
169            equivalentsCache.put(cOne, equivalents);
170    
171            equivalents = new HashSet<AminoAcidCompound>();
172            equivalents.add(cTwo);
173            equivalents.add(cEither);
174            equivalentsCache.put(cTwo, equivalents);
175        }
176    
177        public boolean hasCompound(AminoAcidCompound compound) {
178            return aminoAcidCompoundCache.containsValue(compound);
179        }
180    
181        // TODO throwing an error seems unnecessary, should this return a boolean instead? maybe rename to isValidSequence?
182        public void verifySequence(Sequence<AminoAcidCompound> sequence) throws CompoundNotFoundError {
183            for (AminoAcidCompound compound : sequence) {
184                if (!hasCompound(compound)) {
185                    throw new CompoundNotFoundError("Compound (" + compound + ") not found in AminoAcidCompoundSet.");
186                }
187            }
188        }
189    
190        public List<AminoAcidCompound> getAllCompounds() {
191            return new ArrayList<AminoAcidCompound>(aminoAcidCompoundCache.values());
192        }
193    
194    
195        public boolean isComplementable() {
196            return false;
197        }
198    }