001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     */
022    package org.biojava3.core.sequence.io;
023    
024    import java.io.InputStream;
025    import java.util.ArrayList;
026    import java.util.Arrays;
027    import java.util.HashMap;
028    import java.util.List;
029    import java.util.Map;
030    
031    import org.biojava3.core.exceptions.ParserException;
032    import org.biojava3.core.sequence.compound.AminoAcidCompound;
033    import org.biojava3.core.sequence.compound.NucleotideCompound;
034    import org.biojava3.core.sequence.io.util.ClasspathResource;
035    import org.biojava3.core.sequence.io.util.IOUtils;
036    import org.biojava3.core.sequence.template.AbstractCompoundSet;
037    import org.biojava3.core.sequence.template.CompoundSet;
038    import org.biojava3.core.sequence.transcription.Table;
039    
040    
041    /**
042     * Available translations
043     *
044     * <ul>
045     * <li>1 - UNIVERSAL</li>
046     * <li>2 - VERTEBRATE_MITOCHONDRIAL</li>
047     * <li>3 - YEAST_MITOCHONDRIAL</li>
048     * <li>4 - MOLD_MITOCHONDRIAL</li>
049     * <li>5 - INVERTEBRATE_MITOCHONDRIAL</li>
050     * <li>6 - CILIATE_NUCLEAR</li>
051     * <li>9 - ECHINODERM_MITOCHONDRIAL</li>
052     * <li>10 - EUPLOTID_NUCLEAR</li>
053     * <li>11 - BACTERIAL</li>
054     * <li>12 - ALTERNATIVE_YEAST_NUCLEAR</li>
055     * <li>13 - ASCIDIAN_MITOCHONDRIAL</li>
056     * <li>14 - FLATWORM_MITOCHONDRIAL</li>
057     * <li>15 - BLEPHARISMA_MACRONUCLEAR</li>
058     * <li>16 - 2CHLOROPHYCEAN_MITOCHONDRIAL</li>
059     * <li>21 - TREMATODE_MITOCHONDRIAL</li>
060     * <li>23 - SCENEDESMUS_MITOCHONDRIAL</li>
061     * </ul>
062     *
063     * Taken from <a
064     * href="http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"
065     * >NCBI</a> with slight modification and put into the classpath resource.
066     *
067     * Takes in an ID, name, amino acid string and the locations of amino acids
068     * which acts as start codons in the translation table. You can give the 3 codon
069     * position strings that correspond to the amino acid string or if you are using
070     * the default IUPAC codes you can use the hardcoded ones which are consistent
071     * amongst all <a
072     * href="http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"> codon
073     * tables</a>.
074     *
075     * The generated {@link IUPACTable} objects do not parse the data further until
076     * requested so if you do not use a translation table your only penalty is the
077     * loading of the IUPAC data from the classpath.
078     *
079     * @author Andy Yates
080     */
081    public class IUPACParser {
082    
083      private static class IOD {
084        public static final IUPACParser INSTANCE = new IUPACParser();
085      }
086    
087      public static IUPACParser getInstance() {
088        return IOD.INSTANCE;
089      }
090    
091      public static final String      IUPAC_LOCATION = "org/biojava3/core/sequence/iupac.txt";
092    
093      private InputStream              is;
094      private List<IUPACTable>         tables;
095      private Map<String, IUPACTable>  nameLookup;
096      private Map<Integer, IUPACTable> idLookup;
097    
098      /**
099       * Default version and uses the classpath based IUPAC table
100       */
101      public IUPACParser() {
102        //use the preCache version to make sure we don't keep a IO handle open
103        is = new ClasspathResource(IUPAC_LOCATION, true).getInputStream();
104      }
105    
106      /**
107       * Allows you to specify a different IUPAC table.
108       */
109      public IUPACParser(InputStream is) {
110        this.is = is;
111      }
112    
113      /**
114       * Returns a list of all available IUPAC tables
115       */
116      public List<IUPACTable> getTables() {
117        if (tables == null) {
118          tables = parseTables();
119        }
120        return tables;
121      }
122    
123      /**
124       * Returns a table by its name
125       */
126      public IUPACTable getTable(String name) {
127        populateLookups();
128        return nameLookup.get(name);
129      }
130    
131      /**
132       * Returns a table by its identifier i.e. 1 means universal codon tables
133       */
134      public IUPACTable getTable(Integer id) {
135        populateLookups();
136        return idLookup.get(id);
137      }
138    
139      private void populateLookups() {
140        if(nameLookup == null) {
141          nameLookup = new HashMap<String, IUPACTable>();
142          idLookup = new HashMap<Integer, IUPACTable>();
143          for(IUPACTable t: getTables()) {
144            nameLookup.put(t.getName(), t);
145            idLookup.put(t.getId(), t);
146          }
147        }
148      }
149    
150      private List<IUPACTable> parseTables() {
151        List<IUPACTable> localTables = new ArrayList<IUPACTable>();
152        List<String> lines = IOUtils.getList(is);
153        Integer id = null;
154        String name, aa, starts, baseone, basetwo, basethree;
155        name = aa = starts = baseone = basetwo = basethree = null;
156        for (String line : lines) {
157          if (line.equalsIgnoreCase("//")) {
158            localTables.add(new IUPACTable(name, id, aa, starts, baseone, basetwo,
159                basethree));
160            name = aa = starts = baseone = basetwo = basethree = null;
161            id = null;
162          }
163          else {
164            String[] keyValue = line.split("\\s*=\\s*");
165            if (keyValue[0].equals("AAs")) {
166              aa = keyValue[1];
167            }
168            else if (keyValue[0].equals("Starts")) {
169              starts = keyValue[1];
170            }
171            else if (keyValue[0].equals("Base1")) {
172              baseone = keyValue[1];
173            }
174            else if (keyValue[0].equals("Base2")) {
175              basetwo = keyValue[1];
176            }
177            else if (keyValue[0].equals("Base3")) {
178              basethree = keyValue[1];
179            }
180            else {
181              name = keyValue[0];
182              id = Integer.parseInt(keyValue[1]);
183            }
184          }
185        }
186    
187        return localTables;
188      }
189    
190      /**
191       * Holds the concept of a codon table from the IUPAC format
192       *
193       * @author Andy Yates
194       */
195      public static class IUPACTable implements Table {
196    
197        private final Integer      id;
198        private final String       name;
199        private final String       aminoAcidString;
200        private final String       startCodons;
201        private final String       baseOne;
202        private final String       baseTwo;
203        private final String       baseThree;
204    
205        private final List<Codon>  codons    = new ArrayList<Codon>();
206        private CompoundSet<Codon> compounds = null;
207    
208        public IUPACTable(String name, int id, String aminoAcidString,
209            String startCodons, String baseOne, String baseTwo, String baseThree) {
210          this.aminoAcidString = aminoAcidString;
211          this.startCodons = startCodons;
212          this.name = name;
213          this.id = id;
214          this.baseOne = baseOne;
215          this.baseTwo = baseTwo;
216          this.baseThree = baseThree;
217        }
218    
219        /**
220         * Constructor which uses the basic IUPAC codon table format. Useful
221         * if you need to specify your own IUPAC table with minimal
222         * definitions from your side.
223         */
224        public IUPACTable(String name, Integer id, String aminoAcidString,
225            String startCodons) {
226          this(name, id, aminoAcidString, startCodons,
227              "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
228              "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
229              "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG");
230        }
231    
232        public Integer getId() {
233          return id;
234        }
235    
236        public String getName() {
237          return name;
238        }
239    
240        /**
241         * Returns true if the given compound was a start codon in this
242         * codon table. This will report true if the compound could ever have
243         * been a start codon.
244         *
245         * @throws IllegalStateException Thrown if
246         * {@link #getCodons(CompoundSet, CompoundSet)} was not called first.
247         */
248            @Override
249        public boolean isStart(AminoAcidCompound compound) throws IllegalStateException {
250          if(this.codons.isEmpty()) {
251            throw new IllegalStateException("Codons are empty; please request getCodons() fist before asking this");
252          }
253          for(Codon codon: codons) {
254            //Only check if the codon was a start codon and then ask if the compound was encoded by it
255            if(codon.isStart()) {
256              if(codon.getAminoAcid().equalsIgnoreCase(compound)) {
257                return true;
258              }
259            }
260          }
261          return false;
262        }
263    
264        /**
265         * Returns a list of codons where the source and target compounds
266         * are the same as those given by the parameters.
267         *
268         * @param nucleotides The nucleotide set to use when building BioJava 
269         * representations of codons
270         * @param aminoAcids The target amino acid compounds objects
271         */
272            @Override
273        public List<Codon> getCodons(CompoundSet<NucleotideCompound> nucelotides,
274            CompoundSet<AminoAcidCompound> aminoAcids) {
275    
276          if (this.codons.isEmpty()) {
277            List<String> aminoAcidStrings = aminoAcids();
278            List<String> startCodonStrings = startCodons();
279            List<List<String>> codonStrings = codonStrings();
280    
281            for (int i = 0; i < aminoAcidStrings.size(); i++) {
282    
283              List<String> codonString    = codonStrings.get(i);
284              NucleotideCompound one      = getCompound(codonString, 0, nucelotides);
285              NucleotideCompound two      = getCompound(codonString, 1, nucelotides);
286              NucleotideCompound three    = getCompound(codonString, 2, nucelotides);
287              boolean start               = ("M".equals(startCodonStrings.get(i)));
288              boolean stop                = ("*".equals(aminoAcidStrings.get(i)));
289              AminoAcidCompound aminoAcid = aminoAcids
290                  .getCompoundForString(aminoAcidStrings.get(i));
291              codons.add(new Codon(new CaseInsensitiveTriplet(one, two, three), aminoAcid, start, stop));
292            }
293          }
294    
295          return codons;
296        }
297    
298        private NucleotideCompound getCompound(List<String> compounds,
299            int position, CompoundSet<NucleotideCompound> nucelotides) {
300          String compound = compounds.get(position);
301          NucleotideCompound returnCompound = nucelotides
302              .getCompoundForString(compound);
303          if (returnCompound == null) {
304            if ("T".equalsIgnoreCase(compound)) {
305                returnCompound = nucelotides.getCompoundForString("U");
306            }
307            else {
308              throw new ParserException("Cannot find a compound for string "
309                  + compound);
310            }
311          }
312          return returnCompound;
313        }
314    
315        /**
316         * Returns the compound set of codons
317         */
318        public CompoundSet<Codon> getCodonCompoundSet(
319            final CompoundSet<NucleotideCompound> rnaCompounds,
320            final CompoundSet<AminoAcidCompound> aminoAcidCompounds) {
321          if (compounds == null) {
322            compounds = new AbstractCompoundSet<Codon>() {
323              {
324                for (Codon c : getCodons(rnaCompounds, aminoAcidCompounds)) {
325                  addCompound(c);
326                }
327              }
328            };
329          }
330          return compounds;
331        }
332    
333        private List<List<String>> codonStrings() {
334          List<List<String>> codons = new ArrayList<List<String>>();
335          for (int i = 0; i < baseOne.length(); i++) {
336            List<String> codon = Arrays.asList(Character
337                .toString(baseOne.charAt(i)),
338                Character.toString(baseTwo.charAt(i)), Character.toString(baseThree
339                    .charAt(i)));
340            codons.add(codon);
341          }
342          return codons;
343        }
344    
345        private List<String> aminoAcids() {
346          return split(aminoAcidString);
347        }
348    
349        private List<String> startCodons() {
350          return split(startCodons);
351        }
352    
353        private List<String> split(String string) {
354          List<String> split = new ArrayList<String>();
355          for (int i = 0; i < string.length(); i++) {
356            split.add(Character.toString(string.charAt(i)));
357          }
358          return split;
359        }
360      }
361    }